1/* This file is part of the Kate project.
2 *
3 * Copyright (C) 2010 Christoph Cullmann <cullmann@kde.org>
4 *
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Library General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
9 *
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Library General Public License for more details.
14 *
15 * You should have received a copy of the GNU Library General Public License
16 * along with this library; see the file COPYING.LIB. If not, write to
17 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
18 * Boston, MA 02110-1301, USA.
19 */
20
21#ifndef KATE_TEXTLOADER_H
22#define KATE_TEXTLOADER_H
23
24#include <QtCore/QString>
25#include <QtCore/QFile>
26#include <QtCore/QCryptographicHash>
27
28// on the fly compression
29#include <kfilterdev.h>
30#include <kmimetype.h>
31
32namespace Kate {
33
34/**
35 * loader block size, load 256 kb at once per default
36 * if file size is smaller, fall back to file size
37 * must be a multiple of 2
38 */
39static const qint64 KATE_FILE_LOADER_BS = 256 * 1024;
40
41/**
42 * File Loader, will handle reading of files + detecting encoding
43 */
44class TextLoader
45{
46 public:
47 /**
48 * Construct file loader for given file.
49 * @param filename file to open
50 * @param proberType prober type
51 */
52 TextLoader (const QString &filename, KEncodingProber::ProberType proberType)
53 : m_codec (0)
54 , m_eof (false) // default to not eof
55 , m_lastWasEndOfLine (true) // at start of file, we had a virtual newline
56 , m_lastWasR (false) // we have not found a \r as last char
57 , m_position (0)
58 , m_lastLineStart (0)
59 , m_eol (TextBuffer::eolUnknown) // no eol type detected atm
60 , m_buffer (KATE_FILE_LOADER_BS, 0)
61 , m_digest (QCryptographicHash::Md5)
62 , m_converterState (0)
63 , m_bomFound (false)
64 , m_firstRead (true)
65 , m_proberType (proberType)
66 {
67 // try to get mimetype for on the fly decompression, don't rely on filename!
68 QFile testMime (filename);
69 if (testMime.open (QIODevice::ReadOnly))
70 m_mimeType = KMimeType::findByContent (&testMime)->name ();
71 else
72 m_mimeType = KMimeType::findByPath (filename, 0, false)->name ();
73
74 // construct filter device
75 m_file = KFilterDev::deviceForFile (filename, m_mimeType, false);
76 }
77
78 /**
79 * Destructor
80 */
81 ~TextLoader ()
82 {
83 delete m_file;
84 delete m_converterState;
85 }
86
87 /**
88 * open file with given codec
89 * @param codec codec to use, if 0, will do some auto-dectect or fallback
90 * @return success
91 */
92 bool open (QTextCodec *codec)
93 {
94 m_codec = codec;
95 m_eof = false;
96 m_lastWasEndOfLine = true;
97 m_lastWasR = false;
98 m_position = 0;
99 m_lastLineStart = 0;
100 m_eol = TextBuffer::eolUnknown;
101 m_text.clear ();
102 delete m_converterState;
103 m_converterState = new QTextCodec::ConverterState (QTextCodec::ConvertInvalidToNull);
104 m_bomFound = false;
105 m_firstRead = true;
106
107 // if already opened, close the file...
108 if (m_file->isOpen())
109 m_file->close ();
110
111 return m_file->open (QIODevice::ReadOnly);
112 }
113
114 /**
115 * end of file reached?
116 * @return end of file reached
117 */
118 bool eof () const { return m_eof && !m_lastWasEndOfLine && (m_lastLineStart == m_text.length()); }
119
120 /**
121 * Detected end of line mode for this file.
122 * Detected during reading, is valid after complete file is read.
123 * @return eol mode of this file
124 */
125 TextBuffer::EndOfLineMode eol () const { return m_eol; }
126
127 /**
128 * BOM found?
129 * @return byte order mark found?
130 */
131 bool byteOrderMarkFound () const { return m_bomFound; }
132
133 /**
134 * mime type used to create filter dev
135 * @return mime-type of filter device
136 */
137 const QString &mimeTypeForFilterDev () const { return m_mimeType; }
138
139 /**
140 * internal unicode data array
141 * @return internal unicode data
142 */
143 const QChar *unicode () const { return m_text.unicode(); }
144
145 /**
146 * Get codec for this loader
147 * @return currently in use codec of this loader
148 */
149 QTextCodec *textCodec () const { return m_codec; }
150
151 /**
152 * read a line, return length + offset in unicode data
153 * @param offset offset into internal unicode data for read line
154 * @param length length of read line
155 * @return true if no encoding errors occurred
156 */
157 bool readLine (int &offset, int &length)
158 {
159 length = 0;
160 offset = 0;
161 bool encodingError = false;
162
163 static const QLatin1Char cr(QLatin1Char('\r'));
164 static const QLatin1Char lf(QLatin1Char('\n'));
165
166 /**
167 * did we read two time but got no stuff? encoding error
168 * fixes problem with one character latin-1 files, which lead to crash otherwise!
169 * bug 272579
170 */
171 bool failedToConvertOnce = false;
172
173 /**
174 * reading loop
175 */
176 while (m_position <= m_text.length())
177 {
178 if (m_position == m_text.length())
179 {
180 // try to load more text if something is around
181 if (!m_eof)
182 {
183 // kill the old lines...
184 m_text.remove (0, m_lastLineStart);
185
186 // try to read new data
187 const int c = m_file->read(m_buffer.data(), m_buffer.size());
188
189 // if any text is there, append it....
190 if (c > 0)
191 {
192 // update md5 hash sum
193 m_digest.addData (m_buffer.data(), c);
194
195 // detect byte order marks & codec for byte order markers on first read
196 int bomBytes = 0;
197 if (m_firstRead) {
198 // use first 16 bytes max to allow BOM detection of codec
199 QByteArray bom (m_buffer.data(), qMin (16, c));
200 QTextCodec *codecForByteOrderMark = QTextCodec::codecForUtfText (bom, 0);
201
202 // if codec != null, we found a BOM!
203 if (codecForByteOrderMark) {
204 m_bomFound = true;
205
206 // eat away the different boms!
207 int mib = codecForByteOrderMark->mibEnum ();
208 if (mib == 106) // utf8
209 bomBytes = 3;
210 if (mib == 1013 || mib == 1014 || mib == 1015) // utf16
211 bomBytes = 2;
212 if (mib == 1017 || mib == 1018 || mib == 1019) // utf32
213 bomBytes = 4;
214 }
215
216 /**
217 * if no codec given, do autodetection
218 */
219 if (!m_codec) {
220 /**
221 * byte order said something about encoding?
222 */
223 if (codecForByteOrderMark)
224 m_codec = codecForByteOrderMark;
225 else {
226 /**
227 * no unicode BOM found, trigger prober
228 */
229 KEncodingProber prober (m_proberType);
230 prober.feed (m_buffer.constData(), c);
231
232 // we found codec with some confidence?
233 if (prober.confidence() > 0.5)
234 m_codec = QTextCodec::codecForName(prober.encoding());
235
236 // no codec, no chance, encoding error
237 if (!m_codec)
238 return false;
239 }
240 }
241
242 m_firstRead = false;
243 }
244
245 Q_ASSERT (m_codec);
246 QString unicode = m_codec->toUnicode (m_buffer.constData() + bomBytes, c - bomBytes, m_converterState);
247
248 // detect broken encoding
249 for (int i = 0; i < unicode.size(); ++i) {
250 if (unicode[i] == 0) {
251 encodingError = true;
252 break;
253 }
254 }
255
256 m_text.append (unicode);
257 }
258
259 // is file completely read ?
260 m_eof = (c == -1) || (c == 0);
261
262 // recalc current pos and last pos
263 m_position -= m_lastLineStart;
264 m_lastLineStart = 0;
265 }
266
267 // oh oh, end of file, escape !
268 if (m_eof && (m_position == m_text.length()))
269 {
270 m_lastWasEndOfLine = false;
271
272 // line data
273 offset = m_lastLineStart;
274 length = m_position-m_lastLineStart;
275
276 m_lastLineStart = m_position;
277
278 return !encodingError && !failedToConvertOnce;
279 }
280
281 // empty? try again
282 if (m_position == m_text.length()) {
283 failedToConvertOnce = true;
284 continue;
285 }
286 }
287
288 if (m_text.at(m_position) == lf)
289 {
290 m_lastWasEndOfLine = true;
291
292 if (m_lastWasR)
293 {
294 m_lastLineStart++;
295 m_lastWasR = false;
296 m_eol = TextBuffer::eolDos;
297 }
298 else
299 {
300 // line data
301 offset = m_lastLineStart;
302 length = m_position-m_lastLineStart;
303
304 m_lastLineStart = m_position+1;
305 m_position++;
306
307 // only win, if not dos!
308 if (m_eol != TextBuffer::eolDos)
309 m_eol = TextBuffer::eolUnix;
310
311 return !encodingError;
312 }
313 }
314 else if (m_text.at(m_position) == cr)
315 {
316 m_lastWasEndOfLine = true;
317 m_lastWasR = true;
318
319 // line data
320 offset = m_lastLineStart;
321 length = m_position-m_lastLineStart;
322
323 m_lastLineStart = m_position+1;
324 m_position++;
325
326 // should only win of first time!
327 if (m_eol == TextBuffer::eolUnknown)
328 m_eol = TextBuffer::eolMac;
329
330 return !encodingError;
331 }
332 else if (m_text.at(m_position) == QChar::LineSeparator)
333 {
334 m_lastWasEndOfLine = true;
335
336 // line data
337 offset = m_lastLineStart;
338 length = m_position-m_lastLineStart;
339
340 m_lastLineStart = m_position+1;
341 m_position++;
342
343 return !encodingError;
344 }
345 else
346 {
347 m_lastWasEndOfLine = false;
348 m_lastWasR = false;
349 }
350
351 m_position++;
352 }
353
354 return !encodingError;
355 }
356
357 QByteArray digest ()
358 {
359 return m_digest.result ();
360 }
361
362 private:
363 QTextCodec *m_codec;
364 bool m_eof;
365 bool m_lastWasEndOfLine;
366 bool m_lastWasR;
367 int m_position;
368 int m_lastLineStart;
369 TextBuffer::EndOfLineMode m_eol;
370 QString m_mimeType;
371 QIODevice *m_file;
372 QByteArray m_buffer;
373 QCryptographicHash m_digest;
374 QString m_text;
375 QTextCodec::ConverterState *m_converterState;
376 bool m_bomFound;
377 bool m_firstRead;
378 KEncodingProber::ProberType m_proberType;
379};
380
381}
382
383#endif
384