1 | /* This file is part of the Kate project. |
2 | * |
3 | * Copyright (C) 2010 Christoph Cullmann <cullmann@kde.org> |
4 | * |
5 | * This library is free software; you can redistribute it and/or |
6 | * modify it under the terms of the GNU Library General Public |
7 | * License as published by the Free Software Foundation; either |
8 | * version 2 of the License, or (at your option) any later version. |
9 | * |
10 | * This library is distributed in the hope that it will be useful, |
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | * Library General Public License for more details. |
14 | * |
15 | * You should have received a copy of the GNU Library General Public License |
16 | * along with this library; see the file COPYING.LIB. If not, write to |
17 | * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, |
18 | * Boston, MA 02110-1301, USA. |
19 | */ |
20 | |
21 | #ifndef KATE_TEXTLOADER_H |
22 | #define KATE_TEXTLOADER_H |
23 | |
24 | #include <QtCore/QString> |
25 | #include <QtCore/QFile> |
26 | #include <QtCore/QCryptographicHash> |
27 | |
28 | // on the fly compression |
29 | #include <kfilterdev.h> |
30 | #include <kmimetype.h> |
31 | |
32 | namespace Kate { |
33 | |
34 | /** |
35 | * loader block size, load 256 kb at once per default |
36 | * if file size is smaller, fall back to file size |
37 | * must be a multiple of 2 |
38 | */ |
39 | static const qint64 KATE_FILE_LOADER_BS = 256 * 1024; |
40 | |
41 | /** |
42 | * File Loader, will handle reading of files + detecting encoding |
43 | */ |
44 | class TextLoader |
45 | { |
46 | public: |
47 | /** |
48 | * Construct file loader for given file. |
49 | * @param filename file to open |
50 | * @param proberType prober type |
51 | */ |
52 | TextLoader (const QString &filename, KEncodingProber::ProberType proberType) |
53 | : m_codec (0) |
54 | , m_eof (false) // default to not eof |
55 | , m_lastWasEndOfLine (true) // at start of file, we had a virtual newline |
56 | , m_lastWasR (false) // we have not found a \r as last char |
57 | , m_position (0) |
58 | , m_lastLineStart (0) |
59 | , m_eol (TextBuffer::eolUnknown) // no eol type detected atm |
60 | , m_buffer (KATE_FILE_LOADER_BS, 0) |
61 | , m_digest (QCryptographicHash::Md5) |
62 | , m_converterState (0) |
63 | , m_bomFound (false) |
64 | , m_firstRead (true) |
65 | , m_proberType (proberType) |
66 | { |
67 | // try to get mimetype for on the fly decompression, don't rely on filename! |
68 | QFile testMime (filename); |
69 | if (testMime.open (QIODevice::ReadOnly)) |
70 | m_mimeType = KMimeType::findByContent (&testMime)->name (); |
71 | else |
72 | m_mimeType = KMimeType::findByPath (filename, 0, false)->name (); |
73 | |
74 | // construct filter device |
75 | m_file = KFilterDev::deviceForFile (filename, m_mimeType, false); |
76 | } |
77 | |
78 | /** |
79 | * Destructor |
80 | */ |
81 | ~TextLoader () |
82 | { |
83 | delete m_file; |
84 | delete m_converterState; |
85 | } |
86 | |
87 | /** |
88 | * open file with given codec |
89 | * @param codec codec to use, if 0, will do some auto-dectect or fallback |
90 | * @return success |
91 | */ |
92 | bool open (QTextCodec *codec) |
93 | { |
94 | m_codec = codec; |
95 | m_eof = false; |
96 | m_lastWasEndOfLine = true; |
97 | m_lastWasR = false; |
98 | m_position = 0; |
99 | m_lastLineStart = 0; |
100 | m_eol = TextBuffer::eolUnknown; |
101 | m_text.clear (); |
102 | delete m_converterState; |
103 | m_converterState = new QTextCodec::ConverterState (QTextCodec::ConvertInvalidToNull); |
104 | m_bomFound = false; |
105 | m_firstRead = true; |
106 | |
107 | // if already opened, close the file... |
108 | if (m_file->isOpen()) |
109 | m_file->close (); |
110 | |
111 | return m_file->open (QIODevice::ReadOnly); |
112 | } |
113 | |
114 | /** |
115 | * end of file reached? |
116 | * @return end of file reached |
117 | */ |
118 | bool eof () const { return m_eof && !m_lastWasEndOfLine && (m_lastLineStart == m_text.length()); } |
119 | |
120 | /** |
121 | * Detected end of line mode for this file. |
122 | * Detected during reading, is valid after complete file is read. |
123 | * @return eol mode of this file |
124 | */ |
125 | TextBuffer::EndOfLineMode eol () const { return m_eol; } |
126 | |
127 | /** |
128 | * BOM found? |
129 | * @return byte order mark found? |
130 | */ |
131 | bool byteOrderMarkFound () const { return m_bomFound; } |
132 | |
133 | /** |
134 | * mime type used to create filter dev |
135 | * @return mime-type of filter device |
136 | */ |
137 | const QString &mimeTypeForFilterDev () const { return m_mimeType; } |
138 | |
139 | /** |
140 | * internal unicode data array |
141 | * @return internal unicode data |
142 | */ |
143 | const QChar *unicode () const { return m_text.unicode(); } |
144 | |
145 | /** |
146 | * Get codec for this loader |
147 | * @return currently in use codec of this loader |
148 | */ |
149 | QTextCodec *textCodec () const { return m_codec; } |
150 | |
151 | /** |
152 | * read a line, return length + offset in unicode data |
153 | * @param offset offset into internal unicode data for read line |
154 | * @param length length of read line |
155 | * @return true if no encoding errors occurred |
156 | */ |
157 | bool readLine (int &offset, int &length) |
158 | { |
159 | length = 0; |
160 | offset = 0; |
161 | bool encodingError = false; |
162 | |
163 | static const QLatin1Char cr(QLatin1Char('\r')); |
164 | static const QLatin1Char lf(QLatin1Char('\n')); |
165 | |
166 | /** |
167 | * did we read two time but got no stuff? encoding error |
168 | * fixes problem with one character latin-1 files, which lead to crash otherwise! |
169 | * bug 272579 |
170 | */ |
171 | bool failedToConvertOnce = false; |
172 | |
173 | /** |
174 | * reading loop |
175 | */ |
176 | while (m_position <= m_text.length()) |
177 | { |
178 | if (m_position == m_text.length()) |
179 | { |
180 | // try to load more text if something is around |
181 | if (!m_eof) |
182 | { |
183 | // kill the old lines... |
184 | m_text.remove (0, m_lastLineStart); |
185 | |
186 | // try to read new data |
187 | const int c = m_file->read(m_buffer.data(), m_buffer.size()); |
188 | |
189 | // if any text is there, append it.... |
190 | if (c > 0) |
191 | { |
192 | // update md5 hash sum |
193 | m_digest.addData (m_buffer.data(), c); |
194 | |
195 | // detect byte order marks & codec for byte order markers on first read |
196 | int bomBytes = 0; |
197 | if (m_firstRead) { |
198 | // use first 16 bytes max to allow BOM detection of codec |
199 | QByteArray bom (m_buffer.data(), qMin (16, c)); |
200 | QTextCodec *codecForByteOrderMark = QTextCodec::codecForUtfText (bom, 0); |
201 | |
202 | // if codec != null, we found a BOM! |
203 | if (codecForByteOrderMark) { |
204 | m_bomFound = true; |
205 | |
206 | // eat away the different boms! |
207 | int mib = codecForByteOrderMark->mibEnum (); |
208 | if (mib == 106) // utf8 |
209 | bomBytes = 3; |
210 | if (mib == 1013 || mib == 1014 || mib == 1015) // utf16 |
211 | bomBytes = 2; |
212 | if (mib == 1017 || mib == 1018 || mib == 1019) // utf32 |
213 | bomBytes = 4; |
214 | } |
215 | |
216 | /** |
217 | * if no codec given, do autodetection |
218 | */ |
219 | if (!m_codec) { |
220 | /** |
221 | * byte order said something about encoding? |
222 | */ |
223 | if (codecForByteOrderMark) |
224 | m_codec = codecForByteOrderMark; |
225 | else { |
226 | /** |
227 | * no unicode BOM found, trigger prober |
228 | */ |
229 | KEncodingProber prober (m_proberType); |
230 | prober.feed (m_buffer.constData(), c); |
231 | |
232 | // we found codec with some confidence? |
233 | if (prober.confidence() > 0.5) |
234 | m_codec = QTextCodec::codecForName(prober.encoding()); |
235 | |
236 | // no codec, no chance, encoding error |
237 | if (!m_codec) |
238 | return false; |
239 | } |
240 | } |
241 | |
242 | m_firstRead = false; |
243 | } |
244 | |
245 | Q_ASSERT (m_codec); |
246 | QString unicode = m_codec->toUnicode (m_buffer.constData() + bomBytes, c - bomBytes, m_converterState); |
247 | |
248 | // detect broken encoding |
249 | for (int i = 0; i < unicode.size(); ++i) { |
250 | if (unicode[i] == 0) { |
251 | encodingError = true; |
252 | break; |
253 | } |
254 | } |
255 | |
256 | m_text.append (unicode); |
257 | } |
258 | |
259 | // is file completely read ? |
260 | m_eof = (c == -1) || (c == 0); |
261 | |
262 | // recalc current pos and last pos |
263 | m_position -= m_lastLineStart; |
264 | m_lastLineStart = 0; |
265 | } |
266 | |
267 | // oh oh, end of file, escape ! |
268 | if (m_eof && (m_position == m_text.length())) |
269 | { |
270 | m_lastWasEndOfLine = false; |
271 | |
272 | // line data |
273 | offset = m_lastLineStart; |
274 | length = m_position-m_lastLineStart; |
275 | |
276 | m_lastLineStart = m_position; |
277 | |
278 | return !encodingError && !failedToConvertOnce; |
279 | } |
280 | |
281 | // empty? try again |
282 | if (m_position == m_text.length()) { |
283 | failedToConvertOnce = true; |
284 | continue; |
285 | } |
286 | } |
287 | |
288 | if (m_text.at(m_position) == lf) |
289 | { |
290 | m_lastWasEndOfLine = true; |
291 | |
292 | if (m_lastWasR) |
293 | { |
294 | m_lastLineStart++; |
295 | m_lastWasR = false; |
296 | m_eol = TextBuffer::eolDos; |
297 | } |
298 | else |
299 | { |
300 | // line data |
301 | offset = m_lastLineStart; |
302 | length = m_position-m_lastLineStart; |
303 | |
304 | m_lastLineStart = m_position+1; |
305 | m_position++; |
306 | |
307 | // only win, if not dos! |
308 | if (m_eol != TextBuffer::eolDos) |
309 | m_eol = TextBuffer::eolUnix; |
310 | |
311 | return !encodingError; |
312 | } |
313 | } |
314 | else if (m_text.at(m_position) == cr) |
315 | { |
316 | m_lastWasEndOfLine = true; |
317 | m_lastWasR = true; |
318 | |
319 | // line data |
320 | offset = m_lastLineStart; |
321 | length = m_position-m_lastLineStart; |
322 | |
323 | m_lastLineStart = m_position+1; |
324 | m_position++; |
325 | |
326 | // should only win of first time! |
327 | if (m_eol == TextBuffer::eolUnknown) |
328 | m_eol = TextBuffer::eolMac; |
329 | |
330 | return !encodingError; |
331 | } |
332 | else if (m_text.at(m_position) == QChar::LineSeparator) |
333 | { |
334 | m_lastWasEndOfLine = true; |
335 | |
336 | // line data |
337 | offset = m_lastLineStart; |
338 | length = m_position-m_lastLineStart; |
339 | |
340 | m_lastLineStart = m_position+1; |
341 | m_position++; |
342 | |
343 | return !encodingError; |
344 | } |
345 | else |
346 | { |
347 | m_lastWasEndOfLine = false; |
348 | m_lastWasR = false; |
349 | } |
350 | |
351 | m_position++; |
352 | } |
353 | |
354 | return !encodingError; |
355 | } |
356 | |
357 | QByteArray digest () |
358 | { |
359 | return m_digest.result (); |
360 | } |
361 | |
362 | private: |
363 | QTextCodec *m_codec; |
364 | bool m_eof; |
365 | bool m_lastWasEndOfLine; |
366 | bool m_lastWasR; |
367 | int m_position; |
368 | int m_lastLineStart; |
369 | TextBuffer::EndOfLineMode m_eol; |
370 | QString m_mimeType; |
371 | QIODevice *m_file; |
372 | QByteArray m_buffer; |
373 | QCryptographicHash m_digest; |
374 | QString m_text; |
375 | QTextCodec::ConverterState *m_converterState; |
376 | bool m_bomFound; |
377 | bool m_firstRead; |
378 | KEncodingProber::ProberType m_proberType; |
379 | }; |
380 | |
381 | } |
382 | |
383 | #endif |
384 | |