katetextloader.h [applications/kate/part/buffer/katetextloader.h]

1	/ This file is part of the Kate project.*
2	*
3	* Copyright (C) 2010 Christoph Cullmann <cullmann@kde.org>
4	*
5	* This library is free software; you can redistribute it and/or
6	* modify it under the terms of the GNU Library General Public
7	* License as published by the Free Software Foundation; either
8	* version 2 of the License, or (at your option) any later version.
9	*
10	* This library is distributed in the hope that it will be useful,
11	* but WITHOUT ANY WARRANTY; without even the implied warranty of
12	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	* Library General Public License for more details.
14	*
15	* You should have received a copy of the GNU Library General Public License
16	* along with this library; see the file COPYING.LIB. If not, write to
17	* the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
18	* Boston, MA 02110-1301, USA.
19	*/
20
21	#ifndef KATE_TEXTLOADER_H
22	#define KATE_TEXTLOADER_H
23
24	#include <QtCore/QString>
25	#include <QtCore/QFile>
26	#include <QtCore/QCryptographicHash>
27
28	// on the fly compression
29	#include <kfilterdev.h>
30	#include <kmimetype.h>
31
32	namespace Kate {
33
34	/**
35	* loader block size, load 256 kb at once per default
36	* if file size is smaller, fall back to file size
37	* must be a multiple of 2
38	*/
39	static const qint64 KATE_FILE_LOADER_BS = `256` * `1024`;
40
41	/**
42	* File Loader, will handle reading of files + detecting encoding
43	*/
44	class TextLoader
45	{
46	public:
47	/**
48	* Construct file loader for given file.
49	* @param filename file to open
50	* @param proberType prober type
51	*/
52	TextLoader (const QString &filename, KEncodingProber::ProberType proberType)
53	: m_codec (`0`)
54	, m_eof (false) // default to not eof
55	, m_lastWasEndOfLine (true) // at start of file, we had a virtual newline
56	, m_lastWasR (false) // we have not found a \r as last char
57	, m_position (`0`)
58	, m_lastLineStart (`0`)
59	, m_eol (TextBuffer::eolUnknown) // no eol type detected atm
60	, m_buffer (KATE_FILE_LOADER_BS, `0`)
61	, m_digest (QCryptographicHash::Md5)
62	, m_converterState (`0`)
63	, m_bomFound (false)
64	, m_firstRead (true)
65	, m_proberType (proberType)
66	{
67	// try to get mimetype for on the fly decompression, don't rely on filename!
68	QFile testMime (filename);
69	if (testMime.open (QIODevice::ReadOnly))
70	m_mimeType = KMimeType::findByContent (&testMime)->name ();
71	else
72	m_mimeType = KMimeType::findByPath (filename, `0`, false)->name ();
73
74	// construct filter device
75	m_file = KFilterDev::deviceForFile (filename, m_mimeType, false);
76	}
77
78	/**
79	* Destructor
80	*/
81	~TextLoader ()
82	{
83	delete m_file;
84	delete m_converterState;
85	}
86
87	/**
88	* open file with given codec
89	* @param codec codec to use, if 0, will do some auto-dectect or fallback
90	* @return success
91	*/
92	bool open (QTextCodec *codec)
93	{
94	m_codec = codec;
95	m_eof = false;
96	m_lastWasEndOfLine = true;
97	m_lastWasR = false;
98	m_position = `0`;
99	m_lastLineStart = `0`;
100	m_eol = TextBuffer::eolUnknown;
101	m_text.clear ();
102	delete m_converterState;
103	m_converterState = new QTextCodec::ConverterState (QTextCodec::ConvertInvalidToNull);
104	m_bomFound = false;
105	m_firstRead = true;
106
107	// if already opened, close the file...
108	if (m_file->isOpen())
109	m_file->close ();
110
111	return m_file->open (QIODevice::ReadOnly);
112	}
113
114	/**
115	* end of file reached?
116	* @return end of file reached
117	*/
118	bool eof () const { return m_eof && !m_lastWasEndOfLine && (m_lastLineStart == m_text.length()); }
119
120	/**
121	* Detected end of line mode for this file.
122	* Detected during reading, is valid after complete file is read.
123	* @return eol mode of this file
124	*/
125	TextBuffer::EndOfLineMode eol () const { return m_eol; }
126
127	/**
128	* BOM found?
129	* @return byte order mark found?
130	*/
131	bool byteOrderMarkFound () const { return m_bomFound; }
132
133	/**
134	* mime type used to create filter dev
135	* @return mime-type of filter device
136	*/
137	const QString &mimeTypeForFilterDev () const { return m_mimeType; }
138
139	/**
140	* internal unicode data array
141	* @return internal unicode data
142	*/
143	const QChar unicode () const* { return m_text.unicode(); }
144
145	/**
146	* Get codec for this loader
147	* @return currently in use codec of this loader
148	*/
149	QTextCodec textCodec () const* { return m_codec; }
150
151	/**
152	* read a line, return length + offset in unicode data
153	* @param offset offset into internal unicode data for read line
154	* @param length length of read line
155	* @return true if no encoding errors occurred
156	*/
157	bool readLine (int &offset, int &length)
158	{
159	length = `0`;
160	offset = `0`;
161	bool encodingError = false;
162
163	static const QLatin1Char cr(QLatin1Char ('\r'));
164	static const QLatin1Char lf(QLatin1Char ('\n'));
165
166	/**
167	* did we read two time but got no stuff? encoding error
168	* fixes problem with one character latin-1 files, which lead to crash otherwise!
169	* bug 272579
170	*/
171	bool failedToConvertOnce = false;
172
173	/**
174	* reading loop
175	*/
176	while (m_position <= m_text.length())
177	{
178	if (m_position == m_text.length())
179	{
180	// try to load more text if something is around
181	if (!m_eof)
182	{
183	// kill the old lines...
184	m_text.remove (`0`, m_lastLineStart);
185
186	// try to read new data
187	const int c = m_file->read(m_buffer.data(), m_buffer.size());
188
189	// if any text is there, append it....
190	if (c > `0`)
191	{
192	// update md5 hash sum
193	m_digest.addData (m_buffer.data(), c);
194
195	// detect byte order marks & codec for byte order markers on first read
196	int bomBytes = `0`;
197	if (m_firstRead) {
198	// use first 16 bytes max to allow BOM detection of codec
199	QByteArray bom (m_buffer.data(), qMin (`16`, c));
200	QTextCodec *codecForByteOrderMark = QTextCodec::codecForUtfText (bom, `0`);
201
202	// if codec != null, we found a BOM!
203	if (codecForByteOrderMark) {
204	m_bomFound = true;
205
206	// eat away the different boms!
207	int mib = codecForByteOrderMark->mibEnum ();
208	if (mib == `106`) // utf8
209	bomBytes = `3`;
210	if (mib == `1013` \|\| mib == `1014` \|\| mib == `1015`) // utf16
211	bomBytes = `2`;
212	if (mib == `1017` \|\| mib == `1018` \|\| mib == `1019`) // utf32
213	bomBytes = `4`;
214	}
215
216	/**
217	* if no codec given, do autodetection
218	*/
219	if (!m_codec) {
220	/**
221	* byte order said something about encoding?
222	*/
223	if (codecForByteOrderMark)
224	m_codec = codecForByteOrderMark;
225	else {
226	/**
227	* no unicode BOM found, trigger prober
228	*/
229	KEncodingProber prober (m_proberType);
230	prober.feed (m_buffer.constData(), c);
231
232	// we found codec with some confidence?
233	if (prober.confidence() > `0.5`)
234	m_codec = QTextCodec::codecForName(prober.encoding());
235
236	// no codec, no chance, encoding error
237	if (!m_codec)
238	return false;
239	}
240	}
241
242	m_firstRead = false;
243	}
244
245	Q_ASSERT (m_codec);
246	QString unicode = m_codec->toUnicode (m_buffer.constData() + bomBytes, c - bomBytes, m_converterState);
247
248	// detect broken encoding
249	for (int i = `0`; i < unicode.size(); ++i) {
250	if (unicode [i] == `0`) {
251	encodingError = true;
252	break;
253	}
254	}
255
256	m_text.append (unicode);
257	}
258
259	// is file completely read ?
260	m_eof = (c == -`1`) \|\| (c == `0`);
261
262	// recalc current pos and last pos
263	m_position -= m_lastLineStart;
264	m_lastLineStart = `0`;
265	}
266
267	// oh oh, end of file, escape !
268	if (m_eof && (m_position == m_text.length()))
269	{
270	m_lastWasEndOfLine = false;
271
272	// line data
273	offset = m_lastLineStart;
274	length = m_position-m_lastLineStart;
275
276	m_lastLineStart = m_position;
277
278	return !encodingError && !failedToConvertOnce;
279	}
280
281	// empty? try again
282	if (m_position == m_text.length()) {
283	failedToConvertOnce = true;
284	continue;
285	}
286	}
287
288	if (m_text.at(m_position) == lf)
289	{
290	m_lastWasEndOfLine = true;
291
292	if (m_lastWasR)
293	{
294	m_lastLineStart++;
295	m_lastWasR = false;
296	m_eol = TextBuffer::eolDos;
297	}
298	else
299	{
300	// line data
301	offset = m_lastLineStart;
302	length = m_position-m_lastLineStart;
303
304	m_lastLineStart = m_position+`1`;
305	m_position++;
306
307	// only win, if not dos!
308	if (m_eol != TextBuffer::eolDos)
309	m_eol = TextBuffer::eolUnix;
310
311	return !encodingError;
312	}
313	}
314	else if (m_text.at(m_position) == cr)
315	{
316	m_lastWasEndOfLine = true;
317	m_lastWasR = true;
318
319	// line data
320	offset = m_lastLineStart;
321	length = m_position-m_lastLineStart;
322
323	m_lastLineStart = m_position+`1`;
324	m_position++;
325
326	// should only win of first time!
327	if (m_eol == TextBuffer::eolUnknown)
328	m_eol = TextBuffer::eolMac;
329
330	return !encodingError;
331	}
332	else if (m_text.at(m_position) == QChar::LineSeparator)
333	{
334	m_lastWasEndOfLine = true;
335
336	// line data
337	offset = m_lastLineStart;
338	length = m_position-m_lastLineStart;
339
340	m_lastLineStart = m_position+`1`;
341	m_position++;
342
343	return !encodingError;
344	}
345	else
346	{
347	m_lastWasEndOfLine = false;
348	m_lastWasR = false;
349	}
350
351	m_position++;
352	}
353
354	return !encodingError;
355	}
356
357	QByteArray digest ()
358	{
359	return m_digest.result ();
360	}
361
362	private:
363	QTextCodec *m_codec;
364	bool m_eof;
365	bool m_lastWasEndOfLine;
366	bool m_lastWasR;
367	int m_position;
368	int m_lastLineStart;
369	TextBuffer::EndOfLineMode m_eol;
370	QString m_mimeType;
371	QIODevice *m_file;
372	QByteArray m_buffer;
373	QCryptographicHash m_digest;
374	QString m_text;
375	QTextCodec::ConverterState *m_converterState;
376	bool m_bomFound;
377	bool m_firstRead;
378	KEncodingProber::ProberType m_proberType;
379	};
380
381	}
382
383	#endif
384