1/*
2 This file is part of the KDE libraries
3
4 Copyright (C) 1999 Lars Knoll (knoll@mpi-hd.mpg.de)
5 Copyright (C) 2007 Nick Shaforostoff (shafff@ukr.net)
6
7 This library is free software; you can redistribute it and/or
8 modify it under the terms of the GNU Library General Public
9 License as published by the Free Software Foundation; either
10 version 2 of the License, or (at your option) any later version.
11
12 This library is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Library General Public License for more details.
16
17 You should have received a copy of the GNU Library General Public License
18 along with this library; see the file COPYING.LIB. If not, write to
19 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
20 Boston, MA 02110-1301, USA.
21
22*/
23#ifndef KENCODINGDETECTOR_H
24#define KENCODINGDETECTOR_H
25
26#include <kdecore_export.h>
27#include <QtCore/QString>
28
29class QTextCodec;
30class QTextDecoder;
31class KEncodingDetectorPrivate;
32
33/**
34 * @short Provides encoding detection capabilities.
35 *
36 * Searches for encoding declaration inside raw data -- meta and xml tags.
37 * In the case it can't find it, uses heuristics for specified language.
38 *
39 * If it finds unicode BOM marks, it changes encoding regardless of what the user has told
40 *
41 * Intended lifetime of the object: one instance per document.
42 *
43 * Typical use:
44 * \code
45 * QByteArray data;
46 * ...
47 * KEncodingDetector detector;
48 * detector.setAutoDetectLanguage(KEncodingDetector::Cyrillic);
49 * QString out=detector.decode(data);
50 * \endcode
51 *
52 *
53 * Do not mix decode() with decodeWithBuffering()
54 *
55 * @short Guess encoding of char array
56 *
57 */
58class KDECORE_EXPORT KEncodingDetector
59{
60public:
61 enum EncodingChoiceSource
62 {
63 DefaultEncoding,
64 AutoDetectedEncoding,
65 BOM,
66 EncodingFromXMLHeader,
67 EncodingFromMetaTag,
68 EncodingFromHTTPHeader,
69 UserChosenEncoding
70 };
71
72 enum AutoDetectScript
73 {
74 None,
75 SemiautomaticDetection,
76 Arabic,
77 Baltic,
78 CentralEuropean,
79 ChineseSimplified,
80 ChineseTraditional,
81 Cyrillic,
82 Greek,
83 Hebrew,
84 Japanese,
85 Korean,
86 NorthernSaami,
87 SouthEasternEurope,
88 Thai,
89 Turkish,
90 Unicode,
91 WesternEuropean
92 };
93
94 /**
95 * Default codec is latin1 (as html spec says), EncodingChoiceSource is default, AutoDetectScript=Semiautomatic
96 */
97 KEncodingDetector();
98
99 /**
100 * Allows to set Default codec, EncodingChoiceSource, AutoDetectScript
101 */
102 KEncodingDetector(QTextCodec* codec, EncodingChoiceSource source, AutoDetectScript script=None);
103 ~KEncodingDetector();
104
105 //const QTextCodec* codec() const;
106
107 /**
108 * @returns true if specified encoding was recognized
109 */
110 bool setEncoding(const char *encoding, EncodingChoiceSource type);
111
112 /**
113 * Convenience method.
114 * @returns mime name of detected encoding
115 */
116 const char* encoding() const;
117
118 bool visuallyOrdered() const;
119
120// void setAutoDetectLanguage( const QString& );
121// const QString& autoDetectLanguage() const;
122
123 void setAutoDetectLanguage( AutoDetectScript );
124 AutoDetectScript autoDetectLanguage() const;
125
126 EncodingChoiceSource encodingChoiceSource() const;
127
128 /**
129 * The main class method
130 *
131 * Calls protected analyze() only the first time of the whole object life
132 *
133 * Replaces all null chars with spaces.
134 */
135 QString decode(const char *data, int len);
136 QString decode(const QByteArray &data);
137
138 //* You don't need to call analyze() if you use this method.
139 /**
140 * Convenience method that uses buffering. It waits for full html head to be buffered
141 * (i.e. calls analyze every time until it returns true).
142 *
143 * Replaces all null chars with spaces.
144 *
145 * @returns Decoded data, or empty string, if there was not enough data for accurate detection
146 * @see flush()
147 */
148 QString decodeWithBuffering(const char *data, int len);
149
150 /**
151 * This method checks whether invalid characters were found
152 * during a decoding operation.
153 *
154 * Note that this bit is never reset once invalid characters have been found.
155 * To force a reset, either change the encoding using setEncoding() or call
156 * resetDecoder()
157 *
158 * @returns a boolean reflecting said state.
159 * @since 4.3
160 * @see resetDecoder() setEncoding()
161 */
162 bool decodedInvalidCharacters() const;
163
164 /**
165 * Resets the decoder. Any stateful decoding information (such as resulting from previous calls
166 * to decodeWithBuffering()) will be lost.
167 * Will Reset the state of decodedInvalidCharacters() as a side effect.
168 *
169 * @since 4.3
170 * @see decodeWithBuffering() decodedInvalidCharacters()
171 *
172 */
173 void resetDecoder();
174
175 /**
176 * Convenience method to be used with decodeForHtml. Flushes buffer.
177 * @see decodeForHtml()
178 */
179 QString flush();
180
181 /**
182 * Takes lang name _after_ it were i18n()'ed
183 */
184 static AutoDetectScript scriptForName(const QString& lang);
185 static QString nameForScript(AutoDetectScript);
186 static bool hasAutoDetectionForScript(AutoDetectScript);
187
188protected:
189 /**
190 * This nice method will kill all 0 bytes (or double bytes)
191 * and remember if this was a binary or not ;)
192 */
193 bool processNull(char* data,int length);
194
195 /**
196 * Check if we are really utf8. Taken from kate
197 *
198 * @returns true if current encoding is utf8 and the text cannot be in this encoding
199 *
200 * Please somebody read http://de.wikipedia.org/wiki/UTF-8 and check this code...
201 */
202 bool errorsIfUtf8 (const char* data, int length);
203
204 /**
205 * Analyze text data.
206 * @returns true if there was enough data for accurate detection
207 */
208 bool analyze (const char *data, int len);
209
210 /**
211 * @returns QTextDecoder for detected encoding
212 */
213 QTextDecoder* decoder();
214
215private:
216 KEncodingDetectorPrivate* const d;
217};
218
219#endif
220