1 | /* |
2 | This file is part of the KDE libraries |
3 | |
4 | Copyright (C) 1999 Lars Knoll (knoll@mpi-hd.mpg.de) |
5 | Copyright (C) 2007 Nick Shaforostoff (shafff@ukr.net) |
6 | |
7 | This library is free software; you can redistribute it and/or |
8 | modify it under the terms of the GNU Library General Public |
9 | License as published by the Free Software Foundation; either |
10 | version 2 of the License, or (at your option) any later version. |
11 | |
12 | This library is distributed in the hope that it will be useful, |
13 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
15 | Library General Public License for more details. |
16 | |
17 | You should have received a copy of the GNU Library General Public License |
18 | along with this library; see the file COPYING.LIB. If not, write to |
19 | the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, |
20 | Boston, MA 02110-1301, USA. |
21 | |
22 | */ |
23 | #ifndef KENCODINGDETECTOR_H |
24 | #define KENCODINGDETECTOR_H |
25 | |
26 | #include <kdecore_export.h> |
27 | #include <QtCore/QString> |
28 | |
29 | class QTextCodec; |
30 | class QTextDecoder; |
31 | class KEncodingDetectorPrivate; |
32 | |
33 | /** |
34 | * @short Provides encoding detection capabilities. |
35 | * |
36 | * Searches for encoding declaration inside raw data -- meta and xml tags. |
37 | * In the case it can't find it, uses heuristics for specified language. |
38 | * |
39 | * If it finds unicode BOM marks, it changes encoding regardless of what the user has told |
40 | * |
41 | * Intended lifetime of the object: one instance per document. |
42 | * |
43 | * Typical use: |
44 | * \code |
45 | * QByteArray data; |
46 | * ... |
47 | * KEncodingDetector detector; |
48 | * detector.setAutoDetectLanguage(KEncodingDetector::Cyrillic); |
49 | * QString out=detector.decode(data); |
50 | * \endcode |
51 | * |
52 | * |
53 | * Do not mix decode() with decodeWithBuffering() |
54 | * |
55 | * @short Guess encoding of char array |
56 | * |
57 | */ |
58 | class KDECORE_EXPORT KEncodingDetector |
59 | { |
60 | public: |
61 | enum EncodingChoiceSource |
62 | { |
63 | DefaultEncoding, |
64 | AutoDetectedEncoding, |
65 | BOM, |
66 | , |
67 | EncodingFromMetaTag, |
68 | , |
69 | UserChosenEncoding |
70 | }; |
71 | |
72 | enum AutoDetectScript |
73 | { |
74 | None, |
75 | SemiautomaticDetection, |
76 | Arabic, |
77 | Baltic, |
78 | CentralEuropean, |
79 | ChineseSimplified, |
80 | ChineseTraditional, |
81 | Cyrillic, |
82 | Greek, |
83 | Hebrew, |
84 | Japanese, |
85 | Korean, |
86 | NorthernSaami, |
87 | SouthEasternEurope, |
88 | Thai, |
89 | Turkish, |
90 | Unicode, |
91 | WesternEuropean |
92 | }; |
93 | |
94 | /** |
95 | * Default codec is latin1 (as html spec says), EncodingChoiceSource is default, AutoDetectScript=Semiautomatic |
96 | */ |
97 | KEncodingDetector(); |
98 | |
99 | /** |
100 | * Allows to set Default codec, EncodingChoiceSource, AutoDetectScript |
101 | */ |
102 | KEncodingDetector(QTextCodec* codec, EncodingChoiceSource source, AutoDetectScript script=None); |
103 | ~KEncodingDetector(); |
104 | |
105 | //const QTextCodec* codec() const; |
106 | |
107 | /** |
108 | * @returns true if specified encoding was recognized |
109 | */ |
110 | bool setEncoding(const char *encoding, EncodingChoiceSource type); |
111 | |
112 | /** |
113 | * Convenience method. |
114 | * @returns mime name of detected encoding |
115 | */ |
116 | const char* encoding() const; |
117 | |
118 | bool visuallyOrdered() const; |
119 | |
120 | // void setAutoDetectLanguage( const QString& ); |
121 | // const QString& autoDetectLanguage() const; |
122 | |
123 | void setAutoDetectLanguage( AutoDetectScript ); |
124 | AutoDetectScript autoDetectLanguage() const; |
125 | |
126 | EncodingChoiceSource encodingChoiceSource() const; |
127 | |
128 | /** |
129 | * The main class method |
130 | * |
131 | * Calls protected analyze() only the first time of the whole object life |
132 | * |
133 | * Replaces all null chars with spaces. |
134 | */ |
135 | QString decode(const char *data, int len); |
136 | QString decode(const QByteArray &data); |
137 | |
138 | //* You don't need to call analyze() if you use this method. |
139 | /** |
140 | * Convenience method that uses buffering. It waits for full html head to be buffered |
141 | * (i.e. calls analyze every time until it returns true). |
142 | * |
143 | * Replaces all null chars with spaces. |
144 | * |
145 | * @returns Decoded data, or empty string, if there was not enough data for accurate detection |
146 | * @see flush() |
147 | */ |
148 | QString decodeWithBuffering(const char *data, int len); |
149 | |
150 | /** |
151 | * This method checks whether invalid characters were found |
152 | * during a decoding operation. |
153 | * |
154 | * Note that this bit is never reset once invalid characters have been found. |
155 | * To force a reset, either change the encoding using setEncoding() or call |
156 | * resetDecoder() |
157 | * |
158 | * @returns a boolean reflecting said state. |
159 | * @since 4.3 |
160 | * @see resetDecoder() setEncoding() |
161 | */ |
162 | bool decodedInvalidCharacters() const; |
163 | |
164 | /** |
165 | * Resets the decoder. Any stateful decoding information (such as resulting from previous calls |
166 | * to decodeWithBuffering()) will be lost. |
167 | * Will Reset the state of decodedInvalidCharacters() as a side effect. |
168 | * |
169 | * @since 4.3 |
170 | * @see decodeWithBuffering() decodedInvalidCharacters() |
171 | * |
172 | */ |
173 | void resetDecoder(); |
174 | |
175 | /** |
176 | * Convenience method to be used with decodeForHtml. Flushes buffer. |
177 | * @see decodeForHtml() |
178 | */ |
179 | QString flush(); |
180 | |
181 | /** |
182 | * Takes lang name _after_ it were i18n()'ed |
183 | */ |
184 | static AutoDetectScript scriptForName(const QString& lang); |
185 | static QString nameForScript(AutoDetectScript); |
186 | static bool hasAutoDetectionForScript(AutoDetectScript); |
187 | |
188 | protected: |
189 | /** |
190 | * This nice method will kill all 0 bytes (or double bytes) |
191 | * and remember if this was a binary or not ;) |
192 | */ |
193 | bool processNull(char* data,int length); |
194 | |
195 | /** |
196 | * Check if we are really utf8. Taken from kate |
197 | * |
198 | * @returns true if current encoding is utf8 and the text cannot be in this encoding |
199 | * |
200 | * Please somebody read http://de.wikipedia.org/wiki/UTF-8 and check this code... |
201 | */ |
202 | bool errorsIfUtf8 (const char* data, int length); |
203 | |
204 | /** |
205 | * Analyze text data. |
206 | * @returns true if there was enough data for accurate detection |
207 | */ |
208 | bool analyze (const char *data, int len); |
209 | |
210 | /** |
211 | * @returns QTextDecoder for detected encoding |
212 | */ |
213 | QTextDecoder* decoder(); |
214 | |
215 | private: |
216 | KEncodingDetectorPrivate* const d; |
217 | }; |
218 | |
219 | #endif |
220 | |