1 | /* |
2 | This file is part of the KDE libraries |
3 | |
4 | Copyright (C) 2008 Wang Hoi (zealot.hoi@gmail.com) |
5 | |
6 | This library is free software; you can redistribute it and/or |
7 | modify it under the terms of the GNU Library General Public |
8 | License as published by the Free Software Foundation; either |
9 | version 2 of the License, or (at your option) any later version. |
10 | |
11 | This library is distributed in the hope that it will be useful, |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 | Library General Public License for more details. |
15 | |
16 | You should have received a copy of the GNU Library General Public License |
17 | along with this library; see the file COPYING.LIB. If not, write to |
18 | the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, |
19 | Boston, MA 02110-1301, USA. |
20 | |
21 | */ |
22 | #ifndef KENCODINGPROBER_H |
23 | #define KENCODINGPROBER_H |
24 | |
25 | // enable debug of private probers |
26 | // #define DEBUG_PROBE |
27 | |
28 | #include <kdecore_export.h> |
29 | #ifdef DEBUG_PROBE |
30 | #include <kdebug.h> |
31 | #endif |
32 | #include <QtCore/QString> |
33 | |
34 | class KEncodingProberPrivate; |
35 | |
36 | /** |
37 | * @short Provides encoding detection(probe) capabilities. |
38 | * |
39 | * Probe the encoding of raw data only. |
40 | * In the case it can't find it, return the most possible encoding it guessed. |
41 | * |
42 | * Always do Unicode probe regardless the ProberType |
43 | * |
44 | * Feed data to it several times with feed() until ProberState changes to FoundIt/NotMe, |
45 | * or confidence() returns a value you find acceptable. |
46 | * |
47 | * Intended lifetime of the object: one instance per ProberType. |
48 | * |
49 | * Typical use: |
50 | * \code |
51 | * QByteArray data, moredata; |
52 | * ... |
53 | * KEncodingProber prober(KEncodingProber::Chinese); |
54 | * prober.feed(data); |
55 | * prober.feed(moredata); |
56 | * if (prober.confidence() > 0.6) |
57 | * QString out = QTextCodec::codecForName(prober.encoding())->toUnicode(data); |
58 | * \endcode |
59 | * |
60 | * At least 256 characters are needed to change the ProberState from Probing to FoundIt. |
61 | * If you don't have so many characters to probe, |
62 | * decide whether to accept the encoding it guessed so far according to the Confidence by yourself. |
63 | * |
64 | * @short Guess encoding of char array |
65 | * |
66 | */ |
67 | class KDECORE_EXPORT KEncodingProber |
68 | { |
69 | public: |
70 | |
71 | enum ProberState { |
72 | FoundIt, /**< Sure find the encoding */ |
73 | NotMe, /**< Sure not included in current ProberType's all supported encodings */ |
74 | Probing /**< Need more data to make a decision */ |
75 | }; |
76 | |
77 | enum ProberType { |
78 | None, |
79 | Universal, |
80 | Arabic, |
81 | Baltic, |
82 | CentralEuropean, |
83 | ChineseSimplified, |
84 | ChineseTraditional, |
85 | Cyrillic, |
86 | Greek, |
87 | Hebrew, |
88 | Japanese, |
89 | Korean, |
90 | NorthernSaami, |
91 | Other, |
92 | SouthEasternEurope, |
93 | Thai, |
94 | Turkish, |
95 | Unicode, |
96 | WesternEuropean |
97 | }; |
98 | |
99 | /** |
100 | * Default ProberType is Universal(detect all possibe encodings) |
101 | */ |
102 | KEncodingProber(ProberType proberType=Universal); |
103 | |
104 | ~KEncodingProber(); |
105 | |
106 | /** |
107 | * reset the prober's internal state and data. |
108 | */ |
109 | void reset(); |
110 | |
111 | /** |
112 | * The main class method |
113 | * |
114 | * feed data to the prober |
115 | * |
116 | * @returns the ProberState after probing the fed data. |
117 | */ |
118 | ProberState feed(const QByteArray &data); |
119 | ProberState feed(const char* data, int len); |
120 | |
121 | /** |
122 | * @returns the prober's current ProberState |
123 | * |
124 | */ |
125 | ProberState state() const; |
126 | |
127 | /** |
128 | * @returns the name of the best encoding it has guessed so far |
129 | * @warning The returned string is allocated with strdup, so some memory is leaked with every call. |
130 | * @deprecated Use encoding() instead, which returns a QByteArray. |
131 | */ |
132 | #ifndef KDE_NO_DEPRECATED |
133 | KDE_DEPRECATED const char* encodingName() const; |
134 | #endif |
135 | |
136 | /** |
137 | * @returns a QByteArray with the name of the best encoding it has guessed so far |
138 | * @since 4.2.2 |
139 | */ |
140 | QByteArray encoding() const; |
141 | |
142 | /** |
143 | * @returns the confidence(sureness) of encoding it guessed so far (0.0 ~ 0.99), not very reliable for single byte encodings |
144 | */ |
145 | float confidence() const; |
146 | |
147 | ProberType proberType() const; |
148 | |
149 | /** |
150 | * change current prober's ProberType and reset the prober |
151 | */ |
152 | void setProberType(ProberType proberType); |
153 | |
154 | /** |
155 | * @return the ProberType for lang (eg. proberTypeForName("Chinese Simplified") will return KEncodingProber::ChineseSimplified |
156 | */ |
157 | static ProberType proberTypeForName(const QString& lang); |
158 | |
159 | /** |
160 | * map ProberType to language string |
161 | */ |
162 | static QString nameForProberType(ProberType proberType); |
163 | |
164 | private: |
165 | KEncodingProberPrivate* const d; |
166 | }; |
167 | |
168 | #endif |
169 | |