1/*
2 This file is part of the KDE libraries
3
4 Copyright (C) 2008 Wang Hoi (zealot.hoi@gmail.com)
5
6 This library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Library General Public
8 License as published by the Free Software Foundation; either
9 version 2 of the License, or (at your option) any later version.
10
11 This library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Library General Public License for more details.
15
16 You should have received a copy of the GNU Library General Public License
17 along with this library; see the file COPYING.LIB. If not, write to
18 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
19 Boston, MA 02110-1301, USA.
20
21*/
22#ifndef KENCODINGPROBER_H
23#define KENCODINGPROBER_H
24
25// enable debug of private probers
26// #define DEBUG_PROBE
27
28#include <kdecore_export.h>
29#ifdef DEBUG_PROBE
30#include <kdebug.h>
31#endif
32#include <QtCore/QString>
33
34class KEncodingProberPrivate;
35
36/**
37 * @short Provides encoding detection(probe) capabilities.
38 *
39 * Probe the encoding of raw data only.
40 * In the case it can't find it, return the most possible encoding it guessed.
41 *
42 * Always do Unicode probe regardless the ProberType
43 *
44 * Feed data to it several times with feed() until ProberState changes to FoundIt/NotMe,
45 * or confidence() returns a value you find acceptable.
46 *
47 * Intended lifetime of the object: one instance per ProberType.
48 *
49 * Typical use:
50 * \code
51 * QByteArray data, moredata;
52 * ...
53 * KEncodingProber prober(KEncodingProber::Chinese);
54 * prober.feed(data);
55 * prober.feed(moredata);
56 * if (prober.confidence() > 0.6)
57 * QString out = QTextCodec::codecForName(prober.encoding())->toUnicode(data);
58 * \endcode
59 *
60 * At least 256 characters are needed to change the ProberState from Probing to FoundIt.
61 * If you don't have so many characters to probe,
62 * decide whether to accept the encoding it guessed so far according to the Confidence by yourself.
63 *
64 * @short Guess encoding of char array
65 *
66 */
67class KDECORE_EXPORT KEncodingProber
68{
69public:
70
71 enum ProberState {
72 FoundIt, /**< Sure find the encoding */
73 NotMe, /**< Sure not included in current ProberType's all supported encodings */
74 Probing /**< Need more data to make a decision */
75 };
76
77 enum ProberType {
78 None,
79 Universal,
80 Arabic,
81 Baltic,
82 CentralEuropean,
83 ChineseSimplified,
84 ChineseTraditional,
85 Cyrillic,
86 Greek,
87 Hebrew,
88 Japanese,
89 Korean,
90 NorthernSaami,
91 Other,
92 SouthEasternEurope,
93 Thai,
94 Turkish,
95 Unicode,
96 WesternEuropean
97 };
98
99 /**
100 * Default ProberType is Universal(detect all possibe encodings)
101 */
102 KEncodingProber(ProberType proberType=Universal);
103
104 ~KEncodingProber();
105
106 /**
107 * reset the prober's internal state and data.
108 */
109 void reset();
110
111 /**
112 * The main class method
113 *
114 * feed data to the prober
115 *
116 * @returns the ProberState after probing the fed data.
117 */
118 ProberState feed(const QByteArray &data);
119 ProberState feed(const char* data, int len);
120
121 /**
122 * @returns the prober's current ProberState
123 *
124 */
125 ProberState state() const;
126
127 /**
128 * @returns the name of the best encoding it has guessed so far
129 * @warning The returned string is allocated with strdup, so some memory is leaked with every call.
130 * @deprecated Use encoding() instead, which returns a QByteArray.
131 */
132#ifndef KDE_NO_DEPRECATED
133 KDE_DEPRECATED const char* encodingName() const;
134#endif
135
136 /**
137 * @returns a QByteArray with the name of the best encoding it has guessed so far
138 * @since 4.2.2
139 */
140 QByteArray encoding() const;
141
142 /**
143 * @returns the confidence(sureness) of encoding it guessed so far (0.0 ~ 0.99), not very reliable for single byte encodings
144 */
145 float confidence() const;
146
147 ProberType proberType() const;
148
149 /**
150 * change current prober's ProberType and reset the prober
151 */
152 void setProberType(ProberType proberType);
153
154 /**
155 * @return the ProberType for lang (eg. proberTypeForName("Chinese Simplified") will return KEncodingProber::ChineseSimplified
156 */
157 static ProberType proberTypeForName(const QString& lang);
158
159 /**
160 * map ProberType to language string
161 */
162 static QString nameForProberType(ProberType proberType);
163
164private:
165 KEncodingProberPrivate* const d;
166};
167
168#endif
169