1/****************************************************************************
2**
3** Copyright (C) 2018 The Qt Company Ltd.
4** Copyright (C) 2018 Intel Corporation.
5** Contact: https://www.qt.io/licensing/
6**
7** This file is part of the QtCore module of the Qt Toolkit.
8**
9** $QT_BEGIN_LICENSE:LGPL$
10** Commercial License Usage
11** Licensees holding valid commercial Qt licenses may use this file in
12** accordance with the commercial license agreement provided with the
13** Software or, alternatively, in accordance with the terms contained in
14** a written agreement between you and The Qt Company. For licensing terms
15** and conditions see https://www.qt.io/terms-conditions. For further
16** information use the contact form at https://www.qt.io/contact-us.
17**
18** GNU Lesser General Public License Usage
19** Alternatively, this file may be used under the terms of the GNU Lesser
20** General Public License version 3 as published by the Free Software
21** Foundation and appearing in the file LICENSE.LGPL3 included in the
22** packaging of this file. Please review the following information to
23** ensure the GNU Lesser General Public License version 3 requirements
24** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
25**
26** GNU General Public License Usage
27** Alternatively, this file may be used under the terms of the GNU
28** General Public License version 2.0 or (at your option) the GNU General
29** Public license version 3 or any later version approved by the KDE Free
30** Qt Foundation. The licenses are as published by the Free Software
31** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
32** included in the packaging of this file. Please review the following
33** information to ensure the GNU General Public License requirements will
34** be met: https://www.gnu.org/licenses/gpl-2.0.html and
35** https://www.gnu.org/licenses/gpl-3.0.html.
36**
37** $QT_END_LICENSE$
38**
39****************************************************************************/
40
41#ifndef QUTFCODEC_P_H
42#define QUTFCODEC_P_H
43
44//
45// W A R N I N G
46// -------------
47//
48// This file is not part of the Qt API. It exists purely as an
49// implementation detail. This header file may change from version to
50// version without notice, or even be removed.
51//
52// We mean it.
53//
54
55#include <QtCore/qstring.h>
56#include <QtCore/qlist.h>
57
58#if QT_CONFIG(textcodec)
59#include "QtCore/qtextcodec.h"
60#endif
61
62#include "private/qtextcodec_p.h"
63
64QT_BEGIN_NAMESPACE
65
66struct QUtf8BaseTraits
67{
68 static const bool isTrusted = false;
69 static const bool allowNonCharacters = true;
70 static const bool skipAsciiHandling = false;
71 static const int Error = -1;
72 static const int EndOfString = -2;
73
74 static bool isValidCharacter(uint u)
75 { return int(u) >= 0; }
76
77 static void appendByte(uchar *&ptr, uchar b)
78 { *ptr++ = b; }
79
80 static uchar peekByte(const uchar *ptr, int n = 0)
81 { return ptr[n]; }
82
83 static qptrdiff availableBytes(const uchar *ptr, const uchar *end)
84 { return end - ptr; }
85
86 static void advanceByte(const uchar *&ptr, int n = 1)
87 { ptr += n; }
88
89 static void appendUtf16(ushort *&ptr, ushort uc)
90 { *ptr++ = uc; }
91
92 static void appendUcs4(ushort *&ptr, uint uc)
93 {
94 appendUtf16(ptr, uc: QChar::highSurrogate(ucs4: uc));
95 appendUtf16(ptr, uc: QChar::lowSurrogate(ucs4: uc));
96 }
97
98 static ushort peekUtf16(const ushort *ptr, int n = 0)
99 { return ptr[n]; }
100
101 static qptrdiff availableUtf16(const ushort *ptr, const ushort *end)
102 { return end - ptr; }
103
104 static void advanceUtf16(const ushort *&ptr, int n = 1)
105 { ptr += n; }
106
107 // it's possible to output to UCS-4 too
108 static void appendUtf16(uint *&ptr, ushort uc)
109 { *ptr++ = uc; }
110
111 static void appendUcs4(uint *&ptr, uint uc)
112 { *ptr++ = uc; }
113};
114
115struct QUtf8BaseTraitsNoAscii : public QUtf8BaseTraits
116{
117 static const bool skipAsciiHandling = true;
118};
119
120namespace QUtf8Functions
121{
122 /// returns 0 on success; errors can only happen if \a u is a surrogate:
123 /// Error if \a u is a low surrogate;
124 /// if \a u is a high surrogate, Error if the next isn't a low one,
125 /// EndOfString if we run into the end of the string.
126 template <typename Traits, typename OutputPtr, typename InputPtr> inline
127 int toUtf8(ushort u, OutputPtr &dst, InputPtr &src, InputPtr end)
128 {
129 if (!Traits::skipAsciiHandling && u < 0x80) {
130 // U+0000 to U+007F (US-ASCII) - one byte
131 Traits::appendByte(dst, uchar(u));
132 return 0;
133 } else if (u < 0x0800) {
134 // U+0080 to U+07FF - two bytes
135 // first of two bytes
136 Traits::appendByte(dst, 0xc0 | uchar(u >> 6));
137 } else {
138 if (!QChar::isSurrogate(ucs4: u)) {
139 // U+0800 to U+FFFF (except U+D800-U+DFFF) - three bytes
140 if (!Traits::allowNonCharacters && QChar::isNonCharacter(ucs4: u))
141 return Traits::Error;
142
143 // first of three bytes
144 Traits::appendByte(dst, 0xe0 | uchar(u >> 12));
145 } else {
146 // U+10000 to U+10FFFF - four bytes
147 // need to get one extra codepoint
148 if (Traits::availableUtf16(src, end) == 0)
149 return Traits::EndOfString;
150
151 ushort low = Traits::peekUtf16(src);
152 if (!QChar::isHighSurrogate(ucs4: u))
153 return Traits::Error;
154 if (!QChar::isLowSurrogate(ucs4: low))
155 return Traits::Error;
156
157 Traits::advanceUtf16(src);
158 uint ucs4 = QChar::surrogateToUcs4(high: u, low);
159
160 if (!Traits::allowNonCharacters && QChar::isNonCharacter(ucs4))
161 return Traits::Error;
162
163 // first byte
164 Traits::appendByte(dst, 0xf0 | (uchar(ucs4 >> 18) & 0xf));
165
166 // second of four bytes
167 Traits::appendByte(dst, 0x80 | (uchar(ucs4 >> 12) & 0x3f));
168
169 // for the rest of the bytes
170 u = ushort(ucs4);
171 }
172
173 // second to last byte
174 Traits::appendByte(dst, 0x80 | (uchar(u >> 6) & 0x3f));
175 }
176
177 // last byte
178 Traits::appendByte(dst, 0x80 | (u & 0x3f));
179 return 0;
180 }
181
182 inline bool isContinuationByte(uchar b)
183 {
184 return (b & 0xc0) == 0x80;
185 }
186
187 /// returns the number of characters consumed (including \a b) in case of success;
188 /// returns negative in case of error: Traits::Error or Traits::EndOfString
189 template <typename Traits, typename OutputPtr, typename InputPtr> inline
190 int fromUtf8(uchar b, OutputPtr &dst, InputPtr &src, InputPtr end)
191 {
192 int charsNeeded;
193 uint min_uc;
194 uint uc;
195
196 if (!Traits::skipAsciiHandling && b < 0x80) {
197 // US-ASCII
198 Traits::appendUtf16(dst, b);
199 return 1;
200 }
201
202 if (!Traits::isTrusted && Q_UNLIKELY(b <= 0xC1)) {
203 // an UTF-8 first character must be at least 0xC0
204 // however, all 0xC0 and 0xC1 first bytes can only produce overlong sequences
205 return Traits::Error;
206 } else if (b < 0xe0) {
207 charsNeeded = 2;
208 min_uc = 0x80;
209 uc = b & 0x1f;
210 } else if (b < 0xf0) {
211 charsNeeded = 3;
212 min_uc = 0x800;
213 uc = b & 0x0f;
214 } else if (b < 0xf5) {
215 charsNeeded = 4;
216 min_uc = 0x10000;
217 uc = b & 0x07;
218 } else {
219 // the last Unicode character is U+10FFFF
220 // it's encoded in UTF-8 as "\xF4\x8F\xBF\xBF"
221 // therefore, a byte higher than 0xF4 is not the UTF-8 first byte
222 return Traits::Error;
223 }
224
225 int bytesAvailable = Traits::availableBytes(src, end);
226 if (Q_UNLIKELY(bytesAvailable < charsNeeded - 1)) {
227 // it's possible that we have an error instead of just unfinished bytes
228 if (bytesAvailable > 0 && !isContinuationByte(Traits::peekByte(src, 0)))
229 return Traits::Error;
230 if (bytesAvailable > 1 && !isContinuationByte(Traits::peekByte(src, 1)))
231 return Traits::Error;
232 return Traits::EndOfString;
233 }
234
235 // first continuation character
236 b = Traits::peekByte(src, 0);
237 if (!isContinuationByte(b))
238 return Traits::Error;
239 uc <<= 6;
240 uc |= b & 0x3f;
241
242 if (charsNeeded > 2) {
243 // second continuation character
244 b = Traits::peekByte(src, 1);
245 if (!isContinuationByte(b))
246 return Traits::Error;
247 uc <<= 6;
248 uc |= b & 0x3f;
249
250 if (charsNeeded > 3) {
251 // third continuation character
252 b = Traits::peekByte(src, 2);
253 if (!isContinuationByte(b))
254 return Traits::Error;
255 uc <<= 6;
256 uc |= b & 0x3f;
257 }
258 }
259
260 // we've decoded something; safety-check it
261 if (!Traits::isTrusted) {
262 if (uc < min_uc)
263 return Traits::Error;
264 if (QChar::isSurrogate(ucs4: uc) || uc > QChar::LastValidCodePoint)
265 return Traits::Error;
266 if (!Traits::allowNonCharacters && QChar::isNonCharacter(ucs4: uc))
267 return Traits::Error;
268 }
269
270 // write the UTF-16 sequence
271 if (!QChar::requiresSurrogates(ucs4: uc)) {
272 // UTF-8 decoded and no surrogates are required
273 // detach if necessary
274 Traits::appendUtf16(dst, ushort(uc));
275 } else {
276 // UTF-8 decoded to something that requires a surrogate pair
277 Traits::appendUcs4(dst, uc);
278 }
279
280 Traits::advanceByte(src, charsNeeded - 1);
281 return charsNeeded;
282 }
283}
284
285enum DataEndianness
286{
287 DetectEndianness,
288 BigEndianness,
289 LittleEndianness
290};
291
292struct QUtf8
293{
294 static QChar *convertToUnicode(QChar *, const char *, int) noexcept;
295 static QString convertToUnicode(const char *, int);
296 static QString convertToUnicode(const char *, int, QTextCodec::ConverterState *);
297 static QByteArray convertFromUnicode(const QChar *, int);
298 static QByteArray convertFromUnicode(const QChar *, int, QTextCodec::ConverterState *);
299 struct ValidUtf8Result {
300 bool isValidUtf8;
301 bool isValidAscii;
302 };
303 static ValidUtf8Result isValidUtf8(const char *, qsizetype);
304 static int compareUtf8(const char *, qsizetype, const QChar *, int);
305 static int compareUtf8(const char *, qsizetype, QLatin1String s);
306};
307
308struct QUtf16
309{
310 static QString convertToUnicode(const char *, int, QTextCodec::ConverterState *, DataEndianness = DetectEndianness);
311 static QByteArray convertFromUnicode(const QChar *, int, QTextCodec::ConverterState *, DataEndianness = DetectEndianness);
312};
313
314struct QUtf32
315{
316 static QString convertToUnicode(const char *, int, QTextCodec::ConverterState *, DataEndianness = DetectEndianness);
317 static QByteArray convertFromUnicode(const QChar *, int, QTextCodec::ConverterState *, DataEndianness = DetectEndianness);
318};
319
320#if QT_CONFIG(textcodec)
321
322class QUtf8Codec : public QTextCodec {
323public:
324 ~QUtf8Codec();
325
326 QByteArray name() const override;
327 int mibEnum() const override;
328
329 QString convertToUnicode(const char *, int, ConverterState *) const override;
330 QByteArray convertFromUnicode(const QChar *, int, ConverterState *) const override;
331 void convertToUnicode(QString *target, const char *, int, ConverterState *) const;
332};
333
334class QUtf16Codec : public QTextCodec {
335protected:
336public:
337 QUtf16Codec() { e = DetectEndianness; }
338 ~QUtf16Codec();
339
340 QByteArray name() const override;
341 QList<QByteArray> aliases() const override;
342 int mibEnum() const override;
343
344 QString convertToUnicode(const char *, int, ConverterState *) const override;
345 QByteArray convertFromUnicode(const QChar *, int, ConverterState *) const override;
346
347protected:
348 DataEndianness e;
349};
350
351class QUtf16BECodec : public QUtf16Codec {
352public:
353 QUtf16BECodec() : QUtf16Codec() { e = BigEndianness; }
354 QByteArray name() const override;
355 QList<QByteArray> aliases() const override;
356 int mibEnum() const override;
357};
358
359class QUtf16LECodec : public QUtf16Codec {
360public:
361 QUtf16LECodec() : QUtf16Codec() { e = LittleEndianness; }
362 QByteArray name() const override;
363 QList<QByteArray> aliases() const override;
364 int mibEnum() const override;
365};
366
367class QUtf32Codec : public QTextCodec {
368public:
369 QUtf32Codec() { e = DetectEndianness; }
370 ~QUtf32Codec();
371
372 QByteArray name() const override;
373 QList<QByteArray> aliases() const override;
374 int mibEnum() const override;
375
376 QString convertToUnicode(const char *, int, ConverterState *) const override;
377 QByteArray convertFromUnicode(const QChar *, int, ConverterState *) const override;
378
379protected:
380 DataEndianness e;
381};
382
383class QUtf32BECodec : public QUtf32Codec {
384public:
385 QUtf32BECodec() : QUtf32Codec() { e = BigEndianness; }
386 QByteArray name() const override;
387 QList<QByteArray> aliases() const override;
388 int mibEnum() const override;
389};
390
391class QUtf32LECodec : public QUtf32Codec {
392public:
393 QUtf32LECodec() : QUtf32Codec() { e = LittleEndianness; }
394 QByteArray name() const override;
395 QList<QByteArray> aliases() const override;
396 int mibEnum() const override;
397};
398
399
400#endif // textcodec
401
402QT_END_NAMESPACE
403
404#endif // QUTFCODEC_P_H
405

source code of qtbase/src/corelib/codecs/qutfcodec_p.h