1// Copyright (C) 2020 The Qt Company Ltd.
2// Copyright (C) 2020 Intel Corporation.
3// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
4
5#ifndef QSTRINGCONVERTER_P_H
6#define QSTRINGCONVERTER_P_H
7
8//
9// W A R N I N G
10// -------------
11//
12// This file is not part of the Qt API. It exists purely as an
13// implementation detail. This header file may change from version to
14// version without notice, or even be removed.
15//
16// We mean it.
17//
18
19#include <QtCore/qstring.h>
20#include <QtCore/qendian.h>
21#include <QtCore/qstringconverter.h>
22#include <QtCore/private/qglobal_p.h>
23
24QT_BEGIN_NAMESPACE
25
26#ifndef __cpp_char8_t
27enum qchar8_t : uchar {};
28#else
29using qchar8_t = char8_t;
30#endif
31
32struct QLatin1
33{
34 // Defined in qstring.cpp
35 static char16_t *convertToUnicode(char16_t *dst, QLatin1StringView in) noexcept;
36
37 static QChar *convertToUnicode(QChar *buffer, QLatin1StringView in) noexcept
38 {
39 char16_t *dst = reinterpret_cast<char16_t *>(buffer);
40 dst = convertToUnicode(dst, in);
41 return reinterpret_cast<QChar *>(dst);
42 }
43
44 static QChar *convertToUnicode(QChar *dst, QByteArrayView in,
45 [[maybe_unused]] QStringConverterBase::State *state) noexcept
46 {
47 Q_ASSERT(state);
48
49 return convertToUnicode(buffer: dst, in: QLatin1StringView(in.data(), in.size()));
50 }
51
52 static char *convertFromUnicode(char *out, QStringView in, QStringConverter::State *state) noexcept;
53
54 // Defined in qstring.cpp
55 static char *convertFromUnicode(char *out, QStringView in) noexcept;
56};
57
58struct QUtf8BaseTraits
59{
60 static const bool isTrusted = false;
61 static const bool allowNonCharacters = true;
62 static const bool skipAsciiHandling = false;
63 static const int Error = -1;
64 static const int EndOfString = -2;
65
66 static void appendByte(uchar *&ptr, uchar b)
67 { *ptr++ = b; }
68
69 static void appendByte(qchar8_t *&ptr, qchar8_t b)
70 { *ptr++ = b; }
71
72 static uchar peekByte(const uchar *ptr, qsizetype n = 0)
73 { return ptr[n]; }
74
75 static uchar peekByte(const qchar8_t *ptr, qsizetype n = 0)
76 { return ptr[n]; }
77
78 static qptrdiff availableBytes(const uchar *ptr, const uchar *end)
79 { return end - ptr; }
80
81 static qptrdiff availableBytes(const qchar8_t *ptr, const qchar8_t *end)
82 { return end - ptr; }
83
84 static void advanceByte(const uchar *&ptr, qsizetype n = 1)
85 { ptr += n; }
86
87 static void advanceByte(const qchar8_t *&ptr, qsizetype n = 1)
88 { ptr += n; }
89
90 static void appendUtf16(char16_t *&ptr, char16_t uc)
91 { *ptr++ = char16_t(uc); }
92
93 static void appendUcs4(char16_t *&ptr, char32_t uc)
94 {
95 appendUtf16(ptr, uc: QChar::highSurrogate(ucs4: uc));
96 appendUtf16(ptr, uc: QChar::lowSurrogate(ucs4: uc));
97 }
98
99 static char16_t peekUtf16(const char16_t *ptr, qsizetype n = 0) { return ptr[n]; }
100
101 static qptrdiff availableUtf16(const char16_t *ptr, const char16_t *end)
102 { return end - ptr; }
103
104 static void advanceUtf16(const char16_t *&ptr, qsizetype n = 1) { ptr += n; }
105
106 static void appendUtf16(char32_t *&ptr, char16_t uc)
107 { *ptr++ = char32_t(uc); }
108
109 static void appendUcs4(char32_t *&ptr, char32_t uc)
110 { *ptr++ = uc; }
111};
112
113struct QUtf8BaseTraitsNoAscii : public QUtf8BaseTraits
114{
115 static const bool skipAsciiHandling = true;
116};
117
118namespace QUtf8Functions
119{
120 /// returns 0 on success; errors can only happen if \a u is a surrogate:
121 /// Error if \a u is a low surrogate;
122 /// if \a u is a high surrogate, Error if the next isn't a low one,
123 /// EndOfString if we run into the end of the string.
124 template <typename Traits, typename OutputPtr, typename InputPtr> inline
125 int toUtf8(char16_t u, OutputPtr &dst, InputPtr &src, InputPtr end)
126 {
127 if (!Traits::skipAsciiHandling && u < 0x80) {
128 // U+0000 to U+007F (US-ASCII) - one byte
129 Traits::appendByte(dst, uchar(u));
130 return 0;
131 } else if (u < 0x0800) {
132 // U+0080 to U+07FF - two bytes
133 // first of two bytes
134 Traits::appendByte(dst, 0xc0 | uchar(u >> 6));
135 } else {
136 if (!QChar::isSurrogate(ucs4: u)) {
137 // U+0800 to U+FFFF (except U+D800-U+DFFF) - three bytes
138 if (!Traits::allowNonCharacters && QChar::isNonCharacter(ucs4: u))
139 return Traits::Error;
140
141 // first of three bytes
142 Traits::appendByte(dst, 0xe0 | uchar(u >> 12));
143 } else {
144 // U+10000 to U+10FFFF - four bytes
145 // need to get one extra codepoint
146 if (Traits::availableUtf16(src, end) == 0)
147 return Traits::EndOfString;
148
149 char16_t low = Traits::peekUtf16(src);
150 if (!QChar::isHighSurrogate(ucs4: u))
151 return Traits::Error;
152 if (!QChar::isLowSurrogate(ucs4: low))
153 return Traits::Error;
154
155 Traits::advanceUtf16(src);
156 char32_t ucs4 = QChar::surrogateToUcs4(high: u, low);
157
158 if (!Traits::allowNonCharacters && QChar::isNonCharacter(ucs4))
159 return Traits::Error;
160
161 // first byte
162 Traits::appendByte(dst, 0xf0 | (uchar(ucs4 >> 18) & 0xf));
163
164 // second of four bytes
165 Traits::appendByte(dst, 0x80 | (uchar(ucs4 >> 12) & 0x3f));
166
167 // for the rest of the bytes
168 u = char16_t(ucs4);
169 }
170
171 // second to last byte
172 Traits::appendByte(dst, 0x80 | (uchar(u >> 6) & 0x3f));
173 }
174
175 // last byte
176 Traits::appendByte(dst, 0x80 | (u & 0x3f));
177 return 0;
178 }
179
180 inline bool isContinuationByte(uchar b)
181 {
182 return (b & 0xc0) == 0x80;
183 }
184
185 /// returns the number of characters consumed (including \a b) in case of success;
186 /// returns negative in case of error: Traits::Error or Traits::EndOfString
187 template <typename Traits, typename OutputPtr, typename InputPtr> inline
188 qsizetype fromUtf8(uchar b, OutputPtr &dst, InputPtr &src, InputPtr end)
189 {
190 qsizetype charsNeeded;
191 char32_t min_uc;
192 char32_t uc;
193
194 if (!Traits::skipAsciiHandling && b < 0x80) {
195 // US-ASCII
196 Traits::appendUtf16(dst, b);
197 return 1;
198 }
199
200 if (!Traits::isTrusted && Q_UNLIKELY(b <= 0xC1)) {
201 // an UTF-8 first character must be at least 0xC0
202 // however, all 0xC0 and 0xC1 first bytes can only produce overlong sequences
203 return Traits::Error;
204 } else if (b < 0xe0) {
205 charsNeeded = 2;
206 min_uc = 0x80;
207 uc = b & 0x1f;
208 } else if (b < 0xf0) {
209 charsNeeded = 3;
210 min_uc = 0x800;
211 uc = b & 0x0f;
212 } else if (b < 0xf5) {
213 charsNeeded = 4;
214 min_uc = 0x10000;
215 uc = b & 0x07;
216 } else {
217 // the last Unicode character is U+10FFFF
218 // it's encoded in UTF-8 as "\xF4\x8F\xBF\xBF"
219 // therefore, a byte higher than 0xF4 is not the UTF-8 first byte
220 return Traits::Error;
221 }
222
223 qptrdiff bytesAvailable = Traits::availableBytes(src, end);
224 if (Q_UNLIKELY(bytesAvailable < charsNeeded - 1)) {
225 // it's possible that we have an error instead of just unfinished bytes
226 if (bytesAvailable > 0 && !isContinuationByte(Traits::peekByte(src, 0)))
227 return Traits::Error;
228 if (bytesAvailable > 1 && !isContinuationByte(Traits::peekByte(src, 1)))
229 return Traits::Error;
230 return Traits::EndOfString;
231 }
232
233 // first continuation character
234 b = Traits::peekByte(src, 0);
235 if (!isContinuationByte(b))
236 return Traits::Error;
237 uc <<= 6;
238 uc |= b & 0x3f;
239
240 if (charsNeeded > 2) {
241 // second continuation character
242 b = Traits::peekByte(src, 1);
243 if (!isContinuationByte(b))
244 return Traits::Error;
245 uc <<= 6;
246 uc |= b & 0x3f;
247
248 if (charsNeeded > 3) {
249 // third continuation character
250 b = Traits::peekByte(src, 2);
251 if (!isContinuationByte(b))
252 return Traits::Error;
253 uc <<= 6;
254 uc |= b & 0x3f;
255 }
256 }
257
258 // we've decoded something; safety-check it
259 if (!Traits::isTrusted) {
260 if (uc < min_uc)
261 return Traits::Error;
262 if (QChar::isSurrogate(ucs4: uc) || uc > QChar::LastValidCodePoint)
263 return Traits::Error;
264 if (!Traits::allowNonCharacters && QChar::isNonCharacter(ucs4: uc))
265 return Traits::Error;
266 }
267
268 // write the UTF-16 sequence
269 if (!QChar::requiresSurrogates(ucs4: uc)) {
270 // UTF-8 decoded and no surrogates are required
271 // detach if necessary
272 Traits::appendUtf16(dst, char16_t(uc));
273 } else {
274 // UTF-8 decoded to something that requires a surrogate pair
275 Traits::appendUcs4(dst, uc);
276 }
277
278 Traits::advanceByte(src, charsNeeded - 1);
279 return charsNeeded;
280 }
281}
282
283enum DataEndianness
284{
285 DetectEndianness,
286 BigEndianness,
287 LittleEndianness
288};
289
290struct QUtf8
291{
292 static QChar *convertToUnicode(QChar *buffer, QByteArrayView in) noexcept
293 {
294 char16_t *dst = reinterpret_cast<char16_t *>(buffer);
295 dst = QUtf8::convertToUnicode(dst, in);
296 return reinterpret_cast<QChar *>(dst);
297 }
298
299 Q_CORE_EXPORT static char16_t* convertToUnicode(char16_t *dst, QByteArrayView in) noexcept;
300 static QString convertToUnicode(QByteArrayView in);
301 Q_CORE_EXPORT static QString convertToUnicode(QByteArrayView in, QStringConverter::State *state);
302
303 static QChar *convertToUnicode(QChar *out, QByteArrayView in, QStringConverter::State *state)
304 {
305 char16_t *buffer = reinterpret_cast<char16_t *>(out);
306 buffer = convertToUnicode(dst: buffer, in, state);
307 return reinterpret_cast<QChar *>(buffer);
308 }
309
310 static char16_t *convertToUnicode(char16_t *dst, QByteArrayView in, QStringConverter::State *state);
311
312 Q_CORE_EXPORT static QByteArray convertFromUnicode(QStringView in);
313 Q_CORE_EXPORT static QByteArray convertFromUnicode(QStringView in, QStringConverterBase::State *state);
314 static char *convertFromUnicode(char *out, QStringView in, QStringConverter::State *state);
315 Q_CORE_EXPORT static char *convertFromLatin1(char *out, QLatin1StringView in);
316 struct ValidUtf8Result {
317 bool isValidUtf8;
318 bool isValidAscii;
319 };
320 static ValidUtf8Result isValidUtf8(QByteArrayView in);
321 static int compareUtf8(QByteArrayView utf8, QStringView utf16,
322 Qt::CaseSensitivity cs = Qt::CaseSensitive) noexcept;
323 static int compareUtf8(QByteArrayView utf8, QLatin1StringView s,
324 Qt::CaseSensitivity cs = Qt::CaseSensitive);
325 static int compareUtf8(QByteArrayView lhs, QByteArrayView rhs,
326 Qt::CaseSensitivity cs = Qt::CaseSensitive) noexcept;
327};
328
329struct QUtf16
330{
331 Q_CORE_EXPORT static QString convertToUnicode(QByteArrayView, QStringConverter::State *, DataEndianness = DetectEndianness);
332 static QChar *convertToUnicode(QChar *out, QByteArrayView, QStringConverter::State *state, DataEndianness endian);
333 Q_CORE_EXPORT static QByteArray convertFromUnicode(QStringView, QStringConverter::State *, DataEndianness = DetectEndianness);
334 static char *convertFromUnicode(char *out, QStringView in, QStringConverter::State *state, DataEndianness endian);
335};
336
337struct QUtf32
338{
339 static QChar *convertToUnicode(QChar *out, QByteArrayView, QStringConverter::State *state, DataEndianness endian);
340 Q_CORE_EXPORT static QString convertToUnicode(QByteArrayView, QStringConverter::State *, DataEndianness = DetectEndianness);
341 Q_CORE_EXPORT static QByteArray convertFromUnicode(QStringView, QStringConverter::State *, DataEndianness = DetectEndianness);
342 static char *convertFromUnicode(char *out, QStringView in, QStringConverter::State *state, DataEndianness endian);
343};
344
345struct Q_CORE_EXPORT QLocal8Bit
346{
347#if !defined(Q_OS_WIN) || defined(QT_BOOTSTRAPPED)
348 static QString convertToUnicode(QByteArrayView in, QStringConverter::State *state)
349 { return QUtf8::convertToUnicode(in, state); }
350 static QByteArray convertFromUnicode(QStringView in, QStringConverter::State *state)
351 { return QUtf8::convertFromUnicode(in, state); }
352#else
353 static int checkUtf8();
354 static bool isUtf8()
355 {
356 Q_CONSTINIT
357 static QBasicAtomicInteger<qint8> result = { 0 };
358 int r = result.loadRelaxed();
359 if (r == 0) {
360 r = checkUtf8();
361 result.storeRelaxed(r);
362 }
363 return r > 0;
364 }
365 static QString convertToUnicode_sys(QByteArrayView, QStringConverter::State *);
366 static QString convertToUnicode(QByteArrayView in, QStringConverter::State *state)
367 {
368 if (isUtf8())
369 return QUtf8::convertToUnicode(in, state);
370 return convertToUnicode_sys(in, state);
371 }
372 static QByteArray convertFromUnicode_sys(QStringView, QStringConverter::State *);
373 static QByteArray convertFromUnicode(QStringView in, QStringConverter::State *state)
374 {
375 if (isUtf8())
376 return QUtf8::convertFromUnicode(in, state);
377 return convertFromUnicode_sys(in, state);
378 }
379#endif
380};
381
382QT_END_NAMESPACE
383
384#endif // QSTRINGCONVERTER_P_H
385

source code of qtbase/src/corelib/text/qstringconverter_p.h