1/****************************************************************************
2**
3** Copyright (C) 2016 The Qt Company Ltd.
4** Copyright (C) 2018 Intel Corporation.
5** Contact: https://www.qt.io/licensing/
6**
7** This file is part of the QtCore module of the Qt Toolkit.
8**
9** $QT_BEGIN_LICENSE:LGPL$
10** Commercial License Usage
11** Licensees holding valid commercial Qt licenses may use this file in
12** accordance with the commercial license agreement provided with the
13** Software or, alternatively, in accordance with the terms contained in
14** a written agreement between you and The Qt Company. For licensing terms
15** and conditions see https://www.qt.io/terms-conditions. For further
16** information use the contact form at https://www.qt.io/contact-us.
17**
18** GNU Lesser General Public License Usage
19** Alternatively, this file may be used under the terms of the GNU Lesser
20** General Public License version 3 as published by the Free Software
21** Foundation and appearing in the file LICENSE.LGPL3 included in the
22** packaging of this file. Please review the following information to
23** ensure the GNU Lesser General Public License version 3 requirements
24** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
25**
26** GNU General Public License Usage
27** Alternatively, this file may be used under the terms of the GNU
28** General Public License version 2.0 or (at your option) the GNU General
29** Public license version 3 or any later version approved by the KDE Free
30** Qt Foundation. The licenses are as published by the Free Software
31** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
32** included in the packaging of this file. Please review the following
33** information to ensure the GNU General Public License requirements will
34** be met: https://www.gnu.org/licenses/gpl-2.0.html and
35** https://www.gnu.org/licenses/gpl-3.0.html.
36**
37** $QT_END_LICENSE$
38**
39****************************************************************************/
40
41#include "qutfcodec_p.h"
42#include "qlist.h"
43#include "qendian.h"
44#include "qchar.h"
45
46#include "private/qsimd_p.h"
47#include "private/qstringiterator_p.h"
48
49QT_BEGIN_NAMESPACE
50
51enum { Endian = 0, Data = 1 };
52
53static const uchar utf8bom[] = { 0xef, 0xbb, 0xbf };
54
55#if (defined(__SSE2__) && defined(QT_COMPILER_SUPPORTS_SSE2)) \
56 || (defined(__ARM_NEON__) && defined(Q_PROCESSOR_ARM_64))
57static Q_ALWAYS_INLINE uint qBitScanReverse(unsigned v) noexcept
58{
59 uint result = qCountLeadingZeroBits(v);
60 // Now Invert the result: clz will count *down* from the msb to the lsb, so the msb index is 31
61 // and the lsb index is 0. The result for _bit_scan_reverse is expected to be the index when
62 // counting up: msb index is 0 (because it starts there), and the lsb index is 31.
63 result ^= sizeof(unsigned) * 8 - 1;
64 return result;
65}
66#endif
67
68#if defined(__SSE2__) && defined(QT_COMPILER_SUPPORTS_SSE2)
69static inline bool simdEncodeAscii(uchar *&dst, const ushort *&nextAscii, const ushort *&src, const ushort *end)
70{
71 // do sixteen characters at a time
72 for ( ; end - src >= 16; src += 16, dst += 16) {
73# ifdef __AVX2__
74 __m256i data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
75 __m128i data1 = _mm256_castsi256_si128(data);
76 __m128i data2 = _mm256_extracti128_si256(data, 1);
77# else
78 __m128i data1 = _mm_loadu_si128(p: (const __m128i*)src);
79 __m128i data2 = _mm_loadu_si128(p: 1+(const __m128i*)src);
80# endif
81
82 // check if everything is ASCII
83 // the highest ASCII value is U+007F
84 // Do the packing directly:
85 // The PACKUSWB instruction has packs a signed 16-bit integer to an unsigned 8-bit
86 // with saturation. That is, anything from 0x0100 to 0x7fff is saturated to 0xff,
87 // while all negatives (0x8000 to 0xffff) get saturated to 0x00. To detect non-ASCII,
88 // we simply do a signed greater-than comparison to 0x00. That means we detect NULs as
89 // "non-ASCII", but it's an acceptable compromise.
90 __m128i packed = _mm_packus_epi16(a: data1, b: data2);
91 __m128i nonAscii = _mm_cmpgt_epi8(a: packed, b: _mm_setzero_si128());
92
93 // store, even if there are non-ASCII characters here
94 _mm_storeu_si128(p: (__m128i*)dst, b: packed);
95
96 // n will contain 1 bit set per character in [data1, data2] that is non-ASCII (or NUL)
97 ushort n = ~_mm_movemask_epi8(a: nonAscii);
98 if (n) {
99 // find the next probable ASCII character
100 // we don't want to load 32 bytes again in this loop if we know there are non-ASCII
101 // characters still coming
102 nextAscii = src + qBitScanReverse(v: n) + 1;
103
104 n = qCountTrailingZeroBits(v: n);
105 dst += n;
106 src += n;
107 return false;
108 }
109 }
110
111 if (end - src >= 8) {
112 // do eight characters at a time
113 __m128i data = _mm_loadu_si128(p: reinterpret_cast<const __m128i *>(src));
114 __m128i packed = _mm_packus_epi16(a: data, b: data);
115 __m128i nonAscii = _mm_cmpgt_epi8(a: packed, b: _mm_setzero_si128());
116
117 // store even non-ASCII
118 _mm_storel_epi64(p: reinterpret_cast<__m128i *>(dst), a: packed);
119
120 uchar n = ~_mm_movemask_epi8(a: nonAscii);
121 if (n) {
122 nextAscii = src + qBitScanReverse(v: n) + 1;
123 n = qCountTrailingZeroBits(v: n);
124 dst += n;
125 src += n;
126 return false;
127 }
128 }
129
130 return src == end;
131}
132
133static inline bool simdDecodeAscii(ushort *&dst, const uchar *&nextAscii, const uchar *&src, const uchar *end)
134{
135 // do sixteen characters at a time
136 for ( ; end - src >= 16; src += 16, dst += 16) {
137 __m128i data = _mm_loadu_si128(p: (const __m128i*)src);
138
139#ifdef __AVX2__
140 const int BitSpacing = 2;
141 // load and zero extend to an YMM register
142 const __m256i extended = _mm256_cvtepu8_epi16(data);
143
144 uint n = _mm256_movemask_epi8(extended);
145 if (!n) {
146 // store
147 _mm256_storeu_si256((__m256i*)dst, extended);
148 continue;
149 }
150#else
151 const int BitSpacing = 1;
152
153 // check if everything is ASCII
154 // movemask extracts the high bit of every byte, so n is non-zero if something isn't ASCII
155 uint n = _mm_movemask_epi8(a: data);
156 if (!n) {
157 // unpack
158 _mm_storeu_si128(p: (__m128i*)dst, b: _mm_unpacklo_epi8(a: data, b: _mm_setzero_si128()));
159 _mm_storeu_si128(p: 1+(__m128i*)dst, b: _mm_unpackhi_epi8(a: data, b: _mm_setzero_si128()));
160 continue;
161 }
162#endif
163
164 // copy the front part that is still ASCII
165 while (!(n & 1)) {
166 *dst++ = *src++;
167 n >>= BitSpacing;
168 }
169
170 // find the next probable ASCII character
171 // we don't want to load 16 bytes again in this loop if we know there are non-ASCII
172 // characters still coming
173 n = qBitScanReverse(v: n);
174 nextAscii = src + (n / BitSpacing) + 1;
175 return false;
176
177 }
178
179 if (end - src >= 8) {
180 __m128i data = _mm_loadl_epi64(p: reinterpret_cast<const __m128i *>(src));
181 uint n = _mm_movemask_epi8(a: data) & 0xff;
182 if (!n) {
183 // unpack and store
184 _mm_storeu_si128(p: reinterpret_cast<__m128i *>(dst), b: _mm_unpacklo_epi8(a: data, b: _mm_setzero_si128()));
185 } else {
186 while (!(n & 1)) {
187 *dst++ = *src++;
188 n >>= 1;
189 }
190
191 n = qBitScanReverse(v: n);
192 nextAscii = src + n + 1;
193 return false;
194 }
195 }
196
197 return src == end;
198}
199
200static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end, const uchar *&nextAscii)
201{
202#ifdef __AVX2__
203 // do 32 characters at a time
204 // (this is similar to simdTestMask in qstring.cpp)
205 const __m256i mask = _mm256_set1_epi8(0x80);
206 for ( ; end - src >= 32; src += 32) {
207 __m256i data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
208 if (_mm256_testz_si256(mask, data))
209 continue;
210
211 uint n = _mm256_movemask_epi8(data);
212 Q_ASSUME(n);
213
214 // find the next probable ASCII character
215 // we don't want to load 32 bytes again in this loop if we know there are non-ASCII
216 // characters still coming
217 nextAscii = src + qBitScanReverse(n) + 1;
218
219 // return the non-ASCII character
220 return src + qCountTrailingZeroBits(n);
221 }
222#endif
223
224 // do sixteen characters at a time
225 for ( ; end - src >= 16; src += 16) {
226 __m128i data = _mm_loadu_si128(p: reinterpret_cast<const __m128i*>(src));
227
228 // check if everything is ASCII
229 // movemask extracts the high bit of every byte, so n is non-zero if something isn't ASCII
230 uint n = _mm_movemask_epi8(a: data);
231 if (!n)
232 continue;
233
234 // find the next probable ASCII character
235 // we don't want to load 16 bytes again in this loop if we know there are non-ASCII
236 // characters still coming
237 nextAscii = src + qBitScanReverse(v: n) + 1;
238
239 // return the non-ASCII character
240 return src + qCountTrailingZeroBits(v: n);
241 }
242
243 // do four characters at a time
244 for ( ; end - src >= 4; src += 4) {
245 quint32 data = qFromUnaligned<quint32>(src);
246 data &= 0x80808080U;
247 if (!data)
248 continue;
249
250 // We don't try to guess which of the three bytes is ASCII and which
251 // one isn't. The chance that at least two of them are non-ASCII is
252 // better than 75%.
253 nextAscii = src;
254 return src;
255 }
256 nextAscii = end;
257 return src;
258}
259#elif defined(__ARM_NEON__) && defined(Q_PROCESSOR_ARM_64) // vaddv is only available on Aarch64
260static inline bool simdEncodeAscii(uchar *&dst, const ushort *&nextAscii, const ushort *&src, const ushort *end)
261{
262 uint16x8_t maxAscii = vdupq_n_u16(0x7f);
263 uint16x8_t mask1 = { 1, 1 << 2, 1 << 4, 1 << 6, 1 << 8, 1 << 10, 1 << 12, 1 << 14 };
264 uint16x8_t mask2 = vshlq_n_u16(mask1, 1);
265
266 // do sixteen characters at a time
267 for ( ; end - src >= 16; src += 16, dst += 16) {
268 // load 2 lanes (or: "load interleaved")
269 uint16x8x2_t in = vld2q_u16(src);
270
271 // check if any of the elements > 0x7f, select 1 bit per element (element 0 -> bit 0, element 1 -> bit 1, etc),
272 // add those together into a scalar, and merge the scalars.
273 uint16_t nonAscii = vaddvq_u16(vandq_u16(vcgtq_u16(in.val[0], maxAscii), mask1))
274 | vaddvq_u16(vandq_u16(vcgtq_u16(in.val[1], maxAscii), mask2));
275
276 // merge the two lanes by shifting the values of the second by 8 and inserting them
277 uint16x8_t out = vsliq_n_u16(in.val[0], in.val[1], 8);
278
279 // store, even if there are non-ASCII characters here
280 vst1q_u8(dst, vreinterpretq_u8_u16(out));
281
282 if (nonAscii) {
283 // find the next probable ASCII character
284 // we don't want to load 32 bytes again in this loop if we know there are non-ASCII
285 // characters still coming
286 nextAscii = src + qBitScanReverse(nonAscii) + 1;
287
288 nonAscii = qCountTrailingZeroBits(nonAscii);
289 dst += nonAscii;
290 src += nonAscii;
291 return false;
292 }
293 }
294 return src == end;
295}
296
297static inline bool simdDecodeAscii(ushort *&dst, const uchar *&nextAscii, const uchar *&src, const uchar *end)
298{
299 // do eight characters at a time
300 uint8x8_t msb_mask = vdup_n_u8(0x80);
301 uint8x8_t add_mask = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 };
302 for ( ; end - src >= 8; src += 8, dst += 8) {
303 uint8x8_t c = vld1_u8(src);
304 uint8_t n = vaddv_u8(vand_u8(vcge_u8(c, msb_mask), add_mask));
305 if (!n) {
306 // store
307 vst1q_u16(dst, vmovl_u8(c));
308 continue;
309 }
310
311 // copy the front part that is still ASCII
312 while (!(n & 1)) {
313 *dst++ = *src++;
314 n >>= 1;
315 }
316
317 // find the next probable ASCII character
318 // we don't want to load 16 bytes again in this loop if we know there are non-ASCII
319 // characters still coming
320 n = qBitScanReverse(n);
321 nextAscii = src + n + 1;
322 return false;
323
324 }
325 return src == end;
326}
327
328static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end, const uchar *&nextAscii)
329{
330 // The SIMD code below is untested, so just force an early return until
331 // we've had the time to verify it works.
332 nextAscii = end;
333 return src;
334
335 // do eight characters at a time
336 uint8x8_t msb_mask = vdup_n_u8(0x80);
337 uint8x8_t add_mask = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 };
338 for ( ; end - src >= 8; src += 8) {
339 uint8x8_t c = vld1_u8(src);
340 uint8_t n = vaddv_u8(vand_u8(vcge_u8(c, msb_mask), add_mask));
341 if (!n)
342 continue;
343
344 // find the next probable ASCII character
345 // we don't want to load 16 bytes again in this loop if we know there are non-ASCII
346 // characters still coming
347 nextAscii = src + qBitScanReverse(n) + 1;
348
349 // return the non-ASCII character
350 return src + qCountTrailingZeroBits(n);
351 }
352 nextAscii = end;
353 return src;
354}
355#else
356static inline bool simdEncodeAscii(uchar *, const ushort *, const ushort *, const ushort *)
357{
358 return false;
359}
360
361static inline bool simdDecodeAscii(ushort *, const uchar *, const uchar *, const uchar *)
362{
363 return false;
364}
365
366static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end, const uchar *&nextAscii)
367{
368 nextAscii = end;
369 return src;
370}
371#endif
372
373QByteArray QUtf8::convertFromUnicode(const QChar *uc, int len)
374{
375 // create a QByteArray with the worst case scenario size
376 QByteArray result(len * 3, Qt::Uninitialized);
377 uchar *dst = reinterpret_cast<uchar *>(const_cast<char *>(result.constData()));
378 const ushort *src = reinterpret_cast<const ushort *>(uc);
379 const ushort *const end = src + len;
380
381 while (src != end) {
382 const ushort *nextAscii = end;
383 if (simdEncodeAscii(dst, nextAscii, src, end))
384 break;
385
386 do {
387 ushort uc = *src++;
388 int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(u: uc, dst, src, end);
389 if (res < 0) {
390 // encoding error - append '?'
391 *dst++ = '?';
392 }
393 } while (src < nextAscii);
394 }
395
396 result.truncate(pos: dst - reinterpret_cast<uchar *>(const_cast<char *>(result.constData())));
397 return result;
398}
399
400QByteArray QUtf8::convertFromUnicode(const QChar *uc, int len, QTextCodec::ConverterState *state)
401{
402 uchar replacement = '?';
403 int rlen = 3*len;
404 int surrogate_high = -1;
405 if (state) {
406 if (state->flags & QTextCodec::ConvertInvalidToNull)
407 replacement = 0;
408 if (!(state->flags & QTextCodec::IgnoreHeader))
409 rlen += 3;
410 if (state->remainingChars)
411 surrogate_high = state->state_data[0];
412 }
413
414
415 QByteArray rstr(rlen, Qt::Uninitialized);
416 uchar *cursor = reinterpret_cast<uchar *>(const_cast<char *>(rstr.constData()));
417 const ushort *src = reinterpret_cast<const ushort *>(uc);
418 const ushort *const end = src + len;
419
420 int invalid = 0;
421 if (state && !(state->flags & QTextCodec::IgnoreHeader)) {
422 // append UTF-8 BOM
423 *cursor++ = utf8bom[0];
424 *cursor++ = utf8bom[1];
425 *cursor++ = utf8bom[2];
426 }
427
428 const ushort *nextAscii = src;
429 while (src != end) {
430 int res;
431 ushort uc;
432 if (surrogate_high != -1) {
433 uc = surrogate_high;
434 surrogate_high = -1;
435 res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(u: uc, dst&: cursor, src, end);
436 } else {
437 if (src >= nextAscii && simdEncodeAscii(dst&: cursor, nextAscii, src, end))
438 break;
439
440 uc = *src++;
441 res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(u: uc, dst&: cursor, src, end);
442 }
443 if (Q_LIKELY(res >= 0))
444 continue;
445
446 if (res == QUtf8BaseTraits::Error) {
447 // encoding error
448 ++invalid;
449 *cursor++ = replacement;
450 } else if (res == QUtf8BaseTraits::EndOfString) {
451 surrogate_high = uc;
452 break;
453 }
454 }
455
456 rstr.resize(size: cursor - (const uchar*)rstr.constData());
457 if (state) {
458 state->invalidChars += invalid;
459 state->flags |= QTextCodec::IgnoreHeader;
460 state->remainingChars = 0;
461 if (surrogate_high >= 0) {
462 state->remainingChars = 1;
463 state->state_data[0] = surrogate_high;
464 }
465 }
466 return rstr;
467}
468
469QString QUtf8::convertToUnicode(const char *chars, int len)
470{
471 // UTF-8 to UTF-16 always needs the exact same number of words or less:
472 // UTF-8 UTF-16
473 // 1 byte 1 word
474 // 2 bytes 1 word
475 // 3 bytes 1 word
476 // 4 bytes 2 words (one surrogate pair)
477 // That is, we'll use the full buffer if the input is US-ASCII (1-byte UTF-8),
478 // half the buffer for U+0080-U+07FF text (e.g., Greek, Cyrillic, Arabic) or
479 // non-BMP text, and one third of the buffer for U+0800-U+FFFF text (e.g, CJK).
480 //
481 // The table holds for invalid sequences too: we'll insert one replacement char
482 // per invalid byte.
483 QString result(len, Qt::Uninitialized);
484 QChar *data = const_cast<QChar*>(result.constData()); // we know we're not shared
485 const QChar *end = convertToUnicode(data, chars, len);
486 result.truncate(pos: end - data);
487 return result;
488}
489
490/*!
491 \since 5.7
492 \overload
493
494 Converts the UTF-8 sequence of \a len octets beginning at \a chars to
495 a sequence of QChar starting at \a buffer. The buffer is expected to be
496 large enough to hold the result. An upper bound for the size of the
497 buffer is \a len QChars.
498
499 If, during decoding, an error occurs, a QChar::ReplacementCharacter is
500 written.
501
502 Returns a pointer to one past the last QChar written.
503
504 This function never throws.
505*/
506
507QChar *QUtf8::convertToUnicode(QChar *buffer, const char *chars, int len) noexcept
508{
509 ushort *dst = reinterpret_cast<ushort *>(buffer);
510 const uchar *src = reinterpret_cast<const uchar *>(chars);
511 const uchar *end = src + len;
512
513 // attempt to do a full decoding in SIMD
514 const uchar *nextAscii = end;
515 if (!simdDecodeAscii(dst, nextAscii, src, end)) {
516 // at least one non-ASCII entry
517 // check if we failed to decode the UTF-8 BOM; if so, skip it
518 if (Q_UNLIKELY(src == reinterpret_cast<const uchar *>(chars))
519 && end - src >= 3
520 && Q_UNLIKELY(src[0] == utf8bom[0] && src[1] == utf8bom[1] && src[2] == utf8bom[2])) {
521 src += 3;
522 }
523
524 while (src < end) {
525 nextAscii = end;
526 if (simdDecodeAscii(dst, nextAscii, src, end))
527 break;
528
529 do {
530 uchar b = *src++;
531 int res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, dst, src, end);
532 if (res < 0) {
533 // decoding error
534 *dst++ = QChar::ReplacementCharacter;
535 }
536 } while (src < nextAscii);
537 }
538 }
539
540 return reinterpret_cast<QChar *>(dst);
541}
542
543QString QUtf8::convertToUnicode(const char *chars, int len, QTextCodec::ConverterState *state)
544{
545 bool headerdone = false;
546 ushort replacement = QChar::ReplacementCharacter;
547 int invalid = 0;
548 int res;
549 uchar ch = 0;
550
551 // See above for buffer requirements for stateless decoding. However, that
552 // fails if the state is not empty. The following situations can add to the
553 // requirements:
554 // state contains chars starts with requirement
555 // 1 of 2 bytes valid continuation 0
556 // 2 of 3 bytes same 0
557 // 3 bytes of 4 same +1 (need to insert surrogate pair)
558 // 1 of 2 bytes invalid continuation +1 (need to insert replacement and restart)
559 // 2 of 3 bytes same +1 (same)
560 // 3 of 4 bytes same +1 (same)
561 QString result(len + 1, Qt::Uninitialized);
562
563 ushort *dst = reinterpret_cast<ushort *>(const_cast<QChar *>(result.constData()));
564 const uchar *src = reinterpret_cast<const uchar *>(chars);
565 const uchar *end = src + len;
566
567 if (state) {
568 if (state->flags & QTextCodec::IgnoreHeader)
569 headerdone = true;
570 if (state->flags & QTextCodec::ConvertInvalidToNull)
571 replacement = QChar::Null;
572 if (state->remainingChars) {
573 // handle incoming state first
574 uchar remainingCharsData[4]; // longest UTF-8 sequence possible
575 int remainingCharsCount = state->remainingChars;
576 int newCharsToCopy = qMin<int>(a: sizeof(remainingCharsData) - remainingCharsCount, b: end - src);
577
578 memset(s: remainingCharsData, c: 0, n: sizeof(remainingCharsData));
579 memcpy(dest: remainingCharsData, src: &state->state_data[0], n: remainingCharsCount);
580 memcpy(dest: remainingCharsData + remainingCharsCount, src: src, n: newCharsToCopy);
581
582 const uchar *begin = &remainingCharsData[1];
583 res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b: remainingCharsData[0], dst, src&: begin,
584 end: static_cast<const uchar *>(remainingCharsData) + remainingCharsCount + newCharsToCopy);
585 if (res == QUtf8BaseTraits::Error || (res == QUtf8BaseTraits::EndOfString && len == 0)) {
586 // special case for len == 0:
587 // if we were supplied an empty string, terminate the previous, unfinished sequence with error
588 ++invalid;
589 *dst++ = replacement;
590 } else if (res == QUtf8BaseTraits::EndOfString) {
591 // if we got EndOfString again, then there were too few bytes in src;
592 // copy to our state and return
593 state->remainingChars = remainingCharsCount + newCharsToCopy;
594 memcpy(dest: &state->state_data[0], src: remainingCharsData, n: state->remainingChars);
595 return QString();
596 } else if (!headerdone && res >= 0) {
597 // eat the UTF-8 BOM
598 headerdone = true;
599 if (dst[-1] == 0xfeff)
600 --dst;
601 }
602
603 // adjust src now that we have maybe consumed a few chars
604 if (res >= 0) {
605 Q_ASSERT(res > remainingCharsCount);
606 src += res - remainingCharsCount;
607 }
608 }
609 }
610
611 // main body, stateless decoding
612 res = 0;
613 const uchar *nextAscii = src;
614 const uchar *start = src;
615 while (res >= 0 && src < end) {
616 if (src >= nextAscii && simdDecodeAscii(dst, nextAscii, src, end))
617 break;
618
619 ch = *src++;
620 res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b: ch, dst, src, end);
621 if (!headerdone && res >= 0) {
622 headerdone = true;
623 if (src == start + 3) { // 3 == sizeof(utf8-bom)
624 // eat the UTF-8 BOM (it can only appear at the beginning of the string).
625 if (dst[-1] == 0xfeff)
626 --dst;
627 }
628 }
629 if (res == QUtf8BaseTraits::Error) {
630 res = 0;
631 ++invalid;
632 *dst++ = replacement;
633 }
634 }
635
636 if (!state && res == QUtf8BaseTraits::EndOfString) {
637 // unterminated UTF sequence
638 *dst++ = QChar::ReplacementCharacter;
639 while (src++ < end)
640 *dst++ = QChar::ReplacementCharacter;
641 }
642
643 result.truncate(pos: dst - (const ushort *)result.unicode());
644 if (state) {
645 state->invalidChars += invalid;
646 if (headerdone)
647 state->flags |= QTextCodec::IgnoreHeader;
648 if (res == QUtf8BaseTraits::EndOfString) {
649 --src; // unread the byte in ch
650 state->remainingChars = end - src;
651 memcpy(dest: &state->state_data[0], src: src, n: end - src);
652 } else {
653 state->remainingChars = 0;
654 }
655 }
656 return result;
657}
658
659struct QUtf8NoOutputTraits : public QUtf8BaseTraitsNoAscii
660{
661 struct NoOutput {};
662 static void appendUtf16(const NoOutput &, ushort) {}
663 static void appendUcs4(const NoOutput &, uint) {}
664};
665
666QUtf8::ValidUtf8Result QUtf8::isValidUtf8(const char *chars, qsizetype len)
667{
668 const uchar *src = reinterpret_cast<const uchar *>(chars);
669 const uchar *end = src + len;
670 const uchar *nextAscii = src;
671 bool isValidAscii = true;
672
673 while (src < end) {
674 if (src >= nextAscii)
675 src = simdFindNonAscii(src, end, nextAscii);
676 if (src == end)
677 break;
678
679 do {
680 uchar b = *src++;
681 if ((b & 0x80) == 0)
682 continue;
683
684 isValidAscii = false;
685 QUtf8NoOutputTraits::NoOutput output;
686 int res = QUtf8Functions::fromUtf8<QUtf8NoOutputTraits>(b, dst&: output, src, end);
687 if (res < 0) {
688 // decoding error
689 return { .isValidUtf8: false, .isValidAscii: false };
690 }
691 } while (src < nextAscii);
692 }
693
694 return { .isValidUtf8: true, .isValidAscii: isValidAscii };
695}
696
697int QUtf8::compareUtf8(const char *utf8, qsizetype u8len, const QChar *utf16, int u16len)
698{
699 uint uc1, uc2;
700 auto src1 = reinterpret_cast<const uchar *>(utf8);
701 auto end1 = src1 + u8len;
702 QStringIterator src2(utf16, utf16 + u16len);
703
704 while (src1 < end1 && src2.hasNext()) {
705 uchar b = *src1++;
706 uint *output = &uc1;
707 int res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, dst&: output, src&: src1, end: end1);
708 if (res < 0) {
709 // decoding error
710 uc1 = QChar::ReplacementCharacter;
711 }
712
713 uc2 = src2.next();
714 if (uc1 != uc2)
715 return int(uc1) - int(uc2);
716 }
717
718 // the shorter string sorts first
719 return (end1 > src1) - int(src2.hasNext());
720}
721
722int QUtf8::compareUtf8(const char *utf8, qsizetype u8len, QLatin1String s)
723{
724 uint uc1;
725 auto src1 = reinterpret_cast<const uchar *>(utf8);
726 auto end1 = src1 + u8len;
727 auto src2 = reinterpret_cast<const uchar *>(s.latin1());
728 auto end2 = src2 + s.size();
729
730 while (src1 < end1 && src2 < end2) {
731 uchar b = *src1++;
732 uint *output = &uc1;
733 int res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, dst&: output, src&: src1, end: end1);
734 if (res < 0) {
735 // decoding error
736 uc1 = QChar::ReplacementCharacter;
737 }
738
739 uint uc2 = *src2++;
740 if (uc1 != uc2)
741 return int(uc1) - int(uc2);
742 }
743
744 // the shorter string sorts first
745 return (end1 > src1) - (end2 > src2);
746}
747
748QByteArray QUtf16::convertFromUnicode(const QChar *uc, int len, QTextCodec::ConverterState *state, DataEndianness e)
749{
750 DataEndianness endian = e;
751 int length = 2*len;
752 if (!state || (!(state->flags & QTextCodec::IgnoreHeader))) {
753 length += 2;
754 }
755 if (e == DetectEndianness) {
756 endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
757 }
758
759 QByteArray d;
760 d.resize(size: length);
761 char *data = d.data();
762 if (!state || !(state->flags & QTextCodec::IgnoreHeader)) {
763 QChar bom(QChar::ByteOrderMark);
764 if (endian == BigEndianness)
765 qToBigEndian(src: bom.unicode(), dest: data);
766 else
767 qToLittleEndian(src: bom.unicode(), dest: data);
768 data += 2;
769 }
770 if (endian == BigEndianness)
771 qToBigEndian<ushort>(source: uc, count: len, dest: data);
772 else
773 qToLittleEndian<ushort>(source: uc, count: len, dest: data);
774
775 if (state) {
776 state->remainingChars = 0;
777 state->flags |= QTextCodec::IgnoreHeader;
778 }
779 return d;
780}
781
782QString QUtf16::convertToUnicode(const char *chars, int len, QTextCodec::ConverterState *state, DataEndianness e)
783{
784 DataEndianness endian = e;
785 bool half = false;
786 uchar buf = 0;
787 bool headerdone = false;
788 if (state) {
789 headerdone = state->flags & QTextCodec::IgnoreHeader;
790 if (endian == DetectEndianness)
791 endian = (DataEndianness)state->state_data[Endian];
792 if (state->remainingChars) {
793 half = true;
794 buf = state->state_data[Data];
795 }
796 }
797 if (headerdone && endian == DetectEndianness)
798 endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
799
800 QString result(len, Qt::Uninitialized); // worst case
801 QChar *qch = (QChar *)result.data();
802 while (len--) {
803 if (half) {
804 QChar ch;
805 if (endian == LittleEndianness) {
806 ch.setRow(*chars++);
807 ch.setCell(buf);
808 } else {
809 ch.setRow(buf);
810 ch.setCell(*chars++);
811 }
812 if (!headerdone) {
813 headerdone = true;
814 if (endian == DetectEndianness) {
815 if (ch == QChar::ByteOrderSwapped) {
816 endian = LittleEndianness;
817 } else if (ch == QChar::ByteOrderMark) {
818 endian = BigEndianness;
819 } else {
820 if (QSysInfo::ByteOrder == QSysInfo::BigEndian) {
821 endian = BigEndianness;
822 } else {
823 endian = LittleEndianness;
824 ch = QChar((ch.unicode() >> 8) | ((ch.unicode() & 0xff) << 8));
825 }
826 *qch++ = ch;
827 }
828 } else if (ch != QChar::ByteOrderMark) {
829 *qch++ = ch;
830 }
831 } else {
832 *qch++ = ch;
833 }
834 half = false;
835 } else {
836 buf = *chars++;
837 half = true;
838 }
839 }
840 result.truncate(pos: qch - result.unicode());
841
842 if (state) {
843 if (headerdone)
844 state->flags |= QTextCodec::IgnoreHeader;
845 state->state_data[Endian] = endian;
846 if (half) {
847 state->remainingChars = 1;
848 state->state_data[Data] = buf;
849 } else {
850 state->remainingChars = 0;
851 state->state_data[Data] = 0;
852 }
853 }
854 return result;
855}
856
857QByteArray QUtf32::convertFromUnicode(const QChar *uc, int len, QTextCodec::ConverterState *state, DataEndianness e)
858{
859 DataEndianness endian = e;
860 int length = 4*len;
861 if (!state || (!(state->flags & QTextCodec::IgnoreHeader))) {
862 length += 4;
863 }
864 if (e == DetectEndianness) {
865 endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
866 }
867
868 QByteArray d(length, Qt::Uninitialized);
869 char *data = d.data();
870 if (!state || !(state->flags & QTextCodec::IgnoreHeader)) {
871 if (endian == BigEndianness) {
872 data[0] = 0;
873 data[1] = 0;
874 data[2] = (char)0xfe;
875 data[3] = (char)0xff;
876 } else {
877 data[0] = (char)0xff;
878 data[1] = (char)0xfe;
879 data[2] = 0;
880 data[3] = 0;
881 }
882 data += 4;
883 }
884
885 QStringIterator i(uc, uc + len);
886 if (endian == BigEndianness) {
887 while (i.hasNext()) {
888 uint cp = i.next();
889 qToBigEndian(src: cp, dest: data);
890 data += 4;
891 }
892 } else {
893 while (i.hasNext()) {
894 uint cp = i.next();
895 qToLittleEndian(src: cp, dest: data);
896 data += 4;
897 }
898 }
899
900 if (state) {
901 state->remainingChars = 0;
902 state->flags |= QTextCodec::IgnoreHeader;
903 }
904 return d;
905}
906
907QString QUtf32::convertToUnicode(const char *chars, int len, QTextCodec::ConverterState *state, DataEndianness e)
908{
909 DataEndianness endian = e;
910 uchar tuple[4];
911 int num = 0;
912 bool headerdone = false;
913 if (state) {
914 headerdone = state->flags & QTextCodec::IgnoreHeader;
915 if (endian == DetectEndianness) {
916 endian = (DataEndianness)state->state_data[Endian];
917 }
918 num = state->remainingChars;
919 memcpy(dest: tuple, src: &state->state_data[Data], n: 4);
920 }
921 if (headerdone && endian == DetectEndianness)
922 endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
923
924 QString result;
925 result.resize(size: (num + len) >> 2 << 1); // worst case
926 QChar *qch = (QChar *)result.data();
927
928 const char *end = chars + len;
929 while (chars < end) {
930 tuple[num++] = *chars++;
931 if (num == 4) {
932 if (!headerdone) {
933 headerdone = true;
934 if (endian == DetectEndianness) {
935 if (tuple[0] == 0xff && tuple[1] == 0xfe && tuple[2] == 0 && tuple[3] == 0 && endian != BigEndianness) {
936 endian = LittleEndianness;
937 num = 0;
938 continue;
939 } else if (tuple[0] == 0 && tuple[1] == 0 && tuple[2] == 0xfe && tuple[3] == 0xff && endian != LittleEndianness) {
940 endian = BigEndianness;
941 num = 0;
942 continue;
943 } else if (QSysInfo::ByteOrder == QSysInfo::BigEndian) {
944 endian = BigEndianness;
945 } else {
946 endian = LittleEndianness;
947 }
948 } else if (((endian == BigEndianness) ? qFromBigEndian<quint32>(src: tuple) : qFromLittleEndian<quint32>(src: tuple)) == QChar::ByteOrderMark) {
949 num = 0;
950 continue;
951 }
952 }
953 uint code = (endian == BigEndianness) ? qFromBigEndian<quint32>(src: tuple) : qFromLittleEndian<quint32>(src: tuple);
954 if (QChar::requiresSurrogates(ucs4: code)) {
955 *qch++ = QChar(QChar::highSurrogate(ucs4: code));
956 *qch++ = QChar(QChar::lowSurrogate(ucs4: code));
957 } else {
958 *qch++ = QChar(code);
959 }
960 num = 0;
961 }
962 }
963 result.truncate(pos: qch - result.unicode());
964
965 if (state) {
966 if (headerdone)
967 state->flags |= QTextCodec::IgnoreHeader;
968 state->state_data[Endian] = endian;
969 state->remainingChars = num;
970 memcpy(dest: &state->state_data[Data], src: tuple, n: 4);
971 }
972 return result;
973}
974
975
976#if QT_CONFIG(textcodec)
977
978QUtf8Codec::~QUtf8Codec()
979{
980}
981
982QByteArray QUtf8Codec::convertFromUnicode(const QChar *uc, int len, ConverterState *state) const
983{
984 return QUtf8::convertFromUnicode(uc, len, state);
985}
986
987void QUtf8Codec::convertToUnicode(QString *target, const char *chars, int len, ConverterState *state) const
988{
989 *target += QUtf8::convertToUnicode(chars, len, state);
990}
991
992QString QUtf8Codec::convertToUnicode(const char *chars, int len, ConverterState *state) const
993{
994 return QUtf8::convertToUnicode(chars, len, state);
995}
996
997QByteArray QUtf8Codec::name() const
998{
999 return "UTF-8";
1000}
1001
1002int QUtf8Codec::mibEnum() const
1003{
1004 return 106;
1005}
1006
1007QUtf16Codec::~QUtf16Codec()
1008{
1009}
1010
1011QByteArray QUtf16Codec::convertFromUnicode(const QChar *uc, int len, ConverterState *state) const
1012{
1013 return QUtf16::convertFromUnicode(uc, len, state, e);
1014}
1015
1016QString QUtf16Codec::convertToUnicode(const char *chars, int len, ConverterState *state) const
1017{
1018 return QUtf16::convertToUnicode(chars, len, state, e);
1019}
1020
1021int QUtf16Codec::mibEnum() const
1022{
1023 return 1015;
1024}
1025
1026QByteArray QUtf16Codec::name() const
1027{
1028 return "UTF-16";
1029}
1030
1031QList<QByteArray> QUtf16Codec::aliases() const
1032{
1033 return QList<QByteArray>();
1034}
1035
1036int QUtf16BECodec::mibEnum() const
1037{
1038 return 1013;
1039}
1040
1041QByteArray QUtf16BECodec::name() const
1042{
1043 return "UTF-16BE";
1044}
1045
1046QList<QByteArray> QUtf16BECodec::aliases() const
1047{
1048 QList<QByteArray> list;
1049 return list;
1050}
1051
1052int QUtf16LECodec::mibEnum() const
1053{
1054 return 1014;
1055}
1056
1057QByteArray QUtf16LECodec::name() const
1058{
1059 return "UTF-16LE";
1060}
1061
1062QList<QByteArray> QUtf16LECodec::aliases() const
1063{
1064 QList<QByteArray> list;
1065 return list;
1066}
1067
1068QUtf32Codec::~QUtf32Codec()
1069{
1070}
1071
1072QByteArray QUtf32Codec::convertFromUnicode(const QChar *uc, int len, ConverterState *state) const
1073{
1074 return QUtf32::convertFromUnicode(uc, len, state, e);
1075}
1076
1077QString QUtf32Codec::convertToUnicode(const char *chars, int len, ConverterState *state) const
1078{
1079 return QUtf32::convertToUnicode(chars, len, state, e);
1080}
1081
1082int QUtf32Codec::mibEnum() const
1083{
1084 return 1017;
1085}
1086
1087QByteArray QUtf32Codec::name() const
1088{
1089 return "UTF-32";
1090}
1091
1092QList<QByteArray> QUtf32Codec::aliases() const
1093{
1094 QList<QByteArray> list;
1095 return list;
1096}
1097
1098int QUtf32BECodec::mibEnum() const
1099{
1100 return 1018;
1101}
1102
1103QByteArray QUtf32BECodec::name() const
1104{
1105 return "UTF-32BE";
1106}
1107
1108QList<QByteArray> QUtf32BECodec::aliases() const
1109{
1110 QList<QByteArray> list;
1111 return list;
1112}
1113
1114int QUtf32LECodec::mibEnum() const
1115{
1116 return 1019;
1117}
1118
1119QByteArray QUtf32LECodec::name() const
1120{
1121 return "UTF-32LE";
1122}
1123
1124QList<QByteArray> QUtf32LECodec::aliases() const
1125{
1126 QList<QByteArray> list;
1127 return list;
1128}
1129
1130#endif // textcodec
1131
1132QT_END_NAMESPACE
1133

source code of qtbase/src/corelib/codecs/qutfcodec.cpp