1/****************************************************************************
2**
3** Copyright (C) 2018 The Qt Company Ltd.
4** Copyright (C) 2018 Intel Corporation.
5** Contact: https://www.qt.io/licensing/
6**
7** This file is part of the QtCore module of the Qt Toolkit.
8**
9** $QT_BEGIN_LICENSE:LGPL$
10** Commercial License Usage
11** Licensees holding valid commercial Qt licenses may use this file in
12** accordance with the commercial license agreement provided with the
13** Software or, alternatively, in accordance with the terms contained in
14** a written agreement between you and The Qt Company. For licensing terms
15** and conditions see https://www.qt.io/terms-conditions. For further
16** information use the contact form at https://www.qt.io/contact-us.
17**
18** GNU Lesser General Public License Usage
19** Alternatively, this file may be used under the terms of the GNU Lesser
20** General Public License version 3 as published by the Free Software
21** Foundation and appearing in the file LICENSE.LGPL3 included in the
22** packaging of this file. Please review the following information to
23** ensure the GNU Lesser General Public License version 3 requirements
24** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
25**
26** GNU General Public License Usage
27** Alternatively, this file may be used under the terms of the GNU
28** General Public License version 2.0 or (at your option) the GNU General
29** Public license version 3 or any later version approved by the KDE Free
30** Qt Foundation. The licenses are as published by the Free Software
31** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
32** included in the packaging of this file. Please review the following
33** information to ensure the GNU General Public License requirements will
34** be met: https://www.gnu.org/licenses/gpl-2.0.html and
35** https://www.gnu.org/licenses/gpl-3.0.html.
36**
37** $QT_END_LICENSE$
38**
39****************************************************************************/
40
41#include "qplatformdefs.h"
42
43#include "qtextcodec.h"
44#include "qtextcodec_p.h"
45
46#include "qbytearraymatcher.h"
47#include "qendian.h"
48#include "qfile.h"
49#include "qlist.h"
50#include <private/qlocking_p.h>
51#include "qstringlist.h"
52#include "qvarlengtharray.h"
53#if !defined(QT_BOOTSTRAPPED)
54#include <private/qcoreapplication_p.h>
55#endif
56#include "private/qcoreglobaldata_p.h"
57
58#include "qutfcodec_p.h"
59#include "qlatincodec_p.h"
60
61#if !defined(QT_BOOTSTRAPPED)
62#if QT_CONFIG(codecs)
63# include "qtsciicodec_p.h"
64# include "qisciicodec_p.h"
65#endif
66#if QT_CONFIG(icu)
67#include "qicucodec_p.h"
68#else
69#if QT_CONFIG(iconv)
70# include "qiconvcodec_p.h"
71#endif
72#ifdef Q_OS_WIN
73# include "qwindowscodec_p.h"
74#endif
75# include "qsimplecodec_p.h"
76#if QT_CONFIG(big_codecs)
77# ifndef Q_OS_INTEGRITY
78# include "qgb18030codec_p.h"
79# include "qeucjpcodec_p.h"
80# include "qjiscodec_p.h"
81# include "qsjiscodec_p.h"
82# include "qeuckrcodec_p.h"
83# include "qbig5codec_p.h"
84# endif // !Q_OS_INTEGRITY
85#endif // big_codecs
86
87#endif // icu
88#endif // QT_BOOTSTRAPPED
89
90#include <mutex>
91
92#include <stdlib.h>
93#include <ctype.h>
94#include <locale.h>
95#if defined (_XOPEN_UNIX) && !defined(Q_OS_QNX) && !defined(Q_OS_ANDROID)
96# include <langinfo.h>
97#endif
98
99QT_BEGIN_NAMESPACE
100
101typedef QList<QTextCodec*>::ConstIterator TextCodecListConstIt;
102typedef QList<QByteArray>::ConstIterator ByteArrayListConstIt;
103
104Q_GLOBAL_STATIC(QRecursiveMutex, textCodecsMutex);
105
106class TextCodecsMutexLocker
107{
108 using Lock = decltype(qt_unique_lock(mutex&: std::declval<QRecursiveMutex&>()));
109 // ### FIXME: this is used when textCodecsMutex already == nullptr
110 const Lock lock = qt_unique_lock(mutex: textCodecsMutex());
111public:
112 TextCodecsMutexLocker() {} // required d/t an ICC 19 bug
113};
114
115#if !QT_CONFIG(icu)
116static char qtolower(char c)
117{ if (c >= 'A' && c <= 'Z') return c + 0x20; return c; }
118static bool qisalnum(char c)
119{ return (c >= '0' && c <= '9') || ((c | 0x20) >= 'a' && (c | 0x20) <= 'z'); }
120
121bool qTextCodecNameMatch(const char *n, const char *h)
122{
123 if (qstricmp(n, h) == 0)
124 return true;
125
126 // if the letters and numbers are the same, we have a match
127 while (*n != '\0') {
128 if (qisalnum(*n)) {
129 for (;;) {
130 if (*h == '\0')
131 return false;
132 if (qisalnum(*h))
133 break;
134 ++h;
135 }
136 if (qtolower(*n) != qtolower(*h))
137 return false;
138 ++h;
139 }
140 ++n;
141 }
142 while (*h && !qisalnum(*h))
143 ++h;
144 return (*h == '\0');
145}
146
147
148#if !defined(Q_OS_WIN32) && !defined(QT_LOCALE_IS_UTF8)
149static QTextCodec *checkForCodec(const QByteArray &name) {
150 QTextCodec *c = QTextCodec::codecForName(name);
151 if (!c) {
152 const int index = name.indexOf('@');
153 if (index != -1) {
154 c = QTextCodec::codecForName(name.left(index));
155 }
156 }
157 return c;
158}
159#endif
160
161static void setup();
162
163// \threadsafe
164// this returns the codec the method sets up as locale codec to
165// avoid a race condition in codecForLocale() when
166// setCodecForLocale(0) is called at the same time.
167static QTextCodec *setupLocaleMapper()
168{
169 QCoreGlobalData *globalData = QCoreGlobalData::instance();
170
171 QTextCodec *locale = nullptr;
172
173 {
174 const TextCodecsMutexLocker locker;
175 if (globalData->allCodecs.isEmpty())
176 setup();
177 }
178
179#if !defined(QT_BOOTSTRAPPED)
180 QCoreApplicationPrivate::initLocale();
181#endif
182
183#if defined(QT_LOCALE_IS_UTF8)
184 locale = QTextCodec::codecForName("UTF-8");
185#elif defined(Q_OS_WIN)
186 locale = QTextCodec::codecForName("System");
187#else
188
189 // First try getting the codecs name from nl_langinfo and see
190 // if we have a builtin codec for it.
191 // Only fall back to using iconv if we can't find a builtin codec
192 // This is because the builtin utf8 codec is around 5 times faster
193 // then the using QIconvCodec
194
195#if defined (_XOPEN_UNIX)
196 char *charset = nl_langinfo(CODESET);
197 if (charset)
198 locale = QTextCodec::codecForName(charset);
199#endif
200#if QT_CONFIG(iconv)
201 if (!locale) {
202 // no builtin codec for the locale found, let's try using iconv
203 (void) new QIconvCodec();
204 locale = QTextCodec::codecForName("System");
205 }
206#endif
207
208 if (!locale) {
209 // Very poorly defined and followed standards causes lots of
210 // code to try to get all the cases... This logic is
211 // duplicated in QIconvCodec, so if you change it here, change
212 // it there too.
213
214 // Try to determine locale codeset from locale name assigned to
215 // LC_CTYPE category.
216
217 // First part is getting that locale name. First try setlocale() which
218 // definitely knows it, but since we cannot fully trust it, get ready
219 // to fall back to environment variables.
220 const QByteArray ctype = setlocale(LC_CTYPE, nullptr);
221
222 // Get the first nonempty value from $LC_ALL, $LC_CTYPE, and $LANG
223 // environment variables.
224 QByteArray lang = qgetenv("LC_ALL");
225 if (lang.isEmpty() || lang == "C") {
226 lang = qgetenv("LC_CTYPE");
227 }
228 if (lang.isEmpty() || lang == "C") {
229 lang = qgetenv("LANG");
230 }
231
232 // Now try these in order:
233 // 1. CODESET from ctype if it contains a .CODESET part (e.g. en_US.ISO8859-15)
234 // 2. CODESET from lang if it contains a .CODESET part
235 // 3. ctype (maybe the locale is named "ISO-8859-1" or something)
236 // 4. locale (ditto)
237 // 5. check for "@euro"
238 // 6. guess locale from ctype unless ctype is "C"
239 // 7. guess locale from lang
240
241 // 1. CODESET from ctype if it contains a .CODESET part (e.g. en_US.ISO8859-15)
242 int indexOfDot = ctype.indexOf('.');
243 if (indexOfDot != -1)
244 locale = checkForCodec( ctype.mid(indexOfDot + 1) );
245
246 // 2. CODESET from lang if it contains a .CODESET part
247 if (!locale) {
248 indexOfDot = lang.indexOf('.');
249 if (indexOfDot != -1)
250 locale = checkForCodec( lang.mid(indexOfDot + 1) );
251 }
252
253 // 3. ctype (maybe the locale is named "ISO-8859-1" or something)
254 if (!locale && !ctype.isEmpty() && ctype != "C")
255 locale = checkForCodec(ctype);
256
257 // 4. locale (ditto)
258 if (!locale && !lang.isEmpty())
259 locale = checkForCodec(lang);
260
261 // 5. "@euro"
262 if ((!locale && ctype.contains("@euro")) || lang.contains("@euro"))
263 locale = checkForCodec("ISO 8859-15");
264 }
265
266#endif
267 // If everything failed, we default to 8859-1
268 if (!locale)
269 locale = QTextCodec::codecForName("ISO 8859-1");
270 globalData->codecForLocale.storeRelease(locale);
271 return locale;
272}
273
274
275// textCodecsMutex need to be locked to enter this function
276static void setup()
277{
278 static bool initialized = false;
279 if (initialized)
280 return;
281 initialized = true;
282
283#if QT_CONFIG(codecs) && !defined(QT_BOOTSTRAPPED)
284 (void)new QTsciiCodec;
285 for (int i = 0; i < 9; ++i)
286 (void)new QIsciiCodec(i);
287 for (int i = 0; i < QSimpleTextCodec::numSimpleCodecs; ++i)
288 (void)new QSimpleTextCodec(i);
289
290# if QT_CONFIG(big_codecs) && !defined(Q_OS_INTEGRITY)
291 (void)new QGb18030Codec;
292 (void)new QGbkCodec;
293 (void)new QGb2312Codec;
294 (void)new QEucJpCodec;
295 (void)new QJisCodec;
296 (void)new QSjisCodec;
297 (void)new QEucKrCodec;
298 (void)new QCP949Codec;
299 (void)new QBig5Codec;
300 (void)new QBig5hkscsCodec;
301# endif // big_codecs && !Q_OS_INTEGRITY
302#if QT_CONFIG(iconv)
303 (void) new QIconvCodec;
304#endif
305#if defined(Q_OS_WIN32)
306 (void) new QWindowsLocalCodec;
307#endif // Q_OS_WIN32
308#endif // codecs && !QT_BOOTSTRAPPED
309
310 (void)new QUtf16Codec;
311 (void)new QUtf16BECodec;
312 (void)new QUtf16LECodec;
313 (void)new QUtf32Codec;
314 (void)new QUtf32BECodec;
315 (void)new QUtf32LECodec;
316 (void)new QLatin15Codec;
317 (void)new QLatin1Codec;
318 (void)new QUtf8Codec;
319}
320#else
321static void setup() {}
322#endif // icu
323
324/*!
325 \enum QTextCodec::ConversionFlag
326
327 \value DefaultConversion No flag is set.
328 \value ConvertInvalidToNull If this flag is set, each invalid input
329 character is output as a null character.
330 \value IgnoreHeader Ignore any Unicode byte-order mark and don't generate any.
331
332 \omitvalue FreeFunction
333*/
334
335/*!
336 \fn QTextCodec::ConverterState::ConverterState(ConversionFlags flags)
337
338 Constructs a ConverterState object initialized with the given \a flags.
339*/
340
341/*!
342 Destroys the ConverterState object.
343*/
344QTextCodec::ConverterState::~ConverterState()
345{
346 if (flags & FreeFunction)
347 (QTextCodecUnalignedPointer::decode(src: state_data))(this);
348 else if (d)
349 free(ptr: d);
350}
351
352/*!
353 \class QTextCodec
354 \inmodule QtCore
355 \brief The QTextCodec class provides conversions between text encodings.
356 \reentrant
357 \ingroup i18n
358
359 Qt uses Unicode to store, draw and manipulate strings. In many
360 situations you may wish to deal with data that uses a different
361 encoding. For example, most Japanese documents are still stored
362 in Shift-JIS or ISO 2022-JP, while Russian users often have their
363 documents in KOI8-R or Windows-1251.
364
365 Qt provides a set of QTextCodec classes to help with converting
366 non-Unicode formats to and from Unicode. You can also create your
367 own codec classes.
368
369 The supported encodings are:
370
371 \list
372 \li \l{Big5 Text Codec}{Big5}
373 \li \l{Big5-HKSCS Text Codec}{Big5-HKSCS}
374 \li CP949
375 \li \l{EUC-JP Text Codec}{EUC-JP}
376 \li \l{EUC-KR Text Codec}{EUC-KR}
377 \li \l{GBK Text Codec}{GB18030}
378 \li HP-ROMAN8
379 \li IBM 850
380 \li IBM 866
381 \li IBM 874
382 \li \l{ISO 2022-JP (JIS) Text Codec}{ISO 2022-JP}
383 \li ISO 8859-1 to 10
384 \li ISO 8859-13 to 16
385 \li Iscii-Bng, Dev, Gjr, Knd, Mlm, Ori, Pnj, Tlg, and Tml
386 \li KOI8-R
387 \li KOI8-U
388 \li Macintosh
389 \li \l{Shift-JIS Text Codec}{Shift-JIS}
390 \li TIS-620
391 \li \l{TSCII Text Codec}{TSCII}
392 \li UTF-8
393 \li UTF-16
394 \li UTF-16BE
395 \li UTF-16LE
396 \li UTF-32
397 \li UTF-32BE
398 \li UTF-32LE
399 \li Windows-1250 to 1258
400 \endlist
401
402 If Qt is compiled with ICU support enabled, most codecs supported by
403 ICU will also be available to the application.
404
405 \l {QTextCodec}s can be used as follows to convert some locally encoded
406 string to Unicode. Suppose you have some string encoded in Russian
407 KOI8-R encoding, and want to convert it to Unicode. The simple way
408 to do it is like this:
409
410 \snippet code/src_corelib_codecs_qtextcodec.cpp 0
411
412 After this, \c string holds the text converted to Unicode.
413 Converting a string from Unicode to the local encoding is just as
414 easy:
415
416 \snippet code/src_corelib_codecs_qtextcodec.cpp 1
417
418 To read or write files in various encodings, use QTextStream and
419 its \l{QTextStream::setCodec()}{setCodec()} function. See the
420 \l{tools/codecs}{Codecs} example for an application of QTextCodec
421 to file I/O.
422
423 Some care must be taken when trying to convert the data in chunks,
424 for example, when receiving it over a network. In such cases it is
425 possible that a multi-byte character will be split over two
426 chunks. At best this might result in the loss of a character and
427 at worst cause the entire conversion to fail.
428
429 The approach to use in these situations is to create a QTextDecoder
430 object for the codec and use this QTextDecoder for the whole
431 decoding process, as shown below:
432
433 \snippet code/src_corelib_codecs_qtextcodec.cpp 2
434
435 The QTextDecoder object maintains state between chunks and therefore
436 works correctly even if a multi-byte character is split between
437 chunks.
438
439 \section1 Creating Your Own Codec Class
440
441 Support for new text encodings can be added to Qt by creating
442 QTextCodec subclasses.
443
444 The pure virtual functions describe the encoder to the system and
445 the coder is used as required in the different text file formats
446 supported by QTextStream, and under X11, for the locale-specific
447 character input and output.
448
449 To add support for another encoding to Qt, make a subclass of
450 QTextCodec and implement the functions listed in the table below.
451
452 \table
453 \header \li Function \li Description
454
455 \row \li name()
456 \li Returns the official name for the encoding. If the
457 encoding is listed in the
458 \l{IANA character-sets encoding file}, the name
459 should be the preferred MIME name for the encoding.
460
461 \row \li aliases()
462 \li Returns a list of alternative names for the encoding.
463 QTextCodec provides a default implementation that returns
464 an empty list. For example, "ISO-8859-1" has "latin1",
465 "CP819", "IBM819", and "iso-ir-100" as aliases.
466
467 \row \li \l{QTextCodec::mibEnum()}{mibEnum()}
468 \li Return the MIB enum for the encoding if it is listed in
469 the \l{IANA character-sets encoding file}.
470
471 \row \li convertToUnicode()
472 \li Converts an 8-bit character string to Unicode.
473
474 \row \li convertFromUnicode()
475 \li Converts a Unicode string to an 8-bit character string.
476 \endtable
477
478 \sa QTextStream, QTextDecoder, QTextEncoder, {Text Codecs Example}
479*/
480
481/*!
482 Constructs a QTextCodec, and gives it the highest precedence. The
483 QTextCodec should always be constructed on the heap (i.e. with \c
484 new). Qt takes ownership and will delete it when the application
485 terminates.
486*/
487QTextCodec::QTextCodec()
488{
489 const TextCodecsMutexLocker locker;
490
491 QCoreGlobalData *globalInstance = QCoreGlobalData::instance();
492 if (globalInstance->allCodecs.isEmpty())
493 setup();
494
495 globalInstance->allCodecs.prepend(t: this);
496}
497
498
499/*!
500 \nonreentrant
501
502 Destroys the QTextCodec. Note that you should not delete codecs
503 yourself: once created they become Qt's responsibility.
504*/
505QTextCodec::~QTextCodec()
506{
507 QCoreGlobalData *globalData = QCoreGlobalData::instance();
508 if (!globalData)
509 return;
510
511 globalData->codecForLocale.testAndSetRelaxed(expectedValue: this, newValue: nullptr);
512
513 const TextCodecsMutexLocker locker;
514
515 globalData->allCodecs.removeOne(t: this);
516
517 auto it = globalData->codecCache.begin();
518
519 while (it != globalData->codecCache.end()) {
520 if (it.value() == this)
521 it = globalData->codecCache.erase(it);
522 else
523 ++it;
524 }
525}
526
527/*!
528 \fn QTextCodec *QTextCodec::codecForName(const char *name)
529
530 Searches all installed QTextCodec objects and returns the one
531 which best matches \a name; the match is case-insensitive. Returns
532 0 if no codec matching the name \a name could be found.
533*/
534
535/*!
536 \threadsafe
537 Searches all installed QTextCodec objects and returns the one
538 which best matches \a name; the match is case-insensitive. Returns
539 0 if no codec matching the name \a name could be found.
540*/
541QTextCodec *QTextCodec::codecForName(const QByteArray &name)
542{
543 if (name.isEmpty())
544 return nullptr;
545
546 const TextCodecsMutexLocker locker;
547
548 QCoreGlobalData *globalData = QCoreGlobalData::instance();
549 if (!globalData)
550 return nullptr;
551 setup();
552
553#if !QT_CONFIG(icu)
554 QTextCodecCache *cache = &globalData->codecCache;
555 QTextCodec *codec;
556 codec = cache->value(name);
557 if (codec)
558 return codec;
559
560 for (TextCodecListConstIt it = globalData->allCodecs.constBegin(), cend = globalData->allCodecs.constEnd(); it != cend; ++it) {
561 QTextCodec *cursor = *it;
562 if (qTextCodecNameMatch(cursor->name(), name)) {
563 if (cache)
564 cache->insert(name, cursor);
565 return cursor;
566 }
567 QList<QByteArray> aliases = cursor->aliases();
568 for (ByteArrayListConstIt ait = aliases.constBegin(), acend = aliases.constEnd(); ait != acend; ++ait) {
569 if (qTextCodecNameMatch(*ait, name)) {
570 cache->insert(name, cursor);
571 return cursor;
572 }
573 }
574 }
575
576 return nullptr;
577#else
578 return QIcuCodec::codecForNameUnlocked(name);
579#endif
580}
581
582
583/*!
584 \threadsafe
585 Returns the QTextCodec which matches the
586 \l{QTextCodec::mibEnum()}{MIBenum} \a mib.
587*/
588QTextCodec* QTextCodec::codecForMib(int mib)
589{
590 const TextCodecsMutexLocker locker;
591
592 QCoreGlobalData *globalData = QCoreGlobalData::instance();
593 if (!globalData)
594 return nullptr;
595 if (globalData->allCodecs.isEmpty())
596 setup();
597
598 QByteArray key = "MIB: " + QByteArray::number(mib);
599
600 QTextCodecCache *cache = &globalData->codecCache;
601 QTextCodec *codec;
602 if (cache) {
603 codec = cache->value(akey: key);
604 if (codec)
605 return codec;
606 }
607
608 for (TextCodecListConstIt it = globalData->allCodecs.constBegin(), cend = globalData->allCodecs.constEnd(); it != cend; ++it) {
609 QTextCodec *cursor = *it;
610 if (cursor->mibEnum() == mib) {
611 if (cache)
612 cache->insert(akey: key, avalue: cursor);
613 return cursor;
614 }
615 }
616
617#if QT_CONFIG(icu)
618 return QIcuCodec::codecForMibUnlocked(mib);
619#else
620 return nullptr;
621#endif
622}
623
624/*!
625 \threadsafe
626 Returns the list of all available codecs, by name. Call
627 QTextCodec::codecForName() to obtain the QTextCodec for the name.
628
629 The list may contain many mentions of the same codec
630 if the codec has aliases.
631
632 \sa availableMibs(), name(), aliases()
633*/
634QList<QByteArray> QTextCodec::availableCodecs()
635{
636 const TextCodecsMutexLocker locker;
637
638 QCoreGlobalData *globalData = QCoreGlobalData::instance();
639 if (globalData->allCodecs.isEmpty())
640 setup();
641
642 QList<QByteArray> codecs;
643
644 for (TextCodecListConstIt it = globalData->allCodecs.constBegin(), cend = globalData->allCodecs.constEnd(); it != cend; ++it) {
645 codecs += (*it)->name();
646 codecs += (*it)->aliases();
647 }
648
649#if QT_CONFIG(icu)
650 codecs += QIcuCodec::availableCodecs();
651#endif
652
653 return codecs;
654}
655
656/*!
657 \threadsafe
658 Returns the list of MIBs for all available codecs. Call
659 QTextCodec::codecForMib() to obtain the QTextCodec for the MIB.
660
661 \sa availableCodecs(), mibEnum()
662*/
663QList<int> QTextCodec::availableMibs()
664{
665#if QT_CONFIG(icu)
666 return QIcuCodec::availableMibs();
667#else
668 const TextCodecsMutexLocker locker;
669
670 QCoreGlobalData *globalData = QCoreGlobalData::instance();
671 if (globalData->allCodecs.isEmpty())
672 setup();
673
674 QList<int> codecs;
675
676 for (TextCodecListConstIt it = globalData->allCodecs.constBegin(), cend = globalData->allCodecs.constEnd(); it != cend; ++it)
677 codecs += (*it)->mibEnum();
678
679 return codecs;
680#endif
681}
682
683/*!
684 \nonreentrant
685
686 Set the codec to \a c; this will be returned by
687 codecForLocale(). If \a c is \nullptr, the codec is reset to
688 the default.
689
690 This might be needed for some applications that want to use their
691 own mechanism for setting the locale.
692
693 \sa codecForLocale()
694*/
695void QTextCodec::setCodecForLocale(QTextCodec *c)
696{
697 QCoreGlobalData::instance()->codecForLocale.storeRelease(newValue: c);
698}
699
700/*!
701 \threadsafe
702 Returns a pointer to the codec most suitable for this locale.
703
704 The codec will be retrieved from ICU where that backend is in use, otherwise
705 it may be obtained from an OS-specific API. In the latter case, the codec's
706 name may be "System".
707*/
708
709QTextCodec* QTextCodec::codecForLocale()
710{
711 QCoreGlobalData *globalData = QCoreGlobalData::instance();
712 if (!globalData)
713 return nullptr;
714
715 QTextCodec *codec = globalData->codecForLocale.loadAcquire();
716 if (!codec) {
717#if QT_CONFIG(icu)
718 const TextCodecsMutexLocker locker;
719 codec = QIcuCodec::defaultCodecUnlocked();
720#else
721 // setupLocaleMapper locks as necessary
722 codec = setupLocaleMapper();
723#endif
724 }
725
726 return codec;
727}
728
729
730/*!
731 \fn QByteArray QTextCodec::name() const
732
733 QTextCodec subclasses must reimplement this function. It returns
734 the name of the encoding supported by the subclass.
735
736 If the codec is registered as a character set in the
737 \l{IANA character-sets encoding file} this method should
738 return the preferred mime name for the codec if defined,
739 otherwise its name.
740*/
741
742/*!
743 \fn int QTextCodec::mibEnum() const
744
745 Subclasses of QTextCodec must reimplement this function. It
746 returns the \l{QTextCodec::mibEnum()}{MIBenum} (see \l{IANA character-sets encoding file}
747 for more information). It is important that each QTextCodec
748 subclass returns the correct unique value for this function.
749*/
750
751/*!
752 Subclasses can return a number of aliases for the codec in question.
753
754 Standard aliases for codecs can be found in the
755 \l{IANA character-sets encoding file}.
756*/
757QList<QByteArray> QTextCodec::aliases() const
758{
759 return QList<QByteArray>();
760}
761
762/*!
763 \fn QString QTextCodec::convertToUnicode(const char *chars, int len,
764 ConverterState *state) const
765
766 QTextCodec subclasses must reimplement this function.
767
768 Converts the first \a len characters of \a chars from the
769 encoding of the subclass to Unicode, and returns the result in a
770 QString.
771
772 \a state can be \nullptr, in which case the conversion is stateless and
773 default conversion rules should be used. If state is not 0, the
774 codec should save the state after the conversion in \a state, and
775 adjust the \c remainingChars and \c invalidChars members of the struct.
776*/
777
778/*!
779 \fn QByteArray QTextCodec::convertFromUnicode(const QChar *input, int number,
780 ConverterState *state) const
781
782 QTextCodec subclasses must reimplement this function.
783
784 Converts the first \a number of characters from the \a input array
785 from Unicode to the encoding of the subclass, and returns the result
786 in a QByteArray.
787
788 \a state can be \nullptr in which case the conversion is stateless and
789 default conversion rules should be used. If state is not 0, the
790 codec should save the state after the conversion in \a state, and
791 adjust the \c remainingChars and \c invalidChars members of the struct.
792*/
793
794/*!
795 Creates a QTextDecoder with a specified \a flags to decode chunks
796 of \c{char *} data to create chunks of Unicode data.
797
798 The caller is responsible for deleting the returned object.
799
800 \since 4.7
801*/
802QTextDecoder* QTextCodec::makeDecoder(QTextCodec::ConversionFlags flags) const
803{
804 return new QTextDecoder(this, flags);
805}
806
807/*!
808 Creates a QTextEncoder with a specified \a flags to encode chunks
809 of Unicode data as \c{char *} data.
810
811 The caller is responsible for deleting the returned object.
812
813 \since 4.7
814*/
815QTextEncoder* QTextCodec::makeEncoder(QTextCodec::ConversionFlags flags) const
816{
817 return new QTextEncoder(this, flags);
818}
819
820/*!
821 \fn QByteArray QTextCodec::fromUnicode(const QChar *input, int number,
822 ConverterState *state) const
823
824 Converts the first \a number of characters from the \a input array
825 from Unicode to the encoding of this codec, and returns the result
826 in a QByteArray.
827
828 The \a state of the convertor used is updated.
829*/
830
831#if QT_STRINGVIEW_LEVEL < 2
832/*!
833 Converts \a str from Unicode to the encoding of this codec, and
834 returns the result in a QByteArray.
835*/
836QByteArray QTextCodec::fromUnicode(const QString& str) const
837{
838 return convertFromUnicode(in: str.constData(), length: str.length(), state: nullptr);
839}
840#endif
841
842/*!
843 \overload
844 \since 5.10
845
846 Converts \a str from Unicode to the encoding of this codec, and
847 returns the result in a QByteArray.
848*/
849QByteArray QTextCodec::fromUnicode(QStringView str) const
850{
851 return convertFromUnicode(in: str.data(), length: str.length(), state: nullptr);
852}
853
854/*!
855 \fn QString QTextCodec::toUnicode(const char *input, int size,
856 ConverterState *state) const
857
858 Converts the first \a size characters from the \a input from the
859 encoding of this codec to Unicode, and returns the result in a
860 QString.
861
862 The \a state of the convertor used is updated.
863*/
864
865/*!
866 Converts \a a from the encoding of this codec to Unicode, and
867 returns the result in a QString.
868*/
869QString QTextCodec::toUnicode(const QByteArray& a) const
870{
871 return convertToUnicode(in: a.constData(), length: a.length(), state: nullptr);
872}
873
874/*!
875 Returns \c true if the Unicode character \a ch can be fully encoded
876 with this codec; otherwise returns \c false.
877*/
878bool QTextCodec::canEncode(QChar ch) const
879{
880 ConverterState state;
881 state.flags = ConvertInvalidToNull;
882 convertFromUnicode(in: &ch, length: 1, state: &state);
883 return (state.invalidChars == 0);
884}
885
886#if QT_STRINGVIEW_LEVEL < 2
887/*!
888 \overload
889
890 \a s contains the string being tested for encode-ability.
891*/
892bool QTextCodec::canEncode(const QString& s) const
893{
894 ConverterState state;
895 state.flags = ConvertInvalidToNull;
896 convertFromUnicode(in: s.constData(), length: s.length(), state: &state);
897 return (state.invalidChars == 0);
898}
899#endif
900
901/*!
902 \overload
903 \since 5.10
904
905 Returns \c true if the Unicode string \a s can be fully encoded
906 with this codec; otherwise returns \c false.
907*/
908bool QTextCodec::canEncode(QStringView s) const
909{
910 ConverterState state;
911 state.flags = ConvertInvalidToNull;
912 convertFromUnicode(in: s.data(), length: s.length(), state: &state);
913 return !state.invalidChars;
914}
915/*!
916 \overload
917
918 \a chars contains the source characters.
919*/
920QString QTextCodec::toUnicode(const char *chars) const
921{
922 int len = qstrlen(str: chars);
923 return convertToUnicode(in: chars, length: len, state: nullptr);
924}
925
926
927/*!
928 \class QTextEncoder
929 \inmodule QtCore
930 \brief The QTextEncoder class provides a state-based encoder.
931 \reentrant
932 \ingroup i18n
933
934 A text encoder converts text from Unicode into an encoded text format
935 using a specific codec.
936
937 The encoder converts Unicode into another format, remembering any
938 state that is required between calls.
939
940 \sa QTextCodec::makeEncoder(), QTextDecoder
941*/
942
943/*!
944 \fn QTextEncoder::QTextEncoder(const QTextCodec *codec)
945
946 Constructs a text encoder for the given \a codec.
947*/
948
949/*!
950 Constructs a text encoder for the given \a codec and conversion \a flags.
951
952 \since 4.7
953*/
954QTextEncoder::QTextEncoder(const QTextCodec *codec, QTextCodec::ConversionFlags flags)
955 : c(codec), state()
956{
957 state.flags = flags;
958}
959
960/*!
961 Destroys the encoder.
962*/
963QTextEncoder::~QTextEncoder()
964{
965}
966
967/*!
968 \internal
969 \since 4.5
970 Determines whether the eecoder encountered a failure while decoding the input. If
971 an error was encountered, the produced result is undefined, and gets converted as according
972 to the conversion flags.
973 */
974bool QTextEncoder::hasFailure() const
975{
976 return state.invalidChars != 0;
977}
978
979#if QT_STRINGVIEW_LEVEL < 2
980/*!
981 Converts the Unicode string \a str into an encoded QByteArray.
982*/
983QByteArray QTextEncoder::fromUnicode(const QString& str)
984{
985 QByteArray result = c->fromUnicode(in: str.constData(), length: str.length(), state: &state);
986 return result;
987}
988#endif
989
990/*!
991 \overload
992 \since 5.10
993 Converts the Unicode string \a str into an encoded QByteArray.
994*/
995QByteArray QTextEncoder::fromUnicode(QStringView str)
996{
997 return c->fromUnicode(in: str.data(), length: str.length(), state: &state);
998}
999
1000/*!
1001 \overload
1002
1003 Converts \a len characters (not bytes) from \a uc, and returns the
1004 result in a QByteArray.
1005*/
1006QByteArray QTextEncoder::fromUnicode(const QChar *uc, int len)
1007{
1008 QByteArray result = c->fromUnicode(in: uc, length: len, state: &state);
1009 return result;
1010}
1011
1012/*!
1013 \class QTextDecoder
1014 \inmodule QtCore
1015 \brief The QTextDecoder class provides a state-based decoder.
1016 \reentrant
1017 \ingroup i18n
1018
1019 A text decoder converts text from an encoded text format into Unicode
1020 using a specific codec.
1021
1022 The decoder converts text in this format into Unicode, remembering any
1023 state that is required between calls.
1024
1025 \sa QTextCodec::makeDecoder(), QTextEncoder
1026*/
1027
1028/*!
1029 \fn QTextDecoder::QTextDecoder(const QTextCodec *codec)
1030
1031 Constructs a text decoder for the given \a codec.
1032*/
1033
1034/*!
1035 Constructs a text decoder for the given \a codec and conversion \a flags.
1036
1037 \since 4.7
1038*/
1039
1040QTextDecoder::QTextDecoder(const QTextCodec *codec, QTextCodec::ConversionFlags flags)
1041 : c(codec), state()
1042{
1043 state.flags = flags;
1044}
1045
1046/*!
1047 Destroys the decoder.
1048*/
1049QTextDecoder::~QTextDecoder()
1050{
1051}
1052
1053/*!
1054 \fn QString QTextDecoder::toUnicode(const char *chars, int len)
1055
1056 Converts the first \a len bytes in \a chars to Unicode, returning
1057 the result.
1058
1059 If not all characters are used (e.g. if only part of a multi-byte
1060 encoding is at the end of the characters), the decoder remembers
1061 enough state to continue with the next call to this function.
1062*/
1063QString QTextDecoder::toUnicode(const char *chars, int len)
1064{
1065 return c->toUnicode(in: chars, length: len, state: &state);
1066}
1067
1068// in qstring.cpp:
1069void qt_from_latin1(ushort *dst, const char *str, size_t size) noexcept;
1070
1071/*! \overload
1072
1073 The converted string is returned in \a target.
1074 */
1075void QTextDecoder::toUnicode(QString *target, const char *chars, int len)
1076{
1077 Q_ASSERT(target);
1078 switch (c->mibEnum()) {
1079 case 106: // utf8
1080 static_cast<const QUtf8Codec*>(c)->convertToUnicode(target, chars, len, &state);
1081 break;
1082 case 4: // latin1
1083 target->resize(size: len);
1084 qt_from_latin1(dst: (ushort*)target->data(), str: chars, size: len);
1085 break;
1086 default:
1087 *target = c->toUnicode(in: chars, length: len, state: &state);
1088 }
1089}
1090
1091
1092/*!
1093 \overload
1094
1095 Converts the bytes in the byte array specified by \a ba to Unicode
1096 and returns the result.
1097*/
1098QString QTextDecoder::toUnicode(const QByteArray &ba)
1099{
1100 return c->toUnicode(in: ba.constData(), length: ba.length(), state: &state);
1101}
1102
1103/*!
1104 \since 4.4
1105
1106 Tries to detect the encoding of the provided snippet of HTML in
1107 the given byte array, \a ba, by checking the BOM (Byte Order Mark)
1108 and the content-type meta header and returns a QTextCodec instance
1109 that is capable of decoding the html to unicode. If the codec
1110 cannot be detected from the content provided, \a defaultCodec is
1111 returned.
1112
1113 \sa codecForUtfText()
1114*/
1115QTextCodec *QTextCodec::codecForHtml(const QByteArray &ba, QTextCodec *defaultCodec)
1116{
1117 // determine charset
1118 QTextCodec *c = QTextCodec::codecForUtfText(ba, defaultCodec: nullptr);
1119 if (!c) {
1120 static Q_RELAXED_CONSTEXPR auto matcher = qMakeStaticByteArrayMatcher(pattern: "meta ");
1121 QByteArray header = ba.left(len: 1024).toLower();
1122 int pos = matcher.indexIn(haystack: header);
1123 if (pos != -1) {
1124 static Q_RELAXED_CONSTEXPR auto matcher = qMakeStaticByteArrayMatcher(pattern: "charset=");
1125 pos = matcher.indexIn(haystack: header, from: pos);
1126 if (pos != -1) {
1127 pos += qstrlen(str: "charset=");
1128
1129 int pos2 = pos;
1130 // The attribute can be closed with either """, "'", ">" or "/",
1131 // none of which are valid charset characters.
1132 while (++pos2 < header.size()) {
1133 char ch = header.at(i: pos2);
1134 if (ch == '\"' || ch == '\'' || ch == '>') {
1135 QByteArray name = header.mid(index: pos, len: pos2 - pos);
1136 if (name == "unicode") // QTBUG-41998, ICU will return UTF-16.
1137 name = QByteArrayLiteral("UTF-8");
1138 c = QTextCodec::codecForName(name);
1139 return c ? c : defaultCodec;
1140 }
1141 }
1142 }
1143 }
1144 }
1145 if (!c)
1146 c = defaultCodec;
1147
1148 return c;
1149}
1150
1151/*!
1152 \overload
1153
1154 Tries to detect the encoding of the provided snippet of HTML in
1155 the given byte array, \a ba, by checking the BOM (Byte Order Mark)
1156 and the content-type meta header and returns a QTextCodec instance
1157 that is capable of decoding the html to unicode. If the codec cannot
1158 be detected, this overload returns a Latin-1 QTextCodec.
1159*/
1160QTextCodec *QTextCodec::codecForHtml(const QByteArray &ba)
1161{
1162 return codecForHtml(ba, defaultCodec: QTextCodec::codecForName(name: "ISO-8859-1"));
1163}
1164
1165/*!
1166 \since 4.6
1167
1168 Tries to detect the encoding of the provided snippet \a ba by
1169 using the BOM (Byte Order Mark) and returns a QTextCodec instance
1170 that is capable of decoding the text to unicode. This function can
1171 detect one of the following codecs:
1172
1173 \list
1174 \li UTF-32 Little Endian
1175 \li UTF-32 Big Endian
1176 \li UTF-16 Little Endian
1177 \li UTF-16 Big Endian
1178 \li UTF-8
1179 \endlist
1180
1181 If the codec cannot be detected from the content provided, \a defaultCodec
1182 is returned.
1183
1184 \sa codecForHtml()
1185*/
1186QTextCodec *QTextCodec::codecForUtfText(const QByteArray &ba, QTextCodec *defaultCodec)
1187{
1188 const int arraySize = ba.size();
1189 const uchar *buf = reinterpret_cast<const uchar *>(ba.constData());
1190 const uint bom = 0xfeff;
1191
1192 if (arraySize > 3) {
1193 uint uc = qFromUnaligned<uint>(src: buf);
1194 if (uc == qToBigEndian(source: bom))
1195 return QTextCodec::codecForMib(mib: 1018); // utf-32 be
1196 else if (uc == qToLittleEndian(source: bom))
1197 return QTextCodec::codecForMib(mib: 1019); // utf-32 le
1198 }
1199
1200 if (arraySize < 2)
1201 return defaultCodec;
1202
1203 ushort uc = qFromUnaligned<ushort>(src: buf);
1204 if (uc == qToBigEndian(source: ushort(bom)))
1205 return QTextCodec::codecForMib(mib: 1013); // utf16 be
1206 else if (uc == qToLittleEndian(source: ushort(bom)))
1207 return QTextCodec::codecForMib(mib: 1014); // utf16 le
1208
1209 if (arraySize < 3)
1210 return defaultCodec;
1211
1212 static const char utf8bom[] = "\xef\xbb\xbf";
1213 if (memcmp(s1: buf, s2: utf8bom, n: sizeof(utf8bom) - 1) == 0)
1214 return QTextCodec::codecForMib(mib: 106); // utf-8
1215
1216 return defaultCodec;
1217}
1218
1219/*!
1220 \overload
1221
1222 Tries to detect the encoding of the provided snippet \a ba by
1223 using the BOM (Byte Order Mark) and returns a QTextCodec instance
1224 that is capable of decoding the text to unicode. This function can
1225 detect one of the following codecs:
1226
1227 \list
1228 \li UTF-32 Little Endian
1229 \li UTF-32 Big Endian
1230 \li UTF-16 Little Endian
1231 \li UTF-16 Big Endian
1232 \li UTF-8
1233 \endlist
1234
1235 If the codec cannot be detected from the content provided, this overload
1236 returns a Latin-1 QTextCodec.
1237
1238 \sa codecForHtml()
1239*/
1240QTextCodec *QTextCodec::codecForUtfText(const QByteArray &ba)
1241{
1242 return codecForUtfText(ba, defaultCodec: QTextCodec::codecForMib(/*Latin 1*/ mib: 4));
1243}
1244
1245/*!
1246 \fn QTextCodec * QTextCodec::codecForTr ()
1247 \obsolete
1248
1249 Returns the codec used by QObject::tr() on its argument. If this
1250 function returns \nullptr (the default), tr() assumes Latin-1.
1251*/
1252
1253/*!
1254 \internal
1255 \since 4.3
1256 Determines whether the decoder encountered a failure while decoding the
1257 input. If an error was encountered, the produced result is undefined, and
1258 gets converted as according to the conversion flags.
1259 */
1260bool QTextDecoder::hasFailure() const
1261{
1262 return state.invalidChars != 0;
1263}
1264
1265/*!
1266 \internal
1267 \since 5.12
1268
1269 Determines whether the decoder needs more bytes to continue decoding. That
1270 is, this signifies that the input string ended in the middle of a
1271 multi-byte sequence. Note that it's possible some codecs do not report this.
1272 */
1273bool QTextDecoder::needsMoreData() const
1274{
1275 return state.remainingChars;
1276}
1277
1278QT_END_NAMESPACE
1279

source code of qtbase/src/corelib/codecs/qtextcodec.cpp