1/****************************************************************************
2**
3** Copyright (C) 2018 The Qt Company Ltd.
4** Copyright (C) 2018 Intel Corporation.
5** Contact: https://www.qt.io/licensing/
6**
7** This file is part of the QtCore module of the Qt Toolkit.
8**
9** $QT_BEGIN_LICENSE:LGPL$
10** Commercial License Usage
11** Licensees holding valid commercial Qt licenses may use this file in
12** accordance with the commercial license agreement provided with the
13** Software or, alternatively, in accordance with the terms contained in
14** a written agreement between you and The Qt Company. For licensing terms
15** and conditions see https://www.qt.io/terms-conditions. For further
16** information use the contact form at https://www.qt.io/contact-us.
17**
18** GNU Lesser General Public License Usage
19** Alternatively, this file may be used under the terms of the GNU Lesser
20** General Public License version 3 as published by the Free Software
21** Foundation and appearing in the file LICENSE.LGPL3 included in the
22** packaging of this file. Please review the following information to
23** ensure the GNU Lesser General Public License version 3 requirements
24** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
25**
26** GNU General Public License Usage
27** Alternatively, this file may be used under the terms of the GNU
28** General Public License version 2.0 or (at your option) the GNU General
29** Public license version 3 or any later version approved by the KDE Free
30** Qt Foundation. The licenses are as published by the Free Software
31** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
32** included in the packaging of this file. Please review the following
33** information to ensure the GNU General Public License requirements will
34** be met: https://www.gnu.org/licenses/gpl-2.0.html and
35** https://www.gnu.org/licenses/gpl-3.0.html.
36**
37** $QT_END_LICENSE$
38**
39****************************************************************************/
40
41#include "qplatformdefs.h"
42
43#include "qtextcodec.h"
44#include "qtextcodec_p.h"
45
46#include "qbytearraymatcher.h"
47#include "qendian.h"
48#include "qfile.h"
49#include "qlist.h"
50#include "qstringlist.h"
51#include "qvarlengtharray.h"
52#if !defined(QT_BOOTSTRAPPED)
53#include <private/qcoreapplication_p.h>
54#endif
55#include "private/qcoreglobaldata_p.h"
56
57#include "qutfcodec_p.h"
58#include "qlatincodec_p.h"
59
60#if !defined(QT_BOOTSTRAPPED)
61#if QT_CONFIG(codecs)
62# include "qtsciicodec_p.h"
63# include "qisciicodec_p.h"
64#endif
65#if QT_CONFIG(icu)
66#include "qicucodec_p.h"
67#else
68#if QT_CONFIG(iconv)
69# include "qiconvcodec_p.h"
70#endif
71#ifdef Q_OS_WIN
72# include "qwindowscodec_p.h"
73#endif
74# include "qsimplecodec_p.h"
75#if QT_CONFIG(big_codecs)
76# ifndef Q_OS_INTEGRITY
77# include "qgb18030codec_p.h"
78# include "qeucjpcodec_p.h"
79# include "qjiscodec_p.h"
80# include "qsjiscodec_p.h"
81# include "qeuckrcodec_p.h"
82# include "qbig5codec_p.h"
83# endif // !Q_OS_INTEGRITY
84#endif // big_codecs
85
86#endif // icu
87#endif // QT_BOOTSTRAPPED
88
89#include "qmutex.h"
90
91#include <stdlib.h>
92#include <ctype.h>
93#include <locale.h>
94#if defined (_XOPEN_UNIX) && !defined(Q_OS_QNX) && !defined(Q_OS_ANDROID)
95# include <langinfo.h>
96#endif
97
98QT_BEGIN_NAMESPACE
99
100typedef QList<QTextCodec*>::ConstIterator TextCodecListConstIt;
101typedef QList<QByteArray>::ConstIterator ByteArrayListConstIt;
102
103Q_GLOBAL_STATIC_WITH_ARGS(QMutex, textCodecsMutex, (QMutex::Recursive));
104QMutex *qTextCodecsMutex() { return textCodecsMutex(); }
105
106#if !QT_CONFIG(icu)
107static char qtolower(char c)
108{ if (c >= 'A' && c <= 'Z') return c + 0x20; return c; }
109static bool qisalnum(char c)
110{ return (c >= '0' && c <= '9') || ((c | 0x20) >= 'a' && (c | 0x20) <= 'z'); }
111
112bool qTextCodecNameMatch(const char *n, const char *h)
113{
114 if (qstricmp(n, h) == 0)
115 return true;
116
117 // if the letters and numbers are the same, we have a match
118 while (*n != '\0') {
119 if (qisalnum(*n)) {
120 for (;;) {
121 if (*h == '\0')
122 return false;
123 if (qisalnum(*h))
124 break;
125 ++h;
126 }
127 if (qtolower(*n) != qtolower(*h))
128 return false;
129 ++h;
130 }
131 ++n;
132 }
133 while (*h && !qisalnum(*h))
134 ++h;
135 return (*h == '\0');
136}
137
138
139#if !defined(Q_OS_WIN32) && !defined(QT_LOCALE_IS_UTF8)
140static QTextCodec *checkForCodec(const QByteArray &name) {
141 QTextCodec *c = QTextCodec::codecForName(name);
142 if (!c) {
143 const int index = name.indexOf('@');
144 if (index != -1) {
145 c = QTextCodec::codecForName(name.left(index));
146 }
147 }
148 return c;
149}
150#endif
151
152static void setup();
153
154// \threadsafe
155// this returns the codec the method sets up as locale codec to
156// avoid a race condition in codecForLocale() when
157// setCodecForLocale(0) is called at the same time.
158static QTextCodec *setupLocaleMapper()
159{
160 QCoreGlobalData *globalData = QCoreGlobalData::instance();
161
162 QTextCodec *locale = nullptr;
163
164 {
165 QMutexLocker locker(textCodecsMutex());
166 if (globalData->allCodecs.isEmpty())
167 setup();
168 }
169
170#if !defined(QT_BOOTSTRAPPED)
171 QCoreApplicationPrivate::initLocale();
172#endif
173
174#if defined(QT_LOCALE_IS_UTF8)
175 locale = QTextCodec::codecForName("UTF-8");
176#elif defined(Q_OS_WIN)
177 locale = QTextCodec::codecForName("System");
178#else
179
180 // First try getting the codecs name from nl_langinfo and see
181 // if we have a builtin codec for it.
182 // Only fall back to using iconv if we can't find a builtin codec
183 // This is because the builtin utf8 codec is around 5 times faster
184 // then the using QIconvCodec
185
186#if defined (_XOPEN_UNIX)
187 char *charset = nl_langinfo(CODESET);
188 if (charset)
189 locale = QTextCodec::codecForName(charset);
190#endif
191#if QT_CONFIG(iconv)
192 if (!locale) {
193 // no builtin codec for the locale found, let's try using iconv
194 (void) new QIconvCodec();
195 locale = QTextCodec::codecForName("System");
196 }
197#endif
198
199 if (!locale) {
200 // Very poorly defined and followed standards causes lots of
201 // code to try to get all the cases... This logic is
202 // duplicated in QIconvCodec, so if you change it here, change
203 // it there too.
204
205 // Try to determine locale codeset from locale name assigned to
206 // LC_CTYPE category.
207
208 // First part is getting that locale name. First try setlocale() which
209 // definitely knows it, but since we cannot fully trust it, get ready
210 // to fall back to environment variables.
211 const QByteArray ctype = setlocale(LC_CTYPE, nullptr);
212
213 // Get the first nonempty value from $LC_ALL, $LC_CTYPE, and $LANG
214 // environment variables.
215 QByteArray lang = qgetenv("LC_ALL");
216 if (lang.isEmpty() || lang == "C") {
217 lang = qgetenv("LC_CTYPE");
218 }
219 if (lang.isEmpty() || lang == "C") {
220 lang = qgetenv("LANG");
221 }
222
223 // Now try these in order:
224 // 1. CODESET from ctype if it contains a .CODESET part (e.g. en_US.ISO8859-15)
225 // 2. CODESET from lang if it contains a .CODESET part
226 // 3. ctype (maybe the locale is named "ISO-8859-1" or something)
227 // 4. locale (ditto)
228 // 5. check for "@euro"
229 // 6. guess locale from ctype unless ctype is "C"
230 // 7. guess locale from lang
231
232 // 1. CODESET from ctype if it contains a .CODESET part (e.g. en_US.ISO8859-15)
233 int indexOfDot = ctype.indexOf('.');
234 if (indexOfDot != -1)
235 locale = checkForCodec( ctype.mid(indexOfDot + 1) );
236
237 // 2. CODESET from lang if it contains a .CODESET part
238 if (!locale) {
239 indexOfDot = lang.indexOf('.');
240 if (indexOfDot != -1)
241 locale = checkForCodec( lang.mid(indexOfDot + 1) );
242 }
243
244 // 3. ctype (maybe the locale is named "ISO-8859-1" or something)
245 if (!locale && !ctype.isEmpty() && ctype != "C")
246 locale = checkForCodec(ctype);
247
248 // 4. locale (ditto)
249 if (!locale && !lang.isEmpty())
250 locale = checkForCodec(lang);
251
252 // 5. "@euro"
253 if ((!locale && ctype.contains("@euro")) || lang.contains("@euro"))
254 locale = checkForCodec("ISO 8859-15");
255 }
256
257#endif
258 // If everything failed, we default to 8859-1
259 if (!locale)
260 locale = QTextCodec::codecForName("ISO 8859-1");
261 globalData->codecForLocale.storeRelease(locale);
262 return locale;
263}
264
265
266// textCodecsMutex need to be locked to enter this function
267static void setup()
268{
269 static bool initialized = false;
270 if (initialized)
271 return;
272 initialized = true;
273
274#if QT_CONFIG(codecs) && !defined(QT_BOOTSTRAPPED)
275 (void)new QTsciiCodec;
276 for (int i = 0; i < 9; ++i)
277 (void)new QIsciiCodec(i);
278 for (int i = 0; i < QSimpleTextCodec::numSimpleCodecs; ++i)
279 (void)new QSimpleTextCodec(i);
280
281# if QT_CONFIG(big_codecs) && !defined(Q_OS_INTEGRITY)
282 (void)new QGb18030Codec;
283 (void)new QGbkCodec;
284 (void)new QGb2312Codec;
285 (void)new QEucJpCodec;
286 (void)new QJisCodec;
287 (void)new QSjisCodec;
288 (void)new QEucKrCodec;
289 (void)new QCP949Codec;
290 (void)new QBig5Codec;
291 (void)new QBig5hkscsCodec;
292# endif // big_codecs && !Q_OS_INTEGRITY
293#if QT_CONFIG(iconv)
294 (void) new QIconvCodec;
295#endif
296#if defined(Q_OS_WIN32)
297 (void) new QWindowsLocalCodec;
298#endif // Q_OS_WIN32
299#endif // codecs && !QT_BOOTSTRAPPED
300
301 (void)new QUtf16Codec;
302 (void)new QUtf16BECodec;
303 (void)new QUtf16LECodec;
304 (void)new QUtf32Codec;
305 (void)new QUtf32BECodec;
306 (void)new QUtf32LECodec;
307 (void)new QLatin15Codec;
308 (void)new QLatin1Codec;
309 (void)new QUtf8Codec;
310}
311#else
312static void setup() {}
313#endif // icu
314
315/*!
316 \enum QTextCodec::ConversionFlag
317
318 \value DefaultConversion No flag is set.
319 \value ConvertInvalidToNull If this flag is set, each invalid input
320 character is output as a null character.
321 \value IgnoreHeader Ignore any Unicode byte-order mark and don't generate any.
322
323 \omitvalue FreeFunction
324*/
325
326/*!
327 \fn QTextCodec::ConverterState::ConverterState(ConversionFlags flags)
328
329 Constructs a ConverterState object initialized with the given \a flags.
330*/
331
332/*!
333 Destroys the ConverterState object.
334*/
335QTextCodec::ConverterState::~ConverterState()
336{
337 if (flags & FreeFunction)
338 (QTextCodecUnalignedPointer::decode(state_data))(this);
339 else if (d)
340 free(d);
341}
342
343/*!
344 \class QTextCodec
345 \inmodule QtCore
346 \brief The QTextCodec class provides conversions between text encodings.
347 \reentrant
348 \ingroup i18n
349
350 Qt uses Unicode to store, draw and manipulate strings. In many
351 situations you may wish to deal with data that uses a different
352 encoding. For example, most Japanese documents are still stored
353 in Shift-JIS or ISO 2022-JP, while Russian users often have their
354 documents in KOI8-R or Windows-1251.
355
356 Qt provides a set of QTextCodec classes to help with converting
357 non-Unicode formats to and from Unicode. You can also create your
358 own codec classes.
359
360 The supported encodings are:
361
362 \list
363 \li \l{Big5 Text Codec}{Big5}
364 \li \l{Big5-HKSCS Text Codec}{Big5-HKSCS}
365 \li CP949
366 \li \l{EUC-JP Text Codec}{EUC-JP}
367 \li \l{EUC-KR Text Codec}{EUC-KR}
368 \li \l{GBK Text Codec}{GB18030}
369 \li HP-ROMAN8
370 \li IBM 850
371 \li IBM 866
372 \li IBM 874
373 \li \l{ISO 2022-JP (JIS) Text Codec}{ISO 2022-JP}
374 \li ISO 8859-1 to 10
375 \li ISO 8859-13 to 16
376 \li Iscii-Bng, Dev, Gjr, Knd, Mlm, Ori, Pnj, Tlg, and Tml
377 \li KOI8-R
378 \li KOI8-U
379 \li Macintosh
380 \li \l{Shift-JIS Text Codec}{Shift-JIS}
381 \li TIS-620
382 \li \l{TSCII Text Codec}{TSCII}
383 \li UTF-8
384 \li UTF-16
385 \li UTF-16BE
386 \li UTF-16LE
387 \li UTF-32
388 \li UTF-32BE
389 \li UTF-32LE
390 \li Windows-1250 to 1258
391 \endlist
392
393 If Qt is compiled with ICU support enabled, most codecs supported by
394 ICU will also be available to the application.
395
396 \l {QTextCodec}s can be used as follows to convert some locally encoded
397 string to Unicode. Suppose you have some string encoded in Russian
398 KOI8-R encoding, and want to convert it to Unicode. The simple way
399 to do it is like this:
400
401 \snippet code/src_corelib_codecs_qtextcodec.cpp 0
402
403 After this, \c string holds the text converted to Unicode.
404 Converting a string from Unicode to the local encoding is just as
405 easy:
406
407 \snippet code/src_corelib_codecs_qtextcodec.cpp 1
408
409 To read or write files in various encodings, use QTextStream and
410 its \l{QTextStream::setCodec()}{setCodec()} function. See the
411 \l{tools/codecs}{Codecs} example for an application of QTextCodec
412 to file I/O.
413
414 Some care must be taken when trying to convert the data in chunks,
415 for example, when receiving it over a network. In such cases it is
416 possible that a multi-byte character will be split over two
417 chunks. At best this might result in the loss of a character and
418 at worst cause the entire conversion to fail.
419
420 The approach to use in these situations is to create a QTextDecoder
421 object for the codec and use this QTextDecoder for the whole
422 decoding process, as shown below:
423
424 \snippet code/src_corelib_codecs_qtextcodec.cpp 2
425
426 The QTextDecoder object maintains state between chunks and therefore
427 works correctly even if a multi-byte character is split between
428 chunks.
429
430 \section1 Creating Your Own Codec Class
431
432 Support for new text encodings can be added to Qt by creating
433 QTextCodec subclasses.
434
435 The pure virtual functions describe the encoder to the system and
436 the coder is used as required in the different text file formats
437 supported by QTextStream, and under X11, for the locale-specific
438 character input and output.
439
440 To add support for another encoding to Qt, make a subclass of
441 QTextCodec and implement the functions listed in the table below.
442
443 \table
444 \header \li Function \li Description
445
446 \row \li name()
447 \li Returns the official name for the encoding. If the
448 encoding is listed in the
449 \l{IANA character-sets encoding file}, the name
450 should be the preferred MIME name for the encoding.
451
452 \row \li aliases()
453 \li Returns a list of alternative names for the encoding.
454 QTextCodec provides a default implementation that returns
455 an empty list. For example, "ISO-8859-1" has "latin1",
456 "CP819", "IBM819", and "iso-ir-100" as aliases.
457
458 \row \li \l{QTextCodec::mibEnum()}{mibEnum()}
459 \li Return the MIB enum for the encoding if it is listed in
460 the \l{IANA character-sets encoding file}.
461
462 \row \li convertToUnicode()
463 \li Converts an 8-bit character string to Unicode.
464
465 \row \li convertFromUnicode()
466 \li Converts a Unicode string to an 8-bit character string.
467 \endtable
468
469 \sa QTextStream, QTextDecoder, QTextEncoder, {Text Codecs Example}
470*/
471
472/*!
473 Constructs a QTextCodec, and gives it the highest precedence. The
474 QTextCodec should always be constructed on the heap (i.e. with \c
475 new). Qt takes ownership and will delete it when the application
476 terminates.
477*/
478QTextCodec::QTextCodec()
479{
480 QMutexLocker locker(textCodecsMutex());
481
482 QCoreGlobalData *globalInstance = QCoreGlobalData::instance();
483 if (globalInstance->allCodecs.isEmpty())
484 setup();
485
486 globalInstance->allCodecs.prepend(this);
487}
488
489
490/*!
491 \nonreentrant
492
493 Destroys the QTextCodec. Note that you should not delete codecs
494 yourself: once created they become Qt's responsibility.
495*/
496QTextCodec::~QTextCodec()
497{
498 QCoreGlobalData *globalData = QCoreGlobalData::instance();
499 if (!globalData)
500 return;
501
502 globalData->codecForLocale.testAndSetRelaxed(this, nullptr);
503
504 QMutexLocker locker(textCodecsMutex());
505
506 globalData->allCodecs.removeOne(this);
507
508 auto it = globalData->codecCache.begin();
509
510 while (it != globalData->codecCache.end()) {
511 if (it.value() == this)
512 it = globalData->codecCache.erase(it);
513 else
514 ++it;
515 }
516}
517
518/*!
519 \fn QTextCodec *QTextCodec::codecForName(const char *name)
520
521 Searches all installed QTextCodec objects and returns the one
522 which best matches \a name; the match is case-insensitive. Returns
523 0 if no codec matching the name \a name could be found.
524*/
525
526/*!
527 \threadsafe
528 Searches all installed QTextCodec objects and returns the one
529 which best matches \a name; the match is case-insensitive. Returns
530 0 if no codec matching the name \a name could be found.
531*/
532QTextCodec *QTextCodec::codecForName(const QByteArray &name)
533{
534 if (name.isEmpty())
535 return nullptr;
536
537 QMutexLocker locker(textCodecsMutex());
538
539 QCoreGlobalData *globalData = QCoreGlobalData::instance();
540 if (!globalData)
541 return nullptr;
542 setup();
543
544#if !QT_CONFIG(icu)
545 QTextCodecCache *cache = &globalData->codecCache;
546 QTextCodec *codec;
547 codec = cache->value(name);
548 if (codec)
549 return codec;
550
551 for (TextCodecListConstIt it = globalData->allCodecs.constBegin(), cend = globalData->allCodecs.constEnd(); it != cend; ++it) {
552 QTextCodec *cursor = *it;
553 if (qTextCodecNameMatch(cursor->name(), name)) {
554 if (cache)
555 cache->insert(name, cursor);
556 return cursor;
557 }
558 QList<QByteArray> aliases = cursor->aliases();
559 for (ByteArrayListConstIt ait = aliases.constBegin(), acend = aliases.constEnd(); ait != acend; ++ait) {
560 if (qTextCodecNameMatch(*ait, name)) {
561 cache->insert(name, cursor);
562 return cursor;
563 }
564 }
565 }
566
567 return nullptr;
568#else
569 return QIcuCodec::codecForNameUnlocked(name);
570#endif
571}
572
573
574/*!
575 \threadsafe
576 Returns the QTextCodec which matches the
577 \l{QTextCodec::mibEnum()}{MIBenum} \a mib.
578*/
579QTextCodec* QTextCodec::codecForMib(int mib)
580{
581 QMutexLocker locker(textCodecsMutex());
582
583 QCoreGlobalData *globalData = QCoreGlobalData::instance();
584 if (!globalData)
585 return nullptr;
586 if (globalData->allCodecs.isEmpty())
587 setup();
588
589 QByteArray key = "MIB: " + QByteArray::number(mib);
590
591 QTextCodecCache *cache = &globalData->codecCache;
592 QTextCodec *codec;
593 if (cache) {
594 codec = cache->value(key);
595 if (codec)
596 return codec;
597 }
598
599 for (TextCodecListConstIt it = globalData->allCodecs.constBegin(), cend = globalData->allCodecs.constEnd(); it != cend; ++it) {
600 QTextCodec *cursor = *it;
601 if (cursor->mibEnum() == mib) {
602 if (cache)
603 cache->insert(key, cursor);
604 return cursor;
605 }
606 }
607
608#if QT_CONFIG(icu)
609 return QIcuCodec::codecForMibUnlocked(mib);
610#else
611 return nullptr;
612#endif
613}
614
615/*!
616 \threadsafe
617 Returns the list of all available codecs, by name. Call
618 QTextCodec::codecForName() to obtain the QTextCodec for the name.
619
620 The list may contain many mentions of the same codec
621 if the codec has aliases.
622
623 \sa availableMibs(), name(), aliases()
624*/
625QList<QByteArray> QTextCodec::availableCodecs()
626{
627 QMutexLocker locker(textCodecsMutex());
628
629 QCoreGlobalData *globalData = QCoreGlobalData::instance();
630 if (globalData->allCodecs.isEmpty())
631 setup();
632
633 QList<QByteArray> codecs;
634
635 for (TextCodecListConstIt it = globalData->allCodecs.constBegin(), cend = globalData->allCodecs.constEnd(); it != cend; ++it) {
636 codecs += (*it)->name();
637 codecs += (*it)->aliases();
638 }
639
640#if QT_CONFIG(icu)
641 codecs += QIcuCodec::availableCodecs();
642#endif
643
644 return codecs;
645}
646
647/*!
648 \threadsafe
649 Returns the list of MIBs for all available codecs. Call
650 QTextCodec::codecForMib() to obtain the QTextCodec for the MIB.
651
652 \sa availableCodecs(), mibEnum()
653*/
654QList<int> QTextCodec::availableMibs()
655{
656#if QT_CONFIG(icu)
657 return QIcuCodec::availableMibs();
658#else
659 QMutexLocker locker(textCodecsMutex());
660
661 QCoreGlobalData *globalData = QCoreGlobalData::instance();
662 if (globalData->allCodecs.isEmpty())
663 setup();
664
665 QList<int> codecs;
666
667 for (TextCodecListConstIt it = globalData->allCodecs.constBegin(), cend = globalData->allCodecs.constEnd(); it != cend; ++it)
668 codecs += (*it)->mibEnum();
669
670 return codecs;
671#endif
672}
673
674/*!
675 \nonreentrant
676
677 Set the codec to \a c; this will be returned by
678 codecForLocale(). If \a c is \nullptr, the codec is reset to
679 the default.
680
681 This might be needed for some applications that want to use their
682 own mechanism for setting the locale.
683
684 \sa codecForLocale()
685*/
686void QTextCodec::setCodecForLocale(QTextCodec *c)
687{
688 QCoreGlobalData::instance()->codecForLocale.storeRelease(c);
689}
690
691/*!
692 \threadsafe
693 Returns a pointer to the codec most suitable for this locale.
694
695 The codec will be retrieved from ICU where that backend is in use, otherwise
696 it may be obtained from an OS-specific API. In the latter case, the codec's
697 name may be "System".
698*/
699
700QTextCodec* QTextCodec::codecForLocale()
701{
702 QCoreGlobalData *globalData = QCoreGlobalData::instance();
703 if (!globalData)
704 return nullptr;
705
706 QTextCodec *codec = globalData->codecForLocale.loadAcquire();
707 if (!codec) {
708#if QT_CONFIG(icu)
709 textCodecsMutex()->lock();
710 codec = QIcuCodec::defaultCodecUnlocked();
711 textCodecsMutex()->unlock();
712#else
713 // setupLocaleMapper locks as necessary
714 codec = setupLocaleMapper();
715#endif
716 }
717
718 return codec;
719}
720
721
722/*!
723 \fn QByteArray QTextCodec::name() const
724
725 QTextCodec subclasses must reimplement this function. It returns
726 the name of the encoding supported by the subclass.
727
728 If the codec is registered as a character set in the
729 \l{IANA character-sets encoding file} this method should
730 return the preferred mime name for the codec if defined,
731 otherwise its name.
732*/
733
734/*!
735 \fn int QTextCodec::mibEnum() const
736
737 Subclasses of QTextCodec must reimplement this function. It
738 returns the \l{QTextCodec::mibEnum()}{MIBenum} (see \l{IANA character-sets encoding file}
739 for more information). It is important that each QTextCodec
740 subclass returns the correct unique value for this function.
741*/
742
743/*!
744 Subclasses can return a number of aliases for the codec in question.
745
746 Standard aliases for codecs can be found in the
747 \l{IANA character-sets encoding file}.
748*/
749QList<QByteArray> QTextCodec::aliases() const
750{
751 return QList<QByteArray>();
752}
753
754/*!
755 \fn QString QTextCodec::convertToUnicode(const char *chars, int len,
756 ConverterState *state) const
757
758 QTextCodec subclasses must reimplement this function.
759
760 Converts the first \a len characters of \a chars from the
761 encoding of the subclass to Unicode, and returns the result in a
762 QString.
763
764 \a state can be \nullptr, in which case the conversion is stateless and
765 default conversion rules should be used. If state is not 0, the
766 codec should save the state after the conversion in \a state, and
767 adjust the \c remainingChars and \c invalidChars members of the struct.
768*/
769
770/*!
771 \fn QByteArray QTextCodec::convertFromUnicode(const QChar *input, int number,
772 ConverterState *state) const
773
774 QTextCodec subclasses must reimplement this function.
775
776 Converts the first \a number of characters from the \a input array
777 from Unicode to the encoding of the subclass, and returns the result
778 in a QByteArray.
779
780 \a state can be \nullptr in which case the conversion is stateless and
781 default conversion rules should be used. If state is not 0, the
782 codec should save the state after the conversion in \a state, and
783 adjust the \c remainingChars and \c invalidChars members of the struct.
784*/
785
786/*!
787 Creates a QTextDecoder with a specified \a flags to decode chunks
788 of \c{char *} data to create chunks of Unicode data.
789
790 The caller is responsible for deleting the returned object.
791
792 \since 4.7
793*/
794QTextDecoder* QTextCodec::makeDecoder(QTextCodec::ConversionFlags flags) const
795{
796 return new QTextDecoder(this, flags);
797}
798
799/*!
800 Creates a QTextEncoder with a specified \a flags to encode chunks
801 of Unicode data as \c{char *} data.
802
803 The caller is responsible for deleting the returned object.
804
805 \since 4.7
806*/
807QTextEncoder* QTextCodec::makeEncoder(QTextCodec::ConversionFlags flags) const
808{
809 return new QTextEncoder(this, flags);
810}
811
812/*!
813 \fn QByteArray QTextCodec::fromUnicode(const QChar *input, int number,
814 ConverterState *state) const
815
816 Converts the first \a number of characters from the \a input array
817 from Unicode to the encoding of this codec, and returns the result
818 in a QByteArray.
819
820 The \a state of the convertor used is updated.
821*/
822
823#if QT_STRINGVIEW_LEVEL < 2
824/*!
825 Converts \a str from Unicode to the encoding of this codec, and
826 returns the result in a QByteArray.
827*/
828QByteArray QTextCodec::fromUnicode(const QString& str) const
829{
830 return convertFromUnicode(str.constData(), str.length(), nullptr);
831}
832#endif
833
834/*!
835 \overload
836 \since 5.10
837
838 Converts \a str from Unicode to the encoding of this codec, and
839 returns the result in a QByteArray.
840*/
841QByteArray QTextCodec::fromUnicode(QStringView str) const
842{
843 return convertFromUnicode(str.data(), str.length(), nullptr);
844}
845
846/*!
847 \fn QString QTextCodec::toUnicode(const char *input, int size,
848 ConverterState *state) const
849
850 Converts the first \a size characters from the \a input from the
851 encoding of this codec to Unicode, and returns the result in a
852 QString.
853
854 The \a state of the convertor used is updated.
855*/
856
857/*!
858 Converts \a a from the encoding of this codec to Unicode, and
859 returns the result in a QString.
860*/
861QString QTextCodec::toUnicode(const QByteArray& a) const
862{
863 return convertToUnicode(a.constData(), a.length(), nullptr);
864}
865
866/*!
867 Returns \c true if the Unicode character \a ch can be fully encoded
868 with this codec; otherwise returns \c false.
869*/
870bool QTextCodec::canEncode(QChar ch) const
871{
872 ConverterState state;
873 state.flags = ConvertInvalidToNull;
874 convertFromUnicode(&ch, 1, &state);
875 return (state.invalidChars == 0);
876}
877
878#if QT_STRINGVIEW_LEVEL < 2
879/*!
880 \overload
881
882 \a s contains the string being tested for encode-ability.
883*/
884bool QTextCodec::canEncode(const QString& s) const
885{
886 ConverterState state;
887 state.flags = ConvertInvalidToNull;
888 convertFromUnicode(s.constData(), s.length(), &state);
889 return (state.invalidChars == 0);
890}
891#endif
892
893/*!
894 \overload
895 \since 5.10
896
897 Returns \c true if the Unicode string \a s can be fully encoded
898 with this codec; otherwise returns \c false.
899*/
900bool QTextCodec::canEncode(QStringView s) const
901{
902 ConverterState state;
903 state.flags = ConvertInvalidToNull;
904 convertFromUnicode(s.data(), s.length(), &state);
905 return !state.invalidChars;
906}
907/*!
908 \overload
909
910 \a chars contains the source characters.
911*/
912QString QTextCodec::toUnicode(const char *chars) const
913{
914 int len = qstrlen(chars);
915 return convertToUnicode(chars, len, nullptr);
916}
917
918
919/*!
920 \class QTextEncoder
921 \inmodule QtCore
922 \brief The QTextEncoder class provides a state-based encoder.
923 \reentrant
924 \ingroup i18n
925
926 A text encoder converts text from Unicode into an encoded text format
927 using a specific codec.
928
929 The encoder converts Unicode into another format, remembering any
930 state that is required between calls.
931
932 \sa QTextCodec::makeEncoder(), QTextDecoder
933*/
934
935/*!
936 \fn QTextEncoder::QTextEncoder(const QTextCodec *codec)
937
938 Constructs a text encoder for the given \a codec.
939*/
940
941/*!
942 Constructs a text encoder for the given \a codec and conversion \a flags.
943
944 \since 4.7
945*/
946QTextEncoder::QTextEncoder(const QTextCodec *codec, QTextCodec::ConversionFlags flags)
947 : c(codec), state()
948{
949 state.flags = flags;
950}
951
952/*!
953 Destroys the encoder.
954*/
955QTextEncoder::~QTextEncoder()
956{
957}
958
959/*!
960 \internal
961 \since 4.5
962 Determines whether the eecoder encountered a failure while decoding the input. If
963 an error was encountered, the produced result is undefined, and gets converted as according
964 to the conversion flags.
965 */
966bool QTextEncoder::hasFailure() const
967{
968 return state.invalidChars != 0;
969}
970
971#if QT_STRINGVIEW_LEVEL < 2
972/*!
973 Converts the Unicode string \a str into an encoded QByteArray.
974*/
975QByteArray QTextEncoder::fromUnicode(const QString& str)
976{
977 QByteArray result = c->fromUnicode(str.constData(), str.length(), &state);
978 return result;
979}
980#endif
981
982/*!
983 \overload
984 \since 5.10
985 Converts the Unicode string \a str into an encoded QByteArray.
986*/
987QByteArray QTextEncoder::fromUnicode(QStringView str)
988{
989 return c->fromUnicode(str.data(), str.length(), &state);
990}
991
992/*!
993 \overload
994
995 Converts \a len characters (not bytes) from \a uc, and returns the
996 result in a QByteArray.
997*/
998QByteArray QTextEncoder::fromUnicode(const QChar *uc, int len)
999{
1000 QByteArray result = c->fromUnicode(uc, len, &state);
1001 return result;
1002}
1003
1004/*!
1005 \class QTextDecoder
1006 \inmodule QtCore
1007 \brief The QTextDecoder class provides a state-based decoder.
1008 \reentrant
1009 \ingroup i18n
1010
1011 A text decoder converts text from an encoded text format into Unicode
1012 using a specific codec.
1013
1014 The decoder converts text in this format into Unicode, remembering any
1015 state that is required between calls.
1016
1017 \sa QTextCodec::makeDecoder(), QTextEncoder
1018*/
1019
1020/*!
1021 \fn QTextDecoder::QTextDecoder(const QTextCodec *codec)
1022
1023 Constructs a text decoder for the given \a codec.
1024*/
1025
1026/*!
1027 Constructs a text decoder for the given \a codec and conversion \a flags.
1028
1029 \since 4.7
1030*/
1031
1032QTextDecoder::QTextDecoder(const QTextCodec *codec, QTextCodec::ConversionFlags flags)
1033 : c(codec), state()
1034{
1035 state.flags = flags;
1036}
1037
1038/*!
1039 Destroys the decoder.
1040*/
1041QTextDecoder::~QTextDecoder()
1042{
1043}
1044
1045/*!
1046 \fn QString QTextDecoder::toUnicode(const char *chars, int len)
1047
1048 Converts the first \a len bytes in \a chars to Unicode, returning
1049 the result.
1050
1051 If not all characters are used (e.g. if only part of a multi-byte
1052 encoding is at the end of the characters), the decoder remembers
1053 enough state to continue with the next call to this function.
1054*/
1055QString QTextDecoder::toUnicode(const char *chars, int len)
1056{
1057 return c->toUnicode(chars, len, &state);
1058}
1059
1060// in qstring.cpp:
1061void qt_from_latin1(ushort *dst, const char *str, size_t size) noexcept;
1062
1063/*! \overload
1064
1065 The converted string is returned in \a target.
1066 */
1067void QTextDecoder::toUnicode(QString *target, const char *chars, int len)
1068{
1069 Q_ASSERT(target);
1070 switch (c->mibEnum()) {
1071 case 106: // utf8
1072 static_cast<const QUtf8Codec*>(c)->convertToUnicode(target, chars, len, &state);
1073 break;
1074 case 4: // latin1
1075 target->resize(len);
1076 qt_from_latin1((ushort*)target->data(), chars, len);
1077 break;
1078 default:
1079 *target = c->toUnicode(chars, len, &state);
1080 }
1081}
1082
1083
1084/*!
1085 \overload
1086
1087 Converts the bytes in the byte array specified by \a ba to Unicode
1088 and returns the result.
1089*/
1090QString QTextDecoder::toUnicode(const QByteArray &ba)
1091{
1092 return c->toUnicode(ba.constData(), ba.length(), &state);
1093}
1094
1095/*!
1096 \since 4.4
1097
1098 Tries to detect the encoding of the provided snippet of HTML in
1099 the given byte array, \a ba, by checking the BOM (Byte Order Mark)
1100 and the content-type meta header and returns a QTextCodec instance
1101 that is capable of decoding the html to unicode. If the codec
1102 cannot be detected from the content provided, \a defaultCodec is
1103 returned.
1104
1105 \sa codecForUtfText()
1106*/
1107QTextCodec *QTextCodec::codecForHtml(const QByteArray &ba, QTextCodec *defaultCodec)
1108{
1109 // determine charset
1110 QTextCodec *c = QTextCodec::codecForUtfText(ba, nullptr);
1111 if (!c) {
1112 static Q_RELAXED_CONSTEXPR auto matcher = qMakeStaticByteArrayMatcher("meta ");
1113 QByteArray header = ba.left(1024).toLower();
1114 int pos = matcher.indexIn(header);
1115 if (pos != -1) {
1116 static Q_RELAXED_CONSTEXPR auto matcher = qMakeStaticByteArrayMatcher("charset=");
1117 pos = matcher.indexIn(header, pos);
1118 if (pos != -1) {
1119 pos += qstrlen("charset=");
1120
1121 int pos2 = pos;
1122 // The attribute can be closed with either """, "'", ">" or "/",
1123 // none of which are valid charset characters.
1124 while (++pos2 < header.size()) {
1125 char ch = header.at(pos2);
1126 if (ch == '\"' || ch == '\'' || ch == '>') {
1127 QByteArray name = header.mid(pos, pos2 - pos);
1128 if (name == "unicode") // QTBUG-41998, ICU will return UTF-16.
1129 name = QByteArrayLiteral("UTF-8");
1130 c = QTextCodec::codecForName(name);
1131 return c ? c : defaultCodec;
1132 }
1133 }
1134 }
1135 }
1136 }
1137 if (!c)
1138 c = defaultCodec;
1139
1140 return c;
1141}
1142
1143/*!
1144 \overload
1145
1146 Tries to detect the encoding of the provided snippet of HTML in
1147 the given byte array, \a ba, by checking the BOM (Byte Order Mark)
1148 and the content-type meta header and returns a QTextCodec instance
1149 that is capable of decoding the html to unicode. If the codec cannot
1150 be detected, this overload returns a Latin-1 QTextCodec.
1151*/
1152QTextCodec *QTextCodec::codecForHtml(const QByteArray &ba)
1153{
1154 return codecForHtml(ba, QTextCodec::codecForName("ISO-8859-1"));
1155}
1156
1157/*!
1158 \since 4.6
1159
1160 Tries to detect the encoding of the provided snippet \a ba by
1161 using the BOM (Byte Order Mark) and returns a QTextCodec instance
1162 that is capable of decoding the text to unicode. This function can
1163 detect one of the following codecs:
1164
1165 \list
1166 \li UTF-32 Little Endian
1167 \li UTF-32 Big Endian
1168 \li UTF-16 Little Endian
1169 \li UTF-16 Big Endian
1170 \li UTF-8
1171 \endlist
1172
1173 If the codec cannot be detected from the content provided, \a defaultCodec
1174 is returned.
1175
1176 \sa codecForHtml()
1177*/
1178QTextCodec *QTextCodec::codecForUtfText(const QByteArray &ba, QTextCodec *defaultCodec)
1179{
1180 const int arraySize = ba.size();
1181 const uchar *buf = reinterpret_cast<const uchar *>(ba.constData());
1182 const uint bom = 0xfeff;
1183
1184 if (arraySize > 3) {
1185 uint uc = qFromUnaligned<uint>(buf);
1186 if (uc == qToBigEndian(bom))
1187 return QTextCodec::codecForMib(1018); // utf-32 be
1188 else if (uc == qToLittleEndian(bom))
1189 return QTextCodec::codecForMib(1019); // utf-32 le
1190 }
1191
1192 if (arraySize < 2)
1193 return defaultCodec;
1194
1195 ushort uc = qFromUnaligned<ushort>(buf);
1196 if (uc == qToBigEndian(ushort(bom)))
1197 return QTextCodec::codecForMib(1013); // utf16 be
1198 else if (uc == qToLittleEndian(ushort(bom)))
1199 return QTextCodec::codecForMib(1014); // utf16 le
1200
1201 if (arraySize < 3)
1202 return defaultCodec;
1203
1204 static const char utf8bom[] = "\xef\xbb\xbf";
1205 if (memcmp(buf, utf8bom, sizeof(utf8bom) - 1) == 0)
1206 return QTextCodec::codecForMib(106); // utf-8
1207
1208 return defaultCodec;
1209}
1210
1211/*!
1212 \overload
1213
1214 Tries to detect the encoding of the provided snippet \a ba by
1215 using the BOM (Byte Order Mark) and returns a QTextCodec instance
1216 that is capable of decoding the text to unicode. This function can
1217 detect one of the following codecs:
1218
1219 \list
1220 \li UTF-32 Little Endian
1221 \li UTF-32 Big Endian
1222 \li UTF-16 Little Endian
1223 \li UTF-16 Big Endian
1224 \li UTF-8
1225 \endlist
1226
1227 If the codec cannot be detected from the content provided, this overload
1228 returns a Latin-1 QTextCodec.
1229
1230 \sa codecForHtml()
1231*/
1232QTextCodec *QTextCodec::codecForUtfText(const QByteArray &ba)
1233{
1234 return codecForUtfText(ba, QTextCodec::codecForMib(/*Latin 1*/ 4));
1235}
1236
1237/*!
1238 \fn QTextCodec * QTextCodec::codecForTr ()
1239 \obsolete
1240
1241 Returns the codec used by QObject::tr() on its argument. If this
1242 function returns \nullptr (the default), tr() assumes Latin-1.
1243*/
1244
1245/*!
1246 \internal
1247 \since 4.3
1248 Determines whether the decoder encountered a failure while decoding the
1249 input. If an error was encountered, the produced result is undefined, and
1250 gets converted as according to the conversion flags.
1251 */
1252bool QTextDecoder::hasFailure() const
1253{
1254 return state.invalidChars != 0;
1255}
1256
1257/*!
1258 \internal
1259 \since 5.12
1260
1261 Determines whether the decoder needs more bytes to continue decoding. That
1262 is, this signifies that the input string ended in the middle of a
1263 multi-byte sequence. Note that it's possible some codecs do not report this.
1264 */
1265bool QTextDecoder::needsMoreData() const
1266{
1267 return state.remainingChars;
1268}
1269
1270QT_END_NAMESPACE
1271