qtextcodec.cpp source code [qtbase/src/corelib/codecs/qtextcodec.cpp]

1	/****************************************************************************
2	**
3	** Copyright (C) 2018 The Qt Company Ltd.
4	** Copyright (C) 2018 Intel Corporation.
5	** Contact: https://www.qt.io/licensing/
6	**
7	** This file is part of the QtCore module of the Qt Toolkit.
8	**
9	** $QT_BEGIN_LICENSE:LGPL$
10	** Commercial License Usage
11	** Licensees holding valid commercial Qt licenses may use this file in
12	** accordance with the commercial license agreement provided with the
13	** Software or, alternatively, in accordance with the terms contained in
14	** a written agreement between you and The Qt Company. For licensing terms
15	** and conditions see https://www.qt.io/terms-conditions. For further
16	** information use the contact form at https://www.qt.io/contact-us.
17	**
18	** GNU Lesser General Public License Usage
19	** Alternatively, this file may be used under the terms of the GNU Lesser
20	** General Public License version 3 as published by the Free Software
21	** Foundation and appearing in the file LICENSE.LGPL3 included in the
22	** packaging of this file. Please review the following information to
23	** ensure the GNU Lesser General Public License version 3 requirements
24	** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
25	**
26	** GNU General Public License Usage
27	** Alternatively, this file may be used under the terms of the GNU
28	** General Public License version 2.0 or (at your option) the GNU General
29	** Public license version 3 or any later version approved by the KDE Free
30	** Qt Foundation. The licenses are as published by the Free Software
31	** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
32	** included in the packaging of this file. Please review the following
33	** information to ensure the GNU General Public License requirements will
34	** be met: https://www.gnu.org/licenses/gpl-2.0.html and
35	** https://www.gnu.org/licenses/gpl-3.0.html.
36	**
37	** $QT_END_LICENSE$
38	**
39	****************************************************************************/
40
41	#include "qplatformdefs.h"
42
43	#include "qtextcodec.h"
44	#include "qtextcodec_p.h"
45
46	#include "qbytearraymatcher.h"
47	#include "qendian.h"
48	#include "qfile.h"
49	#include "qlist.h"
50	#include <private/qlocking_p.h>
51	#include "qstringlist.h"
52	#include "qvarlengtharray.h"
53	#if !defined(QT_BOOTSTRAPPED)
54	#include <private/qcoreapplication_p.h>
55	#endif
56	#include "private/qcoreglobaldata_p.h"
57
58	#include "qutfcodec_p.h"
59	#include "qlatincodec_p.h"
60
61	#if !defined(QT_BOOTSTRAPPED)
62	#if QT_CONFIG(codecs)
63	# include "qtsciicodec_p.h"
64	# include "qisciicodec_p.h"
65	#endif
66	#if QT_CONFIG(icu)
67	#include "qicucodec_p.h"
68	#else
69	#if QT_CONFIG(iconv)
70	# include "qiconvcodec_p.h"
71	#endif
72	#ifdef Q_OS_WIN
73	# include "qwindowscodec_p.h"
74	#endif
75	# include "qsimplecodec_p.h"
76	#if QT_CONFIG(big_codecs)
77	# ifndef Q_OS_INTEGRITY
78	# include "qgb18030codec_p.h"
79	# include "qeucjpcodec_p.h"
80	# include "qjiscodec_p.h"
81	# include "qsjiscodec_p.h"
82	# include "qeuckrcodec_p.h"
83	# include "qbig5codec_p.h"
84	# endif // !Q_OS_INTEGRITY
85	#endif // big_codecs
86
87	#endif // icu
88	#endif // QT_BOOTSTRAPPED
89
90	#include <mutex>
91
92	#include <stdlib.h>
93	#include <ctype.h>
94	#include <locale.h>
95	#if defined (_XOPEN_UNIX) && !defined(Q_OS_QNX) && !defined(Q_OS_ANDROID)
96	# include <langinfo.h>
97	#endif
98
99	QT_BEGIN_NAMESPACE
100
101	typedef QList<QTextCodec*>::ConstIterator TextCodecListConstIt;
102	typedef QList<QByteArray>::ConstIterator ByteArrayListConstIt;
103
104	Q_GLOBAL_STATIC(QRecursiveMutex, textCodecsMutex);
105
106	class TextCodecsMutexLocker
107	{
108	using Lock = decltype(qt_unique_lock(mutex&: std::declval<QRecursiveMutex&>()));
109	// ### FIXME: this is used when textCodecsMutex already == nullptr
110	const Lock lock = qt_unique_lock(mutex: textCodecsMutex ());
111	public:
112	TextCodecsMutexLocker() {} // required d/t an ICC 19 bug
113	};
114
115	#if !QT_CONFIG(icu)
116	static char qtolower(char c)
117	{ if (c >= `'A'` && c <= `'Z'`) return c + `0x20`; return c; }
118	static bool qisalnum(char c)
119	{ return (c >= `'0'` && c <= `'9'`) \|\| ((c \| `0x20`) >= `'a'` && (c \| `0x20`) <= `'z'`); }
120
121	bool qTextCodecNameMatch(const char n, const* char *h)
122	{
123	if (qstricmp(n, h) == `0`)
124	return true;
125
126	// if the letters and numbers are the same, we have a match
127	while (*n != `'\0'`) {
128	if (qisalnum(*n)) {
129	for (;;) {
130	if (*h == `'\0'`)
131	return false;
132	if (qisalnum(*h))
133	break;
134	++h;
135	}
136	if (qtolower(n) != qtolower(h))
137	return false;
138	++h;
139	}
140	++n;
141	}
142	while (h && !qisalnum(h))
143	++h;
144	return (*h == `'\0'`);
145	}
146
147
148	#if !defined(Q_OS_WIN32) && !defined(QT_LOCALE_IS_UTF8)
149	static QTextCodec checkForCodec(const* QByteArray &name) {
150	QTextCodec *c = QTextCodec::codecForName(name);
151	if (!c) {
152	const int index = name.indexOf(`'@'`);
153	if (index != -`1`) {
154	c = QTextCodec::codecForName(name.left(index));
155	}
156	}
157	return c;
158	}
159	#endif
160
161	static void setup();
162
163	// \threadsafe
164	// this returns the codec the method sets up as locale codec to
165	// avoid a race condition in codecForLocale() when
166	// setCodecForLocale(0) is called at the same time.
167	static QTextCodec *setupLocaleMapper()
168	{
169	QCoreGlobalData *globalData = QCoreGlobalData::instance();
170
171	QTextCodec locale = nullptr*;
172
173	{
174	const TextCodecsMutexLocker locker;
175	if (globalData->allCodecs.isEmpty())
176	setup();
177	}
178
179	#if !defined(QT_BOOTSTRAPPED)
180	QCoreApplicationPrivate::initLocale();
181	#endif
182
183	#if defined(QT_LOCALE_IS_UTF8)
184	locale = QTextCodec::codecForName("UTF-8");
185	#elif defined(Q_OS_WIN)
186	locale = QTextCodec::codecForName("System");
187	#else
188
189	// First try getting the codecs name from nl_langinfo and see
190	// if we have a builtin codec for it.
191	// Only fall back to using iconv if we can't find a builtin codec
192	// This is because the builtin utf8 codec is around 5 times faster
193	// then the using QIconvCodec
194
195	#if defined (_XOPEN_UNIX)
196	char *charset = nl_langinfo(CODESET);
197	if (charset)
198	locale = QTextCodec::codecForName(charset);
199	#endif
200	#if QT_CONFIG(iconv)
201	if (!locale) {
202	// no builtin codec for the locale found, let's try using iconv
203	(void) new QIconvCodec();
204	locale = QTextCodec::codecForName("System");
205	}
206	#endif
207
208	if (!locale) {
209	// Very poorly defined and followed standards causes lots of
210	// code to try to get all the cases... This logic is
211	// duplicated in QIconvCodec, so if you change it here, change
212	// it there too.
213
214	// Try to determine locale codeset from locale name assigned to
215	// LC_CTYPE category.
216
217	// First part is getting that locale name. First try setlocale() which
218	// definitely knows it, but since we cannot fully trust it, get ready
219	// to fall back to environment variables.
220	const QByteArray ctype = setlocale(LC_CTYPE, nullptr);
221
222	// Get the first nonempty value from $LC_ALL, $LC_CTYPE, and $LANG
223	// environment variables.
224	QByteArray lang = qgetenv("LC_ALL");
225	if (lang.isEmpty() \|\| lang == "C") {
226	lang = qgetenv("LC_CTYPE");
227	}
228	if (lang.isEmpty() \|\| lang == "C") {
229	lang = qgetenv("LANG");
230	}
231
232	// Now try these in order:
233	// 1. CODESET from ctype if it contains a .CODESET part (e.g. en_US.ISO8859-15)
234	// 2. CODESET from lang if it contains a .CODESET part
235	// 3. ctype (maybe the locale is named "ISO-8859-1" or something)
236	// 4. locale (ditto)
237	// 5. check for "@euro"
238	// 6. guess locale from ctype unless ctype is "C"
239	// 7. guess locale from lang
240
241	// 1. CODESET from ctype if it contains a .CODESET part (e.g. en_US.ISO8859-15)
242	int indexOfDot = ctype.indexOf(`'.'`);
243	if (indexOfDot != -`1`)
244	locale = checkForCodec( ctype.mid(indexOfDot + `1`) );
245
246	// 2. CODESET from lang if it contains a .CODESET part
247	if (!locale) {
248	indexOfDot = lang.indexOf(`'.'`);
249	if (indexOfDot != -`1`)
250	locale = checkForCodec( lang.mid(indexOfDot + `1`) );
251	}
252
253	// 3. ctype (maybe the locale is named "ISO-8859-1" or something)
254	if (!locale && !ctype.isEmpty() && ctype != "C")
255	locale = checkForCodec(ctype);
256
257	// 4. locale (ditto)
258	if (!locale && !lang.isEmpty())
259	locale = checkForCodec(lang);
260
261	// 5. "@euro"
262	if ((!locale && ctype.contains("@euro")) \|\| lang.contains("@euro"))
263	locale = checkForCodec("ISO 8859-15");
264	}
265
266	#endif
267	// If everything failed, we default to 8859-1
268	if (!locale)
269	locale = QTextCodec::codecForName("ISO 8859-1");
270	globalData->codecForLocale.storeRelease(locale);
271	return locale;
272	}
273
274
275	// textCodecsMutex need to be locked to enter this function
276	static void setup()
277	{
278	static bool initialized = false;
279	if (initialized)
280	return;
281	initialized = true;
282
283	#if QT_CONFIG(codecs) && !defined(QT_BOOTSTRAPPED)
284	(void)new QTsciiCodec;
285	for (int i = `0`; i < `9`; ++i)
286	(void)new QIsciiCodec(i);
287	for (int i = `0`; i < QSimpleTextCodec::numSimpleCodecs; ++i)
288	(void)new QSimpleTextCodec(i);
289
290	# if QT_CONFIG(big_codecs) && !defined(Q_OS_INTEGRITY)
291	(void)new QGb18030Codec;
292	(void)new QGbkCodec;
293	(void)new QGb2312Codec;
294	(void)new QEucJpCodec;
295	(void)new QJisCodec;
296	(void)new QSjisCodec;
297	(void)new QEucKrCodec;
298	(void)new QCP949Codec;
299	(void)new QBig5Codec;
300	(void)new QBig5hkscsCodec;
301	# endif // big_codecs && !Q_OS_INTEGRITY
302	#if QT_CONFIG(iconv)
303	(void) new QIconvCodec;
304	#endif
305	#if defined(Q_OS_WIN32)
306	(void) new QWindowsLocalCodec;
307	#endif // Q_OS_WIN32
308	#endif // codecs && !QT_BOOTSTRAPPED
309
310	(void)new QUtf16Codec;
311	(void)new QUtf16BECodec;
312	(void)new QUtf16LECodec;
313	(void)new QUtf32Codec;
314	(void)new QUtf32BECodec;
315	(void)new QUtf32LECodec;
316	(void)new QLatin15Codec;
317	(void)new QLatin1Codec;
318	(void)new QUtf8Codec;
319	}
320	#else
321	static void setup() {}
322	#endif // icu
323
324	/!*
325	\enum QTextCodec::ConversionFlag
326
327	\value DefaultConversion No flag is set.
328	\value ConvertInvalidToNull If this flag is set, each invalid input
329	character is output as a null character.
330	\value IgnoreHeader Ignore any Unicode byte-order mark and don't generate any.
331
332	\omitvalue FreeFunction
333	*/
334
335	/!*
336	\fn QTextCodec::ConverterState::ConverterState(ConversionFlags flags)
337
338	Constructs a ConverterState object initialized with the given \a flags.
339	*/
340
341	/!*
342	Destroys the ConverterState object.
343	*/
344	QTextCodec::ConverterState::~ConverterState()
345	{
346	if (flags & FreeFunction)
347	(QTextCodecUnalignedPointer::decode(src: state_data))(this);
348	else if (d)
349	free(ptr: d);
350	}
351
352	/!*
353	\class QTextCodec
354	\inmodule QtCore
355	\brief The QTextCodec class provides conversions between text encodings.
356	\reentrant
357	\ingroup i18n
358
359	Qt uses Unicode to store, draw and manipulate strings. In many
360	situations you may wish to deal with data that uses a different
361	encoding. For example, most Japanese documents are still stored
362	in Shift-JIS or ISO 2022-JP, while Russian users often have their
363	documents in KOI8-R or Windows-1251.
364
365	Qt provides a set of QTextCodec classes to help with converting
366	non-Unicode formats to and from Unicode. You can also create your
367	own codec classes.
368
369	The supported encodings are:
370
371	\list
372	\li \l{Big5 Text Codec}{Big5}
373	\li \l{Big5-HKSCS Text Codec}{Big5-HKSCS}
374	\li CP949
375	\li \l{EUC-JP Text Codec}{EUC-JP}
376	\li \l{EUC-KR Text Codec}{EUC-KR}
377	\li \l{GBK Text Codec}{GB18030}
378	\li HP-ROMAN8
379	\li IBM 850
380	\li IBM 866
381	\li IBM 874
382	\li \l{ISO 2022-JP (JIS) Text Codec}{ISO 2022-JP}
383	\li ISO 8859-1 to 10
384	\li ISO 8859-13 to 16
385	\li Iscii-Bng, Dev, Gjr, Knd, Mlm, Ori, Pnj, Tlg, and Tml
386	\li KOI8-R
387	\li KOI8-U
388	\li Macintosh
389	\li \l{Shift-JIS Text Codec}{Shift-JIS}
390	\li TIS-620
391	\li \l{TSCII Text Codec}{TSCII}
392	\li UTF-8
393	\li UTF-16
394	\li UTF-16BE
395	\li UTF-16LE
396	\li UTF-32
397	\li UTF-32BE
398	\li UTF-32LE
399	\li Windows-1250 to 1258
400	\endlist
401
402	If Qt is compiled with ICU support enabled, most codecs supported by
403	ICU will also be available to the application.
404
405	\l {QTextCodec}s can be used as follows to convert some locally encoded
406	string to Unicode. Suppose you have some string encoded in Russian
407	KOI8-R encoding, and want to convert it to Unicode. The simple way
408	to do it is like this:
409
410	\snippet code/src_corelib_codecs_qtextcodec.cpp 0
411
412	After this, \c string holds the text converted to Unicode.
413	Converting a string from Unicode to the local encoding is just as
414	easy:
415
416	\snippet code/src_corelib_codecs_qtextcodec.cpp 1
417
418	To read or write files in various encodings, use QTextStream and
419	its \l{QTextStream::setCodec()}{setCodec()} function. See the
420	\l{tools/codecs}{Codecs} example for an application of QTextCodec
421	to file I/O.
422
423	Some care must be taken when trying to convert the data in chunks,
424	for example, when receiving it over a network. In such cases it is
425	possible that a multi-byte character will be split over two
426	chunks. At best this might result in the loss of a character and
427	at worst cause the entire conversion to fail.
428
429	The approach to use in these situations is to create a QTextDecoder
430	object for the codec and use this QTextDecoder for the whole
431	decoding process, as shown below:
432
433	\snippet code/src_corelib_codecs_qtextcodec.cpp 2
434
435	The QTextDecoder object maintains state between chunks and therefore
436	works correctly even if a multi-byte character is split between
437	chunks.
438
439	\section1 Creating Your Own Codec Class
440
441	Support for new text encodings can be added to Qt by creating
442	QTextCodec subclasses.
443
444	The pure virtual functions describe the encoder to the system and
445	the coder is used as required in the different text file formats
446	supported by QTextStream, and under X11, for the locale-specific
447	character input and output.
448
449	To add support for another encoding to Qt, make a subclass of
450	QTextCodec and implement the functions listed in the table below.
451
452	\table
453	\header \li Function \li Description
454
455	\row \li name()
456	\li Returns the official name for the encoding. If the
457	encoding is listed in the
458	\l{IANA character-sets encoding file}, the name
459	should be the preferred MIME name for the encoding.
460
461	\row \li aliases()
462	\li Returns a list of alternative names for the encoding.
463	QTextCodec provides a default implementation that returns
464	an empty list. For example, "ISO-8859-1" has "latin1",
465	"CP819", "IBM819", and "iso-ir-100" as aliases.
466
467	\row \li \l{QTextCodec::mibEnum()}{mibEnum()}
468	\li Return the MIB enum for the encoding if it is listed in
469	the \l{IANA character-sets encoding file}.
470
471	\row \li convertToUnicode()
472	\li Converts an 8-bit character string to Unicode.
473
474	\row \li convertFromUnicode()
475	\li Converts a Unicode string to an 8-bit character string.
476	\endtable
477
478	\sa QTextStream, QTextDecoder, QTextEncoder, {Text Codecs Example}
479	*/
480
481	/!*
482	Constructs a QTextCodec, and gives it the highest precedence. The
483	QTextCodec should always be constructed on the heap (i.e. with \c
484	new). Qt takes ownership and will delete it when the application
485	terminates.
486	*/
487	QTextCodec::QTextCodec()
488	{
489	const TextCodecsMutexLocker locker;
490
491	QCoreGlobalData *globalInstance = QCoreGlobalData::instance();
492	if (globalInstance->allCodecs.isEmpty())
493	setup();
494
495	globalInstance->allCodecs.prepend(t: this);
496	}
497
498
499	/!*
500	\nonreentrant
501
502	Destroys the QTextCodec. Note that you should not delete codecs
503	yourself: once created they become Qt's responsibility.
504	*/
505	QTextCodec::~QTextCodec()
506	{
507	QCoreGlobalData *globalData = QCoreGlobalData::instance();
508	if (!globalData)
509	return;
510
511	globalData->codecForLocale.testAndSetRelaxed(expectedValue: this, newValue: nullptr);
512
513	const TextCodecsMutexLocker locker;
514
515	globalData->allCodecs.removeOne(t: this);
516
517	auto it = globalData->codecCache.begin();
518
519	while (it != globalData->codecCache.end()) {
520	if (it.value() == this)
521	it = globalData->codecCache.erase(it);
522	else
523	++it;
524	}
525	}
526
527	/!*
528	\fn QTextCodec QTextCodec::codecForName(const char name)
529
530	Searches all installed QTextCodec objects and returns the one
531	which best matches \a name; the match is case-insensitive. Returns
532	0 if no codec matching the name \a name could be found.
533	*/
534
535	/!*
536	\threadsafe
537	Searches all installed QTextCodec objects and returns the one
538	which best matches \a name; the match is case-insensitive. Returns
539	0 if no codec matching the name \a name could be found.
540	*/
541	QTextCodec QTextCodec::codecForName(const* QByteArray &name)
542	{
543	if (name.isEmpty())
544	return nullptr;
545
546	const TextCodecsMutexLocker locker;
547
548	QCoreGlobalData *globalData = QCoreGlobalData::instance();
549	if (!globalData)
550	return nullptr;
551	setup();
552
553	#if !QT_CONFIG(icu)
554	QTextCodecCache *cache = &globalData->codecCache;
555	QTextCodec *codec;
556	codec = cache->value(name);
557	if (codec)
558	return codec;
559
560	for (TextCodecListConstIt it = globalData->allCodecs.constBegin(), cend = globalData->allCodecs.constEnd(); it != cend; ++it) {
561	QTextCodec cursor = it;
562	if (qTextCodecNameMatch(cursor->name(), name)) {
563	if (cache)
564	cache->insert(name, cursor);
565	return cursor;
566	}
567	QList<QByteArray> aliases = cursor->aliases();
568	for (ByteArrayListConstIt ait = aliases.constBegin(), acend = aliases.constEnd(); ait != acend; ++ait) {
569	if (qTextCodecNameMatch(*ait, name)) {
570	cache->insert(name, cursor);
571	return cursor;
572	}
573	}
574	}
575
576	return nullptr;
577	#else
578	return QIcuCodec::codecForNameUnlocked(name);
579	#endif
580	}
581
582
583	/!*
584	\threadsafe
585	Returns the QTextCodec which matches the
586	\l{QTextCodec::mibEnum()}{MIBenum} \a mib.
587	*/
588	QTextCodec* QTextCodec::codecForMib(int mib)
589	{
590	const TextCodecsMutexLocker locker;
591
592	QCoreGlobalData *globalData = QCoreGlobalData::instance();
593	if (!globalData)
594	return nullptr;
595	if (globalData->allCodecs.isEmpty())
596	setup();
597
598	QByteArray key = "MIB: " + QByteArray::number(mib);
599
600	QTextCodecCache *cache = &globalData->codecCache;
601	QTextCodec *codec;
602	if (cache) {
603	codec = cache->value(akey: key);
604	if (codec)
605	return codec;
606	}
607
608	for (TextCodecListConstIt it = globalData->allCodecs.constBegin(), cend = globalData->allCodecs.constEnd(); it != cend; ++it) {
609	QTextCodec cursor = it;
610	if (cursor->mibEnum() == mib) {
611	if (cache)
612	cache->insert(akey: key, avalue: cursor);
613	return cursor;
614	}
615	}
616
617	#if QT_CONFIG(icu)
618	return QIcuCodec::codecForMibUnlocked(mib);
619	#else
620	return nullptr;
621	#endif
622	}
623
624	/!*
625	\threadsafe
626	Returns the list of all available codecs, by name. Call
627	QTextCodec::codecForName() to obtain the QTextCodec for the name.
628
629	The list may contain many mentions of the same codec
630	if the codec has aliases.
631
632	\sa availableMibs(), name(), aliases()
633	*/
634	QList<QByteArray> QTextCodec::availableCodecs()
635	{
636	const TextCodecsMutexLocker locker;
637
638	QCoreGlobalData *globalData = QCoreGlobalData::instance();
639	if (globalData->allCodecs.isEmpty())
640	setup();
641
642	QList<QByteArray> codecs;
643
644	for (TextCodecListConstIt it = globalData->allCodecs.constBegin(), cend = globalData->allCodecs.constEnd(); it != cend; ++it) {
645	codecs += (*it)->name();
646	codecs += (*it)->aliases();
647	}
648
649	#if QT_CONFIG(icu)
650	codecs += QIcuCodec::availableCodecs();
651	#endif
652
653	return codecs;
654	}
655
656	/!*
657	\threadsafe
658	Returns the list of MIBs for all available codecs. Call
659	QTextCodec::codecForMib() to obtain the QTextCodec for the MIB.
660
661	\sa availableCodecs(), mibEnum()
662	*/
663	QList<int> QTextCodec::availableMibs()
664	{
665	#if QT_CONFIG(icu)
666	return QIcuCodec::availableMibs();
667	#else
668	const TextCodecsMutexLocker locker;
669
670	QCoreGlobalData *globalData = QCoreGlobalData::instance();
671	if (globalData->allCodecs.isEmpty())
672	setup();
673
674	QList<int> codecs;
675
676	for (TextCodecListConstIt it = globalData->allCodecs.constBegin(), cend = globalData->allCodecs.constEnd(); it != cend; ++it)
677	codecs += (*it)->mibEnum();
678
679	return codecs;
680	#endif
681	}
682
683	/!*
684	\nonreentrant
685
686	Set the codec to \a c; this will be returned by
687	codecForLocale(). If \a c is \nullptr, the codec is reset to
688	the default.
689
690	This might be needed for some applications that want to use their
691	own mechanism for setting the locale.
692
693	\sa codecForLocale()
694	*/
695	void QTextCodec::setCodecForLocale(QTextCodec *c)
696	{
697	QCoreGlobalData::instance()->codecForLocale.storeRelease(newValue: c);
698	}
699
700	/!*
701	\threadsafe
702	Returns a pointer to the codec most suitable for this locale.
703
704	The codec will be retrieved from ICU where that backend is in use, otherwise
705	it may be obtained from an OS-specific API. In the latter case, the codec's
706	name may be "System".
707	*/
708
709	QTextCodec* QTextCodec::codecForLocale()
710	{
711	QCoreGlobalData *globalData = QCoreGlobalData::instance();
712	if (!globalData)
713	return nullptr;
714
715	QTextCodec *codec = globalData->codecForLocale.loadAcquire();
716	if (!codec) {
717	#if QT_CONFIG(icu)
718	const TextCodecsMutexLocker locker;
719	codec = QIcuCodec::defaultCodecUnlocked();
720	#else
721	// setupLocaleMapper locks as necessary
722	codec = setupLocaleMapper();
723	#endif
724	}
725
726	return codec;
727	}
728
729
730	/!*
731	\fn QByteArray QTextCodec::name() const
732
733	QTextCodec subclasses must reimplement this function. It returns
734	the name of the encoding supported by the subclass.
735
736	If the codec is registered as a character set in the
737	\l{IANA character-sets encoding file} this method should
738	return the preferred mime name for the codec if defined,
739	otherwise its name.
740	*/
741
742	/!*
743	\fn int QTextCodec::mibEnum() const
744
745	Subclasses of QTextCodec must reimplement this function. It
746	returns the \l{QTextCodec::mibEnum()}{MIBenum} (see \l{IANA character-sets encoding file}
747	for more information). It is important that each QTextCodec
748	subclass returns the correct unique value for this function.
749	*/
750
751	/!*
752	Subclasses can return a number of aliases for the codec in question.
753
754	Standard aliases for codecs can be found in the
755	\l{IANA character-sets encoding file}.
756	*/
757	QList<QByteArray> QTextCodec::aliases() const
758	{
759	return QList<QByteArray>();
760	}
761
762	/!*
763	\fn QString QTextCodec::convertToUnicode(const char chars, int len,*
764	ConverterState state) const*
765
766	QTextCodec subclasses must reimplement this function.
767
768	Converts the first \a len characters of \a chars from the
769	encoding of the subclass to Unicode, and returns the result in a
770	QString.
771
772	\a state can be \nullptr, in which case the conversion is stateless and
773	default conversion rules should be used. If state is not 0, the
774	codec should save the state after the conversion in \a state, and
775	adjust the \c remainingChars and \c invalidChars members of the struct.
776	*/
777
778	/!*
779	\fn QByteArray QTextCodec::convertFromUnicode(const QChar input, int number,*
780	ConverterState state) const*
781
782	QTextCodec subclasses must reimplement this function.
783
784	Converts the first \a number of characters from the \a input array
785	from Unicode to the encoding of the subclass, and returns the result
786	in a QByteArray.
787
788	\a state can be \nullptr in which case the conversion is stateless and
789	default conversion rules should be used. If state is not 0, the
790	codec should save the state after the conversion in \a state, and
791	adjust the \c remainingChars and \c invalidChars members of the struct.
792	*/
793
794	/!*
795	Creates a QTextDecoder with a specified \a flags to decode chunks
796	of \c{char } data to create chunks of Unicode data.*
797
798	The caller is responsible for deleting the returned object.
799
800	\since 4.7
801	*/
802	QTextDecoder* QTextCodec::makeDecoder(QTextCodec::ConversionFlags flags) const
803	{
804	return new QTextDecoder (this, flags);
805	}
806
807	/!*
808	Creates a QTextEncoder with a specified \a flags to encode chunks
809	of Unicode data as \c{char } data.*
810
811	The caller is responsible for deleting the returned object.
812
813	\since 4.7
814	*/
815	QTextEncoder* QTextCodec::makeEncoder(QTextCodec::ConversionFlags flags) const
816	{
817	return new QTextEncoder (this, flags);
818	}
819
820	/!*
821	\fn QByteArray QTextCodec::fromUnicode(const QChar input, int number,*
822	ConverterState state) const*
823
824	Converts the first \a number of characters from the \a input array
825	from Unicode to the encoding of this codec, and returns the result
826	in a QByteArray.
827
828	The \a state of the convertor used is updated.
829	*/
830
831	#if QT_STRINGVIEW_LEVEL < 2
832	/!*
833	Converts \a str from Unicode to the encoding of this codec, and
834	returns the result in a QByteArray.
835	*/
836	QByteArray QTextCodec::fromUnicode(const QString& str) const
837	{
838	return convertFromUnicode(in: str.constData(), length: str.length(), state: nullptr);
839	}
840	#endif
841
842	/!*
843	\overload
844	\since 5.10
845
846	Converts \a str from Unicode to the encoding of this codec, and
847	returns the result in a QByteArray.
848	*/
849	QByteArray QTextCodec::fromUnicode(QStringView str) const
850	{
851	return convertFromUnicode(in: str.data(), length: str.length(), state: nullptr);
852	}
853
854	/!*
855	\fn QString QTextCodec::toUnicode(const char input, int size,*
856	ConverterState state) const*
857
858	Converts the first \a size characters from the \a input from the
859	encoding of this codec to Unicode, and returns the result in a
860	QString.
861
862	The \a state of the convertor used is updated.
863	*/
864
865	/!*
866	Converts \a a from the encoding of this codec to Unicode, and
867	returns the result in a QString.
868	*/
869	QString QTextCodec::toUnicode(const QByteArray& a) const
870	{
871	return convertToUnicode(in: a.constData(), length: a.length(), state: nullptr);
872	}
873
874	/!*
875	Returns \c true if the Unicode character \a ch can be fully encoded
876	with this codec; otherwise returns \c false.
877	*/
878	bool QTextCodec::canEncode(QChar ch) const
879	{
880	ConverterState state;
881	state.flags = ConvertInvalidToNull;
882	convertFromUnicode(in: &ch, length: `1`, state: &state);
883	return (state.invalidChars == `0`);
884	}
885
886	#if QT_STRINGVIEW_LEVEL < 2
887	/!*
888	\overload
889
890	\a s contains the string being tested for encode-ability.
891	*/
892	bool QTextCodec::canEncode(const QString& s) const
893	{
894	ConverterState state;
895	state.flags = ConvertInvalidToNull;
896	convertFromUnicode(in: s.constData(), length: s.length(), state: &state);
897	return (state.invalidChars == `0`);
898	}
899	#endif
900
901	/!*
902	\overload
903	\since 5.10
904
905	Returns \c true if the Unicode string \a s can be fully encoded
906	with this codec; otherwise returns \c false.
907	*/
908	bool QTextCodec::canEncode(QStringView s) const
909	{
910	ConverterState state;
911	state.flags = ConvertInvalidToNull;
912	convertFromUnicode(in: s.data(), length: s.length(), state: &state);
913	return !state.invalidChars;
914	}
915	/!*
916	\overload
917
918	\a chars contains the source characters.
919	*/
920	QString QTextCodec::toUnicode(const char chars) const*
921	{
922	int len = qstrlen(str: chars);
923	return convertToUnicode(in: chars, length: len, state: nullptr);
924	}
925
926
927	/!*
928	\class QTextEncoder
929	\inmodule QtCore
930	\brief The QTextEncoder class provides a state-based encoder.
931	\reentrant
932	\ingroup i18n
933
934	A text encoder converts text from Unicode into an encoded text format
935	using a specific codec.
936
937	The encoder converts Unicode into another format, remembering any
938	state that is required between calls.
939
940	\sa QTextCodec::makeEncoder(), QTextDecoder
941	*/
942
943	/!*
944	\fn QTextEncoder::QTextEncoder(const QTextCodec codec)*
945
946	Constructs a text encoder for the given \a codec.
947	*/
948
949	/!*
950	Constructs a text encoder for the given \a codec and conversion \a flags.
951
952	\since 4.7
953	*/
954	QTextEncoder::QTextEncoder(const QTextCodec *codec, QTextCodec::ConversionFlags flags)
955	: c(codec), state ()
956	{
957	state.flags = flags;
958	}
959
960	/!*
961	Destroys the encoder.
962	*/
963	QTextEncoder::~QTextEncoder()
964	{
965	}
966
967	/!*
968	\internal
969	\since 4.5
970	Determines whether the eecoder encountered a failure while decoding the input. If
971	an error was encountered, the produced result is undefined, and gets converted as according
972	to the conversion flags.
973	*/
974	bool QTextEncoder::hasFailure() const
975	{
976	return state.invalidChars != `0`;
977	}
978
979	#if QT_STRINGVIEW_LEVEL < 2
980	/!*
981	Converts the Unicode string \a str into an encoded QByteArray.
982	*/
983	QByteArray QTextEncoder::fromUnicode(const QString& str)
984	{
985	QByteArray result = c->fromUnicode(in: str.constData(), length: str.length(), state: &state);
986	return result;
987	}
988	#endif
989
990	/!*
991	\overload
992	\since 5.10
993	Converts the Unicode string \a str into an encoded QByteArray.
994	*/
995	QByteArray QTextEncoder::fromUnicode(QStringView str)
996	{
997	return c->fromUnicode(in: str.data(), length: str.length(), state: &state);
998	}
999
1000	/!*
1001	\overload
1002
1003	Converts \a len characters (not bytes) from \a uc, and returns the
1004	result in a QByteArray.
1005	*/
1006	QByteArray QTextEncoder::fromUnicode(const QChar uc, int* len)
1007	{
1008	QByteArray result = c->fromUnicode(in: uc, length: len, state: &state);
1009	return result;
1010	}
1011
1012	/!*
1013	\class QTextDecoder
1014	\inmodule QtCore
1015	\brief The QTextDecoder class provides a state-based decoder.
1016	\reentrant
1017	\ingroup i18n
1018
1019	A text decoder converts text from an encoded text format into Unicode
1020	using a specific codec.
1021
1022	The decoder converts text in this format into Unicode, remembering any
1023	state that is required between calls.
1024
1025	\sa QTextCodec::makeDecoder(), QTextEncoder
1026	*/
1027
1028	/!*
1029	\fn QTextDecoder::QTextDecoder(const QTextCodec codec)*
1030
1031	Constructs a text decoder for the given \a codec.
1032	*/
1033
1034	/!*
1035	Constructs a text decoder for the given \a codec and conversion \a flags.
1036
1037	\since 4.7
1038	*/
1039
1040	QTextDecoder::QTextDecoder(const QTextCodec *codec, QTextCodec::ConversionFlags flags)
1041	: c(codec), state ()
1042	{
1043	state.flags = flags;
1044	}
1045
1046	/!*
1047	Destroys the decoder.
1048	*/
1049	QTextDecoder::~QTextDecoder()
1050	{
1051	}
1052
1053	/!*
1054	\fn QString QTextDecoder::toUnicode(const char chars, int len)*
1055
1056	Converts the first \a len bytes in \a chars to Unicode, returning
1057	the result.
1058
1059	If not all characters are used (e.g. if only part of a multi-byte
1060	encoding is at the end of the characters), the decoder remembers
1061	enough state to continue with the next call to this function.
1062	*/
1063	QString QTextDecoder::toUnicode(const char chars, int* len)
1064	{
1065	return c->toUnicode(in: chars, length: len, state: &state);
1066	}
1067
1068	// in qstring.cpp:
1069	void qt_from_latin1(ushort dst, const* char str, size_t size) noexcept*;
1070
1071	/! \overload*
1072
1073	The converted string is returned in \a target.
1074	*/
1075	void QTextDecoder::toUnicode(QString target, const* char chars, int* len)
1076	{
1077	Q_ASSERT(target);
1078	switch (c->mibEnum()) {
1079	case `106`: // utf8
1080	static_cast<const QUtf8Codec*>(c)->convertToUnicode(target, chars, len, &state);
1081	break;
1082	case `4`: // latin1
1083	target->resize(size: len);
1084	qt_from_latin1(dst: (ushort*)target->data(), str: chars, size: len);
1085	break;
1086	default:
1087	*target = c->toUnicode(in: chars, length: len, state: &state);
1088	}
1089	}
1090
1091
1092	/!*
1093	\overload
1094
1095	Converts the bytes in the byte array specified by \a ba to Unicode
1096	and returns the result.
1097	*/
1098	QString QTextDecoder::toUnicode(const QByteArray &ba)
1099	{
1100	return c->toUnicode(in: ba.constData(), length: ba.length(), state: &state);
1101	}
1102
1103	/!*
1104	\since 4.4
1105
1106	Tries to detect the encoding of the provided snippet of HTML in
1107	the given byte array, \a ba, by checking the BOM (Byte Order Mark)
1108	and the content-type meta header and returns a QTextCodec instance
1109	that is capable of decoding the html to unicode. If the codec
1110	cannot be detected from the content provided, \a defaultCodec is
1111	returned.
1112
1113	\sa codecForUtfText()
1114	*/
1115	QTextCodec QTextCodec::codecForHtml(const* QByteArray &ba, QTextCodec *defaultCodec)
1116	{
1117	// determine charset
1118	QTextCodec c = QTextCodec::codecForUtfText(ba, defaultCodec: nullptr*);
1119	if (!c) {
1120	static Q_RELAXED_CONSTEXPR auto matcher = qMakeStaticByteArrayMatcher(pattern: "meta ");
1121	QByteArray header = ba.left(len: `1024`).toLower();
1122	int pos = matcher.indexIn(haystack: header);
1123	if (pos != -`1`) {
1124	static Q_RELAXED_CONSTEXPR auto matcher = qMakeStaticByteArrayMatcher(pattern: "charset=");
1125	pos = matcher.indexIn(haystack: header, from: pos);
1126	if (pos != -`1`) {
1127	pos += qstrlen(str: "charset=");
1128
1129	int pos2 = pos;
1130	// The attribute can be closed with either """, "'", ">" or "/",
1131	// none of which are valid charset characters.
1132	while (++pos2 < header.size()) {
1133	char ch = header.at(i: pos2);
1134	if (ch == `'\"'` \|\| ch == `'\''` \|\| ch == `'>'`) {
1135	QByteArray name = header.mid(index: pos, len: pos2 - pos);
1136	if (name == "unicode") // QTBUG-41998, ICU will return UTF-16.
1137	name = QByteArrayLiteral("UTF-8");
1138	c = QTextCodec::codecForName(name);
1139	return c ? c : defaultCodec;
1140	}
1141	}
1142	}
1143	}
1144	}
1145	if (!c)
1146	c = defaultCodec;
1147
1148	return c;
1149	}
1150
1151	/!*
1152	\overload
1153
1154	Tries to detect the encoding of the provided snippet of HTML in
1155	the given byte array, \a ba, by checking the BOM (Byte Order Mark)
1156	and the content-type meta header and returns a QTextCodec instance
1157	that is capable of decoding the html to unicode. If the codec cannot
1158	be detected, this overload returns a Latin-1 QTextCodec.
1159	*/
1160	QTextCodec QTextCodec::codecForHtml(const* QByteArray &ba)
1161	{
1162	return codecForHtml(ba, defaultCodec: QTextCodec::codecForName(name: "ISO-8859-1"));
1163	}
1164
1165	/!*
1166	\since 4.6
1167
1168	Tries to detect the encoding of the provided snippet \a ba by
1169	using the BOM (Byte Order Mark) and returns a QTextCodec instance
1170	that is capable of decoding the text to unicode. This function can
1171	detect one of the following codecs:
1172
1173	\list
1174	\li UTF-32 Little Endian
1175	\li UTF-32 Big Endian
1176	\li UTF-16 Little Endian
1177	\li UTF-16 Big Endian
1178	\li UTF-8
1179	\endlist
1180
1181	If the codec cannot be detected from the content provided, \a defaultCodec
1182	is returned.
1183
1184	\sa codecForHtml()
1185	*/
1186	QTextCodec QTextCodec::codecForUtfText(const* QByteArray &ba, QTextCodec *defaultCodec)
1187	{
1188	const int arraySize = ba.size();
1189	const uchar buf = reinterpret_cast<const* uchar *>(ba.constData());
1190	const uint bom = `0xfeff`;
1191
1192	if (arraySize > `3`) {
1193	uint uc = qFromUnaligned<uint>(src: buf);
1194	if (uc == qToBigEndian(source: bom))
1195	return QTextCodec::codecForMib(mib: `1018`); // utf-32 be
1196	else if (uc == qToLittleEndian(source: bom))
1197	return QTextCodec::codecForMib(mib: `1019`); // utf-32 le
1198	}
1199
1200	if (arraySize < `2`)
1201	return defaultCodec;
1202
1203	ushort uc = qFromUnaligned<ushort>(src: buf);
1204	if (uc == qToBigEndian(source: ushort(bom)))
1205	return QTextCodec::codecForMib(mib: `1013`); // utf16 be
1206	else if (uc == qToLittleEndian(source: ushort(bom)))
1207	return QTextCodec::codecForMib(mib: `1014`); // utf16 le
1208
1209	if (arraySize < `3`)
1210	return defaultCodec;
1211
1212	static const char utf8bom[] = "\xef\xbb\xbf";
1213	if (memcmp(s1: buf, s2: utf8bom, n: sizeof(utf8bom) - `1`) == `0`)
1214	return QTextCodec::codecForMib(mib: `106`); // utf-8
1215
1216	return defaultCodec;
1217	}
1218
1219	/!*
1220	\overload
1221
1222	Tries to detect the encoding of the provided snippet \a ba by
1223	using the BOM (Byte Order Mark) and returns a QTextCodec instance
1224	that is capable of decoding the text to unicode. This function can
1225	detect one of the following codecs:
1226
1227	\list
1228	\li UTF-32 Little Endian
1229	\li UTF-32 Big Endian
1230	\li UTF-16 Little Endian
1231	\li UTF-16 Big Endian
1232	\li UTF-8
1233	\endlist
1234
1235	If the codec cannot be detected from the content provided, this overload
1236	returns a Latin-1 QTextCodec.
1237
1238	\sa codecForHtml()
1239	*/
1240	QTextCodec QTextCodec::codecForUtfText(const* QByteArray &ba)
1241	{
1242	return codecForUtfText(ba, defaultCodec: QTextCodec::codecForMib(/Latin 1/ mib: `4`));
1243	}
1244
1245	/!*
1246	\fn QTextCodec QTextCodec::codecForTr ()*
1247	\obsolete
1248
1249	Returns the codec used by QObject::tr() on its argument. If this
1250	function returns \nullptr (the default), tr() assumes Latin-1.
1251	*/
1252
1253	/!*
1254	\internal
1255	\since 4.3
1256	Determines whether the decoder encountered a failure while decoding the
1257	input. If an error was encountered, the produced result is undefined, and
1258	gets converted as according to the conversion flags.
1259	*/
1260	bool QTextDecoder::hasFailure() const
1261	{
1262	return state.invalidChars != `0`;
1263	}
1264
1265	/!*
1266	\internal
1267	\since 5.12
1268
1269	Determines whether the decoder needs more bytes to continue decoding. That
1270	is, this signifies that the input string ended in the middle of a
1271	multi-byte sequence. Note that it's possible some codecs do not report this.
1272	*/
1273	bool QTextDecoder::needsMoreData() const
1274	{
1275	return state.remainingChars;
1276	}
1277
1278	QT_END_NAMESPACE
1279

source code of qtbase/src/corelib/codecs/qtextcodec.cpp