1/****************************************************************************
2**
3** Copyright (C) 2016 The Qt Company Ltd.
4** Contact: https://www.qt.io/licensing/
5**
6** This file is part of the QtCore module of the Qt Toolkit.
7**
8** $QT_BEGIN_LICENSE:LGPL$
9** Commercial License Usage
10** Licensees holding valid commercial Qt licenses may use this file in
11** accordance with the commercial license agreement provided with the
12** Software or, alternatively, in accordance with the terms contained in
13** a written agreement between you and The Qt Company. For licensing terms
14** and conditions see https://www.qt.io/terms-conditions. For further
15** information use the contact form at https://www.qt.io/contact-us.
16**
17** GNU Lesser General Public License Usage
18** Alternatively, this file may be used under the terms of the GNU Lesser
19** General Public License version 3 as published by the Free Software
20** Foundation and appearing in the file LICENSE.LGPL3 included in the
21** packaging of this file. Please review the following information to
22** ensure the GNU Lesser General Public License version 3 requirements
23** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
24**
25** GNU General Public License Usage
26** Alternatively, this file may be used under the terms of the GNU
27** General Public License version 2.0 or (at your option) the GNU General
28** Public license version 3 or any later version approved by the KDE Free
29** Qt Foundation. The licenses are as published by the Free Software
30** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
31** included in the packaging of this file. Please review the following
32** information to ensure the GNU General Public License requirements will
33** be met: https://www.gnu.org/licenses/gpl-2.0.html and
34** https://www.gnu.org/licenses/gpl-3.0.html.
35**
36** $QT_END_LICENSE$
37**
38****************************************************************************/
39
40#include "qicucodec_p.h"
41
42#include "qtextcodec_p.h"
43#include "qutfcodec_p.h"
44#include "qlatincodec_p.h"
45#include "qtsciicodec_p.h"
46#include "qisciicodec_p.h"
47#include "qsimplecodec_p.h"
48#include "private/qcoreglobaldata_p.h"
49#include "qdebug.h"
50
51#include "unicode/ucnv.h"
52
53QT_BEGIN_NAMESPACE
54
55typedef QList<QTextCodec*>::ConstIterator TextCodecListConstIt;
56typedef QList<QByteArray>::ConstIterator ByteArrayListConstIt;
57
58static void qIcuCodecStateFree(QTextCodec::ConverterState *state)
59{
60 ucnv_close(static_cast<UConverter *>(state->d));
61}
62
63bool qTextCodecNameMatch(const char *n, const char *h)
64{
65 return ucnv_compareNames(n, h) == 0;
66}
67
68/* The list below is generated from http://www.iana.org/assignments/character-sets/
69 using the snippet of code below:
70
71#include <QtCore>
72#include <unicode/ucnv.h>
73
74int main(int argc, char **argv)
75{
76 QCoreApplication app(argc, argv);
77
78 QFile file("character-sets.txt");
79 file.open(QFile::ReadOnly);
80 QByteArray name;
81 int mib = -1;
82 QByteArray nameList;
83 int pos = 0;
84 while (!file.atEnd()) {
85 QByteArray s = file.readLine().trimmed();
86 if (s.isEmpty()) {
87 if (mib != -1) {
88 UErrorCode error = U_ZERO_ERROR;
89 const char *standard_name = ucnv_getStandardName(name, "MIME", &error);
90 if (U_FAILURE(error) || !standard_name) {
91 error = U_ZERO_ERROR;
92 standard_name = ucnv_getStandardName(name, "IANA", &error);
93 }
94 UConverter *conv = ucnv_open(standard_name, &error);
95 if (!U_FAILURE(error) && conv && standard_name) {
96 ucnv_close(conv);
97 printf(" { %d, %d },\n", mib, pos);
98 nameList += "\"";
99 nameList += standard_name;
100 nameList += "\\0\"\n";
101 pos += strlen(standard_name) + 1;
102 }
103 }
104 name = QByteArray();
105 mib = -1;
106 }
107 if (s.startsWith("Name: ")) {
108 name = s.mid(5).trimmed();
109 if (name.indexOf(' ') > 0)
110 name = name.left(name.indexOf(' '));
111 }
112 if (s.startsWith("MIBenum:"))
113 mib = s.mid(8).trimmed().toInt();
114 if (s.startsWith("Alias:") && s.contains("MIME")) {
115 name = s.mid(6).trimmed();
116 name = name.left(name.indexOf(' ')).trimmed();
117 }
118 }
119 qDebug() << nameList;
120}
121*/
122
123struct MibToName {
124 short mib;
125 short index;
126};
127
128static const MibToName mibToName[] = {
129 { 3, 0 },
130 { 4, 9 },
131 { 5, 20 },
132 { 6, 31 },
133 { 7, 42 },
134 { 8, 53 },
135 { 9, 64 },
136 { 10, 75 },
137 { 11, 86 },
138 { 12, 97 },
139 { 13, 108 },
140 { 16, 120 },
141 { 17, 134 },
142 { 18, 144 },
143 { 30, 151 },
144 { 36, 160 },
145 { 37, 167 },
146 { 38, 179 },
147 { 39, 186 },
148 { 40, 198 },
149 { 57, 212 },
150 { 81, 223 },
151 { 82, 234 },
152 { 84, 245 },
153 { 85, 256 },
154 { 104, 267 },
155 { 105, 279 },
156 { 106, 295 },
157 { 109, 301 },
158 { 110, 313 },
159 { 111, 325 },
160 { 113, 337 },
161 { 114, 341 },
162 { 1000, 349 },
163 { 1001, 356 },
164 { 1011, 363 },
165 { 1012, 368 },
166 { 1013, 374 },
167 { 1014, 383 },
168 { 1015, 392 },
169 { 1016, 399 },
170 { 1017, 406 },
171 { 1018, 413 },
172 { 1019, 422 },
173 { 1020, 431 },
174 { 2004, 438 },
175 { 2005, 448 },
176 { 2009, 472 },
177 { 2013, 479 },
178 { 2016, 486 },
179 { 2024, 495 },
180 { 2025, 505 },
181 { 2026, 512 },
182 { 2027, 517 },
183 { 2028, 527 },
184 { 2030, 534 },
185 { 2033, 541 },
186 { 2034, 548 },
187 { 2035, 555 },
188 { 2037, 562 },
189 { 2038, 569 },
190 { 2039, 576 },
191 { 2040, 583 },
192 { 2041, 590 },
193 { 2043, 597 },
194 { 2011, 604 },
195 { 2044, 611 },
196 { 2045, 618 },
197 { 2010, 624 },
198 { 2046, 631 },
199 { 2047, 638 },
200 { 2048, 645 },
201 { 2049, 652 },
202 { 2050, 659 },
203 { 2051, 666 },
204 { 2052, 673 },
205 { 2053, 680 },
206 { 2054, 687 },
207 { 2055, 694 },
208 { 2056, 701 },
209 { 2062, 708 },
210 { 2063, 715 },
211 { 2084, 723 },
212 { 2085, 730 },
213 { 2086, 741 },
214 { 2087, 748 },
215 { 2088, 755 },
216 { 2089, 762 },
217 { 2091, 771 },
218 { 2092, 780 },
219 { 2093, 789 },
220 { 2094, 798 },
221 { 2095, 807 },
222 { 2096, 816 },
223 { 2097, 825 },
224 { 2098, 834 },
225 { 2099, 843 },
226 { 2100, 852 },
227 { 2101, 861 },
228 { 2102, 872 },
229 { 2250, 880 },
230 { 2251, 893 },
231 { 2252, 906 },
232 { 2253, 919 },
233 { 2254, 932 },
234 { 2255, 945 },
235 { 2256, 958 },
236 { 2257, 971 },
237 { 2258, 984 },
238 { 2259, 997 },
239};
240int mibToNameSize = sizeof(mibToName)/sizeof(MibToName);
241
242static const char mibToNameTable[] =
243 "US-ASCII\0"
244 "ISO-8859-1\0"
245 "ISO-8859-2\0"
246 "ISO-8859-3\0"
247 "ISO-8859-4\0"
248 "ISO-8859-5\0"
249 "ISO-8859-6\0"
250 "ISO-8859-7\0"
251 "ISO-8859-8\0"
252 "ISO-8859-9\0"
253 "ISO-8859-10\0"
254 "ISO-2022-JP-1\0"
255 "Shift_JIS\0"
256 "EUC-JP\0"
257 "US-ASCII\0"
258 "EUC-KR\0"
259 "ISO-2022-KR\0"
260 "EUC-KR\0"
261 "ISO-2022-JP\0"
262 "ISO-2022-JP-2\0"
263 "GB_2312-80\0"
264 "ISO-8859-6\0"
265 "ISO-8859-6\0"
266 "ISO-8859-8\0"
267 "ISO-8859-8\0"
268 "ISO-2022-CN\0"
269 "ISO-2022-CN-EXT\0"
270 "UTF-8\0"
271 "ISO-8859-13\0"
272 "ISO-8859-14\0"
273 "ISO-8859-15\0"
274 "GBK\0"
275 "GB18030\0"
276 "UTF-16\0"
277 "UTF-32\0"
278 "SCSU\0"
279 "UTF-7\0"
280 "UTF-16BE\0"
281 "UTF-16LE\0"
282 "UTF-16\0"
283 "CESU-8\0"
284 "UTF-32\0"
285 "UTF-32BE\0"
286 "UTF-32LE\0"
287 "BOCU-1\0"
288 "hp-roman8\0"
289 "Adobe-Standard-Encoding\0"
290 "IBM850\0"
291 "IBM862\0"
292 "IBM-Thai\0"
293 "Shift_JIS\0"
294 "GB2312\0"
295 "Big5\0"
296 "macintosh\0"
297 "IBM037\0"
298 "IBM273\0"
299 "IBM277\0"
300 "IBM278\0"
301 "IBM280\0"
302 "IBM284\0"
303 "IBM285\0"
304 "IBM290\0"
305 "IBM297\0"
306 "IBM420\0"
307 "IBM424\0"
308 "IBM437\0"
309 "IBM500\0"
310 "cp851\0"
311 "IBM852\0"
312 "IBM855\0"
313 "IBM857\0"
314 "IBM860\0"
315 "IBM861\0"
316 "IBM863\0"
317 "IBM864\0"
318 "IBM865\0"
319 "IBM868\0"
320 "IBM869\0"
321 "IBM870\0"
322 "IBM871\0"
323 "IBM918\0"
324 "IBM1026\0"
325 "KOI8-R\0"
326 "HZ-GB-2312\0"
327 "IBM866\0"
328 "IBM775\0"
329 "KOI8-U\0"
330 "IBM00858\0"
331 "IBM01140\0"
332 "IBM01141\0"
333 "IBM01142\0"
334 "IBM01143\0"
335 "IBM01144\0"
336 "IBM01145\0"
337 "IBM01146\0"
338 "IBM01147\0"
339 "IBM01148\0"
340 "IBM01149\0"
341 "Big5-HKSCS\0"
342 "IBM1047\0"
343 "windows-1250\0"
344 "windows-1251\0"
345 "windows-1252\0"
346 "windows-1253\0"
347 "windows-1254\0"
348 "windows-1255\0"
349 "windows-1256\0"
350 "windows-1257\0"
351 "windows-1258\0"
352 "TIS-620\0";
353
354static QTextCodec *loadQtCodec(const char *name)
355{
356 if (!strcmp(name, "UTF-8"))
357 return new QUtf8Codec;
358 if (!strcmp(name, "UTF-16"))
359 return new QUtf16Codec;
360 if (!strcmp(name, "ISO-8859-1"))
361 return new QLatin1Codec;
362 if (!strcmp(name, "UTF-16BE"))
363 return new QUtf16BECodec;
364 if (!strcmp(name, "UTF-16LE"))
365 return new QUtf16LECodec;
366 if (!strcmp(name, "UTF-32"))
367 return new QUtf32Codec;
368 if (!strcmp(name, "UTF-32BE"))
369 return new QUtf32BECodec;
370 if (!strcmp(name, "UTF-32LE"))
371 return new QUtf32LECodec;
372 if (!strcmp(name, "ISO-8859-16") || !strcmp(name, "latin10") || !strcmp(name, "iso-ir-226"))
373 return new QSimpleTextCodec(13 /* == 8859-16*/);
374#if QT_CONFIG(codecs)
375 if (!strcmp(name, "TSCII"))
376 return new QTsciiCodec;
377 if (!qstrnicmp(name, "iscii", 5))
378 return QIsciiCodec::create(name);
379#endif
380
381 return 0;
382}
383
384/// \threadsafe
385QList<QByteArray> QIcuCodec::availableCodecs()
386{
387 QList<QByteArray> codecs;
388 int n = ucnv_countAvailable();
389 for (int i = 0; i < n; ++i) {
390 const char *name = ucnv_getAvailableName(i);
391
392 UErrorCode error = U_ZERO_ERROR;
393 const char *standardName = ucnv_getStandardName(name, "MIME", &error);
394 if (U_FAILURE(error) || !standardName) {
395 error = U_ZERO_ERROR;
396 standardName = ucnv_getStandardName(name, "IANA", &error);
397 }
398 if (U_FAILURE(error))
399 continue;
400
401 error = U_ZERO_ERROR;
402 int ac = ucnv_countAliases(standardName, &error);
403 if (U_FAILURE(error))
404 continue;
405 for (int j = 0; j < ac; ++j) {
406 error = U_ZERO_ERROR;
407 const char *alias = ucnv_getAlias(standardName, j, &error);
408 if (!U_SUCCESS(error))
409 continue;
410 codecs += alias;
411 }
412 }
413
414 // handled by Qt and not in ICU:
415 codecs += "TSCII";
416
417 return codecs;
418}
419
420/// \threadsafe
421QList<int> QIcuCodec::availableMibs()
422{
423 QList<int> mibs;
424 mibs.reserve(mibToNameSize + 1);
425 for (int i = 0; i < mibToNameSize; ++i)
426 mibs += mibToName[i].mib;
427
428 // handled by Qt and not in ICU:
429 mibs += 2107; // TSCII
430
431 return mibs;
432}
433
434QTextCodec *QIcuCodec::defaultCodecUnlocked()
435{
436 QCoreGlobalData *globalData = QCoreGlobalData::instance();
437 if (!globalData)
438 return 0;
439 QTextCodec *c = globalData->codecForLocale.loadAcquire();
440 if (c)
441 return c;
442
443#if defined(QT_LOCALE_IS_UTF8)
444 const char *name = "UTF-8";
445#else
446 const char *name = ucnv_getDefaultName();
447#endif
448 c = codecForNameUnlocked(name);
449 globalData->codecForLocale.storeRelease(c);
450 return c;
451}
452
453
454QTextCodec *QIcuCodec::codecForNameUnlocked(const char *name)
455{
456 // backwards compatibility with Qt 4.x
457 if (!qstrcmp(name, "CP949"))
458 name = "windows-949";
459 else if (!qstrcmp(name, "Apple Roman"))
460 name = "macintosh";
461 // these are broken data in ICU 4.4, and can't be resolved even though they are aliases to tis-620
462 if (!qstrcmp(name, "windows-874-2000")
463 || !qstrcmp(name, "windows-874")
464 || !qstrcmp(name, "MS874")
465 || !qstrcmp(name, "x-windows-874")
466 || !qstrcmp(name, "ISO 8859-11"))
467 name = "TIS-620";
468
469 UErrorCode error = U_ZERO_ERROR;
470 // MIME gives better default names
471 const char *standardName = ucnv_getStandardName(name, "MIME", &error);
472 if (U_FAILURE(error) || !standardName) {
473 error = U_ZERO_ERROR;
474 standardName = ucnv_getStandardName(name, "IANA", &error);
475 }
476 bool qt_only = false;
477 if (U_FAILURE(error) || !standardName) {
478 standardName = name;
479 qt_only = true;
480 } else {
481 // correct some issues where the ICU data set contains duplicated entries.
482 // Where this happens it's because one data set is a subset of another. We
483 // always use the larger data set.
484
485 if (qstrcmp(standardName, "GB2312") == 0 || qstrcmp(standardName, "GB_2312-80") == 0)
486 standardName = "GBK";
487 else if (qstrcmp(standardName, "KSC_5601") == 0 || qstrcmp(standardName, "EUC-KR") == 0 || qstrcmp(standardName, "cp1363") == 0)
488 standardName = "windows-949";
489 }
490
491 QCoreGlobalData *globalData = QCoreGlobalData::instance();
492 QTextCodecCache *cache = &globalData->codecCache;
493
494 QTextCodec *codec;
495 if (cache) {
496 codec = cache->value(standardName);
497 if (codec)
498 return codec;
499 }
500
501 for (TextCodecListConstIt it = globalData->allCodecs.constBegin(), cend = globalData->allCodecs.constEnd(); it != cend; ++it) {
502 QTextCodec *cursor = *it;
503 if (qTextCodecNameMatch(cursor->name(), standardName)) {
504 if (cache)
505 cache->insert(standardName, cursor);
506 return cursor;
507 }
508 QList<QByteArray> aliases = cursor->aliases();
509 for (ByteArrayListConstIt ait = aliases.constBegin(), acend = aliases.constEnd(); ait != acend; ++ait) {
510 if (qTextCodecNameMatch(*ait, standardName)) {
511 if (cache)
512 cache->insert(standardName, cursor);
513 return cursor;
514 }
515 }
516 }
517
518 QTextCodec *c = loadQtCodec(standardName);
519 if (c)
520 return c;
521
522 if (qt_only)
523 return 0;
524
525 // check whether there is really a converter for the name available.
526 UConverter *conv = ucnv_open(standardName, &error);
527 if (!conv) {
528 qDebug("codecForName: ucnv_open failed %s %s", standardName, u_errorName(error));
529 return 0;
530 }
531 //qDebug() << "QIcuCodec: Standard name for " << name << "is" << standardName;
532 ucnv_close(conv);
533
534
535 c = new QIcuCodec(standardName);
536 if (cache)
537 cache->insert(standardName, c);
538 return c;
539}
540
541
542QTextCodec *QIcuCodec::codecForMibUnlocked(int mib)
543{
544 for (int i = 0; i < mibToNameSize; ++i) {
545 if (mibToName[i].mib == mib)
546 return codecForNameUnlocked(mibToNameTable + mibToName[i].index);
547 }
548
549 if (mib == 2107)
550 return codecForNameUnlocked("TSCII");
551
552 return 0;
553}
554
555
556QIcuCodec::QIcuCodec(const char *name)
557 : m_name(name)
558{
559}
560
561QIcuCodec::~QIcuCodec()
562{
563}
564
565UConverter *QIcuCodec::getConverter(QTextCodec::ConverterState *state) const
566{
567 UConverter *conv = 0;
568 if (state) {
569 if (!state->d) {
570 // first time
571 state->flags |= QTextCodec::FreeFunction;
572 QTextCodecUnalignedPointer::encode(state->state_data, qIcuCodecStateFree);
573 UErrorCode error = U_ZERO_ERROR;
574 state->d = ucnv_open(m_name, &error);
575 ucnv_setSubstChars(static_cast<UConverter *>(state->d),
576 state->flags & QTextCodec::ConvertInvalidToNull ? "\0" : "?", 1, &error);
577 if (U_FAILURE(error))
578 qDebug("getConverter(state) ucnv_open failed %s %s", m_name, u_errorName(error));
579 }
580 conv = static_cast<UConverter *>(state->d);
581 }
582 if (!conv) {
583 // stateless conversion
584 UErrorCode error = U_ZERO_ERROR;
585 conv = ucnv_open(m_name, &error);
586 ucnv_setSubstChars(conv, "?", 1, &error);
587 if (U_FAILURE(error))
588 qDebug("getConverter(no state) ucnv_open failed %s %s", m_name, u_errorName(error));
589 }
590 return conv;
591}
592
593QString QIcuCodec::convertToUnicode(const char *chars, int length, QTextCodec::ConverterState *state) const
594{
595 UConverter *conv = getConverter(state);
596
597 QString string(length + 2, Qt::Uninitialized);
598
599 const char *end = chars + length;
600 int convertedChars = 0;
601 while (1) {
602 UChar *uc = (UChar *)string.data();
603 UChar *ucEnd = uc + string.length();
604 uc += convertedChars;
605 UErrorCode error = U_ZERO_ERROR;
606 ucnv_toUnicode(conv,
607 &uc, ucEnd,
608 &chars, end,
609 0, false, &error);
610 if (!U_SUCCESS(error) && error != U_BUFFER_OVERFLOW_ERROR) {
611 qDebug("convertToUnicode failed: %s", u_errorName(error));
612 break;
613 }
614
615 convertedChars = uc - (UChar *)string.data();
616 if (chars >= end)
617 break;
618 string.resize(string.length()*2);
619 }
620 string.resize(convertedChars);
621
622 if (!state)
623 ucnv_close(conv);
624 return string;
625}
626
627
628QByteArray QIcuCodec::convertFromUnicode(const QChar *unicode, int length, QTextCodec::ConverterState *state) const
629{
630 UConverter *conv = getConverter(state);
631
632 int requiredLength = UCNV_GET_MAX_BYTES_FOR_STRING(length, ucnv_getMaxCharSize(conv));
633 QByteArray string(requiredLength, Qt::Uninitialized);
634
635 const UChar *uc = (const UChar *)unicode;
636 const UChar *end = uc + length;
637 int convertedChars = 0;
638 while (1) {
639 char *ch = (char *)string.data();
640 char *chEnd = ch + string.length();
641 ch += convertedChars;
642 UErrorCode error = U_ZERO_ERROR;
643 ucnv_fromUnicode(conv,
644 &ch, chEnd,
645 &uc, end,
646 0, false, &error);
647 if (!U_SUCCESS(error))
648 qDebug("convertFromUnicode failed: %s", u_errorName(error));
649 convertedChars = ch - string.data();
650 if (uc >= end)
651 break;
652 string.resize(string.length()*2);
653 }
654 string.resize(convertedChars);
655
656 if (!state)
657 ucnv_close(conv);
658
659 return string;
660}
661
662
663QByteArray QIcuCodec::name() const
664{
665 return m_name;
666}
667
668
669QList<QByteArray> QIcuCodec::aliases() const
670{
671 UErrorCode error = U_ZERO_ERROR;
672
673 int n = ucnv_countAliases(m_name, &error);
674
675 QList<QByteArray> aliases;
676 for (int i = 0; i < n; ++i) {
677 const char *a = ucnv_getAlias(m_name, i, &error);
678 // skip the canonical name
679 if (!a || !qstrcmp(a, m_name))
680 continue;
681 aliases += a;
682 }
683
684 return aliases;
685}
686
687
688int QIcuCodec::mibEnum() const
689{
690 for (int i = 0; i < mibToNameSize; ++i) {
691 if (qTextCodecNameMatch(m_name, (mibToNameTable + mibToName[i].index)))
692 return mibToName[i].mib;
693 }
694
695 return 0;
696}
697
698QT_END_NAMESPACE
699