1 | /**************************************************************************** |
2 | ** |
3 | ** Copyright (C) 2014 Digia Plc and/or its subsidiary(-ies). |
4 | ** Contact: http://www.qt-project.org/legal |
5 | ** |
6 | ** This file is part of the QtCore module of the Qt Toolkit. |
7 | ** |
8 | ** $QT_BEGIN_LICENSE:LGPL$ |
9 | ** Commercial License Usage |
10 | ** Licensees holding valid commercial Qt licenses may use this file in |
11 | ** accordance with the commercial license agreement provided with the |
12 | ** Software or, alternatively, in accordance with the terms contained in |
13 | ** a written agreement between you and Digia. For licensing terms and |
14 | ** conditions see http://qt.digia.com/licensing. For further information |
15 | ** use the contact form at http://qt.digia.com/contact-us. |
16 | ** |
17 | ** GNU Lesser General Public License Usage |
18 | ** Alternatively, this file may be used under the terms of the GNU Lesser |
19 | ** General Public License version 2.1 as published by the Free Software |
20 | ** Foundation and appearing in the file LICENSE.LGPL included in the |
21 | ** packaging of this file. Please review the following information to |
22 | ** ensure the GNU Lesser General Public License version 2.1 requirements |
23 | ** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html. |
24 | ** |
25 | ** In addition, as a special exception, Digia gives you certain additional |
26 | ** rights. These rights are described in the Digia Qt LGPL Exception |
27 | ** version 1.1, included in the file LGPL_EXCEPTION.txt in this package. |
28 | ** |
29 | ** GNU General Public License Usage |
30 | ** Alternatively, this file may be used under the terms of the GNU |
31 | ** General Public License version 3.0 as published by the Free Software |
32 | ** Foundation and appearing in the file LICENSE.GPL included in the |
33 | ** packaging of this file. Please review the following information to |
34 | ** ensure the GNU General Public License version 3.0 requirements will be |
35 | ** met: http://www.gnu.org/copyleft/gpl.html. |
36 | ** |
37 | ** |
38 | ** $QT_END_LICENSE$ |
39 | ** |
40 | ****************************************************************************/ |
41 | |
42 | #include "qiconvcodec_p.h" |
43 | #include "qtextcodec_p.h" |
44 | #include <qlibrary.h> |
45 | #include <qdebug.h> |
46 | #include <qthreadstorage.h> |
47 | |
48 | #include <errno.h> |
49 | #include <locale.h> |
50 | #include <stdio.h> |
51 | #include <dlfcn.h> |
52 | |
53 | // unistd.h is needed for the _XOPEN_UNIX macro |
54 | #include <unistd.h> |
55 | #if defined(_XOPEN_UNIX) && !defined(Q_OS_QNX) && !defined(Q_OS_OSF) |
56 | # include <langinfo.h> |
57 | #endif |
58 | |
59 | #if defined(Q_OS_HPUX) |
60 | # define NO_BOM |
61 | # define UTF16 "ucs2" |
62 | #elif defined(Q_OS_AIX) |
63 | # define NO_BOM |
64 | # define UTF16 "UCS-2" |
65 | #elif defined(Q_OS_FREEBSD) || defined(Q_OS_MAC) |
66 | # define NO_BOM |
67 | # if Q_BYTE_ORDER == Q_BIG_ENDIAN |
68 | # define UTF16 "UTF-16BE" |
69 | # else |
70 | # define UTF16 "UTF-16LE" |
71 | # endif |
72 | #else |
73 | # define UTF16 "UTF-16" |
74 | #endif |
75 | |
76 | #if defined(Q_OS_MAC) |
77 | #ifndef GNU_LIBICONV |
78 | #define GNU_LIBICONV |
79 | #endif |
80 | typedef iconv_t (*Ptr_iconv_open) (const char*, const char*); |
81 | typedef size_t (*Ptr_iconv) (iconv_t, const char **, size_t *, char **, size_t *); |
82 | typedef int (*Ptr_iconv_close) (iconv_t); |
83 | |
84 | static Ptr_iconv_open ptr_iconv_open = 0; |
85 | static Ptr_iconv ptr_iconv = 0; |
86 | static Ptr_iconv_close ptr_iconv_close = 0; |
87 | #endif |
88 | |
89 | QT_BEGIN_NAMESPACE |
90 | |
91 | extern bool qt_locale_initialized; |
92 | |
93 | QIconvCodec::QIconvCodec() |
94 | : utf16Codec(0) |
95 | { |
96 | utf16Codec = QTextCodec::codecForMib(1015); |
97 | Q_ASSERT_X(utf16Codec != 0, |
98 | "QIconvCodec::convertToUnicode" , |
99 | "internal error, UTF-16 codec not found" ); |
100 | if (!utf16Codec) { |
101 | fprintf(stderr, "QIconvCodec::convertToUnicode: internal error, UTF-16 codec not found\n" ); |
102 | utf16Codec = reinterpret_cast<QTextCodec *>(~0); |
103 | } |
104 | #if defined(Q_OS_MAC) |
105 | if (ptr_iconv_open == 0) { |
106 | QLibrary libiconv(QLatin1String("/usr/lib/libiconv" )); |
107 | libiconv.setLoadHints(QLibrary::ExportExternalSymbolsHint); |
108 | |
109 | ptr_iconv_open = reinterpret_cast<Ptr_iconv_open>(libiconv.resolve("libiconv_open" )); |
110 | if (!ptr_iconv_open) |
111 | ptr_iconv_open = reinterpret_cast<Ptr_iconv_open>(libiconv.resolve("iconv_open" )); |
112 | ptr_iconv = reinterpret_cast<Ptr_iconv>(libiconv.resolve("libiconv" )); |
113 | if (!ptr_iconv) |
114 | ptr_iconv = reinterpret_cast<Ptr_iconv>(libiconv.resolve("iconv" )); |
115 | ptr_iconv_close = reinterpret_cast<Ptr_iconv_close>(libiconv.resolve("libiconv_close" )); |
116 | if (!ptr_iconv_close) |
117 | ptr_iconv_close = reinterpret_cast<Ptr_iconv_close>(libiconv.resolve("iconv_close" )); |
118 | |
119 | Q_ASSERT_X(ptr_iconv_open && ptr_iconv && ptr_iconv_close, |
120 | "QIconvCodec::QIconvCodec()" , |
121 | "internal error, could not resolve the iconv functions" ); |
122 | |
123 | # undef iconv_open |
124 | # define iconv_open ptr_iconv_open |
125 | # undef iconv |
126 | # define iconv ptr_iconv |
127 | # undef iconv_close |
128 | # define iconv_close ptr_iconv_close |
129 | } |
130 | #endif |
131 | } |
132 | |
133 | QIconvCodec::~QIconvCodec() |
134 | { |
135 | } |
136 | |
137 | QIconvCodec::IconvState::IconvState(iconv_t x) |
138 | : buffer(array), bufferLen(sizeof array), cd(x) |
139 | { |
140 | } |
141 | |
142 | QIconvCodec::IconvState::~IconvState() |
143 | { |
144 | if (cd != reinterpret_cast<iconv_t>(-1)) |
145 | iconv_close(cd); |
146 | if (buffer != array) |
147 | delete[] buffer; |
148 | } |
149 | |
150 | void QIconvCodec::IconvState::saveChars(const char *c, int count) |
151 | { |
152 | if (count > bufferLen) { |
153 | if (buffer != array) |
154 | delete[] buffer; |
155 | buffer = new char[bufferLen = count]; |
156 | } |
157 | |
158 | memcpy(buffer, c, count); |
159 | } |
160 | |
161 | static void qIconvCodecStateFree(QTextCodec::ConverterState *state) |
162 | { |
163 | delete reinterpret_cast<QIconvCodec::IconvState *>(state->d); |
164 | } |
165 | |
166 | Q_GLOBAL_STATIC(QThreadStorage<QIconvCodec::IconvState *>, toUnicodeState) |
167 | |
168 | QString QIconvCodec::convertToUnicode(const char* chars, int len, ConverterState *convState) const |
169 | { |
170 | if (utf16Codec == reinterpret_cast<QTextCodec *>(~0)) |
171 | return QString::fromLatin1(chars, len); |
172 | |
173 | int invalidCount = 0; |
174 | int remainingCount = 0; |
175 | char *remainingBuffer = 0; |
176 | IconvState *temporaryState = 0; |
177 | IconvState **pstate; |
178 | |
179 | if (convState) { |
180 | // stateful conversion |
181 | pstate = reinterpret_cast<IconvState **>(&convState->d); |
182 | if (convState->d) { |
183 | // restore state |
184 | remainingCount = convState->remainingChars; |
185 | remainingBuffer = (*pstate)->buffer; |
186 | } else { |
187 | // first time |
188 | convState->flags |= FreeFunction; |
189 | QTextCodecUnalignedPointer::encode(convState->state_data, qIconvCodecStateFree); |
190 | } |
191 | } else { |
192 | QThreadStorage<QIconvCodec::IconvState *> *ts = toUnicodeState(); |
193 | if (!qt_locale_initialized || !ts) { |
194 | // we're running after the Q_GLOBAL_STATIC has been deleted |
195 | // or before the QCoreApplication initialization |
196 | // bad programmer, no cookie for you |
197 | pstate = &temporaryState; |
198 | } else { |
199 | // stateless conversion -- use thread-local data |
200 | pstate = &toUnicodeState()->localData(); |
201 | } |
202 | } |
203 | |
204 | if (!*pstate) { |
205 | // first time, create the state |
206 | iconv_t cd = QIconvCodec::createIconv_t(UTF16, 0); |
207 | if (cd == reinterpret_cast<iconv_t>(-1)) { |
208 | static int reported = 0; |
209 | if (!reported++) { |
210 | fprintf(stderr, |
211 | "QIconvCodec::convertToUnicode: using Latin-1 for conversion, iconv_open failed\n" ); |
212 | } |
213 | return QString::fromLatin1(chars, len); |
214 | } |
215 | |
216 | *pstate = new IconvState(cd); |
217 | } |
218 | |
219 | IconvState *state = *pstate; |
220 | size_t inBytesLeft = len; |
221 | // best case assumption, each byte is converted into one UTF-16 character, plus 2 bytes for the BOM |
222 | #ifdef GNU_LIBICONV |
223 | // GNU doesn't disagree with POSIX :/ |
224 | const char *inBytes = chars; |
225 | #else |
226 | char *inBytes = const_cast<char *>(chars); |
227 | #endif |
228 | |
229 | QByteArray in; |
230 | if (remainingCount) { |
231 | // we have to prepend the remaining bytes from the previous conversion |
232 | inBytesLeft += remainingCount; |
233 | in.resize(inBytesLeft); |
234 | inBytes = in.data(); |
235 | |
236 | memcpy(in.data(), remainingBuffer, remainingCount); |
237 | memcpy(in.data() + remainingCount, chars, len); |
238 | |
239 | remainingCount = 0; |
240 | } |
241 | |
242 | size_t outBytesLeft = len * 2 + 2; |
243 | QByteArray ba(outBytesLeft, Qt::Uninitialized); |
244 | char *outBytes = ba.data(); |
245 | do { |
246 | size_t ret = iconv(state->cd, &inBytes, &inBytesLeft, &outBytes, &outBytesLeft); |
247 | if (ret == (size_t) -1) { |
248 | if (errno == E2BIG) { |
249 | int offset = ba.size() - outBytesLeft; |
250 | ba.resize(ba.size() * 2); |
251 | outBytes = ba.data() + offset; |
252 | outBytesLeft = ba.size() - offset; |
253 | |
254 | continue; |
255 | } |
256 | |
257 | if (errno == EILSEQ) { |
258 | // conversion stopped because of an invalid character in the sequence |
259 | ++invalidCount; |
260 | } else if (errno == EINVAL && convState) { |
261 | // conversion stopped because the remaining inBytesLeft make up |
262 | // an incomplete multi-byte sequence; save them for later |
263 | state->saveChars(inBytes, inBytesLeft); |
264 | remainingCount = inBytesLeft; |
265 | break; |
266 | } |
267 | |
268 | if (errno == EILSEQ || errno == EINVAL) { |
269 | // skip the next character |
270 | ++inBytes; |
271 | --inBytesLeft; |
272 | continue; |
273 | } |
274 | |
275 | // some other error |
276 | // note, cannot use qWarning() since we are implementing the codecForLocale :) |
277 | perror("QIconvCodec::convertToUnicode: using Latin-1 for conversion, iconv failed" ); |
278 | |
279 | if (!convState) { |
280 | // reset state |
281 | iconv(state->cd, 0, &inBytesLeft, 0, &outBytesLeft); |
282 | } |
283 | |
284 | delete temporaryState; |
285 | return QString::fromLatin1(chars, len); |
286 | } |
287 | } while (inBytesLeft != 0); |
288 | |
289 | QString s; |
290 | |
291 | if (convState) { |
292 | s = utf16Codec->toUnicode(ba.constData(), ba.size() - outBytesLeft, &state->internalState); |
293 | |
294 | convState->invalidChars = invalidCount; |
295 | convState->remainingChars = remainingCount; |
296 | } else { |
297 | s = utf16Codec->toUnicode(ba.constData(), ba.size() - outBytesLeft); |
298 | |
299 | // reset state |
300 | iconv(state->cd, 0, &inBytesLeft, 0, &outBytesLeft); |
301 | } |
302 | |
303 | delete temporaryState; |
304 | return s; |
305 | } |
306 | |
307 | Q_GLOBAL_STATIC(QThreadStorage<QIconvCodec::IconvState *>, fromUnicodeState) |
308 | |
309 | static bool setByteOrder(iconv_t cd) |
310 | { |
311 | #if !defined(NO_BOM) |
312 | // give iconv() a BOM |
313 | char buf[4]; |
314 | ushort bom[] = { QChar::ByteOrderMark }; |
315 | |
316 | char *outBytes = buf; |
317 | char *inBytes = reinterpret_cast<char *>(bom); |
318 | size_t outBytesLeft = sizeof buf; |
319 | size_t inBytesLeft = sizeof bom; |
320 | |
321 | #if defined(GNU_LIBICONV) |
322 | const char **inBytesPtr = const_cast<const char **>(&inBytes); |
323 | #else |
324 | char **inBytesPtr = &inBytes; |
325 | #endif |
326 | |
327 | if (iconv(cd, inBytesPtr, &inBytesLeft, &outBytes, &outBytesLeft) == (size_t) -1) { |
328 | return false; |
329 | } |
330 | #endif // NO_BOM |
331 | |
332 | return true; |
333 | } |
334 | |
335 | QByteArray QIconvCodec::convertFromUnicode(const QChar *uc, int len, ConverterState *convState) const |
336 | { |
337 | char *inBytes; |
338 | char *outBytes; |
339 | size_t inBytesLeft; |
340 | |
341 | #if defined(GNU_LIBICONV) |
342 | const char **inBytesPtr = const_cast<const char **>(&inBytes); |
343 | #else |
344 | char **inBytesPtr = &inBytes; |
345 | #endif |
346 | |
347 | IconvState *temporaryState = 0; |
348 | QThreadStorage<QIconvCodec::IconvState *> *ts = fromUnicodeState(); |
349 | IconvState *&state = (qt_locale_initialized && ts) ? ts->localData() : temporaryState; |
350 | if (!state) { |
351 | iconv_t cd = QIconvCodec::createIconv_t(0, UTF16); |
352 | if (cd != reinterpret_cast<iconv_t>(-1)) { |
353 | if (!setByteOrder(cd)) { |
354 | perror("QIconvCodec::convertFromUnicode: using Latin-1 for conversion, iconv failed for BOM" ); |
355 | |
356 | iconv_close(cd); |
357 | cd = reinterpret_cast<iconv_t>(-1); |
358 | |
359 | return QString(uc, len).toLatin1(); |
360 | } |
361 | } |
362 | state = new IconvState(cd); |
363 | } |
364 | if (state->cd == reinterpret_cast<iconv_t>(-1)) { |
365 | static int reported = 0; |
366 | if (!reported++) { |
367 | fprintf(stderr, |
368 | "QIconvCodec::convertFromUnicode: using Latin-1 for conversion, iconv_open failed\n" ); |
369 | } |
370 | delete temporaryState; |
371 | return QString(uc, len).toLatin1(); |
372 | } |
373 | |
374 | size_t outBytesLeft = len; |
375 | QByteArray ba(outBytesLeft, Qt::Uninitialized); |
376 | outBytes = ba.data(); |
377 | |
378 | // now feed iconv() the real data |
379 | inBytes = const_cast<char *>(reinterpret_cast<const char *>(uc)); |
380 | inBytesLeft = len * sizeof(QChar); |
381 | |
382 | QByteArray in; |
383 | if (convState && convState->remainingChars) { |
384 | // we have one surrogate char to be prepended |
385 | in.resize(sizeof(QChar) + len); |
386 | inBytes = in.data(); |
387 | |
388 | QChar remaining = convState->state_data[0]; |
389 | memcpy(in.data(), &remaining, sizeof(QChar)); |
390 | memcpy(in.data() + sizeof(QChar), uc, inBytesLeft); |
391 | |
392 | inBytesLeft += sizeof(QChar); |
393 | convState->remainingChars = 0; |
394 | } |
395 | |
396 | int invalidCount = 0; |
397 | while (inBytesLeft != 0) { |
398 | if (iconv(state->cd, inBytesPtr, &inBytesLeft, &outBytes, &outBytesLeft) == (size_t) -1) { |
399 | if (errno == EINVAL && convState) { |
400 | // buffer ends in a surrogate |
401 | Q_ASSERT(inBytesLeft == 2); |
402 | convState->remainingChars = 1; |
403 | convState->state_data[0] = uc[len - 1].unicode(); |
404 | break; |
405 | } |
406 | |
407 | switch (errno) { |
408 | case EILSEQ: |
409 | ++invalidCount; |
410 | // fall through |
411 | case EINVAL: |
412 | { |
413 | inBytes += sizeof(QChar); |
414 | inBytesLeft -= sizeof(QChar); |
415 | break; |
416 | } |
417 | case E2BIG: |
418 | { |
419 | int offset = ba.size() - outBytesLeft; |
420 | ba.resize(ba.size() * 2); |
421 | outBytes = ba.data() + offset; |
422 | outBytesLeft = ba.size() - offset; |
423 | break; |
424 | } |
425 | default: |
426 | { |
427 | // note, cannot use qWarning() since we are implementing the codecForLocale :) |
428 | perror("QIconvCodec::convertFromUnicode: using Latin-1 for conversion, iconv failed" ); |
429 | |
430 | // reset to initial state |
431 | iconv(state->cd, 0, &inBytesLeft, 0, &outBytesLeft); |
432 | |
433 | delete temporaryState; |
434 | return QString(uc, len).toLatin1(); |
435 | } |
436 | } |
437 | } |
438 | } |
439 | |
440 | // reset to initial state |
441 | iconv(state->cd, 0, &inBytesLeft, 0, &outBytesLeft); |
442 | setByteOrder(state->cd); |
443 | |
444 | ba.resize(ba.size() - outBytesLeft); |
445 | |
446 | if (convState) |
447 | convState->invalidChars = invalidCount; |
448 | |
449 | delete temporaryState; |
450 | return ba; |
451 | } |
452 | |
453 | QByteArray QIconvCodec::name() const |
454 | { |
455 | return "System" ; |
456 | } |
457 | |
458 | int QIconvCodec::mibEnum() const |
459 | { |
460 | return 0; |
461 | } |
462 | |
463 | iconv_t QIconvCodec::createIconv_t(const char *to, const char *from) |
464 | { |
465 | Q_ASSERT((to == 0 && from != 0) || (to != 0 && from == 0)); |
466 | |
467 | iconv_t cd = (iconv_t) -1; |
468 | #if defined(__GLIBC__) || defined(GNU_LIBICONV) || defined(Q_OS_QNX) |
469 | #if defined(Q_OS_QNX) |
470 | // on QNX the default locale is UTF-8, and an empty string will cause iconv_open to fail |
471 | static const char empty_codeset[] = "UTF-8" ; |
472 | #else |
473 | // both GLIBC and libgnuiconv will use the locale's encoding if from or to is an empty string |
474 | static const char empty_codeset[] = "" ; |
475 | #endif |
476 | const char *codeset = empty_codeset; |
477 | cd = iconv_open(to ? to : codeset, from ? from : codeset); |
478 | #else |
479 | char *codeset = 0; |
480 | #endif |
481 | |
482 | #if defined(_XOPEN_UNIX) && !defined(Q_OS_QNX) && !defined(Q_OS_OSF) |
483 | if (cd == (iconv_t) -1) { |
484 | codeset = nl_langinfo(CODESET); |
485 | if (codeset) |
486 | cd = iconv_open(to ? to : codeset, from ? from : codeset); |
487 | } |
488 | #endif |
489 | |
490 | if (cd == (iconv_t) -1) { |
491 | // Very poorly defined and followed standards causes lots of |
492 | // code to try to get all the cases... This logic is |
493 | // duplicated in QTextCodec, so if you change it here, change |
494 | // it there too. |
495 | |
496 | // Try to determine locale codeset from locale name assigned to |
497 | // LC_CTYPE category. |
498 | |
499 | // First part is getting that locale name. First try setlocale() which |
500 | // definitely knows it, but since we cannot fully trust it, get ready |
501 | // to fall back to environment variables. |
502 | char * ctype = qstrdup(setlocale(LC_CTYPE, 0)); |
503 | |
504 | // Get the first nonempty value from $LC_ALL, $LC_CTYPE, and $LANG |
505 | // environment variables. |
506 | char * lang = qstrdup(qgetenv("LC_ALL" ).constData()); |
507 | if (!lang || lang[0] == 0 || strcmp(lang, "C" ) == 0) { |
508 | if (lang) delete [] lang; |
509 | lang = qstrdup(qgetenv("LC_CTYPE" ).constData()); |
510 | } |
511 | if (!lang || lang[0] == 0 || strcmp(lang, "C" ) == 0) { |
512 | if (lang) delete [] lang; |
513 | lang = qstrdup(qgetenv("LANG" ).constData()); |
514 | } |
515 | |
516 | // Now try these in order: |
517 | // 1. CODESET from ctype if it contains a .CODESET part (e.g. en_US.ISO8859-15) |
518 | // 2. CODESET from lang if it contains a .CODESET part |
519 | // 3. ctype (maybe the locale is named "ISO-8859-1" or something) |
520 | // 4. locale (ditto) |
521 | // 5. check for "@euro" |
522 | |
523 | // 1. CODESET from ctype if it contains a .CODESET part (e.g. en_US.ISO8859-15) |
524 | codeset = ctype ? strchr(ctype, '.') : 0; |
525 | if (codeset && *codeset == '.') { |
526 | ++codeset; |
527 | cd = iconv_open(to ? to : codeset, from ? from : codeset); |
528 | } |
529 | |
530 | // 2. CODESET from lang if it contains a .CODESET part |
531 | codeset = lang ? strchr(lang, '.') : 0; |
532 | if (cd == (iconv_t) -1 && codeset && *codeset == '.') { |
533 | ++codeset; |
534 | cd = iconv_open(to ? to : codeset, from ? from : codeset); |
535 | } |
536 | |
537 | // 3. ctype (maybe the locale is named "ISO-8859-1" or something) |
538 | if (cd == (iconv_t) -1 && ctype && *ctype != 0 && strcmp (ctype, "C" ) != 0) |
539 | cd = iconv_open(to ? to : ctype, from ? from : ctype); |
540 | |
541 | |
542 | // 4. locale (ditto) |
543 | if (cd == (iconv_t) -1 && lang && *lang != 0) |
544 | cd = iconv_open(to ? to : lang, from ? from : lang); |
545 | |
546 | // 5. "@euro" |
547 | if ((cd == (iconv_t) -1 && ctype && strstr(ctype, "@euro" )) || (lang && strstr(lang, "@euro" ))) |
548 | cd = iconv_open(to ? to : "ISO8859-15" , from ? from : "ISO8859-15" ); |
549 | |
550 | delete [] ctype; |
551 | delete [] lang; |
552 | } |
553 | |
554 | return cd; |
555 | } |
556 | |
557 | QT_END_NAMESPACE |
558 | |