1/* Simple transformations functions.
2 Copyright (C) 1997-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <byteswap.h>
20#include <dlfcn.h>
21#include <endian.h>
22#include <errno.h>
23#include <gconv.h>
24#include <stdint.h>
25#include <stdlib.h>
26#include <string.h>
27#include <wchar.h>
28#include <sys/param.h>
29#include <gconv_int.h>
30
31#define BUILTIN_ALIAS(s1, s2) /* nothing */
32#define BUILTIN_TRANSFORMATION(From, To, Cost, Name, Fct, BtowcFct, \
33 MinF, MaxF, MinT, MaxT) \
34 extern int Fct (struct __gconv_step *, struct __gconv_step_data *, \
35 const unsigned char **, const unsigned char *, \
36 unsigned char **, size_t *, int, int);
37#include "gconv_builtin.h"
38
39
40#ifndef EILSEQ
41# define EILSEQ EINVAL
42#endif
43
44
45/* Specialized conversion function for a single byte to INTERNAL, recognizing
46 only ASCII characters. */
47wint_t
48__gconv_btwoc_ascii (struct __gconv_step *step, unsigned char c)
49{
50 if (c < 0x80)
51 return c;
52 else
53 return WEOF;
54}
55
56
57/* Transform from the internal, UCS4-like format, to UCS4. The
58 difference between the internal ucs4 format and the real UCS4
59 format is, if any, the endianness. The Unicode/ISO 10646 says that
60 unless some higher protocol specifies it differently, the byte
61 order is big endian.*/
62#define DEFINE_INIT 0
63#define DEFINE_FINI 0
64#define MIN_NEEDED_FROM 4
65#define MIN_NEEDED_TO 4
66#define FROM_DIRECTION 1
67#define FROM_LOOP internal_ucs4_loop
68#define TO_LOOP internal_ucs4_loop /* This is not used. */
69#define FUNCTION_NAME __gconv_transform_internal_ucs4
70#define ONE_DIRECTION 0
71
72
73static inline int
74__attribute ((always_inline))
75internal_ucs4_loop (struct __gconv_step *step,
76 struct __gconv_step_data *step_data,
77 const unsigned char **inptrp, const unsigned char *inend,
78 unsigned char **outptrp, const unsigned char *outend,
79 size_t *irreversible)
80{
81 const unsigned char *inptr = *inptrp;
82 unsigned char *outptr = *outptrp;
83 size_t n_convert = MIN (inend - inptr, outend - outptr) / 4;
84 int result;
85
86#if __BYTE_ORDER == __LITTLE_ENDIAN
87 /* Sigh, we have to do some real work. */
88 size_t cnt;
89
90 for (cnt = 0; cnt < n_convert; ++cnt, inptr += 4, outptr += 4)
91 {
92 uint32_t val = get32 (inptr);
93 put32 (outptr, __builtin_bswap32 (val));
94 }
95
96 *inptrp = inptr;
97 *outptrp = outptr;
98#elif __BYTE_ORDER == __BIG_ENDIAN
99 /* Simply copy the data. */
100 *inptrp = inptr + n_convert * 4;
101 *outptrp = __mempcpy (outptr, inptr, n_convert * 4);
102#else
103# error "This endianness is not supported."
104#endif
105
106 /* Determine the status. */
107 if (*inptrp == inend)
108 result = __GCONV_EMPTY_INPUT;
109 else if (*outptrp + 4 > outend)
110 result = __GCONV_FULL_OUTPUT;
111 else
112 result = __GCONV_INCOMPLETE_INPUT;
113
114 return result;
115}
116
117
118static inline int
119__attribute ((always_inline))
120internal_ucs4_loop_single (struct __gconv_step *step,
121 struct __gconv_step_data *step_data,
122 const unsigned char **inptrp,
123 const unsigned char *inend,
124 unsigned char **outptrp,
125 const unsigned char *outend,
126 size_t *irreversible)
127{
128 mbstate_t *state = step_data->__statep;
129 size_t cnt = state->__count & 7;
130
131 while (*inptrp < inend && cnt < 4)
132 state->__value.__wchb[cnt++] = *(*inptrp)++;
133
134 if (__glibc_unlikely (cnt < 4))
135 {
136 /* Still not enough bytes. Store the ones in the input buffer. */
137 state->__count &= ~7;
138 state->__count |= cnt;
139
140 return __GCONV_INCOMPLETE_INPUT;
141 }
142
143#if __BYTE_ORDER == __LITTLE_ENDIAN
144 (*outptrp)[0] = state->__value.__wchb[3];
145 (*outptrp)[1] = state->__value.__wchb[2];
146 (*outptrp)[2] = state->__value.__wchb[1];
147 (*outptrp)[3] = state->__value.__wchb[0];
148
149#elif __BYTE_ORDER == __BIG_ENDIAN
150 /* XXX unaligned */
151 (*outptrp)[0] = state->__value.__wchb[0];
152 (*outptrp)[1] = state->__value.__wchb[1];
153 (*outptrp)[2] = state->__value.__wchb[2];
154 (*outptrp)[3] = state->__value.__wchb[3];
155#else
156# error "This endianness is not supported."
157#endif
158 *outptrp += 4;
159
160 /* Clear the state buffer. */
161 state->__count &= ~7;
162
163 return __GCONV_OK;
164}
165
166#include <iconv/skeleton.c>
167
168
169/* Transform from UCS4 to the internal, UCS4-like format. Unlike
170 for the other direction we have to check for correct values here. */
171#define DEFINE_INIT 0
172#define DEFINE_FINI 0
173#define MIN_NEEDED_FROM 4
174#define MIN_NEEDED_TO 4
175#define FROM_DIRECTION 1
176#define FROM_LOOP ucs4_internal_loop
177#define TO_LOOP ucs4_internal_loop /* This is not used. */
178#define FUNCTION_NAME __gconv_transform_ucs4_internal
179#define ONE_DIRECTION 0
180
181
182static inline int
183__attribute ((always_inline))
184ucs4_internal_loop (struct __gconv_step *step,
185 struct __gconv_step_data *step_data,
186 const unsigned char **inptrp, const unsigned char *inend,
187 unsigned char **outptrp, const unsigned char *outend,
188 size_t *irreversible)
189{
190 int flags = step_data->__flags;
191 const unsigned char *inptr = *inptrp;
192 unsigned char *outptr = *outptrp;
193 int result;
194
195 for (; inptr + 4 <= inend && outptr + 4 <= outend; inptr += 4)
196 {
197 uint32_t inval = get32 (inptr);
198#if __BYTE_ORDER == __LITTLE_ENDIAN
199 inval = __builtin_bswap32 (inval);
200#endif
201
202 if (__glibc_unlikely (inval > 0x7fffffff))
203 {
204 /* The value is too large. We don't try transliteration here since
205 this is not an error because of the lack of possibilities to
206 represent the result. This is a genuine bug in the input since
207 UCS4 does not allow such values. */
208 if (irreversible == NULL)
209 /* We are transliterating, don't try to correct anything. */
210 return __GCONV_ILLEGAL_INPUT;
211
212 if (flags & __GCONV_IGNORE_ERRORS)
213 {
214 /* Just ignore this character. */
215 ++*irreversible;
216 continue;
217 }
218
219 *inptrp = inptr;
220 *outptrp = outptr;
221 return __GCONV_ILLEGAL_INPUT;
222 }
223
224 put32 (outptr, inval);
225 outptr += sizeof (uint32_t);
226 }
227
228 *inptrp = inptr;
229 *outptrp = outptr;
230
231 /* Determine the status. */
232 if (*inptrp == inend)
233 result = __GCONV_EMPTY_INPUT;
234 else if (*outptrp + 4 > outend)
235 result = __GCONV_FULL_OUTPUT;
236 else
237 result = __GCONV_INCOMPLETE_INPUT;
238
239 return result;
240}
241
242
243static inline int
244__attribute ((always_inline))
245ucs4_internal_loop_single (struct __gconv_step *step,
246 struct __gconv_step_data *step_data,
247 const unsigned char **inptrp,
248 const unsigned char *inend,
249 unsigned char **outptrp,
250 const unsigned char *outend,
251 size_t *irreversible)
252{
253 mbstate_t *state = step_data->__statep;
254 int flags = step_data->__flags;
255 size_t cnt = state->__count & 7;
256
257 while (*inptrp < inend && cnt < 4)
258 state->__value.__wchb[cnt++] = *(*inptrp)++;
259
260 if (__glibc_unlikely (cnt < 4))
261 {
262 /* Still not enough bytes. Store the ones in the input buffer. */
263 state->__count &= ~7;
264 state->__count |= cnt;
265
266 return __GCONV_INCOMPLETE_INPUT;
267 }
268
269 if (__builtin_expect (((unsigned char *) state->__value.__wchb)[0] > 0x80,
270 0))
271 {
272 /* The value is too large. We don't try transliteration here since
273 this is not an error because of the lack of possibilities to
274 represent the result. This is a genuine bug in the input since
275 UCS4 does not allow such values. */
276 if (!(flags & __GCONV_IGNORE_ERRORS))
277 {
278 *inptrp -= cnt - (state->__count & 7);
279 return __GCONV_ILLEGAL_INPUT;
280 }
281 }
282 else
283 {
284#if __BYTE_ORDER == __LITTLE_ENDIAN
285 (*outptrp)[0] = state->__value.__wchb[3];
286 (*outptrp)[1] = state->__value.__wchb[2];
287 (*outptrp)[2] = state->__value.__wchb[1];
288 (*outptrp)[3] = state->__value.__wchb[0];
289#elif __BYTE_ORDER == __BIG_ENDIAN
290 (*outptrp)[0] = state->__value.__wchb[0];
291 (*outptrp)[1] = state->__value.__wchb[1];
292 (*outptrp)[2] = state->__value.__wchb[2];
293 (*outptrp)[3] = state->__value.__wchb[3];
294#endif
295
296 *outptrp += 4;
297 }
298
299 /* Clear the state buffer. */
300 state->__count &= ~7;
301
302 return __GCONV_OK;
303}
304
305#include <iconv/skeleton.c>
306
307
308/* Similarly for the little endian form. */
309#define DEFINE_INIT 0
310#define DEFINE_FINI 0
311#define MIN_NEEDED_FROM 4
312#define MIN_NEEDED_TO 4
313#define FROM_DIRECTION 1
314#define FROM_LOOP internal_ucs4le_loop
315#define TO_LOOP internal_ucs4le_loop /* This is not used. */
316#define FUNCTION_NAME __gconv_transform_internal_ucs4le
317#define ONE_DIRECTION 0
318
319
320static inline int
321__attribute ((always_inline))
322internal_ucs4le_loop (struct __gconv_step *step,
323 struct __gconv_step_data *step_data,
324 const unsigned char **inptrp, const unsigned char *inend,
325 unsigned char **outptrp, const unsigned char *outend,
326 size_t *irreversible)
327{
328 const unsigned char *inptr = *inptrp;
329 unsigned char *outptr = *outptrp;
330 size_t n_convert = MIN (inend - inptr, outend - outptr) / 4;
331 int result;
332
333#if __BYTE_ORDER == __BIG_ENDIAN
334 /* Sigh, we have to do some real work. */
335 size_t cnt;
336
337 for (cnt = 0; cnt < n_convert; ++cnt, inptr += 4, outptr += 4)
338 {
339 uint32_t val = get32 (inptr);
340 put32 (outptr, __builtin_bswap32 (val));
341 }
342
343 *inptrp = inptr;
344 *outptrp = outptr;
345#elif __BYTE_ORDER == __LITTLE_ENDIAN
346 /* Simply copy the data. */
347 *inptrp = inptr + n_convert * 4;
348 *outptrp = __mempcpy (outptr, inptr, n_convert * 4);
349#else
350# error "This endianness is not supported."
351#endif
352
353 /* Determine the status. */
354 if (*inptrp == inend)
355 result = __GCONV_EMPTY_INPUT;
356 else if (*outptrp + 4 > outend)
357 result = __GCONV_FULL_OUTPUT;
358 else
359 result = __GCONV_INCOMPLETE_INPUT;
360
361 return result;
362}
363
364
365static inline int
366__attribute ((always_inline))
367internal_ucs4le_loop_single (struct __gconv_step *step,
368 struct __gconv_step_data *step_data,
369 const unsigned char **inptrp,
370 const unsigned char *inend,
371 unsigned char **outptrp,
372 const unsigned char *outend,
373 size_t *irreversible)
374{
375 mbstate_t *state = step_data->__statep;
376 size_t cnt = state->__count & 7;
377
378 while (*inptrp < inend && cnt < 4)
379 state->__value.__wchb[cnt++] = *(*inptrp)++;
380
381 if (__glibc_unlikely (cnt < 4))
382 {
383 /* Still not enough bytes. Store the ones in the input buffer. */
384 state->__count &= ~7;
385 state->__count |= cnt;
386
387 return __GCONV_INCOMPLETE_INPUT;
388 }
389
390#if __BYTE_ORDER == __BIG_ENDIAN
391 (*outptrp)[0] = state->__value.__wchb[3];
392 (*outptrp)[1] = state->__value.__wchb[2];
393 (*outptrp)[2] = state->__value.__wchb[1];
394 (*outptrp)[3] = state->__value.__wchb[0];
395
396#else
397 /* XXX unaligned */
398 (*outptrp)[0] = state->__value.__wchb[0];
399 (*outptrp)[1] = state->__value.__wchb[1];
400 (*outptrp)[2] = state->__value.__wchb[2];
401 (*outptrp)[3] = state->__value.__wchb[3];
402
403#endif
404
405 *outptrp += 4;
406
407 /* Clear the state buffer. */
408 state->__count &= ~7;
409
410 return __GCONV_OK;
411}
412
413#include <iconv/skeleton.c>
414
415
416/* And finally from UCS4-LE to the internal encoding. */
417#define DEFINE_INIT 0
418#define DEFINE_FINI 0
419#define MIN_NEEDED_FROM 4
420#define MIN_NEEDED_TO 4
421#define FROM_DIRECTION 1
422#define FROM_LOOP ucs4le_internal_loop
423#define TO_LOOP ucs4le_internal_loop /* This is not used. */
424#define FUNCTION_NAME __gconv_transform_ucs4le_internal
425#define ONE_DIRECTION 0
426
427
428static inline int
429__attribute ((always_inline))
430ucs4le_internal_loop (struct __gconv_step *step,
431 struct __gconv_step_data *step_data,
432 const unsigned char **inptrp, const unsigned char *inend,
433 unsigned char **outptrp, const unsigned char *outend,
434 size_t *irreversible)
435{
436 int flags = step_data->__flags;
437 const unsigned char *inptr = *inptrp;
438 unsigned char *outptr = *outptrp;
439 int result;
440
441 for (; inptr + 4 <= inend && outptr + 4 <= outend; inptr += 4)
442 {
443 uint32_t inval = get32 (inptr);
444#if __BYTE_ORDER == __BIG_ENDIAN
445 inval = __builtin_bswap32 (inval);
446#endif
447
448 if (__glibc_unlikely (inval > 0x7fffffff))
449 {
450 /* The value is too large. We don't try transliteration here since
451 this is not an error because of the lack of possibilities to
452 represent the result. This is a genuine bug in the input since
453 UCS4 does not allow such values. */
454 if (irreversible == NULL)
455 /* We are transliterating, don't try to correct anything. */
456 return __GCONV_ILLEGAL_INPUT;
457
458 if (flags & __GCONV_IGNORE_ERRORS)
459 {
460 /* Just ignore this character. */
461 ++*irreversible;
462 continue;
463 }
464
465 *inptrp = inptr;
466 *outptrp = outptr;
467 return __GCONV_ILLEGAL_INPUT;
468 }
469
470 put32 (outptr, inval);
471 outptr += sizeof (uint32_t);
472 }
473
474 *inptrp = inptr;
475 *outptrp = outptr;
476
477 /* Determine the status. */
478 if (*inptrp == inend)
479 result = __GCONV_EMPTY_INPUT;
480 else if (*inptrp + 4 > inend)
481 result = __GCONV_INCOMPLETE_INPUT;
482 else
483 {
484 assert (*outptrp + 4 > outend);
485 result = __GCONV_FULL_OUTPUT;
486 }
487
488 return result;
489}
490
491
492static inline int
493__attribute ((always_inline))
494ucs4le_internal_loop_single (struct __gconv_step *step,
495 struct __gconv_step_data *step_data,
496 const unsigned char **inptrp,
497 const unsigned char *inend,
498 unsigned char **outptrp,
499 const unsigned char *outend,
500 size_t *irreversible)
501{
502 mbstate_t *state = step_data->__statep;
503 int flags = step_data->__flags;
504 size_t cnt = state->__count & 7;
505
506 while (*inptrp < inend && cnt < 4)
507 state->__value.__wchb[cnt++] = *(*inptrp)++;
508
509 if (__glibc_unlikely (cnt < 4))
510 {
511 /* Still not enough bytes. Store the ones in the input buffer. */
512 state->__count &= ~7;
513 state->__count |= cnt;
514
515 return __GCONV_INCOMPLETE_INPUT;
516 }
517
518 if (__builtin_expect (((unsigned char *) state->__value.__wchb)[3] > 0x80,
519 0))
520 {
521 /* The value is too large. We don't try transliteration here since
522 this is not an error because of the lack of possibilities to
523 represent the result. This is a genuine bug in the input since
524 UCS4 does not allow such values. */
525 if (!(flags & __GCONV_IGNORE_ERRORS))
526 return __GCONV_ILLEGAL_INPUT;
527 }
528 else
529 {
530#if __BYTE_ORDER == __BIG_ENDIAN
531 (*outptrp)[0] = state->__value.__wchb[3];
532 (*outptrp)[1] = state->__value.__wchb[2];
533 (*outptrp)[2] = state->__value.__wchb[1];
534 (*outptrp)[3] = state->__value.__wchb[0];
535#else
536 (*outptrp)[0] = state->__value.__wchb[0];
537 (*outptrp)[1] = state->__value.__wchb[1];
538 (*outptrp)[2] = state->__value.__wchb[2];
539 (*outptrp)[3] = state->__value.__wchb[3];
540#endif
541
542 *outptrp += 4;
543 }
544
545 /* Clear the state buffer. */
546 state->__count &= ~7;
547
548 return __GCONV_OK;
549}
550
551#include <iconv/skeleton.c>
552
553
554/* Convert from ISO 646-IRV to the internal (UCS4-like) format. */
555#define DEFINE_INIT 0
556#define DEFINE_FINI 0
557#define MIN_NEEDED_FROM 1
558#define MIN_NEEDED_TO 4
559#define FROM_DIRECTION 1
560#define FROM_LOOP ascii_internal_loop
561#define TO_LOOP ascii_internal_loop /* This is not used. */
562#define FUNCTION_NAME __gconv_transform_ascii_internal
563#define ONE_DIRECTION 1
564
565#define MIN_NEEDED_INPUT MIN_NEEDED_FROM
566#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
567#define LOOPFCT FROM_LOOP
568#define BODY \
569 { \
570 if (__glibc_unlikely (*inptr > '\x7f')) \
571 { \
572 /* The value is too large. We don't try transliteration here since \
573 this is not an error because of the lack of possibilities to \
574 represent the result. This is a genuine bug in the input since \
575 ASCII does not allow such values. */ \
576 STANDARD_FROM_LOOP_ERR_HANDLER (1); \
577 } \
578 else \
579 { \
580 /* It's an one byte sequence. */ \
581 *((uint32_t *) outptr) = *inptr++; \
582 outptr += sizeof (uint32_t); \
583 } \
584 }
585#define LOOP_NEED_FLAGS
586#include <iconv/loop.c>
587#include <iconv/skeleton.c>
588
589
590/* Convert from the internal (UCS4-like) format to ISO 646-IRV. */
591#define DEFINE_INIT 0
592#define DEFINE_FINI 0
593#define MIN_NEEDED_FROM 4
594#define MIN_NEEDED_TO 1
595#define FROM_DIRECTION 1
596#define FROM_LOOP internal_ascii_loop
597#define TO_LOOP internal_ascii_loop /* This is not used. */
598#define FUNCTION_NAME __gconv_transform_internal_ascii
599#define ONE_DIRECTION 1
600
601#define MIN_NEEDED_INPUT MIN_NEEDED_FROM
602#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
603#define LOOPFCT FROM_LOOP
604#define BODY \
605 { \
606 if (__glibc_unlikely (*((const uint32_t *) inptr) > 0x7f)) \
607 { \
608 UNICODE_TAG_HANDLER (*((const uint32_t *) inptr), 4); \
609 STANDARD_TO_LOOP_ERR_HANDLER (4); \
610 } \
611 else \
612 { \
613 /* It's an one byte sequence. */ \
614 *outptr++ = *((const uint32_t *) inptr); \
615 inptr += sizeof (uint32_t); \
616 } \
617 }
618#define LOOP_NEED_FLAGS
619#include <iconv/loop.c>
620#include <iconv/skeleton.c>
621
622
623/* Convert from the internal (UCS4-like) format to UTF-8. */
624#define DEFINE_INIT 0
625#define DEFINE_FINI 0
626#define MIN_NEEDED_FROM 4
627#define MIN_NEEDED_TO 1
628#define MAX_NEEDED_TO 6
629#define FROM_DIRECTION 1
630#define FROM_LOOP internal_utf8_loop
631#define TO_LOOP internal_utf8_loop /* This is not used. */
632#define FUNCTION_NAME __gconv_transform_internal_utf8
633#define ONE_DIRECTION 1
634
635#define MIN_NEEDED_INPUT MIN_NEEDED_FROM
636#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
637#define MAX_NEEDED_OUTPUT MAX_NEEDED_TO
638#define LOOPFCT FROM_LOOP
639#define BODY \
640 { \
641 uint32_t wc = *((const uint32_t *) inptr); \
642 \
643 if (__glibc_likely (wc < 0x80)) \
644 /* It's an one byte sequence. */ \
645 *outptr++ = (unsigned char) wc; \
646 else if (__glibc_likely (wc <= 0x7fffffff \
647 && (wc < 0xd800 || wc > 0xdfff))) \
648 { \
649 size_t step; \
650 unsigned char *start; \
651 \
652 for (step = 2; step < 6; ++step) \
653 if ((wc & (~(uint32_t)0 << (5 * step + 1))) == 0) \
654 break; \
655 \
656 if (__glibc_unlikely (outptr + step > outend)) \
657 { \
658 /* Too long. */ \
659 result = __GCONV_FULL_OUTPUT; \
660 break; \
661 } \
662 \
663 start = outptr; \
664 *outptr = (unsigned char) (~0xff >> step); \
665 outptr += step; \
666 do \
667 { \
668 start[--step] = 0x80 | (wc & 0x3f); \
669 wc >>= 6; \
670 } \
671 while (step > 1); \
672 start[0] |= wc; \
673 } \
674 else \
675 { \
676 STANDARD_TO_LOOP_ERR_HANDLER (4); \
677 } \
678 \
679 inptr += 4; \
680 }
681#define LOOP_NEED_FLAGS
682#include <iconv/loop.c>
683#include <iconv/skeleton.c>
684
685
686/* Convert from UTF-8 to the internal (UCS4-like) format. */
687#define DEFINE_INIT 0
688#define DEFINE_FINI 0
689#define MIN_NEEDED_FROM 1
690#define MAX_NEEDED_FROM 6
691#define MIN_NEEDED_TO 4
692#define FROM_DIRECTION 1
693#define FROM_LOOP utf8_internal_loop
694#define TO_LOOP utf8_internal_loop /* This is not used. */
695#define FUNCTION_NAME __gconv_transform_utf8_internal
696#define ONE_DIRECTION 1
697
698#define MIN_NEEDED_INPUT MIN_NEEDED_FROM
699#define MAX_NEEDED_INPUT MAX_NEEDED_FROM
700#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
701#define LOOPFCT FROM_LOOP
702#define BODY \
703 { \
704 /* Next input byte. */ \
705 uint32_t ch = *inptr; \
706 \
707 if (__glibc_likely (ch < 0x80)) \
708 { \
709 /* One byte sequence. */ \
710 ++inptr; \
711 } \
712 else \
713 { \
714 unsigned int cnt; \
715 unsigned int i; \
716 \
717 if (ch >= 0xc2 && ch < 0xe0) \
718 { \
719 /* We expect two bytes. The first byte cannot be 0xc0 or 0xc1, \
720 otherwise the wide character could have been represented \
721 using a single byte. */ \
722 cnt = 2; \
723 ch &= 0x1f; \
724 } \
725 else if (__glibc_likely ((ch & 0xf0) == 0xe0)) \
726 { \
727 /* We expect three bytes. */ \
728 cnt = 3; \
729 ch &= 0x0f; \
730 } \
731 else if (__glibc_likely ((ch & 0xf8) == 0xf0)) \
732 { \
733 /* We expect four bytes. */ \
734 cnt = 4; \
735 ch &= 0x07; \
736 } \
737 else if (__glibc_likely ((ch & 0xfc) == 0xf8)) \
738 { \
739 /* We expect five bytes. */ \
740 cnt = 5; \
741 ch &= 0x03; \
742 } \
743 else if (__glibc_likely ((ch & 0xfe) == 0xfc)) \
744 { \
745 /* We expect six bytes. */ \
746 cnt = 6; \
747 ch &= 0x01; \
748 } \
749 else \
750 { \
751 /* Search the end of this ill-formed UTF-8 character. This \
752 is the next byte with (x & 0xc0) != 0x80. */ \
753 i = 0; \
754 do \
755 ++i; \
756 while (inptr + i < inend \
757 && (*(inptr + i) & 0xc0) == 0x80 \
758 && i < 5); \
759 \
760 errout: \
761 STANDARD_FROM_LOOP_ERR_HANDLER (i); \
762 } \
763 \
764 if (__glibc_unlikely (inptr + cnt > inend)) \
765 { \
766 /* We don't have enough input. But before we report that check \
767 that all the bytes are correct. */ \
768 for (i = 1; inptr + i < inend; ++i) \
769 if ((inptr[i] & 0xc0) != 0x80) \
770 break; \
771 \
772 if (__glibc_likely (inptr + i == inend)) \
773 { \
774 result = __GCONV_INCOMPLETE_INPUT; \
775 break; \
776 } \
777 \
778 goto errout; \
779 } \
780 \
781 /* Read the possible remaining bytes. */ \
782 for (i = 1; i < cnt; ++i) \
783 { \
784 uint32_t byte = inptr[i]; \
785 \
786 if ((byte & 0xc0) != 0x80) \
787 /* This is an illegal encoding. */ \
788 break; \
789 \
790 ch <<= 6; \
791 ch |= byte & 0x3f; \
792 } \
793 \
794 /* If i < cnt, some trail byte was not >= 0x80, < 0xc0. \
795 If cnt > 2 and ch < 2^(5*cnt-4), the wide character ch could \
796 have been represented with fewer than cnt bytes. */ \
797 if (i < cnt || (cnt > 2 && (ch >> (5 * cnt - 4)) == 0) \
798 /* Do not accept UTF-16 surrogates. */ \
799 || (ch >= 0xd800 && ch <= 0xdfff)) \
800 { \
801 /* This is an illegal encoding. */ \
802 goto errout; \
803 } \
804 \
805 inptr += cnt; \
806 } \
807 \
808 /* Now adjust the pointers and store the result. */ \
809 *((uint32_t *) outptr) = ch; \
810 outptr += sizeof (uint32_t); \
811 }
812#define LOOP_NEED_FLAGS
813
814#define STORE_REST \
815 { \
816 /* We store the remaining bytes while converting them into the UCS4 \
817 format. We can assume that the first byte in the buffer is \
818 correct and that it requires a larger number of bytes than there \
819 are in the input buffer. */ \
820 wint_t ch = **inptrp; \
821 size_t cnt, r; \
822 \
823 state->__count = inend - *inptrp; \
824 \
825 assert (ch != 0xc0 && ch != 0xc1); \
826 if (ch >= 0xc2 && ch < 0xe0) \
827 { \
828 /* We expect two bytes. The first byte cannot be 0xc0 or \
829 0xc1, otherwise the wide character could have been \
830 represented using a single byte. */ \
831 cnt = 2; \
832 ch &= 0x1f; \
833 } \
834 else if (__glibc_likely ((ch & 0xf0) == 0xe0)) \
835 { \
836 /* We expect three bytes. */ \
837 cnt = 3; \
838 ch &= 0x0f; \
839 } \
840 else if (__glibc_likely ((ch & 0xf8) == 0xf0)) \
841 { \
842 /* We expect four bytes. */ \
843 cnt = 4; \
844 ch &= 0x07; \
845 } \
846 else if (__glibc_likely ((ch & 0xfc) == 0xf8)) \
847 { \
848 /* We expect five bytes. */ \
849 cnt = 5; \
850 ch &= 0x03; \
851 } \
852 else \
853 { \
854 /* We expect six bytes. */ \
855 cnt = 6; \
856 ch &= 0x01; \
857 } \
858 \
859 /* The first byte is already consumed. */ \
860 r = cnt - 1; \
861 while (++(*inptrp) < inend) \
862 { \
863 ch <<= 6; \
864 ch |= **inptrp & 0x3f; \
865 --r; \
866 } \
867 \
868 /* Shift for the so far missing bytes. */ \
869 ch <<= r * 6; \
870 \
871 /* Store the number of bytes expected for the entire sequence. */ \
872 state->__count |= cnt << 8; \
873 \
874 /* Store the value. */ \
875 state->__value.__wch = ch; \
876 }
877
878#define UNPACK_BYTES \
879 { \
880 static const unsigned char inmask[5] = { 0xc0, 0xe0, 0xf0, 0xf8, 0xfc }; \
881 wint_t wch = state->__value.__wch; \
882 size_t ntotal = state->__count >> 8; \
883 \
884 inlen = state->__count & 255; \
885 \
886 bytebuf[0] = inmask[ntotal - 2]; \
887 \
888 do \
889 { \
890 if (--ntotal < inlen) \
891 bytebuf[ntotal] = 0x80 | (wch & 0x3f); \
892 wch >>= 6; \
893 } \
894 while (ntotal > 1); \
895 \
896 bytebuf[0] |= wch; \
897 }
898
899#define CLEAR_STATE \
900 state->__count = 0
901
902
903#include <iconv/loop.c>
904#include <iconv/skeleton.c>
905
906
907/* Convert from UCS2 to the internal (UCS4-like) format. */
908#define DEFINE_INIT 0
909#define DEFINE_FINI 0
910#define MIN_NEEDED_FROM 2
911#define MIN_NEEDED_TO 4
912#define FROM_DIRECTION 1
913#define FROM_LOOP ucs2_internal_loop
914#define TO_LOOP ucs2_internal_loop /* This is not used. */
915#define FUNCTION_NAME __gconv_transform_ucs2_internal
916#define ONE_DIRECTION 1
917
918#define MIN_NEEDED_INPUT MIN_NEEDED_FROM
919#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
920#define LOOPFCT FROM_LOOP
921#define BODY \
922 { \
923 uint16_t u1 = get16 (inptr); \
924 \
925 if (__glibc_unlikely (u1 >= 0xd800 && u1 < 0xe000)) \
926 { \
927 /* Surrogate characters in UCS-2 input are not valid. Reject \
928 them. (Catching this here is not security relevant.) */ \
929 STANDARD_FROM_LOOP_ERR_HANDLER (2); \
930 } \
931 \
932 *((uint32_t *) outptr) = u1; \
933 outptr += sizeof (uint32_t); \
934 inptr += 2; \
935 }
936#define LOOP_NEED_FLAGS
937#include <iconv/loop.c>
938#include <iconv/skeleton.c>
939
940
941/* Convert from the internal (UCS4-like) format to UCS2. */
942#define DEFINE_INIT 0
943#define DEFINE_FINI 0
944#define MIN_NEEDED_FROM 4
945#define MIN_NEEDED_TO 2
946#define FROM_DIRECTION 1
947#define FROM_LOOP internal_ucs2_loop
948#define TO_LOOP internal_ucs2_loop /* This is not used. */
949#define FUNCTION_NAME __gconv_transform_internal_ucs2
950#define ONE_DIRECTION 1
951
952#define MIN_NEEDED_INPUT MIN_NEEDED_FROM
953#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
954#define LOOPFCT FROM_LOOP
955#define BODY \
956 { \
957 uint32_t val = *((const uint32_t *) inptr); \
958 \
959 if (__glibc_unlikely (val >= 0x10000)) \
960 { \
961 UNICODE_TAG_HANDLER (val, 4); \
962 STANDARD_TO_LOOP_ERR_HANDLER (4); \
963 } \
964 else if (__glibc_unlikely (val >= 0xd800 && val < 0xe000)) \
965 { \
966 /* Surrogate characters in UCS-4 input are not valid. \
967 We must catch this, because the UCS-2 output might be \
968 interpreted as UTF-16 by other programs. If we let \
969 surrogates pass through, attackers could make a security \
970 hole exploit by synthesizing any desired plane 1-16 \
971 character. */ \
972 result = __GCONV_ILLEGAL_INPUT; \
973 if (! ignore_errors_p ()) \
974 break; \
975 inptr += 4; \
976 ++*irreversible; \
977 continue; \
978 } \
979 else \
980 { \
981 put16 (outptr, val); \
982 outptr += sizeof (uint16_t); \
983 inptr += 4; \
984 } \
985 }
986#define LOOP_NEED_FLAGS
987#include <iconv/loop.c>
988#include <iconv/skeleton.c>
989
990
991/* Convert from UCS2 in other endianness to the internal (UCS4-like) format. */
992#define DEFINE_INIT 0
993#define DEFINE_FINI 0
994#define MIN_NEEDED_FROM 2
995#define MIN_NEEDED_TO 4
996#define FROM_DIRECTION 1
997#define FROM_LOOP ucs2reverse_internal_loop
998#define TO_LOOP ucs2reverse_internal_loop/* This is not used.*/
999#define FUNCTION_NAME __gconv_transform_ucs2reverse_internal
1000#define ONE_DIRECTION 1
1001
1002#define MIN_NEEDED_INPUT MIN_NEEDED_FROM
1003#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
1004#define LOOPFCT FROM_LOOP
1005#define BODY \
1006 { \
1007 uint16_t u1 = bswap_16 (get16 (inptr)); \
1008 \
1009 if (__glibc_unlikely (u1 >= 0xd800 && u1 < 0xe000)) \
1010 { \
1011 /* Surrogate characters in UCS-2 input are not valid. Reject \
1012 them. (Catching this here is not security relevant.) */ \
1013 if (! ignore_errors_p ()) \
1014 { \
1015 result = __GCONV_ILLEGAL_INPUT; \
1016 break; \
1017 } \
1018 inptr += 2; \
1019 ++*irreversible; \
1020 continue; \
1021 } \
1022 \
1023 *((uint32_t *) outptr) = u1; \
1024 outptr += sizeof (uint32_t); \
1025 inptr += 2; \
1026 }
1027#define LOOP_NEED_FLAGS
1028#include <iconv/loop.c>
1029#include <iconv/skeleton.c>
1030
1031
1032/* Convert from the internal (UCS4-like) format to UCS2 in other endianness. */
1033#define DEFINE_INIT 0
1034#define DEFINE_FINI 0
1035#define MIN_NEEDED_FROM 4
1036#define MIN_NEEDED_TO 2
1037#define FROM_DIRECTION 1
1038#define FROM_LOOP internal_ucs2reverse_loop
1039#define TO_LOOP internal_ucs2reverse_loop/* This is not used.*/
1040#define FUNCTION_NAME __gconv_transform_internal_ucs2reverse
1041#define ONE_DIRECTION 1
1042
1043#define MIN_NEEDED_INPUT MIN_NEEDED_FROM
1044#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
1045#define LOOPFCT FROM_LOOP
1046#define BODY \
1047 { \
1048 uint32_t val = *((const uint32_t *) inptr); \
1049 if (__glibc_unlikely (val >= 0x10000)) \
1050 { \
1051 UNICODE_TAG_HANDLER (val, 4); \
1052 STANDARD_TO_LOOP_ERR_HANDLER (4); \
1053 } \
1054 else if (__glibc_unlikely (val >= 0xd800 && val < 0xe000)) \
1055 { \
1056 /* Surrogate characters in UCS-4 input are not valid. \
1057 We must catch this, because the UCS-2 output might be \
1058 interpreted as UTF-16 by other programs. If we let \
1059 surrogates pass through, attackers could make a security \
1060 hole exploit by synthesizing any desired plane 1-16 \
1061 character. */ \
1062 if (! ignore_errors_p ()) \
1063 { \
1064 result = __GCONV_ILLEGAL_INPUT; \
1065 break; \
1066 } \
1067 inptr += 4; \
1068 ++*irreversible; \
1069 continue; \
1070 } \
1071 else \
1072 { \
1073 put16 (outptr, bswap_16 (val)); \
1074 outptr += sizeof (uint16_t); \
1075 inptr += 4; \
1076 } \
1077 }
1078#define LOOP_NEED_FLAGS
1079#include <iconv/loop.c>
1080#include <iconv/skeleton.c>
1081

source code of glibc/iconv/gconv_simple.c