utf-16.c source code [glibc/iconvdata/utf-16.c]

1	/ Conversion module for UTF-16.*
2	Copyright (C) 1999-2022 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	#include <byteswap.h>
20	#include <dlfcn.h>
21	#include <gconv.h>
22	#include <stddef.h>
23	#include <stdint.h>
24	#include <stdlib.h>
25	#include <string.h>
26
27	/ This is the Byte Order Mark character (BOM). /
28	#define BOM 0xfeff
29	/ And in the other byte order. /
30	#define BOM_OE 0xfffe
31
32
33	/ Definitions used in the body of the `gconv' function. /
34	#define FROM_LOOP from_utf16_loop
35	#define TO_LOOP to_utf16_loop
36	#define DEFINE_INIT 0
37	#define DEFINE_FINI 0
38	#define MIN_NEEDED_FROM 2
39	#define MAX_NEEDED_FROM 4
40	#define MIN_NEEDED_TO 4
41	#define ONE_DIRECTION 0
42	#define FROM_DIRECTION (dir == from_utf16)
43	#define PREPARE_LOOP \
44	enum direction dir = ((struct utf16_data *) step->__data)->dir; \
45	enum variant var = ((struct utf16_data *) step->__data)->var; \
46	if (__glibc_unlikely (data->__invocation_counter == 0)) \
47	{ \
48	if (var == UTF_16) \
49	{ \
50	if (FROM_DIRECTION) \
51	{ \
52	/* We have to find out which byte order the file is \
53	encoded in. */ \
54	if (inptr + 2 > inend) \
55	return (inptr == inend \
56	? __GCONV_EMPTY_INPUT : __GCONV_INCOMPLETE_INPUT); \
57	\
58	if (get16u (inptr) == BOM) \
59	/* Simply ignore the BOM character. */ \
60	*inptrp = inptr += 2; \
61	else if (get16u (inptr) == BOM_OE) \
62	{ \
63	data->__flags \|= __GCONV_SWAP; \
64	*inptrp = inptr += 2; \
65	} \
66	} \
67	else if (!FROM_DIRECTION && !data->__internal_use) \
68	{ \
69	/* Emit the Byte Order Mark. */ \
70	if (__glibc_unlikely (outbuf + 2 > outend)) \
71	return __GCONV_FULL_OUTPUT; \
72	\
73	put16u (outbuf, BOM); \
74	outbuf += 2; \
75	} \
76	} \
77	else if ((var == UTF_16LE && BYTE_ORDER == BIG_ENDIAN) \
78	\|\| (var == UTF_16BE && BYTE_ORDER == LITTLE_ENDIAN)) \
79	data->__flags \|= __GCONV_SWAP; \
80	} \
81	const int swap = data->__flags & __GCONV_SWAP;
82	#define EXTRA_LOOP_ARGS , swap
83
84
85	/ Direction of the transformation. /
86	enum direction
87	{
88	illegal_dir,
89	to_utf16,
90	from_utf16
91	};
92
93	enum variant
94	{
95	illegal_var,
96	UTF_16,
97	UTF_16LE,
98	UTF_16BE
99	};
100
101	struct utf16_data
102	{
103	enum direction dir;
104	enum variant var;
105	};
106
107
108	extern int gconv_init (struct __gconv_step *step);
109	int
110	gconv_init (struct __gconv_step *step)
111	{
112	/ Determine which direction. /
113	struct utf16_data *new_data;
114	enum direction dir = illegal_dir;
115	enum variant var = illegal_var;
116	int result;
117
118	if (__strcasecmp (s1: step->__from_name, s2: "UTF-16//") == `0`)
119	{
120	dir = from_utf16;
121	var = UTF_16;
122	}
123	else if (__strcasecmp (s1: step->__to_name, s2: "UTF-16//") == `0`)
124	{
125	dir = to_utf16;
126	var = UTF_16;
127	}
128	else if (__strcasecmp (s1: step->__from_name, s2: "UTF-16BE//") == `0`)
129	{
130	dir = from_utf16;
131	var = UTF_16BE;
132	}
133	else if (__strcasecmp (s1: step->__to_name, s2: "UTF-16BE//") == `0`)
134	{
135	dir = to_utf16;
136	var = UTF_16BE;
137	}
138	else if (__strcasecmp (s1: step->__from_name, s2: "UTF-16LE//") == `0`)
139	{
140	dir = from_utf16;
141	var = UTF_16LE;
142	}
143	else if (__strcasecmp (s1: step->__to_name, s2: "UTF-16LE//") == `0`)
144	{
145	dir = to_utf16;
146	var = UTF_16LE;
147	}
148
149	result = __GCONV_NOCONV;
150	if (__builtin_expect (dir, to_utf16) != illegal_dir)
151	{
152	new_data = (struct utf16_data ) malloc (size: sizeof* (struct utf16_data));
153
154	result = __GCONV_NOMEM;
155	if (new_data != NULL)
156	{
157	new_data->dir = dir;
158	new_data->var = var;
159	step->__data = new_data;
160
161	if (dir == from_utf16)
162	{
163	step->__min_needed_from = MIN_NEEDED_FROM;
164	step->__max_needed_from = MAX_NEEDED_FROM;
165	step->__min_needed_to = MIN_NEEDED_TO;
166	step->__max_needed_to = MIN_NEEDED_TO;
167	}
168	else
169	{
170	step->__min_needed_from = MIN_NEEDED_TO;
171	step->__max_needed_from = MIN_NEEDED_TO;
172	step->__min_needed_to = MIN_NEEDED_FROM;
173	step->__max_needed_to = MAX_NEEDED_FROM;
174	}
175
176	step->__stateful = `0`;
177
178	result = __GCONV_OK;
179	}
180	}
181
182	return result;
183	}
184
185
186	extern void gconv_end (struct __gconv_step *data);
187	void
188	gconv_end (struct __gconv_step *data)
189	{
190	free (ptr: data->__data);
191	}
192
193
194	/ Convert from the internal (UCS4-like) format to UTF-16. /
195	#define MIN_NEEDED_INPUT MIN_NEEDED_TO
196	#define MIN_NEEDED_OUTPUT MIN_NEEDED_FROM
197	#define MAX_NEEDED_OUTPUT MAX_NEEDED_FROM
198	#define LOOPFCT TO_LOOP
199	#define BODY \
200	{ \
201	uint32_t c = get32 (inptr); \
202	\
203	if (__glibc_unlikely (c >= 0xd800 && c < 0xe000)) \
204	{ \
205	/* Surrogate characters in UCS-4 input are not valid. \
206	We must catch this. If we let surrogates pass through, \
207	attackers could make a security hole exploit by \
208	synthesizing any desired plane 1-16 character. */ \
209	result = __GCONV_ILLEGAL_INPUT; \
210	if (! ignore_errors_p ()) \
211	break; \
212	inptr += 4; \
213	++*irreversible; \
214	continue; \
215	} \
216	\
217	if (swap) \
218	{ \
219	if (__glibc_unlikely (c >= 0x10000)) \
220	{ \
221	if (__glibc_unlikely (c >= 0x110000)) \
222	{ \
223	STANDARD_TO_LOOP_ERR_HANDLER (4); \
224	} \
225	\
226	/* Generate a surrogate character. */ \
227	if (__glibc_unlikely (outptr + 4 > outend)) \
228	{ \
229	/* Overflow in the output buffer. */ \
230	result = __GCONV_FULL_OUTPUT; \
231	break; \
232	} \
233	\
234	put16 (outptr, bswap_16 (0xd7c0 + (c >> 10))); \
235	outptr += 2; \
236	put16 (outptr, bswap_16 (0xdc00 + (c & 0x3ff))); \
237	} \
238	else \
239	put16 (outptr, bswap_16 (c)); \
240	} \
241	else \
242	{ \
243	if (__glibc_unlikely (c >= 0x10000)) \
244	{ \
245	if (__glibc_unlikely (c >= 0x110000)) \
246	{ \
247	STANDARD_TO_LOOP_ERR_HANDLER (4); \
248	} \
249	\
250	/* Generate a surrogate character. */ \
251	if (__glibc_unlikely (outptr + 4 > outend)) \
252	{ \
253	/* Overflow in the output buffer. */ \
254	result = __GCONV_FULL_OUTPUT; \
255	break; \
256	} \
257	\
258	put16 (outptr, 0xd7c0 + (c >> 10)); \
259	outptr += 2; \
260	put16 (outptr, 0xdc00 + (c & 0x3ff)); \
261	} \
262	else \
263	put16 (outptr, c); \
264	} \
265	outptr += 2; \
266	inptr += 4; \
267	}
268	#define LOOP_NEED_FLAGS
269	#define EXTRA_LOOP_DECLS \
270	, int swap
271	#include <iconv/loop.c>
272
273
274	/ Convert from UTF-16 to the internal (UCS4-like) format. /
275	#define MIN_NEEDED_INPUT MIN_NEEDED_FROM
276	#define MAX_NEEDED_INPUT MAX_NEEDED_FROM
277	#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
278	#define LOOPFCT FROM_LOOP
279	#define BODY \
280	{ \
281	uint16_t u1 = get16 (inptr); \
282	\
283	if (swap) \
284	{ \
285	u1 = bswap_16 (u1); \
286	\
287	if (__builtin_expect (u1 < 0xd800, 1) \|\| u1 > 0xdfff) \
288	{ \
289	/* No surrogate. */ \
290	put32 (outptr, u1); \
291	inptr += 2; \
292	} \
293	else \
294	{ \
295	uint16_t u2; \
296	\
297	if (__glibc_unlikely (u1 >= 0xdc00)) \
298	{ \
299	/* This is no valid first word for a surrogate. */ \
300	STANDARD_FROM_LOOP_ERR_HANDLER (2); \
301	} \
302	\
303	/* It's a surrogate character. At least the first word says \
304	it is. */ \
305	if (__glibc_unlikely (inptr + 4 > inend)) \
306	{ \
307	/* We don't have enough input for another complete input \
308	character. */ \
309	result = __GCONV_INCOMPLETE_INPUT; \
310	break; \
311	} \
312	\
313	inptr += 2; \
314	u2 = bswap_16 (get16 (inptr)); \
315	if (__builtin_expect (u2 < 0xdc00, 0) \
316	\|\| __builtin_expect (u2 > 0xdfff, 0)) \
317	{ \
318	/* This is no valid second word for a surrogate. */ \
319	inptr -= 2; \
320	STANDARD_FROM_LOOP_ERR_HANDLER (2); \
321	} \
322	\
323	put32 (outptr, ((u1 - 0xd7c0) << 10) + (u2 - 0xdc00)); \
324	inptr += 2; \
325	} \
326	} \
327	else \
328	{ \
329	if (__builtin_expect (u1 < 0xd800, 1) \|\| u1 > 0xdfff) \
330	{ \
331	/* No surrogate. */ \
332	put32 (outptr, u1); \
333	inptr += 2; \
334	} \
335	else \
336	{ \
337	if (__glibc_unlikely (u1 >= 0xdc00)) \
338	{ \
339	/* This is no valid first word for a surrogate. */ \
340	STANDARD_FROM_LOOP_ERR_HANDLER (2); \
341	} \
342	\
343	/* It's a surrogate character. At least the first word says \
344	it is. */ \
345	if (__glibc_unlikely (inptr + 4 > inend)) \
346	{ \
347	/* We don't have enough input for another complete input \
348	character. */ \
349	result = __GCONV_INCOMPLETE_INPUT; \
350	break; \
351	} \
352	\
353	inptr += 2; \
354	uint16_t u2 = get16 (inptr); \
355	if (__builtin_expect (u2 < 0xdc00, 0) \
356	\|\| __builtin_expect (u2 > 0xdfff, 0)) \
357	{ \
358	/* This is no valid second word for a surrogate. */ \
359	inptr -= 2; \
360	STANDARD_FROM_LOOP_ERR_HANDLER (2); \
361	} \
362	\
363	put32 (outptr, ((u1 - 0xd7c0) << 10) + (u2 - 0xdc00)); \
364	inptr += 2; \
365	} \
366	} \
367	outptr += 4; \
368	}
369	#define LOOP_NEED_FLAGS
370	#define EXTRA_LOOP_DECLS \
371	, int swap
372	#include <iconv/loop.c>
373
374
375	/ Now define the toplevel functions. /
376	#include <iconv/skeleton.c>
377

source code of glibc/iconvdata/utf-16.c