1 | /* bug 19727: Testing UTF conversions with UTF16 surrogates as input. |
2 | Copyright (C) 2016-2022 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <stdio.h> |
20 | #include <stdlib.h> |
21 | #include <errno.h> |
22 | #include <string.h> |
23 | #include <inttypes.h> |
24 | #include <iconv.h> |
25 | #include <byteswap.h> |
26 | |
27 | static int |
28 | run_conversion (const char *from, const char *to, char *inbuf, size_t inbuflen, |
29 | int exp_errno, int line) |
30 | { |
31 | char outbuf[16]; |
32 | iconv_t cd; |
33 | char *inptr; |
34 | size_t inlen; |
35 | char *outptr; |
36 | size_t outlen; |
37 | size_t n; |
38 | int e; |
39 | int fails = 0; |
40 | |
41 | cd = iconv_open (tocode: to, fromcode: from); |
42 | if (cd == (iconv_t) -1) |
43 | { |
44 | printf (format: "line %d: cannot convert from %s to %s: %m\n" , line, from, to); |
45 | return 1; |
46 | } |
47 | |
48 | inptr = (char *) inbuf; |
49 | inlen = inbuflen; |
50 | outptr = outbuf; |
51 | outlen = sizeof (outbuf); |
52 | |
53 | errno = 0; |
54 | n = iconv (cd: cd, inbuf: &inptr, inbytesleft: &inlen, outbuf: &outptr, outbytesleft: &outlen); |
55 | e = errno; |
56 | |
57 | if (exp_errno == 0) |
58 | { |
59 | if (n == (size_t) -1) |
60 | { |
61 | puts (s: "n should be >= 0, but n == -1" ); |
62 | fails ++; |
63 | } |
64 | |
65 | if (e != 0) |
66 | { |
67 | printf (format: "errno should be 0: 'Success', but errno == %d: '%s'\n" |
68 | , e, strerror(errnum: e)); |
69 | fails ++; |
70 | } |
71 | } |
72 | else |
73 | { |
74 | if (n != (size_t) -1) |
75 | { |
76 | printf (format: "n should be -1, but n == %zd\n" , n); |
77 | fails ++; |
78 | } |
79 | |
80 | if (e != exp_errno) |
81 | { |
82 | printf (format: "errno should be %d: '%s', but errno == %d: '%s'\n" |
83 | , exp_errno, strerror (errnum: exp_errno), e, strerror (errnum: e)); |
84 | fails ++; |
85 | } |
86 | } |
87 | |
88 | iconv_close (cd: cd); |
89 | |
90 | if (fails > 0) |
91 | { |
92 | printf (format: "Errors in line %d while converting %s to %s.\n\n" |
93 | , line, from, to); |
94 | } |
95 | |
96 | return fails; |
97 | } |
98 | |
99 | static int |
100 | do_test (void) |
101 | { |
102 | int fails = 0; |
103 | char buf[4]; |
104 | |
105 | /* This test runs iconv() with UTF character in range of an UTF16 surrogate. |
106 | UTF-16 high surrogate is in range 0xD800..0xDBFF and |
107 | UTF-16 low surrogate is in range 0xDC00..0xDFFF. |
108 | Converting from or to UTF-xx has to report errors in those cases. |
109 | In UTF-16, surrogate pairs with a high surrogate in front of a low |
110 | surrogate is valid. */ |
111 | |
112 | /* Use RUN_UCS4_UTF32_INPUT to test conversion ... |
113 | |
114 | ... from INTERNAL to UTF-xx[LE|BE]: |
115 | Converting from UCS4 to UTF-xx[LE|BE] first converts UCS4 to INTERNAL |
116 | without checking for UTF-16 surrogate values |
117 | and then converts from INTERNAL to UTF-xx[LE|BE]. |
118 | The latter conversion has to report an error in those cases. |
119 | |
120 | ... from UTF-32[LE|BE] to INTERNAL: |
121 | Converting directly from UTF-32LE to UTF-8|16 is needed, |
122 | because e.g. s390x has iconv-modules which converts directly. */ |
123 | #define RUN_UCS4_UTF32_INPUT(b0, b1, b2, b3, err, line) \ |
124 | buf[0] = b0; \ |
125 | buf[1] = b1; \ |
126 | buf[2] = b2; \ |
127 | buf[3] = b3; \ |
128 | fails += run_conversion ("UCS4", "UTF-8", buf, 4, err, line); \ |
129 | fails += run_conversion ("UCS4", "UTF-16LE", buf, 4, err, line); \ |
130 | fails += run_conversion ("UCS4", "UTF-16BE", buf, 4, err, line); \ |
131 | fails += run_conversion ("UCS4", "UTF-32LE", buf, 4, err, line); \ |
132 | fails += run_conversion ("UCS4", "UTF-32BE", buf, 4, err, line); \ |
133 | fails += run_conversion ("UTF-32BE", "WCHAR_T", buf, 4, err, line); \ |
134 | fails += run_conversion ("UTF-32BE", "UTF-8", buf, 4, err, line); \ |
135 | fails += run_conversion ("UTF-32BE", "UTF-16LE", buf, 4, err, line); \ |
136 | fails += run_conversion ("UTF-32BE", "UTF-16BE", buf, 4, err, line); \ |
137 | buf[0] = b3; \ |
138 | buf[1] = b2; \ |
139 | buf[2] = b1; \ |
140 | buf[3] = b0; \ |
141 | fails += run_conversion ("UTF-32LE", "WCHAR_T", buf, 4, err, line); \ |
142 | fails += run_conversion ("UTF-32LE", "UTF-8", buf, 4, err, line); \ |
143 | fails += run_conversion ("UTF-32LE", "UTF-16LE", buf, 4, err, line); \ |
144 | fails += run_conversion ("UTF-32LE", "UTF-16BE", buf, 4, err, line); |
145 | |
146 | /* Use UCS4/UTF32 input of 0xD7FF. */ |
147 | RUN_UCS4_UTF32_INPUT (0x0, 0x0, 0xD7, 0xFF, 0, __LINE__); |
148 | |
149 | /* Use UCS4/UTF32 input of 0xD800. */ |
150 | RUN_UCS4_UTF32_INPUT (0x0, 0x0, 0xD8, 0x00, EILSEQ, __LINE__); |
151 | |
152 | /* Use UCS4/UTF32 input of 0xDBFF. */ |
153 | RUN_UCS4_UTF32_INPUT (0x0, 0x0, 0xDB, 0xFF, EILSEQ, __LINE__); |
154 | |
155 | /* Use UCS4/UTF32 input of 0xDC00. */ |
156 | RUN_UCS4_UTF32_INPUT (0x0, 0x0, 0xDC, 0x00, EILSEQ, __LINE__); |
157 | |
158 | /* Use UCS4/UTF32 input of 0xDFFF. */ |
159 | RUN_UCS4_UTF32_INPUT (0x0, 0x0, 0xDF, 0xFF, EILSEQ, __LINE__); |
160 | |
161 | /* Use UCS4/UTF32 input of 0xE000. */ |
162 | RUN_UCS4_UTF32_INPUT (0x0, 0x0, 0xE0, 0x00, 0, __LINE__); |
163 | |
164 | |
165 | /* Use RUN_UTF16_INPUT to test conversion from UTF16[LE|BE] to INTERNAL. |
166 | Converting directly from UTF-16 to UTF-8|32 is needed, |
167 | because e.g. s390x has iconv-modules which converts directly. |
168 | Use len == 2 or 4 to specify one or two UTF-16 characters. */ |
169 | #define RUN_UTF16_INPUT(b0, b1, b2, b3, len, err, line) \ |
170 | buf[0] = b0; \ |
171 | buf[1] = b1; \ |
172 | buf[2] = b2; \ |
173 | buf[3] = b3; \ |
174 | fails += run_conversion ("UTF-16BE", "WCHAR_T", buf, len, err, line); \ |
175 | fails += run_conversion ("UTF-16BE", "UTF-8", buf, len, err, line); \ |
176 | fails += run_conversion ("UTF-16BE", "UTF-32LE", buf, len, err, line); \ |
177 | fails += run_conversion ("UTF-16BE", "UTF-32BE", buf, len, err, line); \ |
178 | buf[0] = b1; \ |
179 | buf[1] = b0; \ |
180 | buf[2] = b3; \ |
181 | buf[3] = b2; \ |
182 | fails += run_conversion ("UTF-16LE", "WCHAR_T", buf, len, err, line); \ |
183 | fails += run_conversion ("UTF-16LE", "UTF-8", buf, len, err, line); \ |
184 | fails += run_conversion ("UTF-16LE", "UTF-32LE", buf, len, err, line); \ |
185 | fails += run_conversion ("UTF-16LE", "UTF-32BE", buf, len, err, line); |
186 | |
187 | /* Use UTF16 input of 0xD7FF. */ |
188 | RUN_UTF16_INPUT (0xD7, 0xFF, 0xD7, 0xFF, 4, 0, __LINE__); |
189 | |
190 | /* Use [single] UTF16 high surrogate 0xD800 [with a valid character behind]. |
191 | And check an UTF16 surrogate pair [without valid low surrogate]. */ |
192 | RUN_UTF16_INPUT (0xD8, 0x0, 0x0, 0x0, 2, EINVAL, __LINE__); |
193 | RUN_UTF16_INPUT (0xD8, 0x0, 0xD7, 0xFF, 4, EILSEQ, __LINE__); |
194 | RUN_UTF16_INPUT (0xD8, 0x0, 0xD8, 0x0, 4, EILSEQ, __LINE__); |
195 | RUN_UTF16_INPUT (0xD8, 0x0, 0xE0, 0x0, 4, EILSEQ, __LINE__); |
196 | RUN_UTF16_INPUT (0xD8, 0x0, 0xDC, 0x0, 4, 0, __LINE__); |
197 | |
198 | /* Use [single] UTF16 high surrogate 0xDBFF [with a valid character behind]. |
199 | And check an UTF16 surrogate pair [without valid low surrogate]. */ |
200 | RUN_UTF16_INPUT (0xDB, 0xFF, 0x0, 0x0, 2, EINVAL, __LINE__); |
201 | RUN_UTF16_INPUT (0xDB, 0xFF, 0xD7, 0xFF, 4, EILSEQ, __LINE__); |
202 | RUN_UTF16_INPUT (0xDB, 0xFF, 0xDB, 0xFF, 4, EILSEQ, __LINE__); |
203 | RUN_UTF16_INPUT (0xDB, 0xFF, 0xE0, 0x0, 4, EILSEQ, __LINE__); |
204 | RUN_UTF16_INPUT (0xDB, 0xFF, 0xDF, 0xFF, 4, 0, __LINE__); |
205 | |
206 | /* Use single UTF16 low surrogate 0xDC00 [with a valid character behind]. |
207 | And check an UTF16 surrogate pair [without valid high surrogate]. */ |
208 | RUN_UTF16_INPUT (0xDC, 0x0, 0x0, 0x0, 2, EILSEQ, __LINE__); |
209 | RUN_UTF16_INPUT (0xDC, 0x0, 0xD7, 0xFF, 4, EILSEQ, __LINE__); |
210 | RUN_UTF16_INPUT (0xD8, 0x0, 0xDC, 0x0, 4, 0, __LINE__); |
211 | RUN_UTF16_INPUT (0xD7, 0xFF, 0xDC, 0x0, 4, EILSEQ, __LINE__); |
212 | RUN_UTF16_INPUT (0xDC, 0x0, 0xDC, 0x0, 4, EILSEQ, __LINE__); |
213 | RUN_UTF16_INPUT (0xE0, 0x0, 0xDC, 0x0, 4, EILSEQ, __LINE__); |
214 | |
215 | /* Use single UTF16 low surrogate 0xDFFF [with a valid character behind]. |
216 | And check an UTF16 surrogate pair [without valid high surrogate]. */ |
217 | RUN_UTF16_INPUT (0xDF, 0xFF, 0x0, 0x0, 2, EILSEQ, __LINE__); |
218 | RUN_UTF16_INPUT (0xDF, 0xFF, 0xD7, 0xFF, 4, EILSEQ, __LINE__); |
219 | RUN_UTF16_INPUT (0xDB, 0xFF, 0xDF, 0xFF, 4, 0, __LINE__); |
220 | RUN_UTF16_INPUT (0xD7, 0xFF, 0xDF, 0xFF, 4, EILSEQ, __LINE__); |
221 | RUN_UTF16_INPUT (0xDF, 0xFF, 0xDF, 0xFF, 4, EILSEQ, __LINE__); |
222 | RUN_UTF16_INPUT (0xE0, 0x0, 0xDF, 0xFF, 4, EILSEQ, __LINE__); |
223 | |
224 | /* Use UCS4/UTF32 input of 0xE000. */ |
225 | RUN_UTF16_INPUT (0xE0, 0x0, 0xE0, 0x0, 4, 0, __LINE__); |
226 | |
227 | |
228 | /* Use RUN_UTF8_3BYTE_INPUT to test conversion from UTF-8 to INTERNAL. |
229 | Converting directly from UTF-8 to UTF-16|32 is needed, |
230 | because e.g. s390x has iconv-modules which converts directly. */ |
231 | #define RUN_UTF8_3BYTE_INPUT(b0, b1, b2, err, line) \ |
232 | buf[0] = b0; \ |
233 | buf[1] = b1; \ |
234 | buf[2] = b2; \ |
235 | fails += run_conversion ("UTF-8", "WCHAR_T", buf, 3, err, line); \ |
236 | fails += run_conversion ("UTF-8", "UTF-16LE", buf, 3, err, line); \ |
237 | fails += run_conversion ("UTF-8", "UTF-16BE", buf, 3, err, line); \ |
238 | fails += run_conversion ("UTF-8", "UTF-32LE", buf, 3, err, line); \ |
239 | fails += run_conversion ("UTF-8", "UTF-32BE", buf, 3, err, line); |
240 | |
241 | /* Use UTF-8 input of 0xD7FF. */ |
242 | RUN_UTF8_3BYTE_INPUT (0xED, 0x9F, 0xBF, 0, __LINE__); |
243 | |
244 | /* Use UTF-8 input of 0xD800. */ |
245 | RUN_UTF8_3BYTE_INPUT (0xED, 0xA0, 0x80, EILSEQ, __LINE__); |
246 | |
247 | /* Use UTF-8 input of 0xDBFF. */ |
248 | RUN_UTF8_3BYTE_INPUT (0xED, 0xAF, 0xBF, EILSEQ, __LINE__); |
249 | |
250 | /* Use UTF-8 input of 0xDC00. */ |
251 | RUN_UTF8_3BYTE_INPUT (0xED, 0xB0, 0x80, EILSEQ, __LINE__); |
252 | |
253 | /* Use UTF-8 input of 0xDFFF. */ |
254 | RUN_UTF8_3BYTE_INPUT (0xED, 0xBF, 0xBF, EILSEQ, __LINE__); |
255 | |
256 | /* Use UTF-8 input of 0xF000. */ |
257 | RUN_UTF8_3BYTE_INPUT (0xEF, 0x80, 0x80, 0, __LINE__); |
258 | |
259 | return fails > 0 ? EXIT_FAILURE : EXIT_SUCCESS; |
260 | } |
261 | |
262 | #define TEST_FUNCTION do_test () |
263 | #include "../test-skeleton.c" |
264 | |