1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* |
3 | * unicode.c |
4 | * |
5 | * PURPOSE |
6 | * Routines for converting between UTF-8 and OSTA Compressed Unicode. |
7 | * Also handles filename mangling |
8 | * |
9 | * DESCRIPTION |
10 | * OSTA Compressed Unicode is explained in the OSTA UDF specification. |
11 | * http://www.osta.org/ |
12 | * UTF-8 is explained in the IETF RFC XXXX. |
13 | * ftp://ftp.internic.net/rfc/rfcxxxx.txt |
14 | * |
15 | */ |
16 | |
17 | #include "udfdecl.h" |
18 | |
19 | #include <linux/kernel.h> |
20 | #include <linux/string.h> /* for memset */ |
21 | #include <linux/nls.h> |
22 | #include <linux/crc-itu-t.h> |
23 | #include <linux/slab.h> |
24 | |
25 | #include "udf_sb.h" |
26 | |
27 | #define PLANE_SIZE 0x10000 |
28 | #define UNICODE_MAX 0x10ffff |
29 | #define SURROGATE_MASK 0xfffff800 |
30 | #define SURROGATE_PAIR 0x0000d800 |
31 | #define SURROGATE_LOW 0x00000400 |
32 | #define SURROGATE_CHAR_BITS 10 |
33 | #define SURROGATE_CHAR_MASK ((1 << SURROGATE_CHAR_BITS) - 1) |
34 | |
35 | #define ILLEGAL_CHAR_MARK '_' |
36 | #define EXT_MARK '.' |
37 | #define CRC_MARK '#' |
38 | #define EXT_SIZE 5 |
39 | /* Number of chars we need to store generated CRC to make filename unique */ |
40 | #define CRC_LEN 5 |
41 | |
42 | static unicode_t get_utf16_char(const uint8_t *str_i, int str_i_max_len, |
43 | int str_i_idx, int u_ch, unicode_t *ret) |
44 | { |
45 | unicode_t c; |
46 | int start_idx = str_i_idx; |
47 | |
48 | /* Expand OSTA compressed Unicode to Unicode */ |
49 | c = str_i[str_i_idx++]; |
50 | if (u_ch > 1) |
51 | c = (c << 8) | str_i[str_i_idx++]; |
52 | if ((c & SURROGATE_MASK) == SURROGATE_PAIR) { |
53 | unicode_t next; |
54 | |
55 | /* Trailing surrogate char */ |
56 | if (str_i_idx >= str_i_max_len) { |
57 | c = UNICODE_MAX + 1; |
58 | goto out; |
59 | } |
60 | |
61 | /* Low surrogate must follow the high one... */ |
62 | if (c & SURROGATE_LOW) { |
63 | c = UNICODE_MAX + 1; |
64 | goto out; |
65 | } |
66 | |
67 | WARN_ON_ONCE(u_ch != 2); |
68 | next = str_i[str_i_idx++] << 8; |
69 | next |= str_i[str_i_idx++]; |
70 | if ((next & SURROGATE_MASK) != SURROGATE_PAIR || |
71 | !(next & SURROGATE_LOW)) { |
72 | c = UNICODE_MAX + 1; |
73 | goto out; |
74 | } |
75 | |
76 | c = PLANE_SIZE + |
77 | ((c & SURROGATE_CHAR_MASK) << SURROGATE_CHAR_BITS) + |
78 | (next & SURROGATE_CHAR_MASK); |
79 | } |
80 | out: |
81 | *ret = c; |
82 | return str_i_idx - start_idx; |
83 | } |
84 | |
85 | |
86 | static int udf_name_conv_char(uint8_t *str_o, int str_o_max_len, |
87 | int *str_o_idx, |
88 | const uint8_t *str_i, int str_i_max_len, |
89 | int *str_i_idx, |
90 | int u_ch, int *needsCRC, |
91 | int (*conv_f)(wchar_t, unsigned char *, int), |
92 | int translate) |
93 | { |
94 | unicode_t c; |
95 | int illChar = 0; |
96 | int len, gotch = 0; |
97 | |
98 | while (!gotch && *str_i_idx < str_i_max_len) { |
99 | if (*str_o_idx >= str_o_max_len) { |
100 | *needsCRC = 1; |
101 | return gotch; |
102 | } |
103 | |
104 | len = get_utf16_char(str_i, str_i_max_len, str_i_idx: *str_i_idx, u_ch, |
105 | ret: &c); |
106 | /* These chars cannot be converted. Replace them. */ |
107 | if (c == 0 || c > UNICODE_MAX || (conv_f && c > MAX_WCHAR_T) || |
108 | (translate && c == '/')) { |
109 | illChar = 1; |
110 | if (!translate) |
111 | gotch = 1; |
112 | } else if (illChar) |
113 | break; |
114 | else |
115 | gotch = 1; |
116 | *str_i_idx += len; |
117 | } |
118 | if (illChar) { |
119 | *needsCRC = 1; |
120 | c = ILLEGAL_CHAR_MARK; |
121 | gotch = 1; |
122 | } |
123 | if (gotch) { |
124 | if (conv_f) { |
125 | len = conv_f(c, &str_o[*str_o_idx], |
126 | str_o_max_len - *str_o_idx); |
127 | } else { |
128 | len = utf32_to_utf8(u: c, s: &str_o[*str_o_idx], |
129 | maxlen: str_o_max_len - *str_o_idx); |
130 | if (len < 0) |
131 | len = -ENAMETOOLONG; |
132 | } |
133 | /* Valid character? */ |
134 | if (len >= 0) |
135 | *str_o_idx += len; |
136 | else if (len == -ENAMETOOLONG) { |
137 | *needsCRC = 1; |
138 | gotch = 0; |
139 | } else { |
140 | str_o[(*str_o_idx)++] = ILLEGAL_CHAR_MARK; |
141 | *needsCRC = 1; |
142 | } |
143 | } |
144 | return gotch; |
145 | } |
146 | |
147 | static int udf_name_from_CS0(struct super_block *sb, |
148 | uint8_t *str_o, int str_max_len, |
149 | const uint8_t *ocu, int ocu_len, |
150 | int translate) |
151 | { |
152 | uint32_t c; |
153 | uint8_t cmp_id; |
154 | int idx, len; |
155 | int u_ch; |
156 | int needsCRC = 0; |
157 | int ext_i_len, ext_max_len; |
158 | int str_o_len = 0; /* Length of resulting output */ |
159 | int ext_o_len = 0; /* Extension output length */ |
160 | int ext_crc_len = 0; /* Extension output length if used with CRC */ |
161 | int i_ext = -1; /* Extension position in input buffer */ |
162 | int o_crc = 0; /* Rightmost possible output pos for CRC+ext */ |
163 | unsigned short valueCRC; |
164 | uint8_t ext[EXT_SIZE * NLS_MAX_CHARSET_SIZE + 1]; |
165 | uint8_t crc[CRC_LEN]; |
166 | int (*conv_f)(wchar_t, unsigned char *, int); |
167 | |
168 | if (str_max_len <= 0) |
169 | return 0; |
170 | |
171 | if (ocu_len == 0) { |
172 | memset(str_o, 0, str_max_len); |
173 | return 0; |
174 | } |
175 | |
176 | if (UDF_SB(sb)->s_nls_map) |
177 | conv_f = UDF_SB(sb)->s_nls_map->uni2char; |
178 | else |
179 | conv_f = NULL; |
180 | |
181 | cmp_id = ocu[0]; |
182 | if (cmp_id != 8 && cmp_id != 16) { |
183 | memset(str_o, 0, str_max_len); |
184 | pr_err("unknown compression code (%u)\n" , cmp_id); |
185 | return -EINVAL; |
186 | } |
187 | u_ch = cmp_id >> 3; |
188 | |
189 | ocu++; |
190 | ocu_len--; |
191 | |
192 | if (ocu_len % u_ch) { |
193 | pr_err("incorrect filename length (%d)\n" , ocu_len + 1); |
194 | return -EINVAL; |
195 | } |
196 | |
197 | if (translate) { |
198 | /* Look for extension */ |
199 | for (idx = ocu_len - u_ch, ext_i_len = 0; |
200 | (idx >= 0) && (ext_i_len < EXT_SIZE); |
201 | idx -= u_ch, ext_i_len++) { |
202 | c = ocu[idx]; |
203 | if (u_ch > 1) |
204 | c = (c << 8) | ocu[idx + 1]; |
205 | |
206 | if (c == EXT_MARK) { |
207 | if (ext_i_len) |
208 | i_ext = idx; |
209 | break; |
210 | } |
211 | } |
212 | if (i_ext >= 0) { |
213 | /* Convert extension */ |
214 | ext_max_len = min_t(int, sizeof(ext), str_max_len); |
215 | ext[ext_o_len++] = EXT_MARK; |
216 | idx = i_ext + u_ch; |
217 | while (udf_name_conv_char(str_o: ext, str_o_max_len: ext_max_len, str_o_idx: &ext_o_len, |
218 | str_i: ocu, str_i_max_len: ocu_len, str_i_idx: &idx, |
219 | u_ch, needsCRC: &needsCRC, |
220 | conv_f, translate)) { |
221 | if ((ext_o_len + CRC_LEN) < str_max_len) |
222 | ext_crc_len = ext_o_len; |
223 | } |
224 | } |
225 | } |
226 | |
227 | idx = 0; |
228 | while (1) { |
229 | if (translate && (idx == i_ext)) { |
230 | if (str_o_len > (str_max_len - ext_o_len)) |
231 | needsCRC = 1; |
232 | break; |
233 | } |
234 | |
235 | if (!udf_name_conv_char(str_o, str_o_max_len: str_max_len, str_o_idx: &str_o_len, |
236 | str_i: ocu, str_i_max_len: ocu_len, str_i_idx: &idx, |
237 | u_ch, needsCRC: &needsCRC, conv_f, translate)) |
238 | break; |
239 | |
240 | if (translate && |
241 | (str_o_len <= (str_max_len - ext_o_len - CRC_LEN))) |
242 | o_crc = str_o_len; |
243 | } |
244 | |
245 | if (translate) { |
246 | if (str_o_len > 0 && str_o_len <= 2 && str_o[0] == '.' && |
247 | (str_o_len == 1 || str_o[1] == '.')) |
248 | needsCRC = 1; |
249 | if (needsCRC) { |
250 | str_o_len = o_crc; |
251 | valueCRC = crc_itu_t(crc: 0, buffer: ocu, len: ocu_len); |
252 | crc[0] = CRC_MARK; |
253 | crc[1] = hex_asc_upper_hi(valueCRC >> 8); |
254 | crc[2] = hex_asc_upper_lo(valueCRC >> 8); |
255 | crc[3] = hex_asc_upper_hi(valueCRC); |
256 | crc[4] = hex_asc_upper_lo(valueCRC); |
257 | len = min_t(int, CRC_LEN, str_max_len - str_o_len); |
258 | memcpy(&str_o[str_o_len], crc, len); |
259 | str_o_len += len; |
260 | ext_o_len = ext_crc_len; |
261 | } |
262 | if (ext_o_len > 0) { |
263 | memcpy(&str_o[str_o_len], ext, ext_o_len); |
264 | str_o_len += ext_o_len; |
265 | } |
266 | } |
267 | |
268 | return str_o_len; |
269 | } |
270 | |
271 | static int udf_name_to_CS0(struct super_block *sb, |
272 | uint8_t *ocu, int ocu_max_len, |
273 | const uint8_t *str_i, int str_len) |
274 | { |
275 | int i, len; |
276 | unsigned int max_val; |
277 | int u_len, u_ch; |
278 | unicode_t uni_char; |
279 | int (*conv_f)(const unsigned char *, int, wchar_t *); |
280 | |
281 | if (ocu_max_len <= 0) |
282 | return 0; |
283 | |
284 | if (UDF_SB(sb)->s_nls_map) |
285 | conv_f = UDF_SB(sb)->s_nls_map->char2uni; |
286 | else |
287 | conv_f = NULL; |
288 | |
289 | memset(ocu, 0, ocu_max_len); |
290 | ocu[0] = 8; |
291 | max_val = 0xff; |
292 | u_ch = 1; |
293 | |
294 | try_again: |
295 | u_len = 1; |
296 | for (i = 0; i < str_len; i += len) { |
297 | /* Name didn't fit? */ |
298 | if (u_len + u_ch > ocu_max_len) |
299 | return 0; |
300 | if (conv_f) { |
301 | wchar_t wchar; |
302 | |
303 | len = conv_f(&str_i[i], str_len - i, &wchar); |
304 | if (len > 0) |
305 | uni_char = wchar; |
306 | } else { |
307 | len = utf8_to_utf32(s: &str_i[i], len: str_len - i, |
308 | pu: &uni_char); |
309 | } |
310 | /* Invalid character, deal with it */ |
311 | if (len <= 0 || uni_char > UNICODE_MAX) { |
312 | len = 1; |
313 | uni_char = '?'; |
314 | } |
315 | |
316 | if (uni_char > max_val) { |
317 | unicode_t c; |
318 | |
319 | if (max_val == 0xff) { |
320 | max_val = 0xffff; |
321 | ocu[0] = 0x10; |
322 | u_ch = 2; |
323 | goto try_again; |
324 | } |
325 | /* |
326 | * Use UTF-16 encoding for chars outside we |
327 | * cannot encode directly. |
328 | */ |
329 | if (u_len + 2 * u_ch > ocu_max_len) |
330 | return 0; |
331 | |
332 | uni_char -= PLANE_SIZE; |
333 | c = SURROGATE_PAIR | |
334 | ((uni_char >> SURROGATE_CHAR_BITS) & |
335 | SURROGATE_CHAR_MASK); |
336 | ocu[u_len++] = (uint8_t)(c >> 8); |
337 | ocu[u_len++] = (uint8_t)(c & 0xff); |
338 | uni_char = SURROGATE_PAIR | SURROGATE_LOW | |
339 | (uni_char & SURROGATE_CHAR_MASK); |
340 | } |
341 | |
342 | if (max_val == 0xffff) |
343 | ocu[u_len++] = (uint8_t)(uni_char >> 8); |
344 | ocu[u_len++] = (uint8_t)(uni_char & 0xff); |
345 | } |
346 | |
347 | return u_len; |
348 | } |
349 | |
350 | /* |
351 | * Convert CS0 dstring to output charset. Warning: This function may truncate |
352 | * input string if it is too long as it is used for informational strings only |
353 | * and it is better to truncate the string than to refuse mounting a media. |
354 | */ |
355 | int udf_dstrCS0toChar(struct super_block *sb, uint8_t *utf_o, int o_len, |
356 | const uint8_t *ocu_i, int i_len) |
357 | { |
358 | int s_len = 0; |
359 | |
360 | if (i_len > 0) { |
361 | s_len = ocu_i[i_len - 1]; |
362 | if (s_len >= i_len) { |
363 | pr_warn("incorrect dstring lengths (%d/%d)," |
364 | " truncating\n" , s_len, i_len); |
365 | s_len = i_len - 1; |
366 | /* 2-byte encoding? Need to round properly... */ |
367 | if (ocu_i[0] == 16) |
368 | s_len -= (s_len - 1) & 2; |
369 | } |
370 | } |
371 | |
372 | return udf_name_from_CS0(sb, str_o: utf_o, str_max_len: o_len, ocu: ocu_i, ocu_len: s_len, translate: 0); |
373 | } |
374 | |
375 | int udf_get_filename(struct super_block *sb, const uint8_t *sname, int slen, |
376 | uint8_t *dname, int dlen) |
377 | { |
378 | int ret; |
379 | |
380 | if (!slen) |
381 | return -EIO; |
382 | |
383 | if (dlen <= 0) |
384 | return 0; |
385 | |
386 | ret = udf_name_from_CS0(sb, str_o: dname, str_max_len: dlen, ocu: sname, ocu_len: slen, translate: 1); |
387 | /* Zero length filename isn't valid... */ |
388 | if (ret == 0) |
389 | ret = -EINVAL; |
390 | return ret; |
391 | } |
392 | |
393 | int udf_put_filename(struct super_block *sb, const uint8_t *sname, int slen, |
394 | uint8_t *dname, int dlen) |
395 | { |
396 | return udf_name_to_CS0(sb, ocu: dname, ocu_max_len: dlen, str_i: sname, str_len: slen); |
397 | } |
398 | |
399 | |