1 | /* gunicollate.c - Collation |
2 | * |
3 | * Copyright 2001,2005 Red Hat, Inc. |
4 | * |
5 | * This library is free software; you can redistribute it and/or |
6 | * modify it under the terms of the GNU Lesser General Public |
7 | * License as published by the Free Software Foundation; either |
8 | * version 2.1 of the License, or (at your option) any later version. |
9 | * |
10 | * This library is distributed in the hope that it will be useful, |
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | * Lesser General Public License for more details. |
14 | * |
15 | * You should have received a copy of the GNU Lesser General Public License |
16 | * along with this library; if not, see <http://www.gnu.org/licenses/>. |
17 | */ |
18 | |
19 | #include "config.h" |
20 | |
21 | #include <locale.h> |
22 | #include <string.h> |
23 | #ifdef HAVE_WCHAR_H |
24 | #include <wchar.h> |
25 | #endif |
26 | |
27 | #ifdef HAVE_CARBON |
28 | #include <CoreServices/CoreServices.h> |
29 | #endif |
30 | |
31 | #include "gmem.h" |
32 | #include "gunicode.h" |
33 | #include "gunicodeprivate.h" |
34 | #include "gstring.h" |
35 | #include "gstrfuncs.h" |
36 | #include "gtestutils.h" |
37 | #include "gcharset.h" |
38 | #include "gconvert.h" |
39 | |
40 | #if SIZEOF_WCHAR_T == 4 && defined(__STDC_ISO_10646__) |
41 | #define GUNICHAR_EQUALS_WCHAR_T 1 |
42 | #endif |
43 | |
44 | #ifdef _MSC_VER |
45 | /* Workaround for bug in MSVCR80.DLL */ |
46 | static gsize |
47 | msc_strxfrm_wrapper (char *string1, |
48 | const char *string2, |
49 | gsize count) |
50 | { |
51 | if (!string1 || count <= 0) |
52 | { |
53 | char tmp; |
54 | |
55 | return strxfrm (&tmp, string2, 1); |
56 | } |
57 | return strxfrm (string1, string2, count); |
58 | } |
59 | #define strxfrm msc_strxfrm_wrapper |
60 | #endif |
61 | |
62 | /** |
63 | * g_utf8_collate: |
64 | * @str1: a UTF-8 encoded string |
65 | * @str2: a UTF-8 encoded string |
66 | * |
67 | * Compares two strings for ordering using the linguistically |
68 | * correct rules for the [current locale][setlocale]. |
69 | * When sorting a large number of strings, it will be significantly |
70 | * faster to obtain collation keys with g_utf8_collate_key() and |
71 | * compare the keys with strcmp() when sorting instead of sorting |
72 | * the original strings. |
73 | * |
74 | * Returns: < 0 if @str1 compares before @str2, |
75 | * 0 if they compare equal, > 0 if @str1 compares after @str2. |
76 | **/ |
77 | gint |
78 | g_utf8_collate (const gchar *str1, |
79 | const gchar *str2) |
80 | { |
81 | gint result; |
82 | |
83 | #ifdef HAVE_CARBON |
84 | |
85 | UniChar *str1_utf16; |
86 | UniChar *str2_utf16; |
87 | glong len1; |
88 | glong len2; |
89 | SInt32 retval = 0; |
90 | |
91 | g_return_val_if_fail (str1 != NULL, 0); |
92 | g_return_val_if_fail (str2 != NULL, 0); |
93 | |
94 | str1_utf16 = g_utf8_to_utf16 (str1, -1, NULL, &len1, NULL); |
95 | str2_utf16 = g_utf8_to_utf16 (str2, -1, NULL, &len2, NULL); |
96 | |
97 | UCCompareTextDefault (kUCCollateStandardOptions, |
98 | str1_utf16, len1, str2_utf16, len2, |
99 | NULL, &retval); |
100 | result = retval; |
101 | |
102 | g_free (str2_utf16); |
103 | g_free (str1_utf16); |
104 | |
105 | #elif defined(HAVE_WCHAR_H) && defined(GUNICHAR_EQUALS_WCHAR_T) |
106 | |
107 | gunichar *str1_norm; |
108 | gunichar *str2_norm; |
109 | |
110 | g_return_val_if_fail (str1 != NULL, 0); |
111 | g_return_val_if_fail (str2 != NULL, 0); |
112 | |
113 | str1_norm = _g_utf8_normalize_wc (str: str1, max_len: -1, mode: G_NORMALIZE_ALL_COMPOSE); |
114 | str2_norm = _g_utf8_normalize_wc (str: str2, max_len: -1, mode: G_NORMALIZE_ALL_COMPOSE); |
115 | |
116 | result = wcscoll (s1: (wchar_t *)str1_norm, s2: (wchar_t *)str2_norm); |
117 | |
118 | g_free (mem: str1_norm); |
119 | g_free (mem: str2_norm); |
120 | |
121 | #else |
122 | |
123 | const gchar *charset; |
124 | gchar *str1_norm; |
125 | gchar *str2_norm; |
126 | |
127 | g_return_val_if_fail (str1 != NULL, 0); |
128 | g_return_val_if_fail (str2 != NULL, 0); |
129 | |
130 | str1_norm = g_utf8_normalize (str1, -1, G_NORMALIZE_ALL_COMPOSE); |
131 | str2_norm = g_utf8_normalize (str2, -1, G_NORMALIZE_ALL_COMPOSE); |
132 | |
133 | if (g_get_charset (&charset)) |
134 | { |
135 | result = strcoll (str1_norm, str2_norm); |
136 | } |
137 | else |
138 | { |
139 | gchar *str1_locale = g_convert (str1_norm, -1, charset, "UTF-8" , NULL, NULL, NULL); |
140 | gchar *str2_locale = g_convert (str2_norm, -1, charset, "UTF-8" , NULL, NULL, NULL); |
141 | |
142 | if (str1_locale && str2_locale) |
143 | result = strcoll (str1_locale, str2_locale); |
144 | else if (str1_locale) |
145 | result = -1; |
146 | else if (str2_locale) |
147 | result = 1; |
148 | else |
149 | result = strcmp (str1_norm, str2_norm); |
150 | |
151 | g_free (str1_locale); |
152 | g_free (str2_locale); |
153 | } |
154 | |
155 | g_free (str1_norm); |
156 | g_free (str2_norm); |
157 | |
158 | #endif |
159 | |
160 | return result; |
161 | } |
162 | |
163 | #if defined(HAVE_WCHAR_H) && defined(GUNICHAR_EQUALS_WCHAR_T) |
164 | /* We need UTF-8 encoding of numbers to encode the weights if |
165 | * we are using wcsxfrm. However, we aren't encoding Unicode |
166 | * characters, so we can't simply use g_unichar_to_utf8. |
167 | * |
168 | * The following routine is taken (with modification) from GNU |
169 | * libc's strxfrm routine: |
170 | * |
171 | * Copyright (C) 1995-1999,2000,2001 Free Software Foundation, Inc. |
172 | * Written by Ulrich Drepper <drepper@cygnus.com>, 1995. |
173 | */ |
174 | static inline int |
175 | utf8_encode (char *buf, wchar_t val) |
176 | { |
177 | int retval; |
178 | |
179 | if (val < 0x80) |
180 | { |
181 | if (buf) |
182 | *buf++ = (char) val; |
183 | retval = 1; |
184 | } |
185 | else |
186 | { |
187 | int step; |
188 | |
189 | for (step = 2; step < 6; ++step) |
190 | if ((val & (~(guint32)0 << (5 * step + 1))) == 0) |
191 | break; |
192 | retval = step; |
193 | |
194 | if (buf) |
195 | { |
196 | *buf = (unsigned char) (~0xff >> step); |
197 | --step; |
198 | do |
199 | { |
200 | buf[step] = 0x80 | (val & 0x3f); |
201 | val >>= 6; |
202 | } |
203 | while (--step > 0); |
204 | *buf |= val; |
205 | } |
206 | } |
207 | |
208 | return retval; |
209 | } |
210 | #endif |
211 | |
212 | #ifdef HAVE_CARBON |
213 | |
214 | static gchar * |
215 | collate_key_to_string (UCCollationValue *key, |
216 | gsize key_len) |
217 | { |
218 | gchar *result; |
219 | gsize result_len; |
220 | long *lkey = (long *) key; |
221 | |
222 | /* UCCollationValue format: |
223 | * |
224 | * UCCollateOptions (32/64 bits) |
225 | * SizeInBytes (32/64 bits) |
226 | * Value (8 bits arrey) |
227 | * |
228 | * UCCollateOptions: ordering option mask of the collator |
229 | * used to create the key. Size changes on 32-bit / 64-bit |
230 | * hosts. On 64-bits also the extra half-word seems to have |
231 | * some extra (unknown) meaning. |
232 | * SizeInBytes: size of the whole structure, in bytes |
233 | * (including UCCollateOptions and SizeInBytes fields). Size |
234 | * changes on 32-bit & 64-bit hosts. |
235 | * Value: array of bytes containing the comparison weights. |
236 | * Seems to have several sub-strings separated by \001 and \002 |
237 | * chars. Also, experience shows this is directly strcmp-able. |
238 | */ |
239 | |
240 | result_len = lkey[1]; |
241 | result = g_malloc (result_len + 1); |
242 | memcpy (result, &lkey[2], result_len); |
243 | result[result_len] = '\0'; |
244 | |
245 | return result; |
246 | } |
247 | |
248 | static gchar * |
249 | carbon_collate_key_with_collator (const gchar *str, |
250 | gssize len, |
251 | CollatorRef collator) |
252 | { |
253 | UniChar *str_utf16 = NULL; |
254 | glong len_utf16; |
255 | OSStatus ret; |
256 | UCCollationValue staticbuf[512]; |
257 | UCCollationValue *freeme = NULL; |
258 | UCCollationValue *buf; |
259 | ItemCount buf_len; |
260 | ItemCount key_len; |
261 | ItemCount try_len; |
262 | gchar *result = NULL; |
263 | |
264 | str_utf16 = g_utf8_to_utf16 (str, len, NULL, &len_utf16, NULL); |
265 | try_len = len_utf16 * 5 + 2; |
266 | |
267 | if (try_len <= sizeof staticbuf) |
268 | { |
269 | buf = staticbuf; |
270 | buf_len = sizeof staticbuf; |
271 | } |
272 | else |
273 | { |
274 | freeme = g_new (UCCollationValue, try_len); |
275 | buf = freeme; |
276 | buf_len = try_len; |
277 | } |
278 | |
279 | ret = UCGetCollationKey (collator, str_utf16, len_utf16, |
280 | buf_len, &key_len, buf); |
281 | |
282 | if (ret == kCollateBufferTooSmall) |
283 | { |
284 | freeme = g_renew (UCCollationValue, freeme, try_len * 2); |
285 | buf = freeme; |
286 | buf_len = try_len * 2; |
287 | ret = UCGetCollationKey (collator, str_utf16, len_utf16, |
288 | buf_len, &key_len, buf); |
289 | } |
290 | |
291 | if (ret == 0) |
292 | result = collate_key_to_string (buf, key_len); |
293 | else |
294 | result = g_strdup ("" ); |
295 | |
296 | g_free (freeme); |
297 | g_free (str_utf16); |
298 | return result; |
299 | } |
300 | |
301 | static gchar * |
302 | carbon_collate_key (const gchar *str, |
303 | gssize len) |
304 | { |
305 | static CollatorRef collator; |
306 | |
307 | if (G_UNLIKELY (!collator)) |
308 | { |
309 | UCCreateCollator (NULL, 0, kUCCollateStandardOptions, &collator); |
310 | |
311 | if (!collator) |
312 | { |
313 | static gboolean been_here; |
314 | if (!been_here) |
315 | g_warning ("%s: UCCreateCollator failed" , G_STRLOC); |
316 | been_here = TRUE; |
317 | return g_strdup ("" ); |
318 | } |
319 | } |
320 | |
321 | return carbon_collate_key_with_collator (str, len, collator); |
322 | } |
323 | |
324 | static gchar * |
325 | carbon_collate_key_for_filename (const gchar *str, |
326 | gssize len) |
327 | { |
328 | static CollatorRef collator; |
329 | |
330 | if (G_UNLIKELY (!collator)) |
331 | { |
332 | /* http://developer.apple.com/qa/qa2004/qa1159.html */ |
333 | UCCreateCollator (NULL, 0, |
334 | kUCCollateComposeInsensitiveMask |
335 | | kUCCollateWidthInsensitiveMask |
336 | | kUCCollateCaseInsensitiveMask |
337 | | kUCCollateDigitsOverrideMask |
338 | | kUCCollateDigitsAsNumberMask |
339 | | kUCCollatePunctuationSignificantMask, |
340 | &collator); |
341 | |
342 | if (!collator) |
343 | { |
344 | static gboolean been_here; |
345 | if (!been_here) |
346 | g_warning ("%s: UCCreateCollator failed" , G_STRLOC); |
347 | been_here = TRUE; |
348 | return g_strdup ("" ); |
349 | } |
350 | } |
351 | |
352 | return carbon_collate_key_with_collator (str, len, collator); |
353 | } |
354 | |
355 | #endif /* HAVE_CARBON */ |
356 | |
357 | /** |
358 | * g_utf8_collate_key: |
359 | * @str: a UTF-8 encoded string. |
360 | * @len: length of @str, in bytes, or -1 if @str is nul-terminated. |
361 | * |
362 | * Converts a string into a collation key that can be compared |
363 | * with other collation keys produced by the same function using |
364 | * strcmp(). |
365 | * |
366 | * The results of comparing the collation keys of two strings |
367 | * with strcmp() will always be the same as comparing the two |
368 | * original keys with g_utf8_collate(). |
369 | * |
370 | * Note that this function depends on the [current locale][setlocale]. |
371 | * |
372 | * Returns: a newly allocated string. This string should |
373 | * be freed with g_free() when you are done with it. |
374 | **/ |
375 | gchar * |
376 | g_utf8_collate_key (const gchar *str, |
377 | gssize len) |
378 | { |
379 | gchar *result; |
380 | |
381 | #ifdef HAVE_CARBON |
382 | |
383 | g_return_val_if_fail (str != NULL, NULL); |
384 | result = carbon_collate_key (str, len); |
385 | |
386 | #elif defined(HAVE_WCHAR_H) && defined(GUNICHAR_EQUALS_WCHAR_T) |
387 | |
388 | gsize xfrm_len; |
389 | gunichar *str_norm; |
390 | wchar_t *result_wc; |
391 | gsize i; |
392 | gsize result_len = 0; |
393 | |
394 | g_return_val_if_fail (str != NULL, NULL); |
395 | |
396 | str_norm = _g_utf8_normalize_wc (str, max_len: len, mode: G_NORMALIZE_ALL_COMPOSE); |
397 | |
398 | xfrm_len = wcsxfrm (NULL, s2: (wchar_t *)str_norm, n: 0); |
399 | result_wc = g_new (wchar_t, xfrm_len + 1); |
400 | wcsxfrm (s1: result_wc, s2: (wchar_t *)str_norm, n: xfrm_len + 1); |
401 | |
402 | for (i = 0; i < xfrm_len; i++) |
403 | result_len += utf8_encode (NULL, val: result_wc[i]); |
404 | |
405 | result = g_malloc (n_bytes: result_len + 1); |
406 | result_len = 0; |
407 | for (i = 0; i < xfrm_len; i++) |
408 | result_len += utf8_encode (buf: result + result_len, val: result_wc[i]); |
409 | |
410 | result[result_len] = '\0'; |
411 | |
412 | g_free (mem: result_wc); |
413 | g_free (mem: str_norm); |
414 | |
415 | return result; |
416 | #else |
417 | |
418 | gsize xfrm_len; |
419 | const gchar *charset; |
420 | gchar *str_norm; |
421 | |
422 | g_return_val_if_fail (str != NULL, NULL); |
423 | |
424 | str_norm = g_utf8_normalize (str, len, G_NORMALIZE_ALL_COMPOSE); |
425 | |
426 | result = NULL; |
427 | |
428 | if (g_get_charset (&charset)) |
429 | { |
430 | xfrm_len = strxfrm (NULL, str_norm, 0); |
431 | if (xfrm_len < G_MAXINT - 2) |
432 | { |
433 | result = g_malloc (xfrm_len + 1); |
434 | strxfrm (result, str_norm, xfrm_len + 1); |
435 | } |
436 | } |
437 | else |
438 | { |
439 | gchar *str_locale = g_convert (str_norm, -1, charset, "UTF-8" , NULL, NULL, NULL); |
440 | |
441 | if (str_locale) |
442 | { |
443 | xfrm_len = strxfrm (NULL, str_locale, 0); |
444 | if (xfrm_len < 0 || xfrm_len >= G_MAXINT - 2) |
445 | { |
446 | g_free (str_locale); |
447 | str_locale = NULL; |
448 | } |
449 | } |
450 | if (str_locale) |
451 | { |
452 | result = g_malloc (xfrm_len + 2); |
453 | result[0] = 'A'; |
454 | strxfrm (result + 1, str_locale, xfrm_len + 1); |
455 | |
456 | g_free (str_locale); |
457 | } |
458 | } |
459 | |
460 | if (!result) |
461 | { |
462 | xfrm_len = strlen (str_norm); |
463 | result = g_malloc (xfrm_len + 2); |
464 | result[0] = 'B'; |
465 | memcpy (result + 1, str_norm, xfrm_len); |
466 | result[xfrm_len+1] = '\0'; |
467 | } |
468 | |
469 | g_free (str_norm); |
470 | #endif |
471 | |
472 | return result; |
473 | } |
474 | |
475 | /* This is a collation key that is very very likely to sort before any |
476 | * collation key that libc strxfrm generates. We use this before any |
477 | * special case (dot or number) to make sure that its sorted before |
478 | * anything else. |
479 | */ |
480 | #define COLLATION_SENTINEL "\1\1\1" |
481 | |
482 | /** |
483 | * g_utf8_collate_key_for_filename: |
484 | * @str: a UTF-8 encoded string. |
485 | * @len: length of @str, in bytes, or -1 if @str is nul-terminated. |
486 | * |
487 | * Converts a string into a collation key that can be compared |
488 | * with other collation keys produced by the same function using strcmp(). |
489 | * |
490 | * In order to sort filenames correctly, this function treats the dot '.' |
491 | * as a special case. Most dictionary orderings seem to consider it |
492 | * insignificant, thus producing the ordering "event.c" "eventgenerator.c" |
493 | * "event.h" instead of "event.c" "event.h" "eventgenerator.c". Also, we |
494 | * would like to treat numbers intelligently so that "file1" "file10" "file5" |
495 | * is sorted as "file1" "file5" "file10". |
496 | * |
497 | * Note that this function depends on the [current locale][setlocale]. |
498 | * |
499 | * Returns: a newly allocated string. This string should |
500 | * be freed with g_free() when you are done with it. |
501 | * |
502 | * Since: 2.8 |
503 | */ |
504 | gchar * |
505 | g_utf8_collate_key_for_filename (const gchar *str, |
506 | gssize len) |
507 | { |
508 | #ifndef HAVE_CARBON |
509 | GString *result; |
510 | GString *append; |
511 | const gchar *p; |
512 | const gchar *prev; |
513 | const gchar *end; |
514 | gchar *collate_key; |
515 | gint digits; |
516 | gint leading_zeros; |
517 | |
518 | /* |
519 | * How it works: |
520 | * |
521 | * Split the filename into collatable substrings which do |
522 | * not contain [.0-9] and special-cased substrings. The collatable |
523 | * substrings are run through the normal g_utf8_collate_key() and the |
524 | * resulting keys are concatenated with keys generated from the |
525 | * special-cased substrings. |
526 | * |
527 | * Special cases: Dots are handled by replacing them with '\1' which |
528 | * implies that short dot-delimited substrings are before long ones, |
529 | * e.g. |
530 | * |
531 | * a\1a (a.a) |
532 | * a-\1a (a-.a) |
533 | * aa\1a (aa.a) |
534 | * |
535 | * Numbers are handled by prepending to each number d-1 superdigits |
536 | * where d = number of digits in the number and SUPERDIGIT is a |
537 | * character with an integer value higher than any digit (for instance |
538 | * ':'). This ensures that single-digit numbers are sorted before |
539 | * double-digit numbers which in turn are sorted separately from |
540 | * triple-digit numbers, etc. To avoid strange side-effects when |
541 | * sorting strings that already contain SUPERDIGITs, a '\2' |
542 | * is also prepended, like this |
543 | * |
544 | * file\21 (file1) |
545 | * file\25 (file5) |
546 | * file\2:10 (file10) |
547 | * file\2:26 (file26) |
548 | * file\2::100 (file100) |
549 | * file:foo (file:foo) |
550 | * |
551 | * This has the side-effect of sorting numbers before everything else (except |
552 | * dots), but this is probably OK. |
553 | * |
554 | * Leading digits are ignored when doing the above. To discriminate |
555 | * numbers which differ only in the number of leading digits, we append |
556 | * the number of leading digits as a byte at the very end of the collation |
557 | * key. |
558 | * |
559 | * To try avoid conflict with any collation key sequence generated by libc we |
560 | * start each switch to a special cased part with a sentinel that hopefully |
561 | * will sort before anything libc will generate. |
562 | */ |
563 | |
564 | if (len < 0) |
565 | len = strlen (s: str); |
566 | |
567 | result = g_string_sized_new (dfl_size: len * 2); |
568 | append = g_string_sized_new (dfl_size: 0); |
569 | |
570 | end = str + len; |
571 | |
572 | /* No need to use utf8 functions, since we're only looking for ascii chars */ |
573 | for (prev = p = str; p < end; p++) |
574 | { |
575 | switch (*p) |
576 | { |
577 | case '.': |
578 | if (prev != p) |
579 | { |
580 | collate_key = g_utf8_collate_key (str: prev, len: p - prev); |
581 | g_string_append (string: result, val: collate_key); |
582 | g_free (mem: collate_key); |
583 | } |
584 | |
585 | g_string_append (string: result, COLLATION_SENTINEL "\1" ); |
586 | |
587 | /* skip the dot */ |
588 | prev = p + 1; |
589 | break; |
590 | |
591 | case '0': |
592 | case '1': |
593 | case '2': |
594 | case '3': |
595 | case '4': |
596 | case '5': |
597 | case '6': |
598 | case '7': |
599 | case '8': |
600 | case '9': |
601 | if (prev != p) |
602 | { |
603 | collate_key = g_utf8_collate_key (str: prev, len: p - prev); |
604 | g_string_append (string: result, val: collate_key); |
605 | g_free (mem: collate_key); |
606 | } |
607 | |
608 | g_string_append (string: result, COLLATION_SENTINEL "\2" ); |
609 | |
610 | prev = p; |
611 | |
612 | /* write d-1 colons */ |
613 | if (*p == '0') |
614 | { |
615 | leading_zeros = 1; |
616 | digits = 0; |
617 | } |
618 | else |
619 | { |
620 | leading_zeros = 0; |
621 | digits = 1; |
622 | } |
623 | |
624 | while (++p < end) |
625 | { |
626 | if (*p == '0' && !digits) |
627 | ++leading_zeros; |
628 | else if (g_ascii_isdigit(*p)) |
629 | ++digits; |
630 | else |
631 | { |
632 | /* count an all-zero sequence as |
633 | * one digit plus leading zeros |
634 | */ |
635 | if (!digits) |
636 | { |
637 | ++digits; |
638 | --leading_zeros; |
639 | } |
640 | break; |
641 | } |
642 | } |
643 | |
644 | while (digits > 1) |
645 | { |
646 | g_string_append_c (result, ':'); |
647 | --digits; |
648 | } |
649 | |
650 | if (leading_zeros > 0) |
651 | { |
652 | g_string_append_c (append, (char)leading_zeros); |
653 | prev += leading_zeros; |
654 | } |
655 | |
656 | /* write the number itself */ |
657 | g_string_append_len (string: result, val: prev, len: p - prev); |
658 | |
659 | prev = p; |
660 | --p; /* go one step back to avoid disturbing outer loop */ |
661 | break; |
662 | |
663 | default: |
664 | /* other characters just accumulate */ |
665 | break; |
666 | } |
667 | } |
668 | |
669 | if (prev != p) |
670 | { |
671 | collate_key = g_utf8_collate_key (str: prev, len: p - prev); |
672 | g_string_append (string: result, val: collate_key); |
673 | g_free (mem: collate_key); |
674 | } |
675 | |
676 | g_string_append (string: result, val: append->str); |
677 | g_string_free (string: append, TRUE); |
678 | |
679 | return g_string_free (string: result, FALSE); |
680 | #else /* HAVE_CARBON */ |
681 | return carbon_collate_key_for_filename (str, len); |
682 | #endif |
683 | } |
684 | |