1// Copyright (C) 2022 The Qt Company Ltd.
2// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
3
4// Don't define it while compiling this module, or USERS of Qt will
5// not be able to link.
6#ifdef QT_NO_CAST_FROM_ASCII
7# undef QT_NO_CAST_FROM_ASCII
8#endif
9#ifdef QT_NO_CAST_TO_ASCII
10# undef QT_NO_CAST_TO_ASCII
11#endif
12#include "qchar.h"
13
14#include "qdatastream.h"
15
16#include "qunicodetables_p.h"
17#include "qunicodetables.cpp"
18
19#include <algorithm>
20
21QT_BEGIN_NAMESPACE
22
23#define FLAG(x) (1 << (x))
24
25/*!
26 \class QLatin1Char
27 \inmodule QtCore
28 \reentrant
29 \brief The QLatin1Char class provides an 8-bit ASCII/Latin-1 character.
30
31 \ingroup string-processing
32
33 This class is only useful to construct a QChar with 8-bit character.
34
35 \sa QChar, QLatin1StringView, QString
36*/
37
38/*!
39 \fn const char QLatin1Char::toLatin1() const
40
41 Converts a Latin-1 character to an 8-bit ASCII representation of the character.
42*/
43
44/*!
45 \fn QLatin1Char::unicode() const
46
47 Converts a Latin-1 character to an 16-bit-encoded Unicode representation
48 of the character.
49*/
50
51/*!
52 \fn QLatin1Char::QLatin1Char(char c)
53
54 Constructs a Latin-1 character for \a c. This constructor should be
55 used when the encoding of the input character is known to be Latin-1.
56*/
57
58/*!
59 \class QChar
60 \inmodule QtCore
61 \brief The QChar class provides a 16-bit Unicode character.
62
63 \ingroup string-processing
64 \reentrant
65
66 In Qt, Unicode characters are 16-bit entities without any markup
67 or structure. This class represents such an entity. It is
68 lightweight, so it can be used everywhere. Most compilers treat
69 it like an \c{unsigned short}.
70
71 QChar provides a full complement of testing/classification
72 functions, converting to and from other formats, converting from
73 composed to decomposed Unicode, and trying to compare and
74 case-convert if you ask it to.
75
76 The classification functions include functions like those in the
77 standard C++ header \<cctype\> (formerly \<ctype.h\>), but
78 operating on the full range of Unicode characters, not just for the ASCII
79 range. They all return true if the character is a certain type of character;
80 otherwise they return false. These classification functions are
81 isNull() (returns \c true if the character is '\\0'), isPrint()
82 (true if the character is any sort of printable character,
83 including whitespace), isPunct() (any sort of punctation),
84 isMark() (Unicode Mark), isLetter() (a letter), isNumber() (any
85 sort of numeric character, not just 0-9), isLetterOrNumber(), and
86 isDigit() (decimal digits). All of these are wrappers around
87 category() which return the Unicode-defined category of each
88 character. Some of these also calculate the derived properties
89 (for example isSpace() returns \c true if the character is of category
90 Separator_* or an exceptional code point from Other_Control category).
91
92 QChar also provides direction(), which indicates the "natural"
93 writing direction of this character. The joiningType() function
94 indicates how the character joins with it's neighbors (needed
95 mostly for Arabic or Syriac) and finally hasMirrored(), which indicates
96 whether the character needs to be mirrored when it is printed in
97 it's "unnatural" writing direction.
98
99 Composed Unicode characters (like \a ring) can be converted to
100 decomposed Unicode ("a" followed by "ring above") by using decomposition().
101
102 In Unicode, comparison is not necessarily possible and case
103 conversion is very difficult at best. Unicode, covering the
104 "entire" world, also includes most of the world's case and
105 sorting problems. operator==() and friends will do comparison
106 based purely on the numeric Unicode value (code point) of the
107 characters, and toUpper() and toLower() will do case changes when
108 the character has a well-defined uppercase/lowercase equivalent.
109 For locale-dependent comparisons, use QString::localeAwareCompare().
110
111 The conversion functions include unicode() (to a scalar),
112 toLatin1() (to scalar, but converts all non-Latin-1 characters to
113 0), row() (gives the Unicode row), cell() (gives the Unicode
114 cell), digitValue() (gives the integer value of any of the
115 numerous digit characters), and a host of constructors.
116
117 QChar provides constructors and cast operators that make it easy
118 to convert to and from traditional 8-bit \c{char}s. If you
119 defined \c QT_NO_CAST_FROM_ASCII and \c QT_NO_CAST_TO_ASCII, as
120 explained in the QString documentation, you will need to
121 explicitly call fromLatin1(), or use QLatin1Char,
122 to construct a QChar from an 8-bit \c char, and you will need to
123 call toLatin1() to get the 8-bit value back.
124
125 Starting with Qt 6.0, most QChar constructors are \c explicit. This
126 is done to avoid dangerous mistakes when accidentally mixing
127 integral types and strings. You can opt-out (and make these
128 constructors implicit) by defining the macro \c
129 QT_IMPLICIT_QCHAR_CONSTRUCTION.
130
131 For more information see
132 \l{https://www.unicode.org/ucd/}{"About the Unicode Character Database"}.
133
134 \sa Unicode, QString, QLatin1Char
135*/
136
137/*!
138 \enum QChar::UnicodeVersion
139
140 Specifies which version of the \l{Unicode standard} introduced a certain
141 character.
142
143 \value Unicode_1_1 Version 1.1
144 \value Unicode_2_0 Version 2.0
145 \value Unicode_2_1_2 Version 2.1.2
146 \value Unicode_3_0 Version 3.0
147 \value Unicode_3_1 Version 3.1
148 \value Unicode_3_2 Version 3.2
149 \value Unicode_4_0 Version 4.0
150 \value Unicode_4_1 Version 4.1
151 \value Unicode_5_0 Version 5.0
152 \value Unicode_5_1 Version 5.1
153 \value Unicode_5_2 Version 5.2
154 \value Unicode_6_0 Version 6.0
155 \value Unicode_6_1 Version 6.1
156 \value Unicode_6_2 Version 6.2
157 \value [since 5.3] Unicode_6_3 Version 6.3
158 \value [since 5.5] Unicode_7_0 Version 7.0
159 \value [since 5.6] Unicode_8_0 Version 8.0
160 \value [since 5.11] Unicode_9_0 Version 9.0
161 \value [since 5.11] Unicode_10_0 Version 10.0
162 \value [since 5.15] Unicode_11_0 Version 11.0
163 \value [since 5.15] Unicode_12_0 Version 12.0
164 \value [since 5.15] Unicode_12_1 Version 12.1
165 \value [since 5.15] Unicode_13_0 Version 13.0
166 \value [since 6.3] Unicode_14_0 Version 14.0
167 \value [since 6.5] Unicode_15_0 Version 15.0
168 \value Unicode_Unassigned The value is not assigned to any character
169 in version 8.0 of Unicode.
170
171 \sa unicodeVersion(), currentUnicodeVersion()
172*/
173
174/*!
175 \enum QChar::Category
176
177 This enum maps the Unicode character categories.
178
179 The following characters are normative in Unicode:
180
181 \value Mark_NonSpacing Unicode class name Mn
182
183 \value Mark_SpacingCombining Unicode class name Mc
184
185 \value Mark_Enclosing Unicode class name Me
186
187 \value Number_DecimalDigit Unicode class name Nd
188
189 \value Number_Letter Unicode class name Nl
190
191 \value Number_Other Unicode class name No
192
193 \value Separator_Space Unicode class name Zs
194
195 \value Separator_Line Unicode class name Zl
196
197 \value Separator_Paragraph Unicode class name Zp
198
199 \value Other_Control Unicode class name Cc
200
201 \value Other_Format Unicode class name Cf
202
203 \value Other_Surrogate Unicode class name Cs
204
205 \value Other_PrivateUse Unicode class name Co
206
207 \value Other_NotAssigned Unicode class name Cn
208
209
210 The following categories are informative in Unicode:
211
212 \value Letter_Uppercase Unicode class name Lu
213
214 \value Letter_Lowercase Unicode class name Ll
215
216 \value Letter_Titlecase Unicode class name Lt
217
218 \value Letter_Modifier Unicode class name Lm
219
220 \value Letter_Other Unicode class name Lo
221
222 \value Punctuation_Connector Unicode class name Pc
223
224 \value Punctuation_Dash Unicode class name Pd
225
226 \value Punctuation_Open Unicode class name Ps
227
228 \value Punctuation_Close Unicode class name Pe
229
230 \value Punctuation_InitialQuote Unicode class name Pi
231
232 \value Punctuation_FinalQuote Unicode class name Pf
233
234 \value Punctuation_Other Unicode class name Po
235
236 \value Symbol_Math Unicode class name Sm
237
238 \value Symbol_Currency Unicode class name Sc
239
240 \value Symbol_Modifier Unicode class name Sk
241
242 \value Symbol_Other Unicode class name So
243
244 \sa category()
245*/
246
247/*!
248 \enum QChar::Script
249 \since 5.1
250
251 This enum type defines the Unicode script property values.
252
253 For details about the Unicode script property values see
254 \l{https://www.unicode.org/reports/tr24/}{Unicode Standard Annex #24}.
255
256 In order to conform to C/C++ naming conventions "Script_" is prepended
257 to the codes used in the Unicode Standard.
258
259 \value Script_Unknown For unassigned, private-use, noncharacter, and surrogate code points.
260 \value Script_Inherited For characters that may be used with multiple scripts
261 and that inherit their script from the preceding characters.
262 These include nonspacing marks, enclosing marks,
263 and zero width joiner/non-joiner characters.
264 \value Script_Common For characters that may be used with multiple scripts
265 and that do not inherit their script from the preceding characters.
266
267 \value [since 5.11] Script_Adlam
268 \value [since 5.6] Script_Ahom
269 \value [since 5.6] Script_AnatolianHieroglyphs
270 \value Script_Arabic
271 \value Script_Armenian
272 \value Script_Avestan
273 \value Script_Balinese
274 \value Script_Bamum
275 \value [since 5.5] Script_BassaVah
276 \value Script_Batak
277 \value Script_Bengali
278 \value [since 5.11] Script_Bhaiksuki
279 \value Script_Bopomofo
280 \value Script_Brahmi
281 \value Script_Braille
282 \value Script_Buginese
283 \value Script_Buhid
284 \value Script_CanadianAboriginal
285 \value Script_Carian
286 \value [since 5.5] Script_CaucasianAlbanian
287 \value Script_Chakma
288 \value Script_Cham
289 \value Script_Cherokee
290 \value [since 5.15] Script_Chorasmian
291 \value Script_Coptic
292 \value Script_Cuneiform
293 \value Script_Cypriot
294 \value [since 6.3] Script_CyproMinoan
295 \value Script_Cyrillic
296 \value Script_Deseret
297 \value Script_Devanagari
298 \value [since 5.15] Script_DivesAkuru
299 \value [since 5.15] Script_Dogra
300 \value [since 5.5] Script_Duployan
301 \value Script_EgyptianHieroglyphs
302 \value [since 5.5] Script_Elbasan
303 \value [since 5.15] Script_Elymaic
304 \value Script_Ethiopic
305 \value Script_Georgian
306 \value Script_Glagolitic
307 \value Script_Gothic
308 \value [since 5.5] Script_Grantha
309 \value Script_Greek
310 \value Script_Gujarati
311 \value [since 5.15] Script_GunjalaGondi
312 \value Script_Gurmukhi
313 \value Script_Han
314 \value Script_Hangul
315 \value [since 5.15] Script_HanifiRohingya
316 \value Script_Hanunoo
317 \value [since 5.6] Script_Hatran
318 \value Script_Hebrew
319 \value Script_Hiragana
320 \value Script_ImperialAramaic
321 \value Script_InscriptionalPahlavi
322 \value Script_InscriptionalParthian
323 \value Script_Javanese
324 \value Script_Kaithi
325 \value Script_Kannada
326 \value Script_Katakana
327 \value [since 6.5] Script_Kawi
328 \value Script_KayahLi
329 \value Script_Kharoshthi
330 \value [since 5.15] Script_KhitanSmallScript
331 \value Script_Khmer
332 \value [since 5.5] Script_Khojki
333 \value [since 5.5] Script_Khudawadi
334 \value Script_Lao
335 \value Script_Latin
336 \value Script_Lepcha
337 \value Script_Limbu
338 \value [since 5.5] Script_LinearA
339 \value Script_LinearB
340 \value Script_Lisu
341 \value Script_Lycian
342 \value Script_Lydian
343 \value [since 5.5] Script_Mahajani
344 \value [since 5.15] Script_Makasar
345 \value Script_Malayalam
346 \value Script_Mandaic
347 \value [since 5.5] Script_Manichaean
348 \value [since 5.11] Script_Marchen
349 \value [since 5.11] Script_MasaramGondi
350 \value [since 5.15] Script_Medefaidrin
351 \value Script_MeeteiMayek
352 \value [since 5.5] Script_MendeKikakui
353 \value Script_MeroiticCursive
354 \value Script_MeroiticHieroglyphs
355 \value Script_Miao
356 \value [since 5.5] Script_Modi
357 \value Script_Mongolian
358 \value [since 5.5] Script_Mro
359 \value [since 5.6] Script_Multani
360 \value Script_Myanmar
361 \value [since 5.5] Script_Nabataean
362 \value [since 6.3] Script_NagMundari
363 \value [since 5.15] Script_Nandinagari
364 \value [since 5.11] Script_Newa
365 \value Script_NewTaiLue
366 \value Script_Nko
367 \value [since 5.11] Script_Nushu
368 \value [since 5.15] Script_NyiakengPuachueHmong
369 \value Script_Ogham
370 \value Script_OlChiki
371 \value [since 5.6] Script_OldHungarian
372 \value Script_OldItalic
373 \value [since 5.5] Script_OldNorthArabian
374 \value [since 5.5] Script_OldPermic
375 \value Script_OldPersian
376 \value [since 5.15] Script_OldSogdian
377 \value Script_OldSouthArabian
378 \value Script_OldTurkic
379 \value [since 6.3] Script_OldUyghur
380 \value Script_Oriya
381 \value [since 5.11] Script_Osage
382 \value Script_Osmanya
383 \value [since 5.5] Script_PahawhHmong
384 \value [since 5.5] Script_Palmyrene
385 \value [since 5.5] Script_PauCinHau
386 \value Script_PhagsPa
387 \value Script_Phoenician
388 \value [since 5.5] Script_PsalterPahlavi
389 \value Script_Rejang
390 \value Script_Runic
391 \value Script_Samaritan
392 \value Script_Saurashtra
393 \value Script_Sharada
394 \value Script_Shavian
395 \value [since 5.5] Script_Siddham
396 \value [since 5.6] Script_SignWriting
397 \value Script_Sinhala
398 \value [since 5.15] Script_Sogdian
399 \value Script_SoraSompeng
400 \value [since 5.11] Script_Soyombo
401 \value Script_Sundanese
402 \value Script_SylotiNagri
403 \value Script_Syriac
404 \value Script_Tagalog
405 \value Script_Tagbanwa
406 \value Script_TaiLe
407 \value Script_TaiTham
408 \value Script_TaiViet
409 \value Script_Takri
410 \value Script_Tamil
411 \value [since 5.11] Script_Tangut
412 \value [since 6.3] Script_Tangsa
413 \value Script_Telugu
414 \value Script_Thaana
415 \value Script_Thai
416 \value Script_Tibetan
417 \value Script_Tifinagh
418 \value [since 5.5] Script_Tirhuta
419 \value [since 6.3] Script_Toto
420 \value Script_Ugaritic
421 \value Script_Vai
422 \value [since 6.3] Script_Vithkuqi
423 \value [since 5.15] Script_Wancho
424 \value [since 5.5] Script_WarangCiti
425 \value [since 5.15] Script_Yezidi
426 \value Script_Yi
427 \value [since 5.11] Script_ZanabazarSquare
428
429 \omitvalue ScriptCount
430
431 \sa script()
432*/
433
434/*!
435 \enum QChar::Direction
436
437 This enum type defines the Unicode direction attributes. See the
438 \l{https://www.unicode.org/reports/tr9/tr9-35.html#Table_Bidirectional_Character_Types}{Unicode
439 Standard} for a description of the values.
440
441 In order to conform to C/C++ naming conventions "Dir" is prepended
442 to the codes used in the Unicode Standard.
443
444 \value DirAL
445 \value DirAN
446 \value DirB
447 \value DirBN
448 \value DirCS
449 \value DirEN
450 \value DirES
451 \value DirET
452 \value [since 5.3] DirFSI
453 \value DirL
454 \value DirLRE
455 \value [since 5.3] DirLRI
456 \value DirLRO
457 \value DirNSM
458 \value DirON
459 \value DirPDF
460 \value [since 5.3] DirPDI
461 \value DirR
462 \value DirRLE
463 \value [since 5.3] DirRLI
464 \value DirRLO
465 \value DirS
466 \value DirWS
467
468 \sa direction()
469*/
470
471/*!
472 \enum QChar::Decomposition
473
474 This enum type defines the Unicode decomposition attributes. See
475 the \l{Unicode standard} for a description of the values.
476
477 \value NoDecomposition
478 \value Canonical
479 \value Circle
480 \value Compat
481 \value Final
482 \value Font
483 \value Fraction
484 \value Initial
485 \value Isolated
486 \value Medial
487 \value Narrow
488 \value NoBreak
489 \value Small
490 \value Square
491 \value Sub
492 \value Super
493 \value Vertical
494 \value Wide
495
496 \sa decomposition()
497*/
498
499/*!
500 \enum QChar::JoiningType
501 since 5.3
502
503 This enum type defines the Unicode joining type attributes. See the
504 \l{Unicode standard} for a description of the values.
505
506 In order to conform to C/C++ naming conventions "Joining_" is prepended
507 to the codes used in the Unicode Standard.
508
509 \value Joining_None
510 \value Joining_Causing
511 \value Joining_Dual
512 \value Joining_Right
513 \value Joining_Left
514 \value Joining_Transparent
515
516 \sa joiningType()
517*/
518
519/*!
520 \enum QChar::CombiningClass
521
522 \internal
523
524 This enum type defines names for some of the Unicode combining
525 classes. See the \l{Unicode Standard} for a description of the values.
526
527 \value Combining_Above
528 \value Combining_AboveAttached
529 \value Combining_AboveLeft
530 \value Combining_AboveLeftAttached
531 \value Combining_AboveRight
532 \value Combining_AboveRightAttached
533 \value Combining_Below
534 \value Combining_BelowAttached
535 \value Combining_BelowLeft
536 \value Combining_BelowLeftAttached
537 \value Combining_BelowRight
538 \value Combining_BelowRightAttached
539 \value Combining_DoubleAbove
540 \value Combining_DoubleBelow
541 \value Combining_IotaSubscript
542 \value Combining_Left
543 \value Combining_LeftAttached
544 \value Combining_Right
545 \value Combining_RightAttached
546*/
547
548/*!
549 \enum QChar::SpecialCharacter
550
551 \value Null A QChar with this value isNull().
552 \value Tabulation Character tabulation.
553 \value LineFeed
554 \value FormFeed
555 \value CarriageReturn
556 \value Space
557 \value Nbsp Non-breaking space.
558 \value SoftHyphen
559 \value ReplacementCharacter The character shown when a font has no glyph
560 for a certain codepoint. A special question mark character is often
561 used. Codecs use this codepoint when input data cannot be
562 represented in Unicode.
563 \value ObjectReplacementCharacter Used to represent an object such as an
564 image when such objects cannot be presented.
565 \value ByteOrderMark
566 \value ByteOrderSwapped
567 \value ParagraphSeparator
568 \value LineSeparator
569 \value [since 6.2] VisualTabCharacter Used to represent a tabulation as a horizontal arrow.
570 \value LastValidCodePoint
571*/
572
573/*!
574 \fn void QChar::setCell(uchar cell)
575 \internal
576*/
577
578/*!
579 \fn void QChar::setRow(uchar row)
580 \internal
581*/
582
583/*!
584 \fn QChar::QChar()
585
586 Constructs a null QChar ('\\0').
587
588 \sa isNull()
589*/
590
591/*!
592 \fn QChar::QChar(QLatin1Char ch)
593
594 Constructs a QChar corresponding to ASCII/Latin-1 character \a ch.
595*/
596
597/*!
598 \fn QChar::QChar(SpecialCharacter ch)
599
600 Constructs a QChar for the predefined character value \a ch.
601*/
602
603/*!
604 \fn QChar::QChar(char16_t ch)
605 \since 5.10
606
607 Constructs a QChar corresponding to the UTF-16 character \a ch.
608*/
609
610/*!
611 \fn QChar::QChar(wchar_t ch)
612 \since 5.10
613
614 Constructs a QChar corresponding to the wide character \a ch.
615
616 \note This constructor is only available on Windows.
617*/
618
619/*!
620 \fn QChar::QChar(char ch)
621
622 Constructs a QChar corresponding to ASCII/Latin-1 character \a ch.
623
624 \note This constructor is not available when \c QT_NO_CAST_FROM_ASCII
625 is defined.
626
627 \sa QT_NO_CAST_FROM_ASCII
628*/
629
630/*!
631 \fn QChar::QChar(uchar ch)
632
633 Constructs a QChar corresponding to ASCII/Latin-1 character \a ch.
634
635 \note This constructor is not available when \c QT_NO_CAST_FROM_ASCII
636 or \c QT_RESTRICTED_CAST_FROM_ASCII is defined.
637
638 \sa QT_NO_CAST_FROM_ASCII, QT_RESTRICTED_CAST_FROM_ASCII
639*/
640
641/*!
642 \fn QChar::QChar(uchar cell, uchar row)
643
644 Constructs a QChar for Unicode cell \a cell in row \a row.
645
646 \sa cell(), row()
647*/
648
649/*!
650 \fn QChar::QChar(ushort code)
651
652 Constructs a QChar for the character with Unicode code point \a code.
653*/
654
655/*!
656 \fn QChar::QChar(short code)
657
658 Constructs a QChar for the character with Unicode code point \a code.
659*/
660
661/*!
662 \fn QChar::QChar(uint code)
663
664 Constructs a QChar for the character with Unicode code point \a code.
665*/
666
667/*!
668 \fn QChar::QChar(int code)
669
670 Constructs a QChar for the character with Unicode code point \a code.
671*/
672
673/*!
674 \fn static QChar QChar::fromUcs2(char16_t c)
675 \since 6.0
676
677 Constructs a QChar from UTF-16 character \a c.
678
679 \sa fromUcs4()
680*/
681
682/*!
683 \fn static auto QChar::fromUcs4(char32_t c)
684 \since 6.0
685
686 Returns an anonymous struct that
687 \list
688 \li contains a \c{char16_t chars[2]} array,
689 \li can be implicitly converted to a QStringView, and
690 \li iterated over with a C++11 ranged for loop.
691 \endlist
692
693 If \a c requires surrogates, \c{chars[0]} contains the high surrogate
694 and \c{chars[1]} the low surrogate, and the QStringView has size 2.
695 Otherwise, \c{chars[0]} contains \a c and \c{chars[1]} is
696 \l{QChar::isNull}{null}, and the QStringView has size 1.
697
698 This allows easy use of the result:
699
700 \code
701 QString s;
702 s += QChar::fromUcs4(ch);
703 \endcode
704
705 \code
706 for (char16_t c16 : QChar::fromUcs4(ch))
707 use(c16);
708 \endcode
709
710 \sa fromUcs2(), requiresSurrogates()
711*/
712
713/*!
714 \fn bool QChar::isNull() const
715
716 Returns \c true if the character is the Unicode character 0x0000
717 ('\\0'); otherwise returns \c false.
718*/
719
720/*!
721 \fn uchar QChar::cell() const
722
723 Returns the cell (least significant byte) of the Unicode character.
724
725 \sa row()
726*/
727
728/*!
729 \fn uchar QChar::row() const
730
731 Returns the row (most significant byte) of the Unicode character.
732
733 \sa cell()
734*/
735
736/*!
737 \fn bool QChar::isPrint() const
738
739 Returns \c true if the character is a printable character; otherwise
740 returns \c false. This is any character not of category Other_*.
741
742 Note that this gives no indication of whether the character is
743 available in a particular font.
744*/
745
746/*!
747 \overload
748 \since 5.0
749
750 Returns \c true if the UCS-4-encoded character specified by \a ucs4 is
751 a printable character; otherwise returns \c false.
752 This is any character not of category Other_*.
753
754 Note that this gives no indication of whether the character is
755 available in a particular font.
756
757 \note Before Qt 6, this function took a \c uint argument.
758*/
759bool QChar::isPrint(char32_t ucs4) noexcept
760{
761 if (ucs4 > LastValidCodePoint)
762 return false;
763 const int test = FLAG(Other_Control) |
764 FLAG(Other_Format) |
765 FLAG(Other_Surrogate) |
766 FLAG(Other_PrivateUse) |
767 FLAG(Other_NotAssigned);
768 return !(FLAG(qGetProp(ucs4)->category) & test);
769}
770
771/*!
772 \fn bool QChar::isSpace() const
773
774 Returns \c true if the character is a separator character
775 (Separator_* categories or certain code points from Other_Control category);
776 otherwise returns \c false.
777*/
778
779/*!
780 \fn bool QChar::isSpace(char32_t ucs4)
781 \overload
782 \since 5.0
783
784 Returns \c true if the UCS-4-encoded character specified by \a ucs4 is
785 a separator character (Separator_* categories or certain code points
786 from Other_Control category); otherwise returns \c false.
787
788 \note Before Qt 6, this function took a \c uint argument.
789*/
790
791/*!
792 \internal
793*/
794bool QT_FASTCALL QChar::isSpace_helper(char32_t ucs4) noexcept
795{
796 if (ucs4 > LastValidCodePoint)
797 return false;
798 const int test = FLAG(Separator_Space) |
799 FLAG(Separator_Line) |
800 FLAG(Separator_Paragraph);
801 return FLAG(qGetProp(ucs4)->category) & test;
802}
803
804/*!
805 \fn bool QChar::isMark() const
806
807 Returns \c true if the character is a mark (Mark_* categories);
808 otherwise returns \c false.
809
810 See QChar::Category for more information regarding marks.
811*/
812
813/*!
814 \overload
815 \since 5.0
816
817 Returns \c true if the UCS-4-encoded character specified by \a ucs4 is
818 a mark (Mark_* categories); otherwise returns \c false.
819
820 \note Before Qt 6, this function took a \c uint argument.
821*/
822bool QChar::isMark(char32_t ucs4) noexcept
823{
824 if (ucs4 > LastValidCodePoint)
825 return false;
826 const int test = FLAG(Mark_NonSpacing) |
827 FLAG(Mark_SpacingCombining) |
828 FLAG(Mark_Enclosing);
829 return FLAG(qGetProp(ucs4)->category) & test;
830}
831
832/*!
833 \fn bool QChar::isPunct() const
834
835 Returns \c true if the character is a punctuation mark (Punctuation_*
836 categories); otherwise returns \c false.
837*/
838
839/*!
840 \overload
841 \since 5.0
842
843 Returns \c true if the UCS-4-encoded character specified by \a ucs4 is
844 a punctuation mark (Punctuation_* categories); otherwise returns \c false.
845
846 \note Before Qt 6, this function took a \c uint argument.
847*/
848bool QChar::isPunct(char32_t ucs4) noexcept
849{
850 if (ucs4 > LastValidCodePoint)
851 return false;
852 const int test = FLAG(Punctuation_Connector) |
853 FLAG(Punctuation_Dash) |
854 FLAG(Punctuation_Open) |
855 FLAG(Punctuation_Close) |
856 FLAG(Punctuation_InitialQuote) |
857 FLAG(Punctuation_FinalQuote) |
858 FLAG(Punctuation_Other);
859 return FLAG(qGetProp(ucs4)->category) & test;
860}
861
862/*!
863 \fn bool QChar::isSymbol() const
864
865 Returns \c true if the character is a symbol (Symbol_* categories);
866 otherwise returns \c false.
867*/
868
869/*!
870 \overload
871 \since 5.0
872
873 Returns \c true if the UCS-4-encoded character specified by \a ucs4 is
874 a symbol (Symbol_* categories); otherwise returns \c false.
875
876 \note Before Qt 6, this function took a \c uint argument.
877*/
878bool QChar::isSymbol(char32_t ucs4) noexcept
879{
880 if (ucs4 > LastValidCodePoint)
881 return false;
882 const int test = FLAG(Symbol_Math) |
883 FLAG(Symbol_Currency) |
884 FLAG(Symbol_Modifier) |
885 FLAG(Symbol_Other);
886 return FLAG(qGetProp(ucs4)->category) & test;
887}
888
889/*!
890 \fn bool QChar::isLetter() const
891
892 Returns \c true if the character is a letter (Letter_* categories);
893 otherwise returns \c false.
894*/
895
896/*!
897 \fn bool QChar::isLetter(char32_t ucs4)
898 \overload
899 \since 5.0
900
901 Returns \c true if the UCS-4-encoded character specified by \a ucs4 is
902 a letter (Letter_* categories); otherwise returns \c false.
903
904 \note Before Qt 6, this function took a \c uint argument.
905*/
906
907/*!
908 \internal
909*/
910bool QT_FASTCALL QChar::isLetter_helper(char32_t ucs4) noexcept
911{
912 if (ucs4 > LastValidCodePoint)
913 return false;
914 const int test = FLAG(Letter_Uppercase) |
915 FLAG(Letter_Lowercase) |
916 FLAG(Letter_Titlecase) |
917 FLAG(Letter_Modifier) |
918 FLAG(Letter_Other);
919 return FLAG(qGetProp(ucs4)->category) & test;
920}
921
922/*!
923 \fn bool QChar::isNumber() const
924
925 Returns \c true if the character is a number (Number_* categories,
926 not just 0-9); otherwise returns \c false.
927
928 \sa isDigit()
929*/
930
931/*!
932 \fn bool QChar::isNumber(char32_t ucs4)
933 \overload
934 \since 5.0
935
936 Returns \c true if the UCS-4-encoded character specified by \a ucs4 is
937 a number (Number_* categories, not just 0-9); otherwise returns \c false.
938
939 \note Before Qt 6, this function took a \c uint argument.
940
941 \sa isDigit()
942*/
943
944/*!
945 \internal
946*/
947bool QT_FASTCALL QChar::isNumber_helper(char32_t ucs4) noexcept
948{
949 if (ucs4 > LastValidCodePoint)
950 return false;
951 const int test = FLAG(Number_DecimalDigit) |
952 FLAG(Number_Letter) |
953 FLAG(Number_Other);
954 return FLAG(qGetProp(ucs4)->category) & test;
955}
956
957/*!
958 \fn bool QChar::isLetterOrNumber() const
959
960 Returns \c true if the character is a letter or number (Letter_* or
961 Number_* categories); otherwise returns \c false.
962*/
963
964/*!
965 \fn bool QChar::isLetterOrNumber(char32_t ucs4)
966 \overload
967 \since 5.0
968
969 Returns \c true if the UCS-4-encoded character specified by \a ucs4 is
970 a letter or number (Letter_* or Number_* categories); otherwise returns \c false.
971
972 \note Before Qt 6, this function took a \c uint argument.
973*/
974
975/*!
976 \internal
977*/
978bool QT_FASTCALL QChar::isLetterOrNumber_helper(char32_t ucs4) noexcept
979{
980 if (ucs4 > LastValidCodePoint)
981 return false;
982 const int test = FLAG(Letter_Uppercase) |
983 FLAG(Letter_Lowercase) |
984 FLAG(Letter_Titlecase) |
985 FLAG(Letter_Modifier) |
986 FLAG(Letter_Other) |
987 FLAG(Number_DecimalDigit) |
988 FLAG(Number_Letter) |
989 FLAG(Number_Other);
990 return FLAG(qGetProp(ucs4)->category) & test;
991}
992
993/*!
994 \fn bool QChar::isDigit() const
995
996 Returns \c true if the character is a decimal digit
997 (Number_DecimalDigit); otherwise returns \c false.
998
999 \sa isNumber()
1000*/
1001
1002/*!
1003 \fn bool QChar::isDigit(char32_t ucs4)
1004 \overload
1005 \since 5.0
1006
1007 Returns \c true if the UCS-4-encoded character specified by \a ucs4 is
1008 a decimal digit (Number_DecimalDigit); otherwise returns \c false.
1009
1010 \note Before Qt 6, this function took a \c uint argument.
1011
1012 \sa isNumber()
1013*/
1014
1015/*!
1016 \fn bool QChar::isNonCharacter() const
1017 \since 5.0
1018
1019 Returns \c true if the QChar is a non-character; false otherwise.
1020
1021 Unicode has a certain number of code points that are classified
1022 as "non-characters:" that is, they can be used for internal purposes
1023 in applications but cannot be used for text interchange.
1024 Those are the last two entries each Unicode Plane ([0xfffe..0xffff],
1025 [0x1fffe..0x1ffff], etc.) as well as the entries in range [0xfdd0..0xfdef].
1026*/
1027
1028/*!
1029 \fn bool QChar::isHighSurrogate() const
1030
1031 Returns \c true if the QChar is the high part of a UTF16 surrogate
1032 (for example if its code point is in range [0xd800..0xdbff]); false otherwise.
1033*/
1034
1035/*!
1036 \fn bool QChar::isLowSurrogate() const
1037
1038 Returns \c true if the QChar is the low part of a UTF16 surrogate
1039 (for example if its code point is in range [0xdc00..0xdfff]); false otherwise.
1040*/
1041
1042/*!
1043 \fn bool QChar::isSurrogate() const
1044 \since 5.0
1045
1046 Returns \c true if the QChar contains a code point that is in either
1047 the high or the low part of the UTF-16 surrogate range
1048 (for example if its code point is in range [0xd800..0xdfff]); false otherwise.
1049*/
1050
1051/*!
1052 \fn static bool QChar::isNonCharacter(char32_t ucs4)
1053 \overload
1054 \since 5.0
1055
1056 Returns \c true if the UCS-4-encoded character specified by \a ucs4
1057 is a non-character; false otherwise.
1058
1059 Unicode has a certain number of code points that are classified
1060 as "non-characters:" that is, they can be used for internal purposes
1061 in applications but cannot be used for text interchange.
1062 Those are the last two entries each Unicode Plane ([0xfffe..0xffff],
1063 [0x1fffe..0x1ffff], etc.) as well as the entries in range [0xfdd0..0xfdef].
1064
1065 \note Before Qt 6, this function took a \c uint argument.
1066*/
1067
1068/*!
1069 \fn static bool QChar::isHighSurrogate(char32_t ucs4)
1070 \overload
1071
1072 Returns \c true if the UCS-4-encoded character specified by \a ucs4
1073 is the high part of a UTF16 surrogate
1074 (for example if its code point is in range [0xd800..0xdbff]); false otherwise.
1075
1076 \note Before Qt 6, this function took a \c uint argument.
1077*/
1078
1079/*!
1080 \fn static bool QChar::isLowSurrogate(char32_t ucs4)
1081 \overload
1082
1083 Returns \c true if the UCS-4-encoded character specified by \a ucs4
1084 is the low part of a UTF16 surrogate
1085 (for example if its code point is in range [0xdc00..0xdfff]); false otherwise.
1086
1087 \note Before Qt 6, this function took a \c uint argument.
1088*/
1089
1090/*!
1091 \fn static bool QChar::isSurrogate(char32_t ucs4)
1092 \overload
1093 \since 5.0
1094
1095 Returns \c true if the UCS-4-encoded character specified by \a ucs4
1096 contains a code point that is in either the high or the low part of the
1097 UTF-16 surrogate range (for example if its code point is in range [0xd800..0xdfff]);
1098 false otherwise.
1099
1100 \note Before Qt 6, this function took a \c uint argument.
1101*/
1102
1103/*!
1104 \fn static bool QChar::requiresSurrogates(char32_t ucs4)
1105
1106 Returns \c true if the UCS-4-encoded character specified by \a ucs4
1107 can be split into the high and low parts of a UTF16 surrogate
1108 (for example if its code point is greater than or equals to 0x10000);
1109 false otherwise.
1110
1111 \note Before Qt 6, this function took a \c uint argument.
1112*/
1113
1114/*!
1115 \fn static char32_t QChar::surrogateToUcs4(char16_t high, char16_t low)
1116
1117 Converts a UTF16 surrogate pair with the given \a high and \a low values
1118 to it's UCS-4-encoded code point.
1119
1120 \note Before Qt 6, this function took \c ushort arguments and returned \c uint.
1121*/
1122
1123/*!
1124 \fn static char32_t QChar::surrogateToUcs4(QChar high, QChar low)
1125 \overload
1126
1127 Converts a UTF16 surrogate pair (\a high, \a low) to it's UCS-4-encoded code point.
1128
1129 \note Before Qt 6, this function returned \c uint.
1130*/
1131
1132/*!
1133 \fn static char16_t QChar::highSurrogate(char32_t ucs4)
1134
1135 Returns the high surrogate part of a UCS-4-encoded code point.
1136 The returned result is undefined if \a ucs4 is smaller than 0x10000.
1137
1138 \note Before Qt 6, this function took a \c uint argument and returned \c ushort.
1139*/
1140
1141/*!
1142 \fn static char16_t QChar::lowSurrogate(char32_t ucs4)
1143
1144 Returns the low surrogate part of a UCS-4-encoded code point.
1145 The returned result is undefined if \a ucs4 is smaller than 0x10000.
1146
1147 \note Before Qt 6, this function took a \c uint argument and returned \c ushort.
1148*/
1149
1150/*!
1151 \fn int QChar::digitValue() const
1152
1153 Returns the numeric value of the digit, or -1 if the character is not a digit.
1154*/
1155
1156/*!
1157 \overload
1158 Returns the numeric value of the digit specified by the UCS-4-encoded
1159 character, \a ucs4, or -1 if the character is not a digit.
1160
1161 \note Before Qt 6, this function took a \c uint argument.
1162*/
1163int QChar::digitValue(char32_t ucs4) noexcept
1164{
1165 if (ucs4 > LastValidCodePoint)
1166 return -1;
1167 return qGetProp(ucs4)->digitValue;
1168}
1169
1170/*!
1171 \fn QChar::Category QChar::category() const
1172
1173 Returns the character's category.
1174*/
1175
1176/*!
1177 \overload
1178 Returns the category of the UCS-4-encoded character specified by \a ucs4.
1179
1180 \note Before Qt 6, this function took a \c uint argument.
1181*/
1182QChar::Category QChar::category(char32_t ucs4) noexcept
1183{
1184 if (ucs4 > LastValidCodePoint)
1185 return QChar::Other_NotAssigned;
1186 return (QChar::Category) qGetProp(ucs4)->category;
1187}
1188
1189/*!
1190 \fn QChar::Direction QChar::direction() const
1191
1192 Returns the character's direction.
1193*/
1194
1195/*!
1196 \overload
1197 Returns the direction of the UCS-4-encoded character specified by \a ucs4.
1198
1199 \note Before Qt 6, this function took a \c uint argument.
1200*/
1201QChar::Direction QChar::direction(char32_t ucs4) noexcept
1202{
1203 if (ucs4 > LastValidCodePoint)
1204 return QChar::DirL;
1205 return (QChar::Direction) qGetProp(ucs4)->direction;
1206}
1207
1208/*!
1209 \fn QChar::JoiningType QChar::joiningType() const
1210 \since 5.3
1211
1212 Returns information about the joining type attributes of the character
1213 (needed for certain languages such as Arabic or Syriac).
1214*/
1215
1216/*!
1217 \overload
1218 \since 5.3
1219
1220 Returns information about the joining type attributes of the UCS-4-encoded
1221 character specified by \a ucs4
1222 (needed for certain languages such as Arabic or Syriac).
1223
1224 \note Before Qt 6, this function took a \c uint argument.
1225*/
1226QChar::JoiningType QChar::joiningType(char32_t ucs4) noexcept
1227{
1228 if (ucs4 > LastValidCodePoint)
1229 return QChar::Joining_None;
1230 return QChar::JoiningType(qGetProp(ucs4)->joining);
1231}
1232
1233/*!
1234 \fn bool QChar::hasMirrored() const
1235
1236 Returns \c true if the character should be reversed if the text
1237 direction is reversed; otherwise returns \c false.
1238
1239 A bit faster equivalent of (ch.mirroredChar() != ch).
1240
1241 \sa mirroredChar()
1242*/
1243
1244/*!
1245 \overload
1246 \since 5.0
1247
1248 Returns \c true if the UCS-4-encoded character specified by \a ucs4
1249 should be reversed if the text direction is reversed; otherwise returns \c false.
1250
1251 A bit faster equivalent of (QChar::mirroredChar(ucs4) != ucs4).
1252
1253 \note Before Qt 6, this function took a \c uint argument.
1254
1255 \sa mirroredChar()
1256*/
1257bool QChar::hasMirrored(char32_t ucs4) noexcept
1258{
1259 if (ucs4 > LastValidCodePoint)
1260 return false;
1261 return qGetProp(ucs4)->mirrorDiff != 0;
1262}
1263
1264/*!
1265 \fn bool QChar::isLower() const
1266
1267 Returns \c true if the character is a lowercase letter, for example
1268 category() is Letter_Lowercase.
1269
1270 \sa isUpper(), toLower(), toUpper()
1271*/
1272
1273/*!
1274 \fn static bool QChar::isLower(char32_t ucs4)
1275 \overload
1276 \since 5.0
1277
1278 Returns \c true if the UCS-4-encoded character specified by \a ucs4
1279 is a lowercase letter, for example category() is Letter_Lowercase.
1280
1281 \note Before Qt 6, this function took a \c uint argument.
1282
1283 \sa isUpper(), toLower(), toUpper()
1284*/
1285
1286/*!
1287 \fn bool QChar::isUpper() const
1288
1289 Returns \c true if the character is an uppercase letter, for example
1290 category() is Letter_Uppercase.
1291
1292 \sa isLower(), toUpper(), toLower()
1293*/
1294
1295/*!
1296 \fn static bool QChar::isUpper(char32_t ucs4)
1297 \overload
1298 \since 5.0
1299
1300 Returns \c true if the UCS-4-encoded character specified by \a ucs4
1301 is an uppercase letter, for example category() is Letter_Uppercase.
1302
1303 \note Before Qt 6, this function took a \c uint argument.
1304
1305 \sa isLower(), toUpper(), toLower()
1306*/
1307
1308/*!
1309 \fn bool QChar::isTitleCase() const
1310
1311 Returns \c true if the character is a titlecase letter, for example
1312 category() is Letter_Titlecase.
1313
1314 \sa isLower(), toUpper(), toLower(), toTitleCase()
1315*/
1316
1317/*!
1318 \fn static bool QChar::isTitleCase(char32_t ucs4)
1319 \overload
1320 \since 5.0
1321
1322 Returns \c true if the UCS-4-encoded character specified by \a ucs4
1323 is a titlecase letter, for example category() is Letter_Titlecase.
1324
1325 \note Before Qt 6, this function took a \c uint argument.
1326
1327 \sa isLower(), toUpper(), toLower(), toTitleCase()
1328*/
1329/*!
1330 \fn QChar QChar::mirroredChar() const
1331
1332 Returns the mirrored character if this character is a mirrored
1333 character; otherwise returns the character itself.
1334
1335 \sa hasMirrored()
1336*/
1337
1338/*!
1339 \overload
1340 Returns the mirrored character if the UCS-4-encoded character specified
1341 by \a ucs4 is a mirrored character; otherwise returns the character itself.
1342
1343 \note Before Qt 6, this function took a \c uint argument and returned \c uint.
1344
1345 \sa hasMirrored()
1346*/
1347char32_t QChar::mirroredChar(char32_t ucs4) noexcept
1348{
1349 if (ucs4 > LastValidCodePoint)
1350 return ucs4;
1351 return ucs4 + qGetProp(ucs4)->mirrorDiff;
1352}
1353
1354// Constants for Hangul (de)composition, see UAX #15:
1355static constexpr char32_t Hangul_SBase = 0xac00;
1356static constexpr char32_t Hangul_LBase = 0x1100;
1357static constexpr char32_t Hangul_VBase = 0x1161;
1358static constexpr char32_t Hangul_TBase = 0x11a7;
1359static constexpr quint32 Hangul_LCount = 19;
1360static constexpr quint32 Hangul_VCount = 21;
1361static constexpr quint32 Hangul_TCount = 28;
1362static constexpr quint32 Hangul_NCount = Hangul_VCount * Hangul_TCount;
1363static constexpr quint32 Hangul_SCount = Hangul_LCount * Hangul_NCount;
1364
1365// buffer has to have a length of 3. It's needed for Hangul decomposition
1366static const QChar * QT_FASTCALL decompositionHelper(
1367 char32_t ucs4, qsizetype *length, QChar::Decomposition *tag, QChar *buffer)
1368{
1369 if (ucs4 >= Hangul_SBase && ucs4 < Hangul_SBase + Hangul_SCount) {
1370 // compute Hangul syllable decomposition as per UAX #15
1371 const char32_t SIndex = ucs4 - Hangul_SBase;
1372 buffer[0] = QChar(Hangul_LBase + SIndex / Hangul_NCount); // L
1373 buffer[1] = QChar(Hangul_VBase + (SIndex % Hangul_NCount) / Hangul_TCount); // V
1374 buffer[2] = QChar(Hangul_TBase + SIndex % Hangul_TCount); // T
1375 *length = buffer[2].unicode() == Hangul_TBase ? 2 : 3;
1376 *tag = QChar::Canonical;
1377 return buffer;
1378 }
1379
1380 const unsigned short index = GET_DECOMPOSITION_INDEX(ucs4);
1381 if (index == 0xffff) {
1382 *length = 0;
1383 *tag = QChar::NoDecomposition;
1384 return nullptr;
1385 }
1386
1387 const unsigned short *decomposition = uc_decomposition_map+index;
1388 *tag = QChar::Decomposition((*decomposition) & 0xff);
1389 *length = (*decomposition) >> 8;
1390 return reinterpret_cast<const QChar *>(decomposition + 1);
1391}
1392
1393/*!
1394 Decomposes a character into it's constituent parts. Returns an empty string
1395 if no decomposition exists.
1396*/
1397QString QChar::decomposition() const
1398{
1399 return QChar::decomposition(ucs4: ucs);
1400}
1401
1402/*!
1403 \overload
1404 Decomposes the UCS-4-encoded character specified by \a ucs4 into it's
1405 constituent parts. Returns an empty string if no decomposition exists.
1406
1407 \note Before Qt 6, this function took a \c uint argument.
1408*/
1409QString QChar::decomposition(char32_t ucs4)
1410{
1411 QChar buffer[3];
1412 qsizetype length;
1413 QChar::Decomposition tag;
1414 const QChar *d = decompositionHelper(ucs4, length: &length, tag: &tag, buffer);
1415 return QString(d, length);
1416}
1417
1418/*!
1419 \fn QChar::Decomposition QChar::decompositionTag() const
1420
1421 Returns the tag defining the composition of the character. Returns
1422 QChar::NoDecomposition if no decomposition exists.
1423*/
1424
1425/*!
1426 \overload
1427 Returns the tag defining the composition of the UCS-4-encoded character
1428 specified by \a ucs4. Returns QChar::NoDecomposition if no decomposition exists.
1429
1430 \note Before Qt 6, this function took a \c uint argument.
1431*/
1432QChar::Decomposition QChar::decompositionTag(char32_t ucs4) noexcept
1433{
1434 if (ucs4 >= Hangul_SBase && ucs4 < Hangul_SBase + Hangul_SCount)
1435 return QChar::Canonical;
1436 const unsigned short index = GET_DECOMPOSITION_INDEX(ucs4);
1437 if (index == 0xffff)
1438 return QChar::NoDecomposition;
1439 return (QChar::Decomposition)(uc_decomposition_map[index] & 0xff);
1440}
1441
1442/*!
1443 \fn unsigned char QChar::combiningClass() const
1444
1445 Returns the combining class for the character as defined in the
1446 Unicode standard. This is mainly useful as a positioning hint for
1447 marks attached to a base character.
1448
1449 The Qt text rendering engine uses this information to correctly
1450 position non-spacing marks around a base character.
1451*/
1452
1453/*!
1454 \overload
1455 Returns the combining class for the UCS-4-encoded character specified by
1456 \a ucs4, as defined in the Unicode standard.
1457
1458 \note Before Qt 6, this function took a \c uint argument.
1459*/
1460unsigned char QChar::combiningClass(char32_t ucs4) noexcept
1461{
1462 if (ucs4 > LastValidCodePoint)
1463 return 0;
1464 return (unsigned char) qGetProp(ucs4)->combiningClass;
1465}
1466
1467/*!
1468 \fn QChar::Script QChar::script() const
1469 \since 5.1
1470
1471 Returns the Unicode script property value for this character.
1472*/
1473
1474/*!
1475 \overload
1476 \since 5.1
1477
1478 Returns the Unicode script property value for the character specified in
1479 its UCS-4-encoded form as \a ucs4.
1480
1481 \note Before Qt 6, this function took a \c uint argument.
1482*/
1483QChar::Script QChar::script(char32_t ucs4) noexcept
1484{
1485 if (ucs4 > LastValidCodePoint)
1486 return QChar::Script_Unknown;
1487 return (QChar::Script) qGetProp(ucs4)->script;
1488}
1489
1490/*!
1491 \fn QChar::UnicodeVersion QChar::unicodeVersion() const
1492
1493 Returns the Unicode version that introduced this character.
1494*/
1495
1496/*!
1497 \overload
1498 Returns the Unicode version that introduced the character specified in
1499 its UCS-4-encoded form as \a ucs4.
1500
1501 \note Before Qt 6, this function took a \c uint argument.
1502*/
1503QChar::UnicodeVersion QChar::unicodeVersion(char32_t ucs4) noexcept
1504{
1505 if (ucs4 > LastValidCodePoint)
1506 return QChar::Unicode_Unassigned;
1507 return (QChar::UnicodeVersion) qGetProp(ucs4)->unicodeVersion;
1508}
1509
1510/*!
1511 Returns the most recent supported Unicode version.
1512*/
1513QChar::UnicodeVersion QChar::currentUnicodeVersion() noexcept
1514{
1515 return UNICODE_DATA_VERSION;
1516}
1517
1518static auto fullConvertCase(char32_t uc, QUnicodeTables::Case which) noexcept
1519{
1520 struct R {
1521 char16_t chars[MaxSpecialCaseLength + 1];
1522 qint8 sz;
1523
1524 // iterable
1525 auto begin() const { return chars; }
1526 auto end() const { return chars + sz; }
1527 // QStringView-compatible
1528 auto data() const { return chars; }
1529 auto size() const { return sz; }
1530 } result;
1531 Q_ASSERT(uc <= QChar::LastValidCodePoint);
1532
1533 auto pp = result.chars;
1534
1535 const auto fold = qGetProp(ucs4: uc)->cases[which];
1536 const auto caseDiff = fold.diff;
1537
1538 if (Q_UNLIKELY(fold.special)) {
1539 const auto *specialCase = specialCaseMap + caseDiff;
1540 auto length = *specialCase++;
1541 while (length--)
1542 *pp++ = *specialCase++;
1543 } else {
1544 // so far, case conversion never changes planes (guaranteed by the qunicodetables generator)
1545 for (char16_t c : QChar::fromUcs4(c: uc + caseDiff))
1546 *pp++ = c;
1547 }
1548 result.sz = pp - result.chars;
1549 return result;
1550}
1551
1552template <typename T>
1553Q_DECL_CONST_FUNCTION static inline T convertCase_helper(T uc, QUnicodeTables::Case which) noexcept
1554{
1555 const auto fold = qGetProp(uc)->cases[which];
1556
1557 if (Q_UNLIKELY(fold.special)) {
1558 const ushort *specialCase = specialCaseMap + fold.diff;
1559 // so far, there are no special cases beyond BMP (guaranteed by the qunicodetables generator)
1560 return *specialCase == 1 ? specialCase[1] : uc;
1561 }
1562
1563 return uc + fold.diff;
1564}
1565
1566/*!
1567 \fn QChar QChar::toLower() const
1568
1569 Returns the lowercase equivalent if the character is uppercase or titlecase;
1570 otherwise returns the character itself.
1571*/
1572
1573/*!
1574 \overload
1575 Returns the lowercase equivalent of the UCS-4-encoded character specified
1576 by \a ucs4 if the character is uppercase or titlecase; otherwise returns
1577 the character itself.
1578
1579 \note Before Qt 6, this function took a \c uint argument and returned \c uint.
1580*/
1581char32_t QChar::toLower(char32_t ucs4) noexcept
1582{
1583 if (ucs4 > LastValidCodePoint)
1584 return ucs4;
1585 return convertCase_helper(uc: ucs4, which: QUnicodeTables::LowerCase);
1586}
1587
1588/*!
1589 \fn QChar QChar::toUpper() const
1590
1591 Returns the uppercase equivalent if the character is lowercase or titlecase;
1592 otherwise returns the character itself.
1593*/
1594
1595/*!
1596 \overload
1597 Returns the uppercase equivalent of the UCS-4-encoded character specified
1598 by \a ucs4 if the character is lowercase or titlecase; otherwise returns
1599 the character itself.
1600
1601 \note Before Qt 6, this function took a \c uint argument and returned \c uint.
1602*/
1603char32_t QChar::toUpper(char32_t ucs4) noexcept
1604{
1605 if (ucs4 > LastValidCodePoint)
1606 return ucs4;
1607 return convertCase_helper(uc: ucs4, which: QUnicodeTables::UpperCase);
1608}
1609
1610/*!
1611 \fn QChar QChar::toTitleCase() const
1612
1613 Returns the title case equivalent if the character is lowercase or uppercase;
1614 otherwise returns the character itself.
1615*/
1616
1617/*!
1618 \overload
1619 Returns the title case equivalent of the UCS-4-encoded character specified
1620 by \a ucs4 if the character is lowercase or uppercase; otherwise returns
1621 the character itself.
1622
1623 \note Before Qt 6, this function took a \c uint argument and returned \c uint.
1624*/
1625char32_t QChar::toTitleCase(char32_t ucs4) noexcept
1626{
1627 if (ucs4 > LastValidCodePoint)
1628 return ucs4;
1629 return convertCase_helper(uc: ucs4, which: QUnicodeTables::TitleCase);
1630}
1631
1632static inline char32_t foldCase(const char16_t *ch, const char16_t *start)
1633{
1634 char32_t ucs4 = *ch;
1635 if (QChar::isLowSurrogate(ucs4) && ch > start && QChar::isHighSurrogate(ucs4: *(ch - 1)))
1636 ucs4 = QChar::surrogateToUcs4(high: *(ch - 1), low: ucs4);
1637 return convertCase_helper(uc: ucs4, which: QUnicodeTables::CaseFold);
1638}
1639
1640static inline char32_t foldCase(char32_t ch, char32_t &last) noexcept
1641{
1642 char32_t ucs4 = ch;
1643 if (QChar::isLowSurrogate(ucs4) && QChar::isHighSurrogate(ucs4: last))
1644 ucs4 = QChar::surrogateToUcs4(high: last, low: ucs4);
1645 last = ch;
1646 return convertCase_helper(uc: ucs4, which: QUnicodeTables::CaseFold);
1647}
1648
1649static inline char16_t foldCase(char16_t ch) noexcept
1650{
1651 return convertCase_helper(uc: ch, which: QUnicodeTables::CaseFold);
1652}
1653
1654static inline QChar foldCase(QChar ch) noexcept
1655{
1656 return QChar(foldCase(ch: ch.unicode()));
1657}
1658
1659/*!
1660 \fn QChar QChar::toCaseFolded() const
1661
1662 Returns the case folded equivalent of the character.
1663 For most Unicode characters this is the same as toLower().
1664*/
1665
1666/*!
1667 \overload
1668 Returns the case folded equivalent of the UCS-4-encoded character specified
1669 by \a ucs4. For most Unicode characters this is the same as toLower().
1670
1671 \note Before Qt 6, this function took a \c uint argument and returned \c uint.
1672*/
1673char32_t QChar::toCaseFolded(char32_t ucs4) noexcept
1674{
1675 if (ucs4 > LastValidCodePoint)
1676 return ucs4;
1677 return convertCase_helper(uc: ucs4, which: QUnicodeTables::CaseFold);
1678}
1679
1680/*!
1681 \fn char QChar::toLatin1() const
1682
1683 Returns the Latin-1 character equivalent to the QChar, or 0. This
1684 is mainly useful for non-internationalized software.
1685
1686 \note It is not possible to distinguish a non-Latin-1 character from a Latin-1 0
1687 (NUL) character. Prefer to use unicode(), which does not have this ambiguity.
1688
1689 \sa unicode()
1690*/
1691
1692/*!
1693 \fn QChar QChar::fromLatin1(char)
1694
1695 Converts the Latin-1 character \a c to its equivalent QChar. This
1696 is mainly useful for non-internationalized software.
1697
1698 An alternative is to use QLatin1Char.
1699
1700 \sa toLatin1(), unicode()
1701*/
1702
1703#ifndef QT_NO_DATASTREAM
1704/*!
1705 \relates QChar
1706
1707 Writes the char \a chr to the stream \a out.
1708
1709 \sa {Serializing Qt Data Types}
1710*/
1711QDataStream &operator<<(QDataStream &out, QChar chr)
1712{
1713 out << quint16(chr.unicode());
1714 return out;
1715}
1716
1717/*!
1718 \relates QChar
1719
1720 Reads a char from the stream \a in into char \a chr.
1721
1722 \sa {Serializing Qt Data Types}
1723*/
1724QDataStream &operator>>(QDataStream &in, QChar &chr)
1725{
1726 quint16 u;
1727 in >> u;
1728 chr.unicode() = char16_t(u);
1729 return in;
1730}
1731#endif // QT_NO_DATASTREAM
1732
1733/*!
1734 \fn QChar::unicode()
1735
1736 Returns a reference to the numeric Unicode value of the QChar.
1737*/
1738
1739/*!
1740 \fn QChar::unicode() const
1741
1742 Returns the numeric Unicode value of the QChar.
1743*/
1744
1745/*****************************************************************************
1746 Documentation of QChar related functions
1747 *****************************************************************************/
1748
1749/*!
1750 \fn bool QChar::operator==(QChar c1, QChar c2)
1751
1752 Returns \c true if \a c1 and \a c2 are the same Unicode character;
1753 otherwise returns \c false.
1754*/
1755
1756/*!
1757 \fn int QChar::operator!=(QChar c1, QChar c2)
1758
1759 Returns \c true if \a c1 and \a c2 are not the same Unicode
1760 character; otherwise returns \c false.
1761*/
1762
1763/*!
1764 \fn int QChar::operator<=(QChar c1, QChar c2)
1765
1766 Returns \c true if the numeric Unicode value of \a c1 is less than
1767 or equal to that of \a c2; otherwise returns \c false.
1768*/
1769
1770/*!
1771 \fn int QChar::operator>=(QChar c1, QChar c2)
1772
1773 Returns \c true if the numeric Unicode value of \a c1 is greater than
1774 or equal to that of \a c2; otherwise returns \c false.
1775*/
1776
1777/*!
1778 \fn int QChar::operator<(QChar c1, QChar c2)
1779
1780 Returns \c true if the numeric Unicode value of \a c1 is less than
1781 that of \a c2; otherwise returns \c false.
1782*/
1783
1784/*!
1785 \fn int QChar::operator>(QChar c1, QChar c2)
1786
1787 Returns \c true if the numeric Unicode value of \a c1 is greater than
1788 that of \a c2; otherwise returns \c false.
1789*/
1790
1791/*!
1792 \fn Qt::Literals::StringLiterals::operator""_L1(char ch)
1793
1794 \relates QLatin1Char
1795 \since 6.4
1796
1797 Literal operator that creates a QLatin1Char out of \a ch.
1798
1799 The following code creates a QLatin1Char:
1800 \code
1801 using namespace Qt::Literals::StringLiterals;
1802
1803 auto ch = 'a'_L1;
1804 \endcode
1805
1806 \sa Qt::Literals::StringLiterals
1807*/
1808
1809// ---------------------------------------------------------------------------
1810
1811
1812static void decomposeHelper(QString *str, bool canonical, QChar::UnicodeVersion version, qsizetype from)
1813{
1814 qsizetype length;
1815 QChar::Decomposition tag;
1816 QChar buffer[3];
1817
1818 QString &s = *str;
1819
1820 const unsigned short *utf16 = reinterpret_cast<unsigned short *>(s.data());
1821 const unsigned short *uc = utf16 + s.size();
1822 while (uc != utf16 + from) {
1823 char32_t ucs4 = *(--uc);
1824 if (QChar(ucs4).isLowSurrogate() && uc != utf16) {
1825 ushort high = *(uc - 1);
1826 if (QChar(high).isHighSurrogate()) {
1827 --uc;
1828 ucs4 = QChar::surrogateToUcs4(high, low: ucs4);
1829 }
1830 }
1831
1832 if (QChar::unicodeVersion(ucs4) > version)
1833 continue;
1834
1835 const QChar *d = decompositionHelper(ucs4, length: &length, tag: &tag, buffer);
1836 if (!d || (canonical && tag != QChar::Canonical))
1837 continue;
1838
1839 qsizetype pos = uc - utf16;
1840 s.replace(i: pos, len: QChar::requiresSurrogates(ucs4) ? 2 : 1, s: d, slen: length);
1841 // since the replace invalidates the pointers and we do decomposition recursive
1842 utf16 = reinterpret_cast<unsigned short *>(s.data());
1843 uc = utf16 + pos + length;
1844 }
1845}
1846
1847
1848struct UCS2Pair {
1849 ushort u1;
1850 ushort u2;
1851};
1852
1853inline bool operator<(const UCS2Pair &ligature1, const UCS2Pair &ligature2)
1854{ return ligature1.u1 < ligature2.u1; }
1855inline bool operator<(ushort u1, const UCS2Pair &ligature)
1856{ return u1 < ligature.u1; }
1857inline bool operator<(const UCS2Pair &ligature, ushort u1)
1858{ return ligature.u1 < u1; }
1859
1860struct UCS2SurrogatePair {
1861 UCS2Pair p1;
1862 UCS2Pair p2;
1863};
1864
1865inline bool operator<(const UCS2SurrogatePair &ligature1, const UCS2SurrogatePair &ligature2)
1866{ return QChar::surrogateToUcs4(high: ligature1.p1.u1, low: ligature1.p1.u2) < QChar::surrogateToUcs4(high: ligature2.p1.u1, low: ligature2.p1.u2); }
1867inline bool operator<(char32_t u1, const UCS2SurrogatePair &ligature)
1868{ return u1 < QChar::surrogateToUcs4(high: ligature.p1.u1, low: ligature.p1.u2); }
1869inline bool operator<(const UCS2SurrogatePair &ligature, char32_t u1)
1870{ return QChar::surrogateToUcs4(high: ligature.p1.u1, low: ligature.p1.u2) < u1; }
1871
1872static char32_t inline ligatureHelper(char32_t u1, char32_t u2)
1873{
1874 if (u1 >= Hangul_LBase && u1 < Hangul_SBase + Hangul_SCount) {
1875 // compute Hangul syllable composition as per UAX #15
1876 // hangul L-V pair
1877 const char32_t LIndex = u1 - Hangul_LBase;
1878 if (LIndex < Hangul_LCount) {
1879 const char32_t VIndex = u2 - Hangul_VBase;
1880 if (VIndex < Hangul_VCount)
1881 return Hangul_SBase + (LIndex * Hangul_VCount + VIndex) * Hangul_TCount;
1882 }
1883 // hangul LV-T pair
1884 const char32_t SIndex = u1 - Hangul_SBase;
1885 if (SIndex < Hangul_SCount && (SIndex % Hangul_TCount) == 0) {
1886 const char32_t TIndex = u2 - Hangul_TBase;
1887 if (TIndex < Hangul_TCount && TIndex)
1888 return u1 + TIndex;
1889 }
1890 }
1891
1892 const unsigned short index = GET_LIGATURE_INDEX(u2);
1893 if (index == 0xffff)
1894 return 0;
1895 const unsigned short *ligatures = uc_ligature_map+index;
1896 ushort length = *ligatures++;
1897 if (QChar::requiresSurrogates(ucs4: u1)) {
1898 const UCS2SurrogatePair *data = reinterpret_cast<const UCS2SurrogatePair *>(ligatures);
1899 const UCS2SurrogatePair *r = std::lower_bound(first: data, last: data + length, val: u1);
1900 if (r != data + length && QChar::surrogateToUcs4(high: r->p1.u1, low: r->p1.u2) == u1)
1901 return QChar::surrogateToUcs4(high: r->p2.u1, low: r->p2.u2);
1902 } else {
1903 const UCS2Pair *data = reinterpret_cast<const UCS2Pair *>(ligatures);
1904 const UCS2Pair *r = std::lower_bound(first: data, last: data + length, val: ushort(u1));
1905 if (r != data + length && r->u1 == ushort(u1))
1906 return r->u2;
1907 }
1908
1909 return 0;
1910}
1911
1912static void composeHelper(QString *str, QChar::UnicodeVersion version, qsizetype from)
1913{
1914 QString &s = *str;
1915
1916 if (from < 0 || s.size() - from < 2)
1917 return;
1918
1919 char32_t stcode = 0; // starter code point
1920 qsizetype starter = -1; // starter position
1921 qsizetype next = -1; // to prevent i == next
1922 int lastCombining = 255; // to prevent combining > lastCombining
1923
1924 qsizetype pos = from;
1925 while (pos < s.size()) {
1926 qsizetype i = pos;
1927 char32_t uc = s.at(i: pos).unicode();
1928 if (QChar(uc).isHighSurrogate() && pos < s.size()-1) {
1929 ushort low = s.at(i: pos+1).unicode();
1930 if (QChar(low).isLowSurrogate()) {
1931 uc = QChar::surrogateToUcs4(high: uc, low);
1932 ++pos;
1933 }
1934 }
1935
1936 const QUnicodeTables::Properties *p = qGetProp(ucs4: uc);
1937 if (p->unicodeVersion > version) {
1938 starter = -1;
1939 next = -1; // to prevent i == next
1940 lastCombining = 255; // to prevent combining > lastCombining
1941 ++pos;
1942 continue;
1943 }
1944
1945 int combining = p->combiningClass;
1946 if ((i == next || combining > lastCombining) && starter >= from) {
1947 // allowed to form ligature with S
1948 char32_t ligature = ligatureHelper(u1: stcode, u2: uc);
1949 if (ligature) {
1950 stcode = ligature;
1951 QChar *d = s.data();
1952 // ligatureHelper() never changes planes
1953 qsizetype j = 0;
1954 for (QChar ch : QChar::fromUcs4(c: ligature))
1955 d[starter + j++] = ch;
1956 s.remove(i, len: j);
1957 continue;
1958 }
1959 }
1960 if (combining == 0) {
1961 starter = i;
1962 stcode = uc;
1963 next = pos + 1;
1964 }
1965 lastCombining = combining;
1966
1967 ++pos;
1968 }
1969}
1970
1971
1972static void canonicalOrderHelper(QString *str, QChar::UnicodeVersion version, qsizetype from)
1973{
1974 QString &s = *str;
1975 const qsizetype l = s.size()-1;
1976
1977 char32_t u1, u2;
1978 char16_t c1, c2;
1979
1980 qsizetype pos = from;
1981 while (pos < l) {
1982 qsizetype p2 = pos+1;
1983 u1 = s.at(i: pos).unicode();
1984 if (QChar::isHighSurrogate(ucs4: u1)) {
1985 const char16_t low = s.at(i: p2).unicode();
1986 if (QChar::isLowSurrogate(ucs4: low)) {
1987 u1 = QChar::surrogateToUcs4(high: u1, low);
1988 if (p2 >= l)
1989 break;
1990 ++p2;
1991 }
1992 }
1993 c1 = 0;
1994
1995 advance:
1996 u2 = s.at(i: p2).unicode();
1997 if (QChar::isHighSurrogate(ucs4: u2) && p2 < l) {
1998 const char16_t low = s.at(i: p2+1).unicode();
1999 if (QChar::isLowSurrogate(ucs4: low)) {
2000 u2 = QChar::surrogateToUcs4(high: u2, low);
2001 ++p2;
2002 }
2003 }
2004
2005 c2 = 0;
2006 {
2007 const QUnicodeTables::Properties *p = qGetProp(ucs4: u2);
2008 if (p->unicodeVersion <= version)
2009 c2 = p->combiningClass;
2010 }
2011 if (c2 == 0) {
2012 pos = p2+1;
2013 continue;
2014 }
2015
2016 if (c1 == 0) {
2017 const QUnicodeTables::Properties *p = qGetProp(ucs4: u1);
2018 if (p->unicodeVersion <= version)
2019 c1 = p->combiningClass;
2020 }
2021
2022 if (c1 > c2) {
2023 QChar *uc = s.data();
2024 qsizetype p = pos;
2025 // exchange characters
2026 for (QChar ch : QChar::fromUcs4(c: u2))
2027 uc[p++] = ch;
2028 for (QChar ch : QChar::fromUcs4(c: u1))
2029 uc[p++] = ch;
2030 if (pos > 0)
2031 --pos;
2032 if (pos > 0 && s.at(i: pos).isLowSurrogate())
2033 --pos;
2034 } else {
2035 ++pos;
2036 if (QChar::requiresSurrogates(ucs4: u1))
2037 ++pos;
2038
2039 u1 = u2;
2040 c1 = c2; // != 0
2041 p2 = pos + 1;
2042 if (QChar::requiresSurrogates(ucs4: u1))
2043 ++p2;
2044 if (p2 > l)
2045 break;
2046
2047 goto advance;
2048 }
2049 }
2050}
2051
2052// returns true if the text is in a desired Normalization Form already; false otherwise.
2053// sets lastStable to the position of the last stable code point
2054static bool normalizationQuickCheckHelper(QString *str, QString::NormalizationForm mode, qsizetype from, qsizetype *lastStable)
2055{
2056 static_assert(QString::NormalizationForm_D == 0);
2057 static_assert(QString::NormalizationForm_C == 1);
2058 static_assert(QString::NormalizationForm_KD == 2);
2059 static_assert(QString::NormalizationForm_KC == 3);
2060
2061 enum { NFQC_YES = 0, NFQC_NO = 1, NFQC_MAYBE = 3 };
2062
2063 const auto *string = reinterpret_cast<const char16_t *>(str->constData());
2064 qsizetype length = str->size();
2065
2066 // this avoids one out of bounds check in the loop
2067 while (length > from && QChar::isHighSurrogate(ucs4: string[length - 1]))
2068 --length;
2069
2070 uchar lastCombining = 0;
2071 for (qsizetype i = from; i < length; ++i) {
2072 qsizetype pos = i;
2073 char32_t uc = string[i];
2074 if (uc < 0x80) {
2075 // ASCII characters are stable code points
2076 lastCombining = 0;
2077 *lastStable = pos;
2078 continue;
2079 }
2080
2081 if (QChar::isHighSurrogate(ucs4: uc)) {
2082 ushort low = string[i + 1];
2083 if (!QChar::isLowSurrogate(ucs4: low)) {
2084 // treat surrogate like stable code point
2085 lastCombining = 0;
2086 *lastStable = pos;
2087 continue;
2088 }
2089 ++i;
2090 uc = QChar::surrogateToUcs4(high: uc, low);
2091 }
2092
2093 const QUnicodeTables::Properties *p = qGetProp(ucs4: uc);
2094
2095 if (p->combiningClass < lastCombining && p->combiningClass > 0)
2096 return false;
2097
2098 const uchar check = (p->nfQuickCheck >> (mode << 1)) & 0x03;
2099 if (check != NFQC_YES)
2100 return false; // ### can we quick check NFQC_MAYBE ?
2101
2102 lastCombining = p->combiningClass;
2103 if (lastCombining == 0)
2104 *lastStable = pos;
2105 }
2106
2107 if (length != str->size()) // low surrogate parts at the end of text
2108 *lastStable = str->size() - 1;
2109
2110 return true;
2111}
2112
2113/*!
2114 \macro QT_IMPLICIT_QCHAR_CONSTRUCTION
2115 \since 6.0
2116 \relates QChar
2117
2118 Defining this macro makes certain QChar constructors implicit
2119 rather than explicit. This is done to enforce safe conversions:
2120
2121 \badcode
2122
2123 QString str = getString();
2124 if (str == 123) {
2125 // Oops, meant str == "123". By default does not compile,
2126 // *unless* this macro is defined, in which case, it's interpreted
2127 // as `if (str == QChar(123))`, that is, `if (str == '{')`.
2128 // Likely, not what we meant.
2129 }
2130
2131 \endcode
2132
2133 This macro is provided to keep existing code working; it is
2134 recommended to instead use explicit conversions and/or QLatin1Char.
2135 For instance:
2136
2137 \code
2138
2139 QChar c1 = 'x'; // OK, unless QT_NO_CAST_FROM_ASCII is defined
2140 QChar c2 = u'x'; // always OK, recommended
2141 QChar c3 = QLatin1Char('x'); // always OK, recommended
2142
2143 // from int to 1 UTF-16 code unit: must guarantee that the input is <= 0xFFFF
2144 QChar c4 = 120; // compile error, unless QT_IMPLICIT_QCHAR_CONSTRUCTION is defined
2145 QChar c5(120); // OK (direct initialization)
2146 auto c6 = QChar(120); // ditto
2147
2148 // from int/char32_t to 1/2 UTF-16 code units:
2149 // 𝄞 'MUSICAL SYMBOL G CLEF' (U+1D11E)
2150 auto c7 = QChar(0x1D11E); // compiles, but undefined behavior at runtime
2151 auto c8 = QChar::fromUcs4(0x1D11E); // always OK
2152 auto c9 = QChar::fromUcs4(U'\U0001D11E'); // always OK
2153 // => use c8/c9 as QStringView objects
2154
2155 \endcode
2156
2157 \sa QLatin1Char, QChar::fromUcs4, QT_NO_CAST_FROM_ASCII
2158*/
2159
2160QT_END_NAMESPACE
2161

source code of qtbase/src/corelib/text/qchar.cpp