1// Copyright (C) 2020 The Qt Company Ltd.
2// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
3
4#include "qunicodetools_p.h"
5
6#include "qunicodetables_p.h"
7#include "qvarlengtharray.h"
8#if QT_CONFIG(library)
9#include "qlibrary.h"
10#endif
11
12#include <limits.h>
13
14#define FLAG(x) (1 << (x))
15
16QT_BEGIN_NAMESPACE
17
18using namespace Qt::StringLiterals;
19
20#ifdef QT_BUILD_INTERNAL
21Q_CONSTINIT Q_AUTOTEST_EXPORT
22#else
23constexpr
24#endif
25int qt_initcharattributes_default_algorithm_only = 0;
26
27namespace QUnicodeTools {
28
29// -----------------------------------------------------------------------------------------------------
30//
31// The text boundaries determination algorithm.
32// See https://www.unicode.org/reports/tr29/tr29-37.html
33//
34// -----------------------------------------------------------------------------------------------------
35
36namespace GB {
37
38// This table is indexed by the grapheme break classes of two
39// (adjacent) code points.
40// The class of the first code point selects an entry.
41// If the entry's bit at position second_cp_class is set
42// (in other words: if entry & (1u << second_cp_class) is non-zero)
43// then there is NO grapheme break between the two code points.
44
45using GBTableEntryType = quint16;
46
47// Check that we have enough bits in the table (in case
48// NumGraphemeBreakClasses grows too much).
49static_assert(sizeof(GBTableEntryType) * CHAR_BIT >= QUnicodeTables::NumGraphemeBreakClasses,
50 "Internal error: increase the size in bits of GBTableEntryType");
51
52// GB9, GB9a
53static const GBTableEntryType Extend_SpacingMark_ZWJ =
54 FLAG(QUnicodeTables::GraphemeBreak_Extend)
55 | FLAG(QUnicodeTables::GraphemeBreak_SpacingMark)
56 | FLAG(QUnicodeTables::GraphemeBreak_ZWJ);
57
58static const GBTableEntryType HardBreak = 0u;
59
60static const GBTableEntryType breakTable[QUnicodeTables::NumGraphemeBreakClasses] = {
61 Extend_SpacingMark_ZWJ, // Any
62 FLAG(QUnicodeTables::GraphemeBreak_LF), // CR
63 HardBreak, // LF
64 HardBreak, // Control
65 Extend_SpacingMark_ZWJ, // Extend
66 Extend_SpacingMark_ZWJ, // ZWJ
67 Extend_SpacingMark_ZWJ, // RegionalIndicator
68 (Extend_SpacingMark_ZWJ
69 | FLAG(QUnicodeTables::GraphemeBreak_Any)
70 | FLAG(QUnicodeTables::GraphemeBreak_Prepend)
71 | FLAG(QUnicodeTables::GraphemeBreak_L)
72 | FLAG(QUnicodeTables::GraphemeBreak_V)
73 | FLAG(QUnicodeTables::GraphemeBreak_T)
74 | FLAG(QUnicodeTables::GraphemeBreak_LV)
75 | FLAG(QUnicodeTables::GraphemeBreak_LVT)
76 | FLAG(QUnicodeTables::GraphemeBreak_RegionalIndicator)
77 | FLAG(QUnicodeTables::GraphemeBreak_Extended_Pictographic)
78 ), // Prepend
79 Extend_SpacingMark_ZWJ, // SpacingMark
80 (Extend_SpacingMark_ZWJ
81 | FLAG(QUnicodeTables::GraphemeBreak_L)
82 | FLAG(QUnicodeTables::GraphemeBreak_V)
83 | FLAG(QUnicodeTables::GraphemeBreak_LV)
84 | FLAG(QUnicodeTables::GraphemeBreak_LVT)
85 ), // L
86 (Extend_SpacingMark_ZWJ
87 | FLAG(QUnicodeTables::GraphemeBreak_V)
88 | FLAG(QUnicodeTables::GraphemeBreak_T)
89 ), // V
90 (Extend_SpacingMark_ZWJ
91 | FLAG(QUnicodeTables::GraphemeBreak_T)
92 ), // T
93 (Extend_SpacingMark_ZWJ
94 | FLAG(QUnicodeTables::GraphemeBreak_V)
95 | FLAG(QUnicodeTables::GraphemeBreak_T)
96 ), // LV
97 (Extend_SpacingMark_ZWJ
98 | FLAG(QUnicodeTables::GraphemeBreak_T)
99 ), // LVT
100 Extend_SpacingMark_ZWJ // Extended_Pictographic
101};
102
103static bool shouldBreakBetweenClasses(QUnicodeTables::GraphemeBreakClass first,
104 QUnicodeTables::GraphemeBreakClass second)
105{
106 return (breakTable[first] & FLAG(second)) == 0;
107}
108
109// Some rules (GB11, GB12, GB13) cannot be represented by the table alone,
110// so we need to store some local state.
111enum class State : uchar {
112 Normal,
113 GB11_ExtPicExt, // saw a Extend after a Extended_Pictographic
114 GB11_ExtPicExtZWJ, // saw a ZWG after a Extended_Pictographic and zero or more Extend
115 GB12_13_RI, // saw a RegionalIndicator following a non-RegionalIndicator
116};
117
118} // namespace GB
119
120static void getGraphemeBreaks(const char16_t *string, qsizetype len, QCharAttributes *attributes)
121{
122 QUnicodeTables::GraphemeBreakClass lcls = QUnicodeTables::GraphemeBreak_LF; // to meet GB1
123 GB::State state = GB::State::Normal;
124 for (qsizetype i = 0; i != len; ++i) {
125 qsizetype pos = i;
126 char32_t ucs4 = string[i];
127 if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
128 ushort low = string[i + 1];
129 if (QChar::isLowSurrogate(ucs4: low)) {
130 ucs4 = QChar::surrogateToUcs4(high: ucs4, low);
131 ++i;
132 }
133 }
134
135 const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
136 QUnicodeTables::GraphemeBreakClass cls = (QUnicodeTables::GraphemeBreakClass) prop->graphemeBreakClass;
137
138 bool shouldBreak = GB::shouldBreakBetweenClasses(first: lcls, second: cls);
139 bool handled = false;
140
141 switch (state) {
142 case GB::State::Normal:
143 break; // will deal with it below
144
145 case GB::State::GB11_ExtPicExt:
146 Q_ASSERT(lcls == QUnicodeTables::GraphemeBreak_Extend);
147 if (cls == QUnicodeTables::GraphemeBreak_Extend) {
148 // keep going in the current state
149 Q_ASSERT(!shouldBreak); // GB9, do not break before Extend
150 handled = true;
151 } else if (cls == QUnicodeTables::GraphemeBreak_ZWJ) {
152 state = GB::State::GB11_ExtPicExtZWJ;
153 Q_ASSERT(!shouldBreak); // GB9, do not break before ZWJ
154 handled = true;
155 } else {
156 state = GB::State::Normal;
157 }
158 break;
159
160 case GB::State::GB11_ExtPicExtZWJ:
161 Q_ASSERT(lcls == QUnicodeTables::GraphemeBreak_ZWJ);
162 if (cls == QUnicodeTables::GraphemeBreak_Extended_Pictographic) {
163 shouldBreak = false;
164 handled = true;
165 }
166
167 state = GB::State::Normal;
168 break;
169
170 case GB::State::GB12_13_RI:
171 Q_ASSERT(lcls == QUnicodeTables::GraphemeBreak_RegionalIndicator);
172 if (cls == QUnicodeTables::GraphemeBreak_RegionalIndicator) {
173 shouldBreak = false;
174 handled = true;
175 }
176
177 state = GB::State::Normal;
178 break;
179 }
180
181 if (!handled) {
182 Q_ASSERT(state == GB::State::Normal);
183 if (lcls == QUnicodeTables::GraphemeBreak_Extended_Pictographic) { // GB11
184 if (cls == QUnicodeTables::GraphemeBreak_Extend) {
185 state = GB::State::GB11_ExtPicExt;
186 Q_ASSERT(!shouldBreak); // GB9, do not break before Extend
187 } else if (cls == QUnicodeTables::GraphemeBreak_ZWJ) {
188 state = GB::State::GB11_ExtPicExtZWJ;
189 Q_ASSERT(!shouldBreak); // GB9, do not break before ZWJ
190 }
191 } else if (cls == QUnicodeTables::GraphemeBreak_RegionalIndicator) { // GB12, GB13
192 state = GB::State::GB12_13_RI;
193 }
194 }
195
196 if (shouldBreak)
197 attributes[pos].graphemeBoundary = true;
198
199 lcls = cls;
200 }
201
202 attributes[len].graphemeBoundary = true; // GB2
203}
204
205
206namespace WB {
207
208enum Action {
209 NoBreak,
210 Break,
211 Lookup,
212 LookupW
213};
214
215static const uchar breakTable[QUnicodeTables::NumWordBreakClasses][QUnicodeTables::NumWordBreakClasses] = {
216// Any CR LF Newline Extend ZWJ Format RI Katakana HLetter ALetter SQuote DQuote MidNumLet MidLetter MidNum Numeric ExtNumLet WSeg
217 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Any
218 { Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // CR
219 { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // LF
220 { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Newline
221 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Extend
222 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // ZWJ
223 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Format
224 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // RegionalIndicator
225 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , NoBreak, Break }, // Katakana
226 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , NoBreak, NoBreak, LookupW, Lookup , LookupW, LookupW, Break , NoBreak, NoBreak, Break }, // HebrewLetter
227 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , NoBreak, NoBreak, LookupW, Break , LookupW, LookupW, Break , NoBreak, NoBreak, Break }, // ALetter
228 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // SingleQuote
229 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // DoubleQuote
230 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidNumLet
231 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidLetter
232 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidNum
233 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , NoBreak, NoBreak, Lookup , Break , Lookup , Break , Lookup , NoBreak, NoBreak, Break }, // Numeric
234 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , NoBreak, NoBreak, Break }, // ExtendNumLet
235 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , NoBreak }, // WSegSpace
236};
237
238} // namespace WB
239
240static void getWordBreaks(const char16_t *string, qsizetype len, QCharAttributes *attributes)
241{
242 enum WordType {
243 WordTypeNone, WordTypeAlphaNumeric, WordTypeHiraganaKatakana
244 } currentWordType = WordTypeNone;
245
246 QUnicodeTables::WordBreakClass cls = QUnicodeTables::WordBreak_LF; // to meet WB1
247 auto real_cls = cls; // Unaffected by WB4
248
249 for (qsizetype i = 0; i != len; ++i) {
250 qsizetype pos = i;
251 char32_t ucs4 = string[i];
252 if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
253 ushort low = string[i + 1];
254 if (QChar::isLowSurrogate(ucs4: low)) {
255 ucs4 = QChar::surrogateToUcs4(high: ucs4, low);
256 ++i;
257 }
258 }
259
260 const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
261 QUnicodeTables::WordBreakClass ncls = (QUnicodeTables::WordBreakClass) prop->wordBreakClass;
262 if (qt_initcharattributes_default_algorithm_only) {
263 // as of Unicode 5.1, some punctuation marks were mapped to MidLetter and MidNumLet
264 // which caused "hi.there" to be treated like if it were just a single word;
265 // we keep the pre-5.1 behavior by remapping these characters in the Unicode tables generator
266 // and this code is needed to pass the coverage tests; remove once the issue is fixed.
267 if (ucs4 == 0x002E) // FULL STOP
268 ncls = QUnicodeTables::WordBreak_MidNumLet;
269 else if (ucs4 == 0x003A) // COLON
270 ncls = QUnicodeTables::WordBreak_MidLetter;
271 }
272
273 uchar action = WB::breakTable[cls][ncls];
274 switch (action) {
275 case WB::Break:
276 if (Q_UNLIKELY(real_cls == QUnicodeTables::WordBreak_ZWJ
277 && prop->graphemeBreakClass
278 == QUnicodeTables::GraphemeBreak_Extended_Pictographic)) {
279 // WB3c: ZWJ × \p{Extended_Pictographic}
280 action = WB::NoBreak;
281 }
282 break;
283 case WB::NoBreak:
284 if (Q_UNLIKELY(ncls == QUnicodeTables::WordBreak_Extend || ncls == QUnicodeTables::WordBreak_ZWJ || ncls == QUnicodeTables::WordBreak_Format)) {
285 // WB4: X(Extend|Format)* -> X
286 real_cls = ncls;
287 continue;
288 }
289 if (Q_UNLIKELY(cls == QUnicodeTables::WordBreak_RegionalIndicator)) {
290 // WB15/WB16: break between pairs of Regional indicator
291 ncls = QUnicodeTables::WordBreak_Any;
292 }
293 if (Q_UNLIKELY(ncls == QUnicodeTables::WordBreak_WSegSpace
294 && real_cls != QUnicodeTables::WordBreak_WSegSpace)) {
295 // WB3d should not be affected by WB4
296 action = WB::Break;
297 }
298 break;
299 case WB::Lookup:
300 case WB::LookupW:
301 for (qsizetype lookahead = i + 1; lookahead < len; ++lookahead) {
302 ucs4 = string[lookahead];
303 if (QChar::isHighSurrogate(ucs4) && lookahead + 1 != len) {
304 ushort low = string[lookahead + 1];
305 if (QChar::isLowSurrogate(ucs4: low)) {
306 ucs4 = QChar::surrogateToUcs4(high: ucs4, low);
307 ++lookahead;
308 }
309 }
310
311 prop = QUnicodeTables::properties(ucs4);
312 QUnicodeTables::WordBreakClass tcls = (QUnicodeTables::WordBreakClass) prop->wordBreakClass;
313
314 if (Q_UNLIKELY(tcls == QUnicodeTables::WordBreak_Extend || tcls == QUnicodeTables::WordBreak_ZWJ || tcls == QUnicodeTables::WordBreak_Format)) {
315 // WB4: X(Extend|Format)* -> X
316 continue;
317 }
318
319 if (Q_LIKELY(tcls == cls || (action == WB::LookupW && (tcls == QUnicodeTables::WordBreak_HebrewLetter
320 || tcls == QUnicodeTables::WordBreak_ALetter)))) {
321 i = lookahead;
322 ncls = tcls;
323 action = WB::NoBreak;
324 }
325 break;
326 }
327 if (action != WB::NoBreak) {
328 action = WB::Break;
329 if (Q_UNLIKELY(ncls == QUnicodeTables::WordBreak_SingleQuote && cls == QUnicodeTables::WordBreak_HebrewLetter))
330 action = WB::NoBreak; // WB7a
331 }
332 break;
333 }
334
335 cls = ncls;
336 real_cls = ncls;
337
338 if (action == WB::Break) {
339 attributes[pos].wordBreak = true;
340 if (currentWordType != WordTypeNone)
341 attributes[pos].wordEnd = true;
342 switch (cls) {
343 case QUnicodeTables::WordBreak_Katakana:
344 currentWordType = WordTypeHiraganaKatakana;
345 attributes[pos].wordStart = true;
346 break;
347 case QUnicodeTables::WordBreak_HebrewLetter:
348 case QUnicodeTables::WordBreak_ALetter:
349 case QUnicodeTables::WordBreak_Numeric:
350 currentWordType = WordTypeAlphaNumeric;
351 attributes[pos].wordStart = true;
352 break;
353 default:
354 currentWordType = WordTypeNone;
355 break;
356 }
357 }
358 }
359
360 if (currentWordType != WordTypeNone)
361 attributes[len].wordEnd = true;
362 attributes[len].wordBreak = true; // WB2
363}
364
365
366namespace SB {
367
368enum State {
369 Initial,
370 Lower,
371 Upper,
372 LUATerm,
373 ATerm,
374 ATermC,
375 ACS,
376 STerm,
377 STermC,
378 SCS,
379 BAfterC,
380 BAfter,
381 Break,
382 Lookup
383};
384
385static const uchar breakTable[BAfter + 1][QUnicodeTables::NumSentenceBreakClasses] = {
386// Any CR LF Sep Extend Sp Lower Upper OLetter Numeric ATerm SContinue STerm Close
387 { Initial, BAfterC, BAfter , BAfter , Initial, Initial, Lower , Upper , Initial, Initial, ATerm , Initial, STerm , Initial }, // Initial
388 { Initial, BAfterC, BAfter , BAfter , Lower , Initial, Initial, Initial, Initial, Initial, LUATerm, Initial, STerm , Initial }, // Lower
389 { Initial, BAfterC, BAfter , BAfter , Upper , Initial, Initial, Upper , Initial, Initial, LUATerm, Initial, STerm , Initial }, // Upper
390
391 { Lookup , BAfterC, BAfter , BAfter , LUATerm, ACS , Initial, Upper , Break , Initial, ATerm , STerm , STerm , ATermC }, // LUATerm
392 { Lookup , BAfterC, BAfter , BAfter , ATerm , ACS , Initial, Break , Break , Initial, ATerm , STerm , STerm , ATermC }, // ATerm
393 { Lookup , BAfterC, BAfter , BAfter , ATermC , ACS , Initial, Break , Break , Lookup , ATerm , STerm , STerm , ATermC }, // ATermC
394 { Lookup , BAfterC, BAfter , BAfter , ACS , ACS , Initial, Break , Break , Lookup , ATerm , STerm , STerm , Lookup }, // ACS
395
396 { Break , BAfterC, BAfter , BAfter , STerm , SCS , Break , Break , Break , Break , ATerm , STerm , STerm , STermC }, // STerm,
397 { Break , BAfterC, BAfter , BAfter , STermC , SCS , Break , Break , Break , Break , ATerm , STerm , STerm , STermC }, // STermC
398 { Break , BAfterC, BAfter , BAfter , SCS , SCS , Break , Break , Break , Break , ATerm , STerm , STerm , Break }, // SCS
399 { Break , Break , BAfter , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // BAfterC
400 { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // BAfter
401};
402
403} // namespace SB
404
405static void getSentenceBreaks(const char16_t *string, qsizetype len, QCharAttributes *attributes)
406{
407 uchar state = SB::BAfter; // to meet SB1
408 for (qsizetype i = 0; i != len; ++i) {
409 qsizetype pos = i;
410 char32_t ucs4 = string[i];
411 if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
412 ushort low = string[i + 1];
413 if (QChar::isLowSurrogate(ucs4: low)) {
414 ucs4 = QChar::surrogateToUcs4(high: ucs4, low);
415 ++i;
416 }
417 }
418
419 const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
420 QUnicodeTables::SentenceBreakClass ncls = (QUnicodeTables::SentenceBreakClass) prop->sentenceBreakClass;
421
422 Q_ASSERT(state <= SB::BAfter);
423 state = SB::breakTable[state][ncls];
424 if (Q_UNLIKELY(state == SB::Lookup)) { // SB8
425 state = SB::Break;
426 for (qsizetype lookahead = i + 1; lookahead < len; ++lookahead) {
427 ucs4 = string[lookahead];
428 if (QChar::isHighSurrogate(ucs4) && lookahead + 1 != len) {
429 ushort low = string[lookahead + 1];
430 if (QChar::isLowSurrogate(ucs4: low)) {
431 ucs4 = QChar::surrogateToUcs4(high: ucs4, low);
432 ++lookahead;
433 }
434 }
435
436 prop = QUnicodeTables::properties(ucs4);
437 QUnicodeTables::SentenceBreakClass tcls = (QUnicodeTables::SentenceBreakClass) prop->sentenceBreakClass;
438 switch (tcls) {
439 case QUnicodeTables::SentenceBreak_Any:
440 case QUnicodeTables::SentenceBreak_Extend:
441 case QUnicodeTables::SentenceBreak_Sp:
442 case QUnicodeTables::SentenceBreak_Numeric:
443 case QUnicodeTables::SentenceBreak_SContinue:
444 case QUnicodeTables::SentenceBreak_Close:
445 continue;
446 case QUnicodeTables::SentenceBreak_Lower:
447 i = lookahead;
448 state = SB::Initial;
449 break;
450 default:
451 break;
452 }
453 break;
454 }
455 }
456 if (Q_UNLIKELY(state == SB::Break)) {
457 attributes[pos].sentenceBoundary = true;
458 state = SB::breakTable[SB::Initial][ncls];
459 }
460 }
461
462 attributes[len].sentenceBoundary = true; // SB2
463}
464
465
466// -----------------------------------------------------------------------------------------------------
467//
468// The line breaking algorithm.
469// See http://www.unicode.org/reports/tr14/tr14-39.html
470//
471// -----------------------------------------------------------------------------------------------------
472
473namespace LB {
474
475namespace NS { // Number Sequence
476
477// LB25 recommends to not break lines inside numbers of the form
478// described by the following regular expression:
479// (PR|PO)?(OP|HY)?NU(NU|SY|IS)*(CL|CP)?(PR|PO)?
480
481enum Action {
482 None,
483 Start,
484 Continue,
485 Break
486};
487
488enum Class {
489 XX,
490 PRPO,
491 OPHY,
492 NU,
493 SYIS,
494 CLCP
495};
496
497static const uchar actionTable[CLCP + 1][CLCP + 1] = {
498// XX PRPO OPHY NU SYIS CLCP
499 { None , Start , Start , Start , None , None }, // XX
500 { None , Start , Continue, Continue, None , None }, // PRPO
501 { None , Start , Start , Continue, None , None }, // OPHY
502 { Break , Break , Break , Continue, Continue, Continue }, // NU
503 { Break , Break , Break , Continue, Continue, Continue }, // SYIS
504 { Break , Continue, Break , Break , Break , Break }, // CLCP
505};
506
507inline Class toClass(QUnicodeTables::LineBreakClass lbc, QChar::Category category)
508{
509 switch (lbc) {
510 case QUnicodeTables::LineBreak_AL:// case QUnicodeTables::LineBreak_AI:
511 // resolve AI math symbols in numerical context to IS
512 if (category == QChar::Symbol_Math)
513 return SYIS;
514 break;
515 case QUnicodeTables::LineBreak_PR: case QUnicodeTables::LineBreak_PO:
516 return PRPO;
517 case QUnicodeTables::LineBreak_OP: case QUnicodeTables::LineBreak_HY:
518 return OPHY;
519 case QUnicodeTables::LineBreak_NU:
520 return NU;
521 case QUnicodeTables::LineBreak_SY: case QUnicodeTables::LineBreak_IS:
522 return SYIS;
523 case QUnicodeTables::LineBreak_CL: case QUnicodeTables::LineBreak_CP:
524 return CLCP;
525 default:
526 break;
527 }
528 return XX;
529}
530
531} // namespace NS
532
533/* In order to support the tailored implementation of LB25 properly
534 the following changes were made in the pair table to allow breaks
535 where the numeric expression doesn't match the template (i.e. [^NU](IS|SY)NU):
536 (CL)(PO) from IB to DB
537 (CP)(PO) from IB to DB
538 (CL)(PR) from IB to DB
539 (CP)(PR) from IB to DB
540 (PO)(OP) from IB to DB
541 (PR)(OP) from IB to DB
542 (IS)(NU) from IB to DB
543 (SY)(NU) from IB to DB
544*/
545
546/* In order to implementat LB21a properly a special rule HH has been introduced and
547 the following changes were made in the pair table to disallow breaks after Hebrew + Hyphen:
548 (HL)(HY|BA) from IB to CI
549 (HY|BA)(!CB) from DB to HH
550*/
551
552enum Action {
553 ProhibitedBreak, PB = ProhibitedBreak,
554 DirectBreak, DB = DirectBreak,
555 IndirectBreak, IB = IndirectBreak,
556 CombiningIndirectBreak, CI = CombiningIndirectBreak,
557 CombiningProhibitedBreak, CP = CombiningProhibitedBreak,
558 ProhibitedBreakAfterHebrewPlusHyphen, HH = ProhibitedBreakAfterHebrewPlusHyphen,
559 IndirectBreakIfNarrow, IN = IndirectBreakIfNarrow, // For LB30
560};
561
562static const uchar breakTable[QUnicodeTables::LineBreak_ZWJ][QUnicodeTables::LineBreak_ZWJ] = {
563/* OP CL CP QU GL NS EX SY IS PR PO NU AL HL ID IN HY BA BB B2 ZW CM WJ H2 H3 JL JV JT RI CB EB EM*/
564/* OP */ { PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, CP, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB },
565/* CL */ { DB, PB, PB, IB, IB, PB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
566/* CP */ { DB, PB, PB, IB, IB, PB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
567/* QU */ { PB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
568/* GL */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
569/* NS */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
570/* EX */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
571/* SY */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
572/* IS */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
573/* PR */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, IB, DB, DB, IB, IB },
574/* PO */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
575/* NU */ { IN, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
576/* AL */ { IN, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
577/* HL */ { IN, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, CI, CI, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
578/* ID */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
579/* IN */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
580/* HY */ { HH, PB, PB, IB, HH, IB, PB, PB, PB, HH, HH, IB, HH, HH, HH, IB, IB, IB, HH, HH, PB, CI, PB, HH, HH, HH, HH, HH, HH, DB, DB, DB },
581/* BA */ { HH, PB, PB, IB, HH, IB, PB, PB, PB, HH, HH, HH, HH, HH, HH, IB, IB, IB, HH, HH, PB, CI, PB, HH, HH, HH, HH, HH, HH, DB, DB, DB },
582/* BB */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, DB, IB, IB },
583/* B2 */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, PB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
584/* ZW */ { DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
585/* CM */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
586/* WJ */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
587/* H2 */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB, DB, DB, DB, DB },
588/* H3 */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB, DB, DB, DB, DB },
589/* JL */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, DB, DB, DB, DB, DB },
590/* JV */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB, DB, DB, DB, DB },
591/* JT */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB, DB, DB, DB, DB },
592/* RI */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, IB, DB, DB, DB },
593/* CB */ { DB, PB, PB, IB, IB, DB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
594/* EB */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
595/* EM */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
596};
597
598// The following line break classes are not treated by the pair table
599// and must be resolved outside:
600// AI, BK, CB, CJ, CR, LF, NL, ZWJ, SA, SG, SP, XX
601
602} // namespace LB
603
604static void getLineBreaks(const char16_t *string, qsizetype len, QCharAttributes *attributes, QUnicodeTools::CharAttributeOptions options)
605{
606 qsizetype nestart = 0;
607 LB::NS::Class nelast = LB::NS::XX;
608
609 QUnicodeTables::LineBreakClass lcls = QUnicodeTables::LineBreak_LF; // to meet LB10
610 QUnicodeTables::LineBreakClass cls = lcls;
611 const QUnicodeTables::Properties *lastProp = QUnicodeTables::properties(ucs4: U'\n');
612
613 for (qsizetype i = 0; i != len; ++i) {
614 qsizetype pos = i;
615 char32_t ucs4 = string[i];
616 if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
617 ushort low = string[i + 1];
618 if (QChar::isLowSurrogate(ucs4: low)) {
619 ucs4 = QChar::surrogateToUcs4(high: ucs4, low);
620 ++i;
621 }
622 }
623
624 const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
625 QUnicodeTables::LineBreakClass ncls = (QUnicodeTables::LineBreakClass) prop->lineBreakClass;
626 QUnicodeTables::LineBreakClass tcls;
627
628 if (options & QUnicodeTools::HangulLineBreakTailoring) {
629 if (Q_UNLIKELY((ncls >= QUnicodeTables::LineBreak_H2
630 && ncls <= QUnicodeTables::LineBreak_JT)
631 || (ucs4 >= 0x3130 && ucs4 <= 0x318F && ncls == QUnicodeTables::LineBreak_ID))
632 ) {
633 // LB27: use SPACE for line breaking
634 // "When Korean uses SPACE for line breaking, the classes in rule LB26,
635 // as well as characters of class ID, are often tailored to AL; see Section 8, Customization."
636 // In case of Korean syllables: "3130..318F HANGUL COMPATIBILITY JAMO"
637 ncls = QUnicodeTables::LineBreak_AL;
638 } else {
639 if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_SA)) {
640 // LB1: resolve SA to AL, except of those that have Category Mn or Mc be resolved to CM
641 static const int test = FLAG(QChar::Mark_NonSpacing) | FLAG(QChar::Mark_SpacingCombining);
642 if (FLAG(prop->category) & test)
643 ncls = QUnicodeTables::LineBreak_CM;
644 }
645 if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_CM)) {
646 // LB10: treat CM that follows SP, BK, CR, LF, NL, or ZW as AL
647 if (lcls == QUnicodeTables::LineBreak_ZW || lcls >= QUnicodeTables::LineBreak_SP)
648 ncls = QUnicodeTables::LineBreak_AL;
649 }
650 }
651 }
652
653 if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_SA)) {
654 // LB1: resolve SA to AL, except of those that have Category Mn or Mc be resolved to CM
655 static const int test = FLAG(QChar::Mark_NonSpacing) | FLAG(QChar::Mark_SpacingCombining);
656 if (FLAG(prop->category) & test)
657 ncls = QUnicodeTables::LineBreak_CM;
658 }
659
660 if (Q_UNLIKELY(lcls >= QUnicodeTables::LineBreak_CR)) {
661 // LB4: BK!, LB5: (CRxLF|CR|LF|NL)!
662 if (lcls > QUnicodeTables::LineBreak_CR || ncls != QUnicodeTables::LineBreak_LF)
663 attributes[pos].lineBreak = attributes[pos].mandatoryBreak = true;
664 if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_CM || ncls == QUnicodeTables::LineBreak_ZWJ)) {
665 cls = QUnicodeTables::LineBreak_AL;
666 goto next_no_cls_update;
667 }
668 goto next;
669 }
670
671 if (Q_UNLIKELY(ncls >= QUnicodeTables::LineBreak_SP)) {
672 if (ncls > QUnicodeTables::LineBreak_SP)
673 goto next; // LB6: x(BK|CR|LF|NL)
674 goto next_no_cls_update; // LB7: xSP
675 }
676
677 if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_CM || ncls == QUnicodeTables::LineBreak_ZWJ)) {
678 // LB9: treat CM that don't follows SP, BK, CR, LF, NL, or ZW as X
679 if (lcls != QUnicodeTables::LineBreak_ZW && lcls < QUnicodeTables::LineBreak_SP)
680 // don't update anything
681 goto next_no_cls_update;
682 }
683
684 if (Q_UNLIKELY(lcls == QUnicodeTables::LineBreak_ZWJ)) {
685 // LB8a: ZWJ x
686 goto next;
687 }
688
689 // LB25: do not break lines inside numbers
690 {
691 LB::NS::Class necur = LB::NS::toClass(lbc: ncls, category: (QChar::Category)prop->category);
692 switch (LB::NS::actionTable[nelast][necur]) {
693 case LB::NS::Break:
694 // do not change breaks before and after the expression
695 for (qsizetype j = nestart + 1; j < pos; ++j)
696 attributes[j].lineBreak = false;
697 Q_FALLTHROUGH();
698 case LB::NS::None:
699 nelast = LB::NS::XX; // reset state
700 break;
701 case LB::NS::Start:
702 nestart = i;
703 Q_FALLTHROUGH();
704 default:
705 nelast = necur;
706 break;
707 }
708 }
709
710 if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_RI && lcls == QUnicodeTables::LineBreak_RI)) {
711 // LB30a
712 ncls = QUnicodeTables::LineBreak_SP;
713 goto next;
714 }
715
716 if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_EM
717 && lastProp->category == QChar::Other_NotAssigned
718 && lastProp->graphemeBreakClass
719 == QUnicodeTables::GraphemeBreak_Extended_Pictographic)) {
720 // LB30b: [\p{Extended_Pictographic}&\p{Cn}] × EM
721 goto next;
722 }
723
724 // for South East Asian chars that require a complex analysis, the Unicode
725 // standard recommends to treat them as AL. tailoring that do dictionary analysis can override
726 if (Q_UNLIKELY(cls >= QUnicodeTables::LineBreak_SA))
727 cls = QUnicodeTables::LineBreak_AL;
728
729 tcls = cls;
730 if (tcls == QUnicodeTables::LineBreak_CM || tcls == QUnicodeTables::LineBreak_ZWJ)
731 // LB10
732 tcls = QUnicodeTables::LineBreak_AL;
733 switch (LB::breakTable[tcls][ncls < QUnicodeTables::LineBreak_ZWJ ? ncls : QUnicodeTables::LineBreak_AL]) {
734 case LB::DirectBreak:
735 attributes[pos].lineBreak = true;
736 break;
737 case LB::IndirectBreak:
738 if (lcls == QUnicodeTables::LineBreak_SP)
739 attributes[pos].lineBreak = true;
740 break;
741 case LB::CombiningIndirectBreak:
742 if (lcls != QUnicodeTables::LineBreak_SP)
743 goto next_no_cls_update;
744 attributes[pos].lineBreak = true;
745 break;
746 case LB::CombiningProhibitedBreak:
747 if (lcls != QUnicodeTables::LineBreak_SP)
748 goto next_no_cls_update;
749 break;
750 case LB::ProhibitedBreakAfterHebrewPlusHyphen:
751 if (lcls != QUnicodeTables::LineBreak_HL)
752 attributes[pos].lineBreak = true;
753 break;
754 case LB::IndirectBreakIfNarrow:
755 switch (static_cast<QUnicodeTables::EastAsianWidth>(prop->eastAsianWidth)) {
756 default:
757 if (lcls != QUnicodeTables::LineBreak_SP)
758 break;
759 Q_FALLTHROUGH();
760 case QUnicodeTables::EastAsianWidth::F:
761 case QUnicodeTables::EastAsianWidth::W:
762 case QUnicodeTables::EastAsianWidth::H:
763 attributes[pos].lineBreak = true;
764 break;
765 }
766 break;
767 case LB::ProhibitedBreak:
768 // nothing to do
769 default:
770 break;
771 }
772
773 next:
774 cls = ncls;
775 lastProp = prop;
776 next_no_cls_update:
777 lcls = ncls;
778 }
779
780 if (Q_UNLIKELY(LB::NS::actionTable[nelast][LB::NS::XX] == LB::NS::Break)) {
781 // LB25: do not break lines inside numbers
782 for (qsizetype j = nestart + 1; j < len; ++j)
783 attributes[j].lineBreak = false;
784 }
785
786 attributes[0].lineBreak = attributes[0].mandatoryBreak = false; // LB2
787 attributes[len].lineBreak = attributes[len].mandatoryBreak = true; // LB3
788}
789
790
791static void getWhiteSpaces(const char16_t *string, qsizetype len, QCharAttributes *attributes)
792{
793 for (qsizetype i = 0; i != len; ++i) {
794 uint ucs4 = string[i];
795 if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
796 ushort low = string[i + 1];
797 if (QChar::isLowSurrogate(ucs4: low)) {
798 ucs4 = QChar::surrogateToUcs4(high: ucs4, low);
799 ++i;
800 }
801 }
802
803 if (Q_UNLIKELY(QChar::isSpace(ucs4)))
804 attributes[i].whiteSpace = true;
805 }
806}
807
808namespace Tailored {
809
810using CharAttributeFunction = void (*)(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes);
811
812
813enum Form {
814 Invalid = 0x0,
815 UnknownForm = Invalid,
816 Consonant,
817 Nukta,
818 Halant,
819 Matra,
820 VowelMark,
821 StressMark,
822 IndependentVowel,
823 LengthMark,
824 Control,
825 Other
826};
827
828static const unsigned char indicForms[0xe00-0x900] = {
829 // Devangari
830 Invalid, VowelMark, VowelMark, VowelMark,
831 IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
832 IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
833 IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
834
835 IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
836 IndependentVowel, Consonant, Consonant, Consonant,
837 Consonant, Consonant, Consonant, Consonant,
838 Consonant, Consonant, Consonant, Consonant,
839
840 Consonant, Consonant, Consonant, Consonant,
841 Consonant, Consonant, Consonant, Consonant,
842 Consonant, Consonant, Consonant, Consonant,
843 Consonant, Consonant, Consonant, Consonant,
844
845 Consonant, Consonant, Consonant, Consonant,
846 Consonant, Consonant, Consonant, Consonant,
847 Consonant, Consonant, UnknownForm, UnknownForm,
848 Nukta, Other, Matra, Matra,
849
850 Matra, Matra, Matra, Matra,
851 Matra, Matra, Matra, Matra,
852 Matra, Matra, Matra, Matra,
853 Matra, Halant, UnknownForm, UnknownForm,
854
855 Other, StressMark, StressMark, StressMark,
856 StressMark, UnknownForm, UnknownForm, UnknownForm,
857 Consonant, Consonant, Consonant, Consonant,
858 Consonant, Consonant, Consonant, Consonant,
859
860 IndependentVowel, IndependentVowel, VowelMark, VowelMark,
861 Other, Other, Other, Other,
862 Other, Other, Other, Other,
863 Other, Other, Other, Other,
864
865 Other, Other, Other, Other,
866 Other, Other, Other, Other,
867 Other, Other, Other, Consonant,
868 Consonant, Consonant /* ??? */, Consonant, Consonant,
869
870 // Bengali
871 Invalid, VowelMark, VowelMark, VowelMark,
872 Invalid, IndependentVowel, IndependentVowel, IndependentVowel,
873 IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
874 IndependentVowel, Invalid, Invalid, IndependentVowel,
875
876 IndependentVowel, Invalid, Invalid, IndependentVowel,
877 IndependentVowel, Consonant, Consonant, Consonant,
878 Consonant, Consonant, Consonant, Consonant,
879 Consonant, Consonant, Consonant, Consonant,
880
881 Consonant, Consonant, Consonant, Consonant,
882 Consonant, Consonant, Consonant, Consonant,
883 Consonant, Invalid, Consonant, Consonant,
884 Consonant, Consonant, Consonant, Consonant,
885
886 Consonant, Invalid, Consonant, Invalid,
887 Invalid, Invalid, Consonant, Consonant,
888 Consonant, Consonant, UnknownForm, UnknownForm,
889 Nukta, Other, Matra, Matra,
890
891 Matra, Matra, Matra, Matra,
892 Matra, Invalid, Invalid, Matra,
893 Matra, Invalid, Invalid, Matra,
894 Matra, Halant, Consonant, UnknownForm,
895
896 Invalid, Invalid, Invalid, Invalid,
897 Invalid, Invalid, Invalid, VowelMark,
898 Invalid, Invalid, Invalid, Invalid,
899 Consonant, Consonant, Invalid, Consonant,
900
901 IndependentVowel, IndependentVowel, VowelMark, VowelMark,
902 Other, Other, Other, Other,
903 Other, Other, Other, Other,
904 Other, Other, Other, Other,
905
906 Consonant, Consonant, Other, Other,
907 Other, Other, Other, Other,
908 Other, Other, Other, Other,
909 Other, Other, Other, Other,
910
911 // Gurmukhi
912 Invalid, VowelMark, VowelMark, VowelMark,
913 Invalid, IndependentVowel, IndependentVowel, IndependentVowel,
914 IndependentVowel, IndependentVowel, IndependentVowel, Invalid,
915 Invalid, Invalid, Invalid, IndependentVowel,
916
917 IndependentVowel, Invalid, Invalid, IndependentVowel,
918 IndependentVowel, Consonant, Consonant, Consonant,
919 Consonant, Consonant, Consonant, Consonant,
920 Consonant, Consonant, Consonant, Consonant,
921
922 Consonant, Consonant, Consonant, Consonant,
923 Consonant, Consonant, Consonant, Consonant,
924 Consonant, Invalid, Consonant, Consonant,
925 Consonant, Consonant, Consonant, Consonant,
926
927 Consonant, Invalid, Consonant, Consonant,
928 Invalid, Consonant, Consonant, Invalid,
929 Consonant, Consonant, UnknownForm, UnknownForm,
930 Nukta, Other, Matra, Matra,
931
932 Matra, Matra, Matra, Invalid,
933 Invalid, Invalid, Invalid, Matra,
934 Matra, Invalid, Invalid, Matra,
935 Matra, Halant, UnknownForm, UnknownForm,
936
937 Invalid, Invalid, Invalid, Invalid,
938 Invalid, UnknownForm, UnknownForm, UnknownForm,
939 Invalid, Consonant, Consonant, Consonant,
940 Consonant, Invalid, Consonant, Invalid,
941
942 Other, Other, Invalid, Invalid,
943 Other, Other, Other, Other,
944 Other, Other, Other, Other,
945 Other, Other, Other, Other,
946
947 StressMark, StressMark, Consonant, Consonant,
948 Other, Other, Other, Other,
949 Other, Other, Other, Other,
950 Other, Other, Other, Other,
951
952 // Gujarati
953 Invalid, VowelMark, VowelMark, VowelMark,
954 Invalid, IndependentVowel, IndependentVowel, IndependentVowel,
955 IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
956 IndependentVowel, IndependentVowel, Invalid, IndependentVowel,
957
958 IndependentVowel, IndependentVowel, Invalid, IndependentVowel,
959 IndependentVowel, Consonant, Consonant, Consonant,
960 Consonant, Consonant, Consonant, Consonant,
961 Consonant, Consonant, Consonant, Consonant,
962
963 Consonant, Consonant, Consonant, Consonant,
964 Consonant, Consonant, Consonant, Consonant,
965 Consonant, Invalid, Consonant, Consonant,
966 Consonant, Consonant, Consonant, Consonant,
967
968 Consonant, Invalid, Consonant, Consonant,
969 Invalid, Consonant, Consonant, Consonant,
970 Consonant, Consonant, UnknownForm, UnknownForm,
971 Nukta, Other, Matra, Matra,
972
973 Matra, Matra, Matra, Matra,
974 Matra, Matra, Invalid, Matra,
975 Matra, Matra, Invalid, Matra,
976 Matra, Halant, UnknownForm, UnknownForm,
977
978 Other, UnknownForm, UnknownForm, UnknownForm,
979 UnknownForm, UnknownForm, UnknownForm, UnknownForm,
980 UnknownForm, UnknownForm, UnknownForm, UnknownForm,
981 UnknownForm, UnknownForm, UnknownForm, UnknownForm,
982
983 IndependentVowel, IndependentVowel, VowelMark, VowelMark,
984 Other, Other, Other, Other,
985 Other, Other, Other, Other,
986 Other, Other, Other, Other,
987
988 Other, Other, Other, Other,
989 Other, Other, Other, Other,
990 Other, Other, Other, Other,
991 Other, Other, Other, Other,
992
993 // Oriya
994 Invalid, VowelMark, VowelMark, VowelMark,
995 Invalid, IndependentVowel, IndependentVowel, IndependentVowel,
996 IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
997 IndependentVowel, Invalid, Invalid, IndependentVowel,
998
999 IndependentVowel, Invalid, Invalid, IndependentVowel,
1000 IndependentVowel, Consonant, Consonant, Consonant,
1001 Consonant, Consonant, Consonant, Consonant,
1002 Consonant, Consonant, Consonant, Consonant,
1003
1004 Consonant, Consonant, Consonant, Consonant,
1005 Consonant, Consonant, Consonant, Consonant,
1006 Consonant, Invalid, Consonant, Consonant,
1007 Consonant, Consonant, Consonant, Consonant,
1008
1009 Consonant, Invalid, Consonant, Consonant,
1010 Invalid, Consonant, Consonant, Consonant,
1011 Consonant, Consonant, UnknownForm, UnknownForm,
1012 Nukta, Other, Matra, Matra,
1013
1014 Matra, Matra, Matra, Matra,
1015 Invalid, Invalid, Invalid, Matra,
1016 Matra, Invalid, Invalid, Matra,
1017 Matra, Halant, UnknownForm, UnknownForm,
1018
1019 Other, Invalid, Invalid, Invalid,
1020 Invalid, UnknownForm, LengthMark, LengthMark,
1021 Invalid, Invalid, Invalid, Invalid,
1022 Consonant, Consonant, Invalid, Consonant,
1023
1024 IndependentVowel, IndependentVowel, Invalid, Invalid,
1025 Invalid, Invalid, Other, Other,
1026 Other, Other, Other, Other,
1027 Other, Other, Other, Other,
1028
1029 Other, Consonant, Other, Other,
1030 Other, Other, Other, Other,
1031 Other, Other, Other, Other,
1032 Other, Other, Other, Other,
1033
1034 //Tamil
1035 Invalid, Invalid, VowelMark, Other,
1036 Invalid, IndependentVowel, IndependentVowel, IndependentVowel,
1037 IndependentVowel, IndependentVowel, IndependentVowel, Invalid,
1038 Invalid, Invalid, IndependentVowel, IndependentVowel,
1039
1040 IndependentVowel, Invalid, IndependentVowel, IndependentVowel,
1041 IndependentVowel, Consonant, Invalid, Invalid,
1042 Invalid, Consonant, Consonant, Invalid,
1043 Consonant, Invalid, Consonant, Consonant,
1044
1045 Invalid, Invalid, Invalid, Consonant,
1046 Consonant, Invalid, Invalid, Invalid,
1047 Consonant, Consonant, Consonant, Invalid,
1048 Invalid, Invalid, Consonant, Consonant,
1049
1050 Consonant, Consonant, Consonant, Consonant,
1051 Consonant, Consonant, Consonant, Consonant,
1052 Consonant, Consonant, UnknownForm, UnknownForm,
1053 Invalid, Invalid, Matra, Matra,
1054
1055 Matra, Matra, Matra, Invalid,
1056 Invalid, Invalid, Matra, Matra,
1057 Matra, Invalid, Matra, Matra,
1058 Matra, Halant, Invalid, Invalid,
1059
1060 Invalid, Invalid, Invalid, Invalid,
1061 Invalid, Invalid, Invalid, LengthMark,
1062 Invalid, Invalid, Invalid, Invalid,
1063 Invalid, Invalid, Invalid, Invalid,
1064
1065 Invalid, Invalid, Invalid, Invalid,
1066 Invalid, Invalid, Other, Other,
1067 Other, Other, Other, Other,
1068 Other, Other, Other, Other,
1069
1070 Other, Other, Other, Other,
1071 Other, Other, Other, Other,
1072 Other, Other, Other, Other,
1073 Other, Other, Other, Other,
1074
1075 // Telugu
1076 Invalid, VowelMark, VowelMark, VowelMark,
1077 Invalid, IndependentVowel, IndependentVowel, IndependentVowel,
1078 IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
1079 IndependentVowel, Invalid, IndependentVowel, IndependentVowel,
1080
1081 IndependentVowel, Invalid, IndependentVowel, IndependentVowel,
1082 IndependentVowel, Consonant, Consonant, Consonant,
1083 Consonant, Consonant, Consonant, Consonant,
1084 Consonant, Consonant, Consonant, Consonant,
1085
1086 Consonant, Consonant, Consonant, Consonant,
1087 Consonant, Consonant, Consonant, Consonant,
1088 Consonant, Invalid, Consonant, Consonant,
1089 Consonant, Consonant, Consonant, Consonant,
1090
1091 Consonant, Consonant, Consonant, Consonant,
1092 Invalid, Consonant, Consonant, Consonant,
1093 Consonant, Consonant, UnknownForm, UnknownForm,
1094 Invalid, Invalid, Matra, Matra,
1095
1096 Matra, Matra, Matra, Matra,
1097 Matra, Invalid, Matra, Matra,
1098 Matra, Invalid, Matra, Matra,
1099 Matra, Halant, Invalid, Invalid,
1100
1101 Invalid, Invalid, Invalid, Invalid,
1102 Invalid, LengthMark, Matra, Invalid,
1103 Invalid, Invalid, Invalid, Invalid,
1104 Invalid, Invalid, Invalid, Invalid,
1105
1106 IndependentVowel, IndependentVowel, Invalid, Invalid,
1107 Invalid, Invalid, Other, Other,
1108 Other, Other, Other, Other,
1109 Other, Other, Other, Other,
1110
1111 Other, Other, Other, Other,
1112 Other, Other, Other, Other,
1113 Other, Other, Other, Other,
1114 Other, Other, Other, Other,
1115
1116 // Kannada
1117 Invalid, Invalid, VowelMark, VowelMark,
1118 Invalid, IndependentVowel, IndependentVowel, IndependentVowel,
1119 IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
1120 IndependentVowel, Invalid, IndependentVowel, IndependentVowel,
1121
1122 IndependentVowel, Invalid, IndependentVowel, IndependentVowel,
1123 IndependentVowel, Consonant, Consonant, Consonant,
1124 Consonant, Consonant, Consonant, Consonant,
1125 Consonant, Consonant, Consonant, Consonant,
1126
1127 Consonant, Consonant, Consonant, Consonant,
1128 Consonant, Consonant, Consonant, Consonant,
1129 Consonant, Invalid, Consonant, Consonant,
1130 Consonant, Consonant, Consonant, Consonant,
1131
1132 Consonant, Consonant, Consonant, Consonant,
1133 Invalid, Consonant, Consonant, Consonant,
1134 Consonant, Consonant, UnknownForm, UnknownForm,
1135 Nukta, Other, Matra, Matra,
1136
1137 Matra, Matra, Matra, Matra,
1138 Matra, Invalid, Matra, Matra,
1139 Matra, Invalid, Matra, Matra,
1140 Matra, Halant, Invalid, Invalid,
1141
1142 Invalid, Invalid, Invalid, Invalid,
1143 Invalid, LengthMark, LengthMark, Invalid,
1144 Invalid, Invalid, Invalid, Invalid,
1145 Invalid, Invalid, Consonant, Invalid,
1146
1147 IndependentVowel, IndependentVowel, VowelMark, VowelMark,
1148 Invalid, Invalid, Other, Other,
1149 Other, Other, Other, Other,
1150 Other, Other, Other, Other,
1151
1152 Other, Other, Other, Other,
1153 Other, Other, Other, Other,
1154 Other, Other, Other, Other,
1155 Other, Other, Other, Other,
1156
1157 // Malayalam
1158 Invalid, Invalid, VowelMark, VowelMark,
1159 Invalid, IndependentVowel, IndependentVowel, IndependentVowel,
1160 IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
1161 IndependentVowel, Invalid, IndependentVowel, IndependentVowel,
1162
1163 IndependentVowel, Invalid, IndependentVowel, IndependentVowel,
1164 IndependentVowel, Consonant, Consonant, Consonant,
1165 Consonant, Consonant, Consonant, Consonant,
1166 Consonant, Consonant, Consonant, Consonant,
1167
1168 Consonant, Consonant, Consonant, Consonant,
1169 Consonant, Consonant, Consonant, Consonant,
1170 Consonant, Invalid, Consonant, Consonant,
1171 Consonant, Consonant, Consonant, Consonant,
1172
1173 Consonant, Consonant, Consonant, Consonant,
1174 Consonant, Consonant, Consonant, Consonant,
1175 Consonant, Consonant, UnknownForm, UnknownForm,
1176 Invalid, Invalid, Matra, Matra,
1177
1178 Matra, Matra, Matra, Matra,
1179 Invalid, Invalid, Matra, Matra,
1180 Matra, Invalid, Matra, Matra,
1181 Matra, Halant, Invalid, Invalid,
1182
1183 Invalid, Invalid, Invalid, Invalid,
1184 Invalid, Invalid, Invalid, Matra,
1185 Invalid, Invalid, Invalid, Invalid,
1186 Invalid, Invalid, Invalid, Invalid,
1187
1188 IndependentVowel, IndependentVowel, Invalid, Invalid,
1189 Invalid, Invalid, Other, Other,
1190 Other, Other, Other, Other,
1191 Other, Other, Other, Other,
1192
1193 Other, Other, Other, Other,
1194 Other, Other, Other, Other,
1195 Other, Other, Other, Other,
1196 Other, Other, Other, Other,
1197
1198 // Sinhala
1199 Invalid, Invalid, VowelMark, VowelMark,
1200 Invalid, IndependentVowel, IndependentVowel, IndependentVowel,
1201 IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
1202 IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
1203
1204 IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
1205 IndependentVowel, IndependentVowel, IndependentVowel, Invalid,
1206 Invalid, Invalid, Consonant, Consonant,
1207 Consonant, Consonant, Consonant, Consonant,
1208
1209 Consonant, Consonant, Consonant, Consonant,
1210 Consonant, Consonant, Consonant, Consonant,
1211 Consonant, Consonant, Consonant, Consonant,
1212 Consonant, Consonant, Consonant, Consonant,
1213
1214 Consonant, Consonant, Invalid, Consonant,
1215 Consonant, Consonant, Consonant, Consonant,
1216 Consonant, Consonant, Consonant, Consonant,
1217 Invalid, Consonant, Invalid, Invalid,
1218
1219 Consonant, Consonant, Consonant, Consonant,
1220 Consonant, Consonant, Consonant, Invalid,
1221 Invalid, Invalid, Halant, Invalid,
1222 Invalid, Invalid, Invalid, Matra,
1223
1224 Matra, Matra, Matra, Matra,
1225 Matra, Invalid, Matra, Invalid,
1226 Matra, Matra, Matra, Matra,
1227 Matra, Matra, Matra, Matra,
1228
1229 Invalid, Invalid, Invalid, Invalid,
1230 Invalid, Invalid, Invalid, Invalid,
1231 Invalid, Invalid, Invalid, Invalid,
1232 Invalid, Invalid, Invalid, Invalid,
1233
1234 Invalid, Invalid, Matra, Matra,
1235 Other, Other, Other, Other,
1236 Other, Other, Other, Other,
1237 Other, Other, Other, Other,
1238};
1239
1240static inline Form form(unsigned short uc) {
1241 if (uc < 0x900 || uc > 0xdff) {
1242 if (uc == 0x25cc)
1243 return Consonant;
1244 if (uc == 0x200c || uc == 0x200d)
1245 return Control;
1246 return Other;
1247 }
1248 return (Form)indicForms[uc-0x900];
1249}
1250
1251// #define INDIC_DEBUG
1252#ifdef INDIC_DEBUG
1253#define IDEBUG qDebug
1254#else
1255#define IDEBUG if constexpr (1) ; else qDebug
1256#endif
1257
1258/* syllables are of the form:
1259
1260 (Consonant Nukta? Halant)* Consonant Matra? VowelMark? StressMark?
1261 (Consonant Nukta? Halant)* Consonant Halant
1262 IndependentVowel VowelMark? StressMark?
1263
1264 We return syllable boundaries on invalid combinations as well
1265*/
1266static qsizetype indic_nextSyllableBoundary(QChar::Script script, const char16_t *s, qsizetype start, qsizetype end, bool *invalid)
1267{
1268 *invalid = false;
1269 IDEBUG(msg: "indic_nextSyllableBoundary: start=%lld, end=%lld", qlonglong(start), qlonglong(end));
1270 const char16_t *uc = s+start;
1271
1272 qsizetype pos = 0;
1273 Form state = form(uc: uc[pos]);
1274 IDEBUG(msg: "state[%lld]=%d (uc=%4x)", qlonglong(pos), state, uc[pos]);
1275 pos++;
1276
1277 if (state != Consonant && state != IndependentVowel) {
1278 if (state != Other)
1279 *invalid = true;
1280 goto finish;
1281 }
1282
1283 while (pos < end - start) {
1284 Form newState = form(uc: uc[pos]);
1285 IDEBUG(msg: "state[%lld]=%d (uc=%4x)", qlonglong(pos), newState, uc[pos]);
1286 switch (newState) {
1287 case Control:
1288 newState = state;
1289 if (state == Halant && uc[pos] == 0x200d /* ZWJ */)
1290 break;
1291 // the control character should be the last char in the item
1292 if (state == Consonant && script == QChar::Script_Bengali && uc[pos-1] == 0x09B0 && uc[pos] == 0x200d /* ZWJ */)
1293 break;
1294 if (state == Consonant && script == QChar::Script_Kannada && uc[pos-1] == 0x0CB0 && uc[pos] == 0x200d /* ZWJ */)
1295 break;
1296 // Bengali and Kannada has a special exception for rendering yaphala with ra (to avoid reph) see http://www.unicode.org/faq/indic.html#15
1297 ++pos;
1298 goto finish;
1299 case Consonant:
1300 if (state == Halant && (script != QChar::Script_Sinhala || uc[pos-1] == 0x200d /* ZWJ */))
1301 break;
1302 goto finish;
1303 case Halant:
1304 if (state == Nukta || state == Consonant)
1305 break;
1306 // Bengali has a special exception allowing the combination Vowel_A/E + Halant + Ya
1307 if (script == QChar::Script_Bengali && pos == 1 &&
1308 (uc[0] == 0x0985 || uc[0] == 0x098f))
1309 break;
1310 // Sinhala uses the Halant as a component of certain matras. Allow these, but keep the state on Matra.
1311 if (script == QChar::Script_Sinhala && state == Matra) {
1312 ++pos;
1313 continue;
1314 }
1315 if (script == QChar::Script_Malayalam && state == Matra && uc[pos-1] == 0x0d41) {
1316 ++pos;
1317 continue;
1318 }
1319 goto finish;
1320 case Nukta:
1321 if (state == Consonant)
1322 break;
1323 goto finish;
1324 case StressMark:
1325 if (state == VowelMark)
1326 break;
1327 Q_FALLTHROUGH();
1328 case VowelMark:
1329 if (state == Matra || state == LengthMark || state == IndependentVowel)
1330 break;
1331 Q_FALLTHROUGH();
1332 case Matra:
1333 if (state == Consonant || state == Nukta)
1334 break;
1335 if (state == Matra) {
1336 // ### needs proper testing for correct two/three part matras
1337 break;
1338 }
1339 // ### not sure if this is correct. If it is, does it apply only to Bengali or should
1340 // it work for all Indic languages?
1341 // the combination Independent_A + Vowel Sign AA is allowed.
1342 if (script == QChar::Script_Bengali && uc[pos] == 0x9be && uc[pos-1] == 0x985)
1343 break;
1344 if (script == QChar::Script_Tamil && state == Matra) {
1345 if (uc[pos-1] == 0x0bc6 &&
1346 (uc[pos] == 0xbbe || uc[pos] == 0xbd7))
1347 break;
1348 if (uc[pos-1] == 0x0bc7 && uc[pos] == 0xbbe)
1349 break;
1350 }
1351 goto finish;
1352
1353 case LengthMark:
1354 if (state == Matra) {
1355 // ### needs proper testing for correct two/three part matras
1356 break;
1357 }
1358 case IndependentVowel:
1359 case Invalid:
1360 case Other:
1361 goto finish;
1362 }
1363 state = newState;
1364 pos++;
1365 }
1366 finish:
1367 return pos+start;
1368}
1369
1370static void indicAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
1371{
1372 qsizetype end = from + len;
1373 attributes += from;
1374 qsizetype i = 0;
1375 while (i < len) {
1376 bool invalid;
1377 qsizetype boundary = indic_nextSyllableBoundary(script, s: text, start: from+i, end, invalid: &invalid) - from;
1378 attributes[i].graphemeBoundary = true;
1379
1380 if (boundary > len-1) boundary = len;
1381 i++;
1382 while (i < boundary) {
1383 attributes[i].graphemeBoundary = false;
1384 ++i;
1385 }
1386 assert(i == boundary);
1387 }
1388
1389
1390}
1391
1392#if QT_CONFIG(library)
1393
1394#define LIBTHAI_MAJOR 0
1395
1396/*
1397 * if libthai changed please update these codes too.
1398 */
1399struct thcell_t {
1400 unsigned char base; /**< base character */
1401 unsigned char hilo; /**< upper/lower vowel/diacritic */
1402 unsigned char top; /**< top-level mark */
1403};
1404
1405using ThBrk = struct _ThBrk;
1406
1407namespace {
1408
1409class LibThai final
1410{
1411 Q_DISABLE_COPY_MOVE(LibThai)
1412
1413 using th_brk_new_def = ThBrk *(*)(const char *);
1414 using th_brk_delete_def = void (*)(ThBrk *);
1415 using th_brk_find_breaks_def = int (*)(ThBrk *, const unsigned char *, int *, size_t);
1416 using th_next_cell_def = size_t (*)(const unsigned char *, size_t, struct thcell_t *, int);
1417
1418public:
1419 LibThai() : m_library("thai"_L1, LIBTHAI_MAJOR)
1420 {
1421 m_th_brk_find_breaks =
1422 reinterpret_cast<th_brk_find_breaks_def>(m_library.resolve(symbol: "th_brk_find_breaks"));
1423 m_th_next_cell = reinterpret_cast<th_next_cell_def>(m_library.resolve(symbol: "th_next_cell"));
1424
1425 auto th_brk_new = reinterpret_cast<th_brk_new_def>(m_library.resolve(symbol: "th_brk_new"));
1426 if (th_brk_new) {
1427 m_state = th_brk_new(nullptr);
1428 m_th_brk_delete =
1429 reinterpret_cast<th_brk_delete_def>(m_library.resolve(symbol: "th_brk_delete"));
1430 }
1431 }
1432
1433 ~LibThai()
1434 {
1435 if (m_state && m_th_brk_delete)
1436 m_th_brk_delete(m_state);
1437 m_library.unload();
1438 }
1439
1440 bool isInitialized() const { return m_th_brk_find_breaks && m_th_next_cell && m_state; }
1441
1442 int brk_find_breaks(const unsigned char *s, int *pos, size_t pos_sz) const
1443 {
1444 Q_ASSERT(m_state);
1445 Q_ASSERT(m_th_brk_find_breaks);
1446 return m_th_brk_find_breaks(m_state, s, pos, pos_sz);
1447 }
1448
1449 size_t next_cell(const unsigned char *s, size_t len, struct thcell_t *cell, int is_decomp_am)
1450 {
1451 Q_ASSERT(m_th_next_cell);
1452 return m_th_next_cell(s, len, cell, is_decomp_am);
1453 }
1454
1455private:
1456 QLibrary m_library;
1457
1458 // Global state for th_brk_find_breaks().
1459 // Note: even if signature for th_brk_find_breaks() suggests otherwise, the
1460 // state is read-only, and so it is safe to use it from multiple threads after
1461 // initialization. This is also stated in the libthai documentation.
1462 ThBrk *m_state = nullptr;
1463
1464 th_brk_find_breaks_def m_th_brk_find_breaks = nullptr;
1465 th_next_cell_def m_th_next_cell = nullptr;
1466 th_brk_delete_def m_th_brk_delete = nullptr;
1467};
1468
1469} // unnamed namespace
1470
1471Q_GLOBAL_STATIC(LibThai, g_libThai)
1472
1473static void to_tis620(const char16_t *string, qsizetype len, char *cstr)
1474{
1475 qsizetype i;
1476 unsigned char *result = reinterpret_cast<unsigned char *>(cstr);
1477
1478 for (i = 0; i < len; ++i) {
1479 if (string[i] <= 0xa0)
1480 result[i] = static_cast<unsigned char>(string[i]);
1481 else if (string[i] >= 0xe01 && string[i] <= 0xe5b)
1482 result[i] = static_cast<unsigned char>(string[i] - 0xe00 + 0xa0);
1483 else
1484 result[i] = static_cast<unsigned char>(~0); // Same encoding as libthai uses for invalid chars
1485 }
1486
1487 result[len] = 0;
1488}
1489
1490/*
1491 * Thai Attributes: computes Word Break, Word Boundary and Char stop for THAI.
1492 */
1493static void thaiAssignAttributes(const char16_t *string, qsizetype len, QCharAttributes *attributes)
1494{
1495 constexpr qsizetype Prealloc = 128;
1496 QVarLengthArray<char, Prealloc + 1> s(len + 1);
1497 QVarLengthArray<int, Prealloc> break_positions(len);
1498 qsizetype numbreaks, i;
1499 struct thcell_t tis_cell;
1500
1501 LibThai *libThai = g_libThai;
1502 if (!libThai || !libThai->isInitialized())
1503 return;
1504
1505 to_tis620(string, len, cstr: s.data());
1506
1507 for (i = 0; i < len; ++i) {
1508 attributes[i].wordBreak = false;
1509 attributes[i].wordStart = false;
1510 attributes[i].wordEnd = false;
1511 attributes[i].lineBreak = false;
1512 }
1513
1514 attributes[0].wordBreak = true;
1515 attributes[0].wordStart = true;
1516 attributes[0].wordEnd = false;
1517 numbreaks = libThai->brk_find_breaks(s: reinterpret_cast<const unsigned char *>(s.data()),
1518 pos: break_positions.data(),
1519 pos_sz: static_cast<size_t>(break_positions.size()));
1520 for (i = 0; i < numbreaks; ++i) {
1521 attributes[break_positions[i]].wordBreak = true;
1522 attributes[break_positions[i]].wordStart = true;
1523 attributes[break_positions[i]].wordEnd = true;
1524 attributes[break_positions[i]].lineBreak = true;
1525 }
1526 if (numbreaks > 0)
1527 attributes[break_positions[numbreaks - 1]].wordStart = false;
1528
1529 /* manage grapheme boundaries */
1530 i = 0;
1531 while (i < len) {
1532 size_t cell_length =
1533 libThai->next_cell(s: reinterpret_cast<const unsigned char *>(s.data()) + i,
1534 len: size_t(len - i), cell: &tis_cell, is_decomp_am: true);
1535
1536 attributes[i].graphemeBoundary = true;
1537 for (size_t j = 1; j < cell_length; ++j)
1538 attributes[i + j].graphemeBoundary = false;
1539
1540 i += cell_length;
1541 }
1542}
1543
1544#endif // QT_CONFIG(library)
1545
1546static void thaiAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
1547{
1548 assert(script == QChar::Script_Thai);
1549#if QT_CONFIG(library)
1550 const char16_t *uc = text + from;
1551 attributes += from;
1552 Q_UNUSED(script);
1553 thaiAssignAttributes(string: uc, len, attributes);
1554#else
1555 Q_UNUSED(script);
1556 Q_UNUSED(text);
1557 Q_UNUSED(from);
1558 Q_UNUSED(len);
1559 Q_UNUSED(attributes);
1560#endif
1561}
1562
1563/*
1564 tibetan syllables are of the form:
1565 head position consonant
1566 first sub-joined consonant
1567 ....intermediate sub-joined consonants (if any)
1568 last sub-joined consonant
1569 sub-joined vowel (a-chung U+0F71)
1570 standard or compound vowel sign (or 'virama' for devanagari transliteration)
1571*/
1572
1573typedef enum {
1574 TibetanOther,
1575 TibetanHeadConsonant,
1576 TibetanSubjoinedConsonant,
1577 TibetanSubjoinedVowel,
1578 TibetanVowel
1579} TibetanForm;
1580
1581/* this table starts at U+0f40 */
1582static const unsigned char tibetanForm[0x80] = {
1583 TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1584 TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1585 TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1586 TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1587
1588 TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1589 TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1590 TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1591 TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1592
1593 TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1594 TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1595 TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1596 TibetanOther, TibetanOther, TibetanOther, TibetanOther,
1597
1598 TibetanOther, TibetanVowel, TibetanVowel, TibetanVowel,
1599 TibetanVowel, TibetanVowel, TibetanVowel, TibetanVowel,
1600 TibetanVowel, TibetanVowel, TibetanVowel, TibetanVowel,
1601 TibetanVowel, TibetanVowel, TibetanVowel, TibetanVowel,
1602
1603 TibetanVowel, TibetanVowel, TibetanVowel, TibetanVowel,
1604 TibetanVowel, TibetanVowel, TibetanVowel, TibetanVowel,
1605 TibetanOther, TibetanOther, TibetanOther, TibetanOther,
1606 TibetanOther, TibetanOther, TibetanOther, TibetanOther,
1607
1608 TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1609 TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1610 TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1611 TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1612
1613 TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1614 TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1615 TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1616 TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1617
1618 TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1619 TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1620 TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1621 TibetanSubjoinedConsonant, TibetanOther, TibetanOther, TibetanOther
1622};
1623
1624#define tibetan_form(c) \
1625 ((c) >= 0x0f40 && (c) < 0x0fc0 ? (TibetanForm)tibetanForm[(c) - 0x0f40] : TibetanOther)
1626
1627static qsizetype tibetan_nextSyllableBoundary(const char16_t *s, qsizetype start, qsizetype end, bool *invalid)
1628{
1629 const char16_t *uc = s + start;
1630
1631 qsizetype pos = 0;
1632 TibetanForm state = tibetan_form(*uc);
1633
1634/* qDebug("state[%d]=%d (uc=%4x)", pos, state, uc[pos]);*/
1635 pos++;
1636
1637 if (state != TibetanHeadConsonant) {
1638 if (state != TibetanOther)
1639 *invalid = true;
1640 goto finish;
1641 }
1642
1643 while (pos < end - start) {
1644 TibetanForm newState = tibetan_form(uc[pos]);
1645 switch (newState) {
1646 case TibetanSubjoinedConsonant:
1647 case TibetanSubjoinedVowel:
1648 if (state != TibetanHeadConsonant &&
1649 state != TibetanSubjoinedConsonant)
1650 goto finish;
1651 state = newState;
1652 break;
1653 case TibetanVowel:
1654 if (state != TibetanHeadConsonant &&
1655 state != TibetanSubjoinedConsonant &&
1656 state != TibetanSubjoinedVowel)
1657 goto finish;
1658 break;
1659 case TibetanOther:
1660 case TibetanHeadConsonant:
1661 goto finish;
1662 }
1663 pos++;
1664 }
1665
1666finish:
1667 *invalid = false;
1668 return start+pos;
1669}
1670
1671static void tibetanAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
1672{
1673 qsizetype end = from + len;
1674 qsizetype i = 0;
1675 Q_UNUSED(script);
1676 attributes += from;
1677 while (i < len) {
1678 bool invalid;
1679 qsizetype boundary = tibetan_nextSyllableBoundary(s: text, start: from+i, end, invalid: &invalid) - from;
1680
1681 attributes[i].graphemeBoundary = true;
1682
1683 if (boundary > len-1) boundary = len;
1684 i++;
1685 while (i < boundary) {
1686 attributes[i].graphemeBoundary = false;
1687 ++i;
1688 }
1689 assert(i == boundary);
1690 }
1691}
1692
1693enum MymrCharClassValues {
1694 Mymr_CC_RESERVED = 0,
1695 Mymr_CC_CONSONANT = 1, /* Consonant of type 1, that has subscript form */
1696 Mymr_CC_CONSONANT2 = 2, /* Consonant of type 2, that has no subscript form */
1697 Mymr_CC_NGA = 3, /* Consonant NGA */
1698 Mymr_CC_YA = 4, /* Consonant YA */
1699 Mymr_CC_RA = 5, /* Consonant RA */
1700 Mymr_CC_WA = 6, /* Consonant WA */
1701 Mymr_CC_HA = 7, /* Consonant HA */
1702 Mymr_CC_IND_VOWEL = 8, /* Independent vowel */
1703 Mymr_CC_ZERO_WIDTH_NJ_MARK = 9, /* Zero Width non joiner character (0x200C) */
1704 Mymr_CC_VIRAMA = 10, /* Subscript consonant combining character */
1705 Mymr_CC_PRE_VOWEL = 11, /* Dependent vowel, prebase (Vowel e) */
1706 Mymr_CC_BELOW_VOWEL = 12, /* Dependent vowel, prebase (Vowel u, uu) */
1707 Mymr_CC_ABOVE_VOWEL = 13, /* Dependent vowel, prebase (Vowel i, ii, ai) */
1708 Mymr_CC_POST_VOWEL = 14, /* Dependent vowel, prebase (Vowel aa) */
1709 Mymr_CC_SIGN_ABOVE = 15,
1710 Mymr_CC_SIGN_BELOW = 16,
1711 Mymr_CC_SIGN_AFTER = 17,
1712 Mymr_CC_ZERO_WIDTH_J_MARK = 18, /* Zero width joiner character */
1713 Mymr_CC_COUNT = 19 /* This is the number of character classes */
1714};
1715
1716enum MymrCharClassFlags {
1717 Mymr_CF_CLASS_MASK = 0x0000FFFF,
1718
1719 Mymr_CF_CONSONANT = 0x01000000, /* flag to speed up comparing */
1720 Mymr_CF_MEDIAL = 0x02000000, /* flag to speed up comparing */
1721 Mymr_CF_IND_VOWEL = 0x04000000, /* flag to speed up comparing */
1722 Mymr_CF_DEP_VOWEL = 0x08000000, /* flag to speed up comparing */
1723 Mymr_CF_DOTTED_CIRCLE = 0x10000000, /* add a dotted circle if a character with this flag is the
1724 first in a syllable */
1725 Mymr_CF_VIRAMA = 0x20000000, /* flag to speed up comparing */
1726
1727 /* position flags */
1728 Mymr_CF_POS_BEFORE = 0x00080000,
1729 Mymr_CF_POS_BELOW = 0x00040000,
1730 Mymr_CF_POS_ABOVE = 0x00020000,
1731 Mymr_CF_POS_AFTER = 0x00010000,
1732 Mymr_CF_POS_MASK = 0x000f0000,
1733
1734 Mymr_CF_AFTER_KINZI = 0x00100000
1735};
1736
1737Q_DECLARE_MIXED_ENUM_OPERATORS(int, MymrCharClassValues, MymrCharClassFlags)
1738
1739/* Characters that get refrered to by name */
1740enum MymrChar
1741{
1742 Mymr_C_SIGN_ZWNJ = 0x200C,
1743 Mymr_C_SIGN_ZWJ = 0x200D,
1744 Mymr_C_DOTTED_CIRCLE = 0x25CC,
1745 Mymr_C_RA = 0x101B,
1746 Mymr_C_YA = 0x101A,
1747 Mymr_C_NGA = 0x1004,
1748 Mymr_C_VOWEL_E = 0x1031,
1749 Mymr_C_VIRAMA = 0x1039
1750};
1751
1752enum
1753{
1754 Mymr_xx = Mymr_CC_RESERVED,
1755 Mymr_c1 = Mymr_CC_CONSONANT | Mymr_CF_CONSONANT | Mymr_CF_POS_BELOW,
1756 Mymr_c2 = Mymr_CC_CONSONANT2 | Mymr_CF_CONSONANT,
1757 Mymr_ng = Mymr_CC_NGA | Mymr_CF_CONSONANT | Mymr_CF_POS_ABOVE,
1758 Mymr_ya = Mymr_CC_YA | Mymr_CF_CONSONANT | Mymr_CF_MEDIAL | Mymr_CF_POS_AFTER | Mymr_CF_AFTER_KINZI,
1759 Mymr_ra = Mymr_CC_RA | Mymr_CF_CONSONANT | Mymr_CF_MEDIAL | Mymr_CF_POS_BEFORE,
1760 Mymr_wa = Mymr_CC_WA | Mymr_CF_CONSONANT | Mymr_CF_MEDIAL | Mymr_CF_POS_BELOW,
1761 Mymr_ha = Mymr_CC_HA | Mymr_CF_CONSONANT | Mymr_CF_MEDIAL | Mymr_CF_POS_BELOW,
1762 Mymr_id = Mymr_CC_IND_VOWEL | Mymr_CF_IND_VOWEL,
1763 Mymr_vi = Mymr_CC_VIRAMA | Mymr_CF_VIRAMA | Mymr_CF_POS_ABOVE | Mymr_CF_DOTTED_CIRCLE,
1764 Mymr_dl = Mymr_CC_PRE_VOWEL | Mymr_CF_DEP_VOWEL | Mymr_CF_POS_BEFORE | Mymr_CF_DOTTED_CIRCLE | Mymr_CF_AFTER_KINZI,
1765 Mymr_db = Mymr_CC_BELOW_VOWEL | Mymr_CF_DEP_VOWEL | Mymr_CF_POS_BELOW | Mymr_CF_DOTTED_CIRCLE | Mymr_CF_AFTER_KINZI,
1766 Mymr_da = Mymr_CC_ABOVE_VOWEL | Mymr_CF_DEP_VOWEL | Mymr_CF_POS_ABOVE | Mymr_CF_DOTTED_CIRCLE | Mymr_CF_AFTER_KINZI,
1767 Mymr_dr = Mymr_CC_POST_VOWEL | Mymr_CF_DEP_VOWEL | Mymr_CF_POS_AFTER | Mymr_CF_DOTTED_CIRCLE | Mymr_CF_AFTER_KINZI,
1768 Mymr_sa = Mymr_CC_SIGN_ABOVE | Mymr_CF_DOTTED_CIRCLE | Mymr_CF_POS_ABOVE | Mymr_CF_AFTER_KINZI,
1769 Mymr_sb = Mymr_CC_SIGN_BELOW | Mymr_CF_DOTTED_CIRCLE | Mymr_CF_POS_BELOW | Mymr_CF_AFTER_KINZI,
1770 Mymr_sp = Mymr_CC_SIGN_AFTER | Mymr_CF_DOTTED_CIRCLE | Mymr_CF_AFTER_KINZI
1771};
1772
1773
1774typedef int MymrCharClass;
1775
1776
1777static const MymrCharClass mymrCharClasses[] =
1778{
1779 Mymr_c1, Mymr_c1, Mymr_c1, Mymr_c1, Mymr_ng, Mymr_c1, Mymr_c1, Mymr_c1,
1780 Mymr_c1, Mymr_c1, Mymr_c2, Mymr_c1, Mymr_c1, Mymr_c1, Mymr_c1, Mymr_c1, /* 1000 - 100F */
1781 Mymr_c1, Mymr_c1, Mymr_c1, Mymr_c1, Mymr_c1, Mymr_c1, Mymr_c1, Mymr_c1,
1782 Mymr_c1, Mymr_c1, Mymr_ya, Mymr_ra, Mymr_c1, Mymr_wa, Mymr_c1, Mymr_ha, /* 1010 - 101F */
1783 Mymr_c2, Mymr_c2, Mymr_xx, Mymr_id, Mymr_id, Mymr_id, Mymr_id, Mymr_id,
1784 Mymr_xx, Mymr_id, Mymr_id, Mymr_xx, Mymr_dr, Mymr_da, Mymr_da, Mymr_db, /* 1020 - 102F */
1785 Mymr_db, Mymr_dl, Mymr_da, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_sa, Mymr_sb,
1786 Mymr_sp, Mymr_vi, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, /* 1030 - 103F */
1787 Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx,
1788 Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, /* 1040 - 104F */
1789 Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx,
1790 Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, /* 1050 - 105F */
1791};
1792
1793static MymrCharClass
1794getMyanmarCharClass (ushort ch)
1795{
1796 if (ch == Mymr_C_SIGN_ZWJ)
1797 return Mymr_CC_ZERO_WIDTH_J_MARK;
1798
1799 if (ch == Mymr_C_SIGN_ZWNJ)
1800 return Mymr_CC_ZERO_WIDTH_NJ_MARK;
1801
1802 if (ch < 0x1000 || ch > 0x105f)
1803 return Mymr_CC_RESERVED;
1804
1805 return mymrCharClasses[ch - 0x1000];
1806}
1807
1808static const signed char mymrStateTable[][Mymr_CC_COUNT] =
1809{
1810/* xx c1, c2 ng ya ra wa ha id zwnj vi dl db da dr sa sb sp zwj */
1811 { 1, 4, 4, 2, 4, 4, 4, 4, 24, 1, 27, 17, 18, 19, 20, 21, 1, 1, 4}, /* 0 - ground state */
1812 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 1 - exit state (or sp to the right of the syllable) */
1813 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 3, 17, 18, 19, 20, 21, -1, -1, 4}, /* 2 - NGA */
1814 {-1, 4, 4, 4, 4, 4, 4, 4, -1, 23, -1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 3 - Virama after NGA */
1815 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 5, 17, 18, 19, 20, 21, 1, 1, -1}, /* 4 - Base consonant */
1816 {-2, 6, -2, -2, 7, 8, 9, 10, -2, 23, -2, -2, -2, -2, -2, -2, -2, -2, -2}, /* 5 - First virama */
1817 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 25, 17, 18, 19, 20, 21, -1, -1, -1}, /* 6 - c1 after virama */
1818 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 17, 18, 19, 20, 21, -1, -1, -1}, /* 7 - ya after virama */
1819 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 17, 18, 19, 20, 21, -1, -1, -1}, /* 8 - ra after virama */
1820 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 17, 18, 19, 20, 21, -1, -1, -1}, /* 9 - wa after virama */
1821 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 17, 18, 19, 20, 21, -1, -1, -1}, /* 10 - ha after virama */
1822 {-1, -1, -1, -1, 7, 8, 9, 10, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 11 - Virama after NGA+zwj */
1823 {-2, -2, -2, -2, -2, -2, 13, 14, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2}, /* 12 - Second virama */
1824 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 15, 17, 18, 19, 20, 21, -1, -1, -1}, /* 13 - wa after virama */
1825 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 17, 18, 19, 20, 21, -1, -1, -1}, /* 14 - ha after virama */
1826 {-2, -2, -2, -2, -2, -2, -2, 16, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2}, /* 15 - Third virama */
1827 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 17, 18, 19, 20, 21, -1, -1, -1}, /* 16 - ha after virama */
1828 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 20, 21, 1, 1, -1}, /* 17 - dl, Dependent vowel e */
1829 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 19, -1, 21, 1, 1, -1}, /* 18 - db, Dependent vowel u,uu */
1830 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, -1}, /* 19 - da, Dependent vowel i,ii,ai */
1831 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 22, -1, -1, -1, -1, -1, 1, 1, -1}, /* 20 - dr, Dependent vowel aa */
1832 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, -1}, /* 21 - sa, Sign anusvara */
1833 {-1, -1, -1, -1, -1, -1, -1, -1, -1, 23, -1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 22 - atha */
1834 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, -1}, /* 23 - zwnj for atha */
1835 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, -1}, /* 24 - Independent vowel */
1836 {-2, -2, -2, -2, 26, 26, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2}, /* 25 - Virama after subscript consonant */
1837 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 17, 18, 19, 20, 21, -1, 1, -1}, /* 26 - ra/ya after subscript consonant + virama */
1838 {-1, 6, -1, -1, 7, 8, 9, 10, -1, 23, -1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 27 - Virama after ground state */
1839/* exit state -2 is for invalid order of medials and combination of invalids
1840 with virama where virama should treat as start of next syllable
1841 */
1842};
1843
1844/*#define MYANMAR_DEBUG */
1845#ifdef MYANMAR_DEBUG
1846#define MMDEBUG qDebug
1847#else
1848# define MMDEBUG \
1849 if (0) \
1850 printf
1851#endif
1852
1853/*
1854// Given an input string of characters and a location in which to start looking
1855// calculate, using the state table, which one is the last character of the syllable
1856// that starts in the starting position.
1857*/
1858static qsizetype myanmar_nextSyllableBoundary(const char16_t *s, qsizetype start, qsizetype end, bool *invalid)
1859{
1860 const char16_t *uc = s + start;
1861 int state = 0;
1862 qsizetype pos = start;
1863 *invalid = false;
1864
1865 while (pos < end) {
1866 MymrCharClass charClass = getMyanmarCharClass(ch: *uc);
1867 state = mymrStateTable[state][charClass & Mymr_CF_CLASS_MASK];
1868 if (pos == start)
1869 *invalid = (bool)(charClass & Mymr_CF_DOTTED_CIRCLE);
1870
1871 MMDEBUG(format: "state[%lld]=%d class=%8x (uc=%4x)", qlonglong(pos - start), state, charClass, *uc);
1872
1873 if (state < 0) {
1874 if (state < -1)
1875 --pos;
1876 break;
1877 }
1878 ++uc;
1879 ++pos;
1880 }
1881 return pos;
1882}
1883
1884static void myanmarAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
1885{
1886 qsizetype end = from + len;
1887 qsizetype i = 0;
1888 Q_UNUSED(script);
1889 attributes += from;
1890 while (i < len) {
1891 bool invalid;
1892 qsizetype boundary = myanmar_nextSyllableBoundary(s: text, start: from+i, end, invalid: &invalid) - from;
1893
1894 attributes[i].graphemeBoundary = true;
1895 attributes[i].lineBreak = true;
1896
1897 if (boundary > len-1)
1898 boundary = len;
1899 i++;
1900 while (i < boundary) {
1901 attributes[i].graphemeBoundary = false;
1902 ++i;
1903 }
1904 assert(i == boundary);
1905 }
1906}
1907
1908/*
1909// Vocabulary
1910// Base -> A consonant or an independent vowel in its full (not subscript) form. It is the
1911// center of the syllable, it can be surrounded by coeng (subscript) consonants, vowels,
1912// split vowels, signs... but there is only one base in a syllable, it has to be coded as
1913// the first character of the syllable.
1914// split vowel --> vowel that has two parts placed separately (e.g. Before and after the consonant).
1915// Khmer language has five of them. Khmer split vowels either have one part before the
1916// base and one after the base or they have a part before the base and a part above the base.
1917// The first part of all Khmer split vowels is the same character, identical to
1918// the glyph of Khmer dependent vowel SRA EI
1919// coeng --> modifier used in Khmer to construct coeng (subscript) consonants
1920// Differently than indian languages, the coeng modifies the consonant that follows it,
1921// not the one preceding it Each consonant has two forms, the base form and the subscript form
1922// the base form is the normal one (using the consonants code-point), the subscript form is
1923// displayed when the combination coeng + consonant is encountered.
1924// Consonant of type 1 -> A consonant which has subscript for that only occupies space under a base consonant
1925// Consonant of type 2.-> Its subscript form occupies space under and before the base (only one, RO)
1926// Consonant of Type 3 -> Its subscript form occupies space under and after the base (KHO, CHHO, THHO, BA, YO, SA)
1927// Consonant shifter -> Khmer has to series of consonants. The same dependent vowel has different sounds
1928// if it is attached to a consonant of the first series or a consonant of the second series
1929// Most consonants have an equivalent in the other series, but some of theme exist only in
1930// one series (for example SA). If we want to use the consonant SA with a vowel sound that
1931// can only be done with a vowel sound that corresponds to a vowel accompanying a consonant
1932// of the other series, then we need to use a consonant shifter: TRIISAP or MUSIKATOAN
1933// x17C9 y x17CA. TRIISAP changes a first series consonant to second series sound and
1934// MUSIKATOAN a second series consonant to have a first series vowel sound.
1935// Consonant shifter are both normally supercript marks, but, when they are followed by a
1936// superscript, they change shape and take the form of subscript dependent vowel SRA U.
1937// If they are in the same syllable as a coeng consonant, Unicode 3.0 says that they
1938// should be typed before the coeng. Unicode 4.0 breaks the standard and says that it should
1939// be placed after the coeng consonant.
1940// Dependent vowel -> In khmer dependent vowels can be placed above, below, before or after the base
1941// Each vowel has its own position. Only one vowel per syllable is allowed.
1942// Signs -> Khmer has above signs and post signs. Only one above sign and/or one post sign are
1943// Allowed in a syllable.
1944//
1945//
1946// order is important here! This order must be the same that is found in each horizontal
1947// line in the statetable for Khmer (see khmerStateTable) .
1948*/
1949enum KhmerCharClassValues {
1950 CC_RESERVED = 0,
1951 CC_CONSONANT = 1, /* Consonant of type 1 or independent vowel */
1952 CC_CONSONANT2 = 2, /* Consonant of type 2 */
1953 CC_CONSONANT3 = 3, /* Consonant of type 3 */
1954 CC_ZERO_WIDTH_NJ_MARK = 4, /* Zero Width non joiner character (0x200C) */
1955 CC_CONSONANT_SHIFTER = 5,
1956 CC_ROBAT = 6, /* Khmer special diacritic accent -treated differently in state table */
1957 CC_COENG = 7, /* Subscript consonant combining character */
1958 CC_DEPENDENT_VOWEL = 8,
1959 CC_SIGN_ABOVE = 9,
1960 CC_SIGN_AFTER = 10,
1961 CC_ZERO_WIDTH_J_MARK = 11, /* Zero width joiner character */
1962 CC_COUNT = 12 /* This is the number of character classes */
1963};
1964
1965
1966enum KhmerCharClassFlags {
1967 CF_CLASS_MASK = 0x0000FFFF,
1968
1969 CF_CONSONANT = 0x01000000, /* flag to speed up comparing */
1970 CF_SPLIT_VOWEL = 0x02000000, /* flag for a split vowel -> the first part is added in front of the syllable */
1971 CF_DOTTED_CIRCLE = 0x04000000, /* add a dotted circle if a character with this flag is the first in a syllable */
1972 CF_COENG = 0x08000000, /* flag to speed up comparing */
1973 CF_SHIFTER = 0x10000000, /* flag to speed up comparing */
1974 CF_ABOVE_VOWEL = 0x20000000, /* flag to speed up comparing */
1975
1976 /* position flags */
1977 CF_POS_BEFORE = 0x00080000,
1978 CF_POS_BELOW = 0x00040000,
1979 CF_POS_ABOVE = 0x00020000,
1980 CF_POS_AFTER = 0x00010000,
1981 CF_POS_MASK = 0x000f0000
1982};
1983
1984Q_DECLARE_MIXED_ENUM_OPERATORS(int, KhmerCharClassValues, KhmerCharClassFlags)
1985
1986/* Characters that get referred to by name */
1987enum KhmerChar {
1988 C_SIGN_ZWNJ = 0x200C,
1989 C_SIGN_ZWJ = 0x200D,
1990 C_RO = 0x179A,
1991 C_VOWEL_AA = 0x17B6,
1992 C_SIGN_NIKAHIT = 0x17C6,
1993 C_VOWEL_E = 0x17C1,
1994 C_COENG = 0x17D2
1995};
1996
1997
1998/*
1999// simple classes, they are used in the statetable (in this file) to control the length of a syllable
2000// they are also used to know where a character should be placed (location in reference to the base character)
2001// and also to know if a character, when independently displayed, should be displayed with a dotted-circle to
2002// indicate error in syllable construction
2003*/
2004enum {
2005 _xx = CC_RESERVED,
2006 _sa = CC_SIGN_ABOVE | CF_DOTTED_CIRCLE | CF_POS_ABOVE,
2007 _sp = CC_SIGN_AFTER | CF_DOTTED_CIRCLE| CF_POS_AFTER,
2008 _c1 = CC_CONSONANT | CF_CONSONANT,
2009 _c2 = CC_CONSONANT2 | CF_CONSONANT,
2010 _c3 = CC_CONSONANT3 | CF_CONSONANT,
2011 _rb = CC_ROBAT | CF_POS_ABOVE | CF_DOTTED_CIRCLE,
2012 _cs = CC_CONSONANT_SHIFTER | CF_DOTTED_CIRCLE | CF_SHIFTER,
2013 _dl = CC_DEPENDENT_VOWEL | CF_POS_BEFORE | CF_DOTTED_CIRCLE,
2014 _db = CC_DEPENDENT_VOWEL | CF_POS_BELOW | CF_DOTTED_CIRCLE,
2015 _da = CC_DEPENDENT_VOWEL | CF_POS_ABOVE | CF_DOTTED_CIRCLE | CF_ABOVE_VOWEL,
2016 _dr = CC_DEPENDENT_VOWEL | CF_POS_AFTER | CF_DOTTED_CIRCLE,
2017 _co = CC_COENG | CF_COENG | CF_DOTTED_CIRCLE,
2018
2019 /* split vowel */
2020 _va = _da | CF_SPLIT_VOWEL,
2021 _vr = _dr | CF_SPLIT_VOWEL
2022};
2023
2024
2025/*
2026// Character class: a character class value
2027// ORed with character class flags.
2028*/
2029typedef unsigned long KhmerCharClass;
2030
2031
2032/*
2033// Character class tables
2034// _xx character does not combine into syllable, such as numbers, puntuation marks, non-Khmer signs...
2035// _sa Sign placed above the base
2036// _sp Sign placed after the base
2037// _c1 Consonant of type 1 or independent vowel (independent vowels behave as type 1 consonants)
2038// _c2 Consonant of type 2 (only RO)
2039// _c3 Consonant of type 3
2040// _rb Khmer sign robat u17CC. combining mark for subscript consonants
2041// _cd Consonant-shifter
2042// _dl Dependent vowel placed before the base (left of the base)
2043// _db Dependent vowel placed below the base
2044// _da Dependent vowel placed above the base
2045// _dr Dependent vowel placed behind the base (right of the base)
2046// _co Khmer combining mark COENG u17D2, combines with the consonant or independent vowel following
2047// it to create a subscript consonant or independent vowel
2048// _va Khmer split vowel in which the first part is before the base and the second one above the base
2049// _vr Khmer split vowel in which the first part is before the base and the second one behind (right of) the base
2050*/
2051static const KhmerCharClass khmerCharClasses[] = {
2052 _c1, _c1, _c1, _c3, _c1, _c1, _c1, _c1, _c3, _c1, _c1, _c1, _c1, _c3, _c1, _c1, /* 1780 - 178F */
2053 _c1, _c1, _c1, _c1, _c3, _c1, _c1, _c1, _c1, _c3, _c2, _c1, _c1, _c1, _c3, _c3, /* 1790 - 179F */
2054 _c1, _c3, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, /* 17A0 - 17AF */
2055 _c1, _c1, _c1, _c1, _dr, _dr, _dr, _da, _da, _da, _da, _db, _db, _db, _va, _vr, /* 17B0 - 17BF */
2056 _vr, _dl, _dl, _dl, _vr, _vr, _sa, _sp, _sp, _cs, _cs, _sa, _rb, _sa, _sa, _sa, /* 17C0 - 17CF */
2057 _sa, _sa, _co, _sa, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _sa, _xx, _xx /* 17D0 - 17DF */
2058};
2059
2060/* this enum must reflect the range of khmerCharClasses */
2061enum KhmerCharClassesRange {
2062 KhmerFirstChar = 0x1780,
2063 KhmerLastChar = 0x17df
2064};
2065
2066/*
2067// Below we define how a character in the input string is either in the khmerCharClasses table
2068// (in which case we get its type back), a ZWJ or ZWNJ (two characters that may appear
2069// within the syllable, but are not in the table) we also get their type back, or an unknown object
2070// in which case we get _xx (CC_RESERVED) back
2071*/
2072static KhmerCharClass getKhmerCharClass(ushort uc)
2073{
2074 if (uc == C_SIGN_ZWJ) {
2075 return CC_ZERO_WIDTH_J_MARK;
2076 }
2077
2078 if (uc == C_SIGN_ZWNJ) {
2079 return CC_ZERO_WIDTH_NJ_MARK;
2080 }
2081
2082 if (uc < KhmerFirstChar || uc > KhmerLastChar) {
2083 return CC_RESERVED;
2084 }
2085
2086 return khmerCharClasses[uc - KhmerFirstChar];
2087}
2088
2089
2090/*
2091// The stateTable is used to calculate the end (the length) of a well
2092// formed Khmer Syllable.
2093//
2094// Each horizontal line is ordered exactly the same way as the values in KhmerClassTable
2095// CharClassValues. This coincidence of values allows the follow up of the table.
2096//
2097// Each line corresponds to a state, which does not necessarily need to be a type
2098// of component... for example, state 2 is a base, with is always a first character
2099// in the syllable, but the state could be produced a consonant of any type when
2100// it is the first character that is analysed (in ground state).
2101//
2102// Differentiating 3 types of consonants is necessary in order to
2103// forbid the use of certain combinations, such as having a second
2104// coeng after a coeng RO,
2105// The inexistent possibility of having a type 3 after another type 3 is permitted,
2106// eliminating it would very much complicate the table, and it does not create typing
2107// problems, as the case above.
2108//
2109// The table is quite complex, in order to limit the number of coeng consonants
2110// to 2 (by means of the table).
2111//
2112// There a peculiarity, as far as Unicode is concerned:
2113// - The consonant-shifter is considered in two possible different
2114// locations, the one considered in Unicode 3.0 and the one considered in
2115// Unicode 4.0. (there is a backwards compatibility problem in this standard).
2116//
2117//
2118// xx independent character, such as a number, punctuation sign or non-khmer char
2119//
2120// c1 Khmer consonant of type 1 or an independent vowel
2121// that is, a letter in which the subscript for is only under the
2122// base, not taking any space to the right or to the left
2123//
2124// c2 Khmer consonant of type 2, the coeng form takes space under
2125// and to the left of the base (only RO is of this type)
2126//
2127// c3 Khmer consonant of type 3. Its subscript form takes space under
2128// and to the right of the base.
2129//
2130// cs Khmer consonant shifter
2131//
2132// rb Khmer robat
2133//
2134// co coeng character (u17D2)
2135//
2136// dv dependent vowel (including split vowels, they are treated in the same way).
2137// even if dv is not defined above, the component that is really tested for is
2138// KhmerClassTable::CC_DEPENDENT_VOWEL, which is common to all dependent vowels
2139//
2140// zwj Zero Width joiner
2141//
2142// zwnj Zero width non joiner
2143//
2144// sa above sign
2145//
2146// sp post sign
2147//
2148// there are lines with equal content but for an easier understanding
2149// (and maybe change in the future) we did not join them
2150*/
2151static const signed char khmerStateTable[][CC_COUNT] =
2152{
2153 /* xx c1 c2 c3 zwnj cs rb co dv sa sp zwj */
2154 { 1, 2, 2, 2, 1, 1, 1, 6, 1, 1, 1, 2}, /* 0 - ground state */
2155 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 1 - exit state (or sign to the right of the syllable) */
2156 {-1, -1, -1, -1, 3, 4, 5, 6, 16, 17, 1, -1}, /* 2 - Base consonant */
2157 {-1, -1, -1, -1, -1, 4, -1, -1, 16, -1, -1, -1}, /* 3 - First ZWNJ before a register shifter It can only be followed by a shifter or a vowel */
2158 {-1, -1, -1, -1, 15, -1, -1, 6, 16, 17, 1, 14}, /* 4 - First register shifter */
2159 {-1, -1, -1, -1, -1, -1, -1, -1, 20, -1, 1, -1}, /* 5 - Robat */
2160 {-1, 7, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1}, /* 6 - First Coeng */
2161 {-1, -1, -1, -1, 12, 13, -1, 10, 16, 17, 1, 14}, /* 7 - First consonant of type 1 after coeng */
2162 {-1, -1, -1, -1, 12, 13, -1, -1, 16, 17, 1, 14}, /* 8 - First consonant of type 2 after coeng */
2163 {-1, -1, -1, -1, 12, 13, -1, 10, 16, 17, 1, 14}, /* 9 - First consonant or type 3 after ceong */
2164 {-1, 11, 11, 11, -1, -1, -1, -1, -1, -1, -1, -1}, /* 10 - Second Coeng (no register shifter before) */
2165 {-1, -1, -1, -1, 15, -1, -1, -1, 16, 17, 1, 14}, /* 11 - Second coeng consonant (or ind. vowel) no register shifter before */
2166 {-1, -1, -1, -1, -1, 13, -1, -1, 16, -1, -1, -1}, /* 12 - Second ZWNJ before a register shifter */
2167 {-1, -1, -1, -1, 15, -1, -1, -1, 16, 17, 1, 14}, /* 13 - Second register shifter */
2168 {-1, -1, -1, -1, -1, -1, -1, -1, 16, -1, -1, -1}, /* 14 - ZWJ before vowel */
2169 {-1, -1, -1, -1, -1, -1, -1, -1, 16, -1, -1, -1}, /* 15 - ZWNJ before vowel */
2170 {-1, -1, -1, -1, -1, -1, -1, -1, -1, 17, 1, 18}, /* 16 - dependent vowel */
2171 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 18}, /* 17 - sign above */
2172 {-1, -1, -1, -1, -1, -1, -1, 19, -1, -1, -1, -1}, /* 18 - ZWJ after vowel */
2173 {-1, 1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 19 - Third coeng */
2174 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, -1}, /* 20 - dependent vowel after a Robat */
2175};
2176
2177
2178/* #define KHMER_DEBUG */
2179#ifdef KHMER_DEBUG
2180#define KHDEBUG qDebug
2181#else
2182# define KHDEBUG \
2183 if (0) \
2184 printf
2185#endif
2186
2187/*
2188// Given an input string of characters and a location in which to start looking
2189// calculate, using the state table, which one is the last character of the syllable
2190// that starts in the starting position.
2191*/
2192static qsizetype khmer_nextSyllableBoundary(const char16_t *s, qsizetype start, qsizetype end, bool *invalid)
2193{
2194 const char16_t *uc = s + start;
2195 int state = 0;
2196 qsizetype pos = start;
2197 *invalid = false;
2198
2199 while (pos < end) {
2200 KhmerCharClass charClass = getKhmerCharClass(uc: *uc);
2201 if (pos == start) {
2202 *invalid = (charClass > 0) && ! (charClass & CF_CONSONANT);
2203 }
2204 state = khmerStateTable[state][charClass & CF_CLASS_MASK];
2205
2206 KHDEBUG(format: "state[%lld]=%d class=%8lx (uc=%4x)", qlonglong(pos - start), state,
2207 charClass, *uc );
2208
2209 if (state < 0) {
2210 break;
2211 }
2212 ++uc;
2213 ++pos;
2214 }
2215 return pos;
2216}
2217
2218static void khmerAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
2219{
2220 qsizetype end = from + len;
2221 qsizetype i = 0;
2222 Q_UNUSED(script);
2223 attributes += from;
2224 while ( i < len ) {
2225 bool invalid;
2226 qsizetype boundary = khmer_nextSyllableBoundary( s: text, start: from+i, end, invalid: &invalid ) - from;
2227
2228 attributes[i].graphemeBoundary = true;
2229
2230 if ( boundary > len-1 ) boundary = len;
2231 i++;
2232 while ( i < boundary ) {
2233 attributes[i].graphemeBoundary = false;
2234 ++i;
2235 }
2236 assert( i == boundary );
2237 }
2238}
2239
2240
2241const CharAttributeFunction charAttributeFunction[] = {
2242// Script_Unknown,
2243 nullptr,
2244// Script_Inherited,
2245 nullptr,
2246// Script_Common,
2247 nullptr,
2248// Script_Latin,
2249 nullptr,
2250// Script_Greek,
2251 nullptr,
2252// Script_Cyrillic,
2253 nullptr,
2254// Script_Armenian,
2255 nullptr,
2256// Script_Hebrew,
2257 nullptr,
2258// Script_Arabic,
2259 nullptr,
2260// Script_Syriac,
2261 nullptr,
2262// Script_Thaana,
2263 nullptr,
2264// Script_Devanagari,
2265 indicAttributes,
2266// Script_Bengali,
2267 indicAttributes,
2268// Script_Gurmukhi,
2269 indicAttributes,
2270// Script_Gujarati,
2271 indicAttributes,
2272// Script_Oriya,
2273 indicAttributes,
2274// Script_Tamil,
2275 indicAttributes,
2276// Script_Telugu,
2277 indicAttributes,
2278// Script_Kannada,
2279 indicAttributes,
2280// Script_Malayalam,
2281 indicAttributes,
2282// Script_Sinhala,
2283 indicAttributes,
2284// Script_Thai,
2285 thaiAttributes,
2286// Script_Lao,
2287 nullptr,
2288// Script_Tibetan,
2289 tibetanAttributes,
2290// Script_Myanmar,
2291 myanmarAttributes,
2292// Script_Georgian,
2293 nullptr,
2294// Script_Hangul,
2295 nullptr,
2296// Script_Ethiopic,
2297 nullptr,
2298// Script_Cherokee,
2299 nullptr,
2300// Script_CanadianAboriginal,
2301 nullptr,
2302// Script_Ogham,
2303 nullptr,
2304// Script_Runic,
2305 nullptr,
2306// Script_Khmer,
2307 khmerAttributes
2308};
2309
2310static void getCharAttributes(const char16_t *string, qsizetype stringLength,
2311 const QUnicodeTools::ScriptItem *items, qsizetype numItems,
2312 QCharAttributes *attributes)
2313{
2314 if (stringLength == 0)
2315 return;
2316 for (qsizetype i = 0; i < numItems; ++i) {
2317 QChar::Script script = items[i].script;
2318 if (script > QChar::Script_Khmer)
2319 script = QChar::Script_Common;
2320 CharAttributeFunction attributeFunction = charAttributeFunction[script];
2321 if (!attributeFunction)
2322 continue;
2323 qsizetype end = i < numItems - 1 ? items[i + 1].position : stringLength;
2324 attributeFunction(script, string, items[i].position, end - items[i].position, attributes);
2325 }
2326}
2327
2328}
2329
2330Q_CORE_EXPORT void initCharAttributes(QStringView string,
2331 const ScriptItem *items, qsizetype numItems,
2332 QCharAttributes *attributes, CharAttributeOptions options)
2333{
2334 if (string.size() <= 0)
2335 return;
2336
2337 if (!(options & DontClearAttributes))
2338 ::memset(s: attributes, c: 0, n: (string.size() + 1) * sizeof(QCharAttributes));
2339
2340 if (options & GraphemeBreaks)
2341 getGraphemeBreaks(string: string.utf16(), len: string.size(), attributes);
2342 if (options & WordBreaks)
2343 getWordBreaks(string: string.utf16(), len: string.size(), attributes);
2344 if (options & SentenceBreaks)
2345 getSentenceBreaks(string: string.utf16(), len: string.size(), attributes);
2346 if (options & LineBreaks)
2347 getLineBreaks(string: string.utf16(), len: string.size(), attributes, options);
2348 if (options & WhiteSpaces)
2349 getWhiteSpaces(string: string.utf16(), len: string.size(), attributes);
2350
2351 if (!qt_initcharattributes_default_algorithm_only) {
2352 if (!items || numItems <= 0)
2353 return;
2354
2355 Tailored::getCharAttributes(string: string.utf16(), stringLength: string.size(), items, numItems, attributes);
2356 }
2357}
2358
2359
2360// ----------------------------------------------------------------------------
2361//
2362// The Unicode script property. See http://www.unicode.org/reports/tr24/tr24-24.html
2363//
2364// ----------------------------------------------------------------------------
2365
2366Q_CORE_EXPORT void initScripts(QStringView string, ScriptItemArray *scripts)
2367{
2368 qsizetype sor = 0;
2369 qsizetype eor = 0;
2370 QChar::Script script = QChar::Script_Common;
2371
2372 for (qsizetype i = 0; i < string.size(); ++i, eor = i) {
2373 char32_t ucs4 = string[i].unicode();
2374 if (QChar::isHighSurrogate(ucs4) && i + 1 < string.size()) {
2375 ushort low = string[i + 1].unicode();
2376 if (QChar::isLowSurrogate(ucs4: low)) {
2377 ucs4 = QChar::surrogateToUcs4(high: ucs4, low);
2378 ++i;
2379 }
2380 }
2381
2382 const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
2383
2384 QChar::Script nscript = QChar::Script(prop->script);
2385
2386 if (Q_LIKELY(nscript == script || nscript <= QChar::Script_Common))
2387 continue;
2388
2389 // inherit preceding Common-s
2390 if (Q_UNLIKELY(script <= QChar::Script_Common)) {
2391 // also covers a case where the base character of Common script followed
2392 // by one or more combining marks of non-Inherited, non-Common script
2393 script = nscript;
2394 continue;
2395 }
2396
2397 // Never break between a combining mark (gc= Mc, Mn or Me) and its base character.
2398 // Thus, a combining mark - whatever its script property value is - should inherit
2399 // the script property value of its base character.
2400 static const int test = (FLAG(QChar::Mark_NonSpacing) | FLAG(QChar::Mark_SpacingCombining) | FLAG(QChar::Mark_Enclosing));
2401 if (Q_UNLIKELY(FLAG(prop->category) & test))
2402 continue;
2403
2404 Q_ASSERT(script > QChar::Script_Common);
2405 Q_ASSERT(sor < eor);
2406 scripts->append(t: ScriptItem{.position: sor, .script: script});
2407 sor = eor;
2408
2409 script = nscript;
2410 }
2411
2412 Q_ASSERT(script >= QChar::Script_Common);
2413 Q_ASSERT(eor == string.size());
2414 scripts->append(t: ScriptItem{.position: sor, .script: script});
2415}
2416
2417} // namespace QUnicodeTools
2418
2419QT_END_NAMESPACE
2420

source code of qtbase/src/corelib/text/qunicodetools.cpp