qunicodetools.cpp source code [qtbase/src/corelib/text/qunicodetools.cpp]

1	// Copyright (C) 2020 The Qt Company Ltd.
2	// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
3
4	#include "qunicodetools_p.h"
5
6	#include "qunicodetables_p.h"
7	#include "qvarlengtharray.h"
8	#if QT_CONFIG(library)
9	#include "qlibrary.h"
10	#endif
11
12	#include <limits.h>
13
14	#define FLAG(x) (1 << (x))
15
16	QT_BEGIN_NAMESPACE
17
18	using namespace Qt::StringLiterals;
19
20	#ifdef QT_BUILD_INTERNAL
21	Q_CONSTINIT Q_AUTOTEST_EXPORT
22	#else
23	constexpr
24	#endif
25	int qt_initcharattributes_default_algorithm_only = `0`;
26
27	namespace QUnicodeTools {
28
29	// -----------------------------------------------------------------------------------------------------
30	//
31	// The text boundaries determination algorithm.
32	// See https://www.unicode.org/reports/tr29/tr29-37.html
33	//
34	// -----------------------------------------------------------------------------------------------------
35
36	namespace GB {
37
38	// This table is indexed by the grapheme break classes of two
39	// (adjacent) code points.
40	// The class of the first code point selects an entry.
41	// If the entry's bit at position second_cp_class is set
42	// (in other words: if entry & (1u << second_cp_class) is non-zero)
43	// then there is NO grapheme break between the two code points.
44
45	using GBTableEntryType = quint16;
46
47	// Check that we have enough bits in the table (in case
48	// NumGraphemeBreakClasses grows too much).
49	static_assert(sizeof(GBTableEntryType) * CHAR_BIT >= QUnicodeTables::NumGraphemeBreakClasses,
50	"Internal error: increase the size in bits of GBTableEntryType");
51
52	// GB9, GB9a
53	static const GBTableEntryType Extend_SpacingMark_ZWJ =
54	FLAG(QUnicodeTables::GraphemeBreak_Extend)
55	\| FLAG(QUnicodeTables::GraphemeBreak_SpacingMark)
56	\| FLAG(QUnicodeTables::GraphemeBreak_ZWJ);
57
58	static const GBTableEntryType HardBreak = `0u`;
59
60	static const GBTableEntryType breakTable[QUnicodeTables::NumGraphemeBreakClasses] = {
61	Extend_SpacingMark_ZWJ, // Any
62	FLAG(QUnicodeTables::GraphemeBreak_LF), // CR
63	HardBreak, // LF
64	HardBreak, // Control
65	Extend_SpacingMark_ZWJ, // Extend
66	Extend_SpacingMark_ZWJ, // ZWJ
67	Extend_SpacingMark_ZWJ, // RegionalIndicator
68	(Extend_SpacingMark_ZWJ
69	\| FLAG(QUnicodeTables::GraphemeBreak_Any)
70	\| FLAG(QUnicodeTables::GraphemeBreak_Prepend)
71	\| FLAG(QUnicodeTables::GraphemeBreak_L)
72	\| FLAG(QUnicodeTables::GraphemeBreak_V)
73	\| FLAG(QUnicodeTables::GraphemeBreak_T)
74	\| FLAG(QUnicodeTables::GraphemeBreak_LV)
75	\| FLAG(QUnicodeTables::GraphemeBreak_LVT)
76	\| FLAG(QUnicodeTables::GraphemeBreak_RegionalIndicator)
77	\| FLAG(QUnicodeTables::GraphemeBreak_Extended_Pictographic)
78	), // Prepend
79	Extend_SpacingMark_ZWJ, // SpacingMark
80	(Extend_SpacingMark_ZWJ
81	\| FLAG(QUnicodeTables::GraphemeBreak_L)
82	\| FLAG(QUnicodeTables::GraphemeBreak_V)
83	\| FLAG(QUnicodeTables::GraphemeBreak_LV)
84	\| FLAG(QUnicodeTables::GraphemeBreak_LVT)
85	), // L
86	(Extend_SpacingMark_ZWJ
87	\| FLAG(QUnicodeTables::GraphemeBreak_V)
88	\| FLAG(QUnicodeTables::GraphemeBreak_T)
89	), // V
90	(Extend_SpacingMark_ZWJ
91	\| FLAG(QUnicodeTables::GraphemeBreak_T)
92	), // T
93	(Extend_SpacingMark_ZWJ
94	\| FLAG(QUnicodeTables::GraphemeBreak_V)
95	\| FLAG(QUnicodeTables::GraphemeBreak_T)
96	), // LV
97	(Extend_SpacingMark_ZWJ
98	\| FLAG(QUnicodeTables::GraphemeBreak_T)
99	), // LVT
100	Extend_SpacingMark_ZWJ // Extended_Pictographic
101	};
102
103	static bool shouldBreakBetweenClasses(QUnicodeTables::GraphemeBreakClass first,
104	QUnicodeTables::GraphemeBreakClass second)
105	{
106	return (breakTable[first] & FLAG(second)) == `0`;
107	}
108
109	// Some rules (GB11, GB12, GB13) cannot be represented by the table alone,
110	// so we need to store some local state.
111	enum class State : uchar {
112	Normal,
113	GB11_ExtPicExt, // saw a Extend after a Extended_Pictographic
114	GB11_ExtPicExtZWJ, // saw a ZWG after a Extended_Pictographic and zero or more Extend
115	GB12_13_RI, // saw a RegionalIndicator following a non-RegionalIndicator
116	};
117
118	} // namespace GB
119
120	static void getGraphemeBreaks(const char16_t string, qsizetype len, QCharAttributes attributes)
121	{
122	QUnicodeTables::GraphemeBreakClass lcls = QUnicodeTables::GraphemeBreak_LF; // to meet GB1
123	GB::State state = GB::State::Normal;
124	for (qsizetype i = `0`; i != len; ++i) {
125	qsizetype pos = i;
126	char32_t ucs4 = string[i];
127	if (QChar::isHighSurrogate(ucs4) && i + `1` != len) {
128	ushort low = string[i + `1`];
129	if (QChar::isLowSurrogate(ucs4: low)) {
130	ucs4 = QChar::surrogateToUcs4(high: ucs4, low);
131	++i;
132	}
133	}
134
135	const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
136	QUnicodeTables::GraphemeBreakClass cls = (QUnicodeTables::GraphemeBreakClass) prop->graphemeBreakClass;
137
138	bool shouldBreak = GB::shouldBreakBetweenClasses(first: lcls, second: cls);
139	bool handled = false;
140
141	switch (state) {
142	case GB::State::Normal:
143	break; // will deal with it below
144
145	case GB::State::GB11_ExtPicExt:
146	Q_ASSERT(lcls == QUnicodeTables::GraphemeBreak_Extend);
147	if (cls == QUnicodeTables::GraphemeBreak_Extend) {
148	// keep going in the current state
149	Q_ASSERT(!shouldBreak); // GB9, do not break before Extend
150	handled = true;
151	} else if (cls == QUnicodeTables::GraphemeBreak_ZWJ) {
152	state = GB::State::GB11_ExtPicExtZWJ;
153	Q_ASSERT(!shouldBreak); // GB9, do not break before ZWJ
154	handled = true;
155	} else {
156	state = GB::State::Normal;
157	}
158	break;
159
160	case GB::State::GB11_ExtPicExtZWJ:
161	Q_ASSERT(lcls == QUnicodeTables::GraphemeBreak_ZWJ);
162	if (cls == QUnicodeTables::GraphemeBreak_Extended_Pictographic) {
163	shouldBreak = false;
164	handled = true;
165	}
166
167	state = GB::State::Normal;
168	break;
169
170	case GB::State::GB12_13_RI:
171	Q_ASSERT(lcls == QUnicodeTables::GraphemeBreak_RegionalIndicator);
172	if (cls == QUnicodeTables::GraphemeBreak_RegionalIndicator) {
173	shouldBreak = false;
174	handled = true;
175	}
176
177	state = GB::State::Normal;
178	break;
179	}
180
181	if (!handled) {
182	Q_ASSERT(state == GB::State::Normal);
183	if (lcls == QUnicodeTables::GraphemeBreak_Extended_Pictographic) { // GB11
184	if (cls == QUnicodeTables::GraphemeBreak_Extend) {
185	state = GB::State::GB11_ExtPicExt;
186	Q_ASSERT(!shouldBreak); // GB9, do not break before Extend
187	} else if (cls == QUnicodeTables::GraphemeBreak_ZWJ) {
188	state = GB::State::GB11_ExtPicExtZWJ;
189	Q_ASSERT(!shouldBreak); // GB9, do not break before ZWJ
190	}
191	} else if (cls == QUnicodeTables::GraphemeBreak_RegionalIndicator) { // GB12, GB13
192	state = GB::State::GB12_13_RI;
193	}
194	}
195
196	if (shouldBreak)
197	attributes[pos].graphemeBoundary = true;
198
199	lcls = cls;
200	}
201
202	attributes[len].graphemeBoundary = true; // GB2
203	}
204
205
206	namespace WB {
207
208	enum Action {
209	NoBreak,
210	Break,
211	Lookup,
212	LookupW
213	};
214
215	static const uchar breakTable[QUnicodeTables::NumWordBreakClasses][QUnicodeTables::NumWordBreakClasses] = {
216	// Any CR LF Newline Extend ZWJ Format RI Katakana HLetter ALetter SQuote DQuote MidNumLet MidLetter MidNum Numeric ExtNumLet WSeg
217	{ Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Any
218	{ Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // CR
219	{ Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // LF
220	{ Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Newline
221	{ Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Extend
222	{ Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // ZWJ
223	{ Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Format
224	{ Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // RegionalIndicator
225	{ Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , NoBreak, Break }, // Katakana
226	{ Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , NoBreak, NoBreak, LookupW, Lookup , LookupW, LookupW, Break , NoBreak, NoBreak, Break }, // HebrewLetter
227	{ Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , NoBreak, NoBreak, LookupW, Break , LookupW, LookupW, Break , NoBreak, NoBreak, Break }, // ALetter
228	{ Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // SingleQuote
229	{ Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // DoubleQuote
230	{ Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidNumLet
231	{ Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidLetter
232	{ Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidNum
233	{ Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , NoBreak, NoBreak, Lookup , Break , Lookup , Break , Lookup , NoBreak, NoBreak, Break }, // Numeric
234	{ Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , NoBreak, NoBreak, Break }, // ExtendNumLet
235	{ Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , NoBreak }, // WSegSpace
236	};
237
238	} // namespace WB
239
240	static void getWordBreaks(const char16_t string, qsizetype len, QCharAttributes attributes)
241	{
242	enum WordType {
243	WordTypeNone, WordTypeAlphaNumeric, WordTypeHiraganaKatakana
244	} currentWordType = WordTypeNone;
245
246	QUnicodeTables::WordBreakClass cls = QUnicodeTables::WordBreak_LF; // to meet WB1
247	auto real_cls = cls; // Unaffected by WB4
248
249	for (qsizetype i = `0`; i != len; ++i) {
250	qsizetype pos = i;
251	char32_t ucs4 = string[i];
252	if (QChar::isHighSurrogate(ucs4) && i + `1` != len) {
253	ushort low = string[i + `1`];
254	if (QChar::isLowSurrogate(ucs4: low)) {
255	ucs4 = QChar::surrogateToUcs4(high: ucs4, low);
256	++i;
257	}
258	}
259
260	const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
261	QUnicodeTables::WordBreakClass ncls = (QUnicodeTables::WordBreakClass) prop->wordBreakClass;
262	if (qt_initcharattributes_default_algorithm_only) {
263	// as of Unicode 5.1, some punctuation marks were mapped to MidLetter and MidNumLet
264	// which caused "hi.there" to be treated like if it were just a single word;
265	// we keep the pre-5.1 behavior by remapping these characters in the Unicode tables generator
266	// and this code is needed to pass the coverage tests; remove once the issue is fixed.
267	if (ucs4 == `0x002E`) // FULL STOP
268	ncls = QUnicodeTables::WordBreak_MidNumLet;
269	else if (ucs4 == `0x003A`) // COLON
270	ncls = QUnicodeTables::WordBreak_MidLetter;
271	}
272
273	uchar action = WB::breakTable[cls][ncls];
274	switch (action) {
275	case WB::Break:
276	if (Q_UNLIKELY(real_cls == QUnicodeTables::WordBreak_ZWJ
277	&& prop->graphemeBreakClass
278	== QUnicodeTables::GraphemeBreak_Extended_Pictographic)) {
279	// WB3c: ZWJ × \p{Extended_Pictographic}
280	action = WB::NoBreak;
281	}
282	break;
283	case WB::NoBreak:
284	if (Q_UNLIKELY(ncls == QUnicodeTables::WordBreak_Extend \|\| ncls == QUnicodeTables::WordBreak_ZWJ \|\| ncls == QUnicodeTables::WordBreak_Format)) {
285	// WB4: X(Extend\|Format) -> X*
286	real_cls = ncls;
287	continue;
288	}
289	if (Q_UNLIKELY(cls == QUnicodeTables::WordBreak_RegionalIndicator)) {
290	// WB15/WB16: break between pairs of Regional indicator
291	ncls = QUnicodeTables::WordBreak_Any;
292	}
293	if (Q_UNLIKELY(ncls == QUnicodeTables::WordBreak_WSegSpace
294	&& real_cls != QUnicodeTables::WordBreak_WSegSpace)) {
295	// WB3d should not be affected by WB4
296	action = WB::Break;
297	}
298	break;
299	case WB::Lookup:
300	case WB::LookupW:
301	for (qsizetype lookahead = i + `1`; lookahead < len; ++lookahead) {
302	ucs4 = string[lookahead];
303	if (QChar::isHighSurrogate(ucs4) && lookahead + `1` != len) {
304	ushort low = string[lookahead + `1`];
305	if (QChar::isLowSurrogate(ucs4: low)) {
306	ucs4 = QChar::surrogateToUcs4(high: ucs4, low);
307	++lookahead;
308	}
309	}
310
311	prop = QUnicodeTables::properties(ucs4);
312	QUnicodeTables::WordBreakClass tcls = (QUnicodeTables::WordBreakClass) prop->wordBreakClass;
313
314	if (Q_UNLIKELY(tcls == QUnicodeTables::WordBreak_Extend \|\| tcls == QUnicodeTables::WordBreak_ZWJ \|\| tcls == QUnicodeTables::WordBreak_Format)) {
315	// WB4: X(Extend\|Format) -> X*
316	continue;
317	}
318
319	if (Q_LIKELY(tcls == cls \|\| (action == WB::LookupW && (tcls == QUnicodeTables::WordBreak_HebrewLetter
320	\|\| tcls == QUnicodeTables::WordBreak_ALetter)))) {
321	i = lookahead;
322	ncls = tcls;
323	action = WB::NoBreak;
324	}
325	break;
326	}
327	if (action != WB::NoBreak) {
328	action = WB::Break;
329	if (Q_UNLIKELY(ncls == QUnicodeTables::WordBreak_SingleQuote && cls == QUnicodeTables::WordBreak_HebrewLetter))
330	action = WB::NoBreak; // WB7a
331	}
332	break;
333	}
334
335	cls = ncls;
336	real_cls = ncls;
337
338	if (action == WB::Break) {
339	attributes[pos].wordBreak = true;
340	if (currentWordType != WordTypeNone)
341	attributes[pos].wordEnd = true;
342	switch (cls) {
343	case QUnicodeTables::WordBreak_Katakana:
344	currentWordType = WordTypeHiraganaKatakana;
345	attributes[pos].wordStart = true;
346	break;
347	case QUnicodeTables::WordBreak_HebrewLetter:
348	case QUnicodeTables::WordBreak_ALetter:
349	case QUnicodeTables::WordBreak_Numeric:
350	currentWordType = WordTypeAlphaNumeric;
351	attributes[pos].wordStart = true;
352	break;
353	default:
354	currentWordType = WordTypeNone;
355	break;
356	}
357	}
358	}
359
360	if (currentWordType != WordTypeNone)
361	attributes[len].wordEnd = true;
362	attributes[len].wordBreak = true; // WB2
363	}
364
365
366	namespace SB {
367
368	enum State {
369	Initial,
370	Lower,
371	Upper,
372	LUATerm,
373	ATerm,
374	ATermC,
375	ACS,
376	STerm,
377	STermC,
378	SCS,
379	BAfterC,
380	BAfter,
381	Break,
382	Lookup
383	};
384
385	static const uchar breakTable[BAfter + `1`][QUnicodeTables::NumSentenceBreakClasses] = {
386	// Any CR LF Sep Extend Sp Lower Upper OLetter Numeric ATerm SContinue STerm Close
387	{ Initial, BAfterC, BAfter , BAfter , Initial, Initial, Lower , Upper , Initial, Initial, ATerm , Initial, STerm , Initial }, // Initial
388	{ Initial, BAfterC, BAfter , BAfter , Lower , Initial, Initial, Initial, Initial, Initial, LUATerm, Initial, STerm , Initial }, // Lower
389	{ Initial, BAfterC, BAfter , BAfter , Upper , Initial, Initial, Upper , Initial, Initial, LUATerm, Initial, STerm , Initial }, // Upper
390
391	{ Lookup , BAfterC, BAfter , BAfter , LUATerm, ACS , Initial, Upper , Break , Initial, ATerm , STerm , STerm , ATermC }, // LUATerm
392	{ Lookup , BAfterC, BAfter , BAfter , ATerm , ACS , Initial, Break , Break , Initial, ATerm , STerm , STerm , ATermC }, // ATerm
393	{ Lookup , BAfterC, BAfter , BAfter , ATermC , ACS , Initial, Break , Break , Lookup , ATerm , STerm , STerm , ATermC }, // ATermC
394	{ Lookup , BAfterC, BAfter , BAfter , ACS , ACS , Initial, Break , Break , Lookup , ATerm , STerm , STerm , Lookup }, // ACS
395
396	{ Break , BAfterC, BAfter , BAfter , STerm , SCS , Break , Break , Break , Break , ATerm , STerm , STerm , STermC }, // STerm,
397	{ Break , BAfterC, BAfter , BAfter , STermC , SCS , Break , Break , Break , Break , ATerm , STerm , STerm , STermC }, // STermC
398	{ Break , BAfterC, BAfter , BAfter , SCS , SCS , Break , Break , Break , Break , ATerm , STerm , STerm , Break }, // SCS
399	{ Break , Break , BAfter , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // BAfterC
400	{ Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // BAfter
401	};
402
403	} // namespace SB
404
405	static void getSentenceBreaks(const char16_t string, qsizetype len, QCharAttributes attributes)
406	{
407	uchar state = SB::BAfter; // to meet SB1
408	for (qsizetype i = `0`; i != len; ++i) {
409	qsizetype pos = i;
410	char32_t ucs4 = string[i];
411	if (QChar::isHighSurrogate(ucs4) && i + `1` != len) {
412	ushort low = string[i + `1`];
413	if (QChar::isLowSurrogate(ucs4: low)) {
414	ucs4 = QChar::surrogateToUcs4(high: ucs4, low);
415	++i;
416	}
417	}
418
419	const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
420	QUnicodeTables::SentenceBreakClass ncls = (QUnicodeTables::SentenceBreakClass) prop->sentenceBreakClass;
421
422	Q_ASSERT(state <= SB::BAfter);
423	state = SB::breakTable[state][ncls];
424	if (Q_UNLIKELY(state == SB::Lookup)) { // SB8
425	state = SB::Break;
426	for (qsizetype lookahead = i + `1`; lookahead < len; ++lookahead) {
427	ucs4 = string[lookahead];
428	if (QChar::isHighSurrogate(ucs4) && lookahead + `1` != len) {
429	ushort low = string[lookahead + `1`];
430	if (QChar::isLowSurrogate(ucs4: low)) {
431	ucs4 = QChar::surrogateToUcs4(high: ucs4, low);
432	++lookahead;
433	}
434	}
435
436	prop = QUnicodeTables::properties(ucs4);
437	QUnicodeTables::SentenceBreakClass tcls = (QUnicodeTables::SentenceBreakClass) prop->sentenceBreakClass;
438	switch (tcls) {
439	case QUnicodeTables::SentenceBreak_Any:
440	case QUnicodeTables::SentenceBreak_Extend:
441	case QUnicodeTables::SentenceBreak_Sp:
442	case QUnicodeTables::SentenceBreak_Numeric:
443	case QUnicodeTables::SentenceBreak_SContinue:
444	case QUnicodeTables::SentenceBreak_Close:
445	continue;
446	case QUnicodeTables::SentenceBreak_Lower:
447	i = lookahead;
448	state = SB::Initial;
449	break;
450	default:
451	break;
452	}
453	break;
454	}
455	}
456	if (Q_UNLIKELY(state == SB::Break)) {
457	attributes[pos].sentenceBoundary = true;
458	state = SB::breakTable[SB::Initial][ncls];
459	}
460	}
461
462	attributes[len].sentenceBoundary = true; // SB2
463	}
464
465
466	// -----------------------------------------------------------------------------------------------------
467	//
468	// The line breaking algorithm.
469	// See http://www.unicode.org/reports/tr14/tr14-39.html
470	//
471	// -----------------------------------------------------------------------------------------------------
472
473	namespace LB {
474
475	namespace NS { // Number Sequence
476
477	// LB25 recommends to not break lines inside numbers of the form
478	// described by the following regular expression:
479	// (PR\|PO)?(OP\|HY)?NU(NU\|SY\|IS)(CL\|CP)?(PR\|PO)?*
480
481	enum Action {
482	None,
483	Start,
484	Continue,
485	Break
486	};
487
488	enum Class {
489	XX,
490	PRPO,
491	OPHY,
492	NU,
493	SYIS,
494	CLCP
495	};
496
497	static const uchar actionTable[CLCP + `1`][CLCP + `1`] = {
498	// XX PRPO OPHY NU SYIS CLCP
499	{ None , Start , Start , Start , None , None }, // XX
500	{ None , Start , Continue, Continue, None , None }, // PRPO
501	{ None , Start , Start , Continue, None , None }, // OPHY
502	{ Break , Break , Break , Continue, Continue, Continue }, // NU
503	{ Break , Break , Break , Continue, Continue, Continue }, // SYIS
504	{ Break , Continue, Break , Break , Break , Break }, // CLCP
505	};
506
507	inline Class toClass(QUnicodeTables::LineBreakClass lbc, QChar::Category category)
508	{
509	switch (lbc) {
510	case QUnicodeTables::LineBreak_AL:// case QUnicodeTables::LineBreak_AI:
511	// resolve AI math symbols in numerical context to IS
512	if (category == QChar::Symbol_Math)
513	return SYIS;
514	break;
515	case QUnicodeTables::LineBreak_PR: case QUnicodeTables::LineBreak_PO:
516	return PRPO;
517	case QUnicodeTables::LineBreak_OP: case QUnicodeTables::LineBreak_HY:
518	return OPHY;
519	case QUnicodeTables::LineBreak_NU:
520	return NU;
521	case QUnicodeTables::LineBreak_SY: case QUnicodeTables::LineBreak_IS:
522	return SYIS;
523	case QUnicodeTables::LineBreak_CL: case QUnicodeTables::LineBreak_CP:
524	return CLCP;
525	default:
526	break;
527	}
528	return XX;
529	}
530
531	} // namespace NS
532
533	/ In order to support the tailored implementation of LB25 properly*
534	the following changes were made in the pair table to allow breaks
535	where the numeric expression doesn't match the template (i.e. [^NU](IS\|SY)NU):
536	(CL)(PO) from IB to DB
537	(CP)(PO) from IB to DB
538	(CL)(PR) from IB to DB
539	(CP)(PR) from IB to DB
540	(PO)(OP) from IB to DB
541	(PR)(OP) from IB to DB
542	(IS)(NU) from IB to DB
543	(SY)(NU) from IB to DB
544	*/
545
546	/ In order to implementat LB21a properly a special rule HH has been introduced and*
547	the following changes were made in the pair table to disallow breaks after Hebrew + Hyphen:
548	(HL)(HY\|BA) from IB to CI
549	(HY\|BA)(!CB) from DB to HH
550	*/
551
552	enum Action {
553	ProhibitedBreak, PB = ProhibitedBreak,
554	DirectBreak, DB = DirectBreak,
555	IndirectBreak, IB = IndirectBreak,
556	CombiningIndirectBreak, CI = CombiningIndirectBreak,
557	CombiningProhibitedBreak, CP = CombiningProhibitedBreak,
558	ProhibitedBreakAfterHebrewPlusHyphen, HH = ProhibitedBreakAfterHebrewPlusHyphen,
559	IndirectBreakIfNarrow, IN = IndirectBreakIfNarrow, // For LB30
560	};
561
562	static const uchar breakTable[QUnicodeTables::LineBreak_ZWJ][QUnicodeTables::LineBreak_ZWJ] = {
563	/ OP CL CP QU GL NS EX SY IS PR PO NU AL HL ID IN HY BA BB B2 ZW CM WJ H2 H3 JL JV JT RI CB EB EM/
564	/ OP / { PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, CP, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB },
565	/ CL / { DB, PB, PB, IB, IB, PB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
566	/ CP / { DB, PB, PB, IB, IB, PB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
567	/ QU / { PB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
568	/ GL / { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
569	/ NS / { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
570	/ EX / { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
571	/ SY / { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
572	/ IS / { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
573	/ PR / { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, IB, DB, DB, IB, IB },
574	/ PO / { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
575	/ NU / { IN, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
576	/ AL / { IN, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
577	/ HL / { IN, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, CI, CI, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
578	/ ID / { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
579	/ IN / { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
580	/ HY / { HH, PB, PB, IB, HH, IB, PB, PB, PB, HH, HH, IB, HH, HH, HH, IB, IB, IB, HH, HH, PB, CI, PB, HH, HH, HH, HH, HH, HH, DB, DB, DB },
581	/ BA / { HH, PB, PB, IB, HH, IB, PB, PB, PB, HH, HH, HH, HH, HH, HH, IB, IB, IB, HH, HH, PB, CI, PB, HH, HH, HH, HH, HH, HH, DB, DB, DB },
582	/ BB / { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, DB, IB, IB },
583	/ B2 / { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, PB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
584	/ ZW / { DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
585	/ CM / { IB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
586	/ WJ / { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
587	/ H2 / { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB, DB, DB, DB, DB },
588	/ H3 / { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB, DB, DB, DB, DB },
589	/ JL / { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, DB, DB, DB, DB, DB },
590	/ JV / { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB, DB, DB, DB, DB },
591	/ JT / { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB, DB, DB, DB, DB },
592	/ RI / { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, IB, DB, DB, DB },
593	/ CB / { DB, PB, PB, IB, IB, DB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
594	/ EB / { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
595	/ EM / { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
596	};
597
598	// The following line break classes are not treated by the pair table
599	// and must be resolved outside:
600	// AI, BK, CB, CJ, CR, LF, NL, ZWJ, SA, SG, SP, XX
601
602	} // namespace LB
603
604	static void getLineBreaks(const char16_t string, qsizetype len, QCharAttributes attributes, QUnicodeTools::CharAttributeOptions options)
605	{
606	qsizetype nestart = `0`;
607	LB::NS::Class nelast = LB::NS::XX;
608
609	QUnicodeTables::LineBreakClass lcls = QUnicodeTables::LineBreak_LF; // to meet LB10
610	QUnicodeTables::LineBreakClass cls = lcls;
611	const QUnicodeTables::Properties *lastProp = QUnicodeTables::properties(ucs4: U`'\n'`);
612
613	for (qsizetype i = `0`; i != len; ++i) {
614	qsizetype pos = i;
615	char32_t ucs4 = string[i];
616	if (QChar::isHighSurrogate(ucs4) && i + `1` != len) {
617	ushort low = string[i + `1`];
618	if (QChar::isLowSurrogate(ucs4: low)) {
619	ucs4 = QChar::surrogateToUcs4(high: ucs4, low);
620	++i;
621	}
622	}
623
624	const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
625	QUnicodeTables::LineBreakClass ncls = (QUnicodeTables::LineBreakClass) prop->lineBreakClass;
626	QUnicodeTables::LineBreakClass tcls;
627
628	if (options & QUnicodeTools::HangulLineBreakTailoring) {
629	if (Q_UNLIKELY((ncls >= QUnicodeTables::LineBreak_H2
630	&& ncls <= QUnicodeTables::LineBreak_JT)
631	\|\| (ucs4 >= `0x3130` && ucs4 <= `0x318F` && ncls == QUnicodeTables::LineBreak_ID))
632	) {
633	// LB27: use SPACE for line breaking
634	// "When Korean uses SPACE for line breaking, the classes in rule LB26,
635	// as well as characters of class ID, are often tailored to AL; see Section 8, Customization."
636	// In case of Korean syllables: "3130..318F HANGUL COMPATIBILITY JAMO"
637	ncls = QUnicodeTables::LineBreak_AL;
638	} else {
639	if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_SA)) {
640	// LB1: resolve SA to AL, except of those that have Category Mn or Mc be resolved to CM
641	static const int test = FLAG(QChar::Mark_NonSpacing) \| FLAG(QChar::Mark_SpacingCombining);
642	if (FLAG(prop->category) & test)
643	ncls = QUnicodeTables::LineBreak_CM;
644	}
645	if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_CM)) {
646	// LB10: treat CM that follows SP, BK, CR, LF, NL, or ZW as AL
647	if (lcls == QUnicodeTables::LineBreak_ZW \|\| lcls >= QUnicodeTables::LineBreak_SP)
648	ncls = QUnicodeTables::LineBreak_AL;
649	}
650	}
651	}
652
653	if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_SA)) {
654	// LB1: resolve SA to AL, except of those that have Category Mn or Mc be resolved to CM
655	static const int test = FLAG(QChar::Mark_NonSpacing) \| FLAG(QChar::Mark_SpacingCombining);
656	if (FLAG(prop->category) & test)
657	ncls = QUnicodeTables::LineBreak_CM;
658	}
659
660	if (Q_UNLIKELY(lcls >= QUnicodeTables::LineBreak_CR)) {
661	// LB4: BK!, LB5: (CRxLF\|CR\|LF\|NL)!
662	if (lcls > QUnicodeTables::LineBreak_CR \|\| ncls != QUnicodeTables::LineBreak_LF)
663	attributes[pos].lineBreak = attributes[pos].mandatoryBreak = true;
664	if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_CM \|\| ncls == QUnicodeTables::LineBreak_ZWJ)) {
665	cls = QUnicodeTables::LineBreak_AL;
666	goto next_no_cls_update;
667	}
668	goto next;
669	}
670
671	if (Q_UNLIKELY(ncls >= QUnicodeTables::LineBreak_SP)) {
672	if (ncls > QUnicodeTables::LineBreak_SP)
673	goto next; // LB6: x(BK\|CR\|LF\|NL)
674	goto next_no_cls_update; // LB7: xSP
675	}
676
677	if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_CM \|\| ncls == QUnicodeTables::LineBreak_ZWJ)) {
678	// LB9: treat CM that don't follows SP, BK, CR, LF, NL, or ZW as X
679	if (lcls != QUnicodeTables::LineBreak_ZW && lcls < QUnicodeTables::LineBreak_SP)
680	// don't update anything
681	goto next_no_cls_update;
682	}
683
684	if (Q_UNLIKELY(lcls == QUnicodeTables::LineBreak_ZWJ)) {
685	// LB8a: ZWJ x
686	goto next;
687	}
688
689	// LB25: do not break lines inside numbers
690	{
691	LB::NS::Class necur = LB::NS::toClass(lbc: ncls, category: (QChar::Category)prop->category);
692	switch (LB::NS::actionTable[nelast][necur]) {
693	case LB::NS::Break:
694	// do not change breaks before and after the expression
695	for (qsizetype j = nestart + `1`; j < pos; ++j)
696	attributes[j].lineBreak = false;
697	Q_FALLTHROUGH();
698	case LB::NS::None:
699	nelast = LB::NS::XX; // reset state
700	break;
701	case LB::NS::Start:
702	nestart = i;
703	Q_FALLTHROUGH();
704	default:
705	nelast = necur;
706	break;
707	}
708	}
709
710	if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_RI && lcls == QUnicodeTables::LineBreak_RI)) {
711	// LB30a
712	ncls = QUnicodeTables::LineBreak_SP;
713	goto next;
714	}
715
716	if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_EM
717	&& lastProp->category == QChar::Other_NotAssigned
718	&& lastProp->graphemeBreakClass
719	== QUnicodeTables::GraphemeBreak_Extended_Pictographic)) {
720	// LB30b: [\p{Extended_Pictographic}&\p{Cn}] × EM
721	goto next;
722	}
723
724	// for South East Asian chars that require a complex analysis, the Unicode
725	// standard recommends to treat them as AL. tailoring that do dictionary analysis can override
726	if (Q_UNLIKELY(cls >= QUnicodeTables::LineBreak_SA))
727	cls = QUnicodeTables::LineBreak_AL;
728
729	tcls = cls;
730	if (tcls == QUnicodeTables::LineBreak_CM \|\| tcls == QUnicodeTables::LineBreak_ZWJ)
731	// LB10
732	tcls = QUnicodeTables::LineBreak_AL;
733	switch (LB::breakTable[tcls][ncls < QUnicodeTables::LineBreak_ZWJ ? ncls : QUnicodeTables::LineBreak_AL]) {
734	case LB::DirectBreak:
735	attributes[pos].lineBreak = true;
736	break;
737	case LB::IndirectBreak:
738	if (lcls == QUnicodeTables::LineBreak_SP)
739	attributes[pos].lineBreak = true;
740	break;
741	case LB::CombiningIndirectBreak:
742	if (lcls != QUnicodeTables::LineBreak_SP)
743	goto next_no_cls_update;
744	attributes[pos].lineBreak = true;
745	break;
746	case LB::CombiningProhibitedBreak:
747	if (lcls != QUnicodeTables::LineBreak_SP)
748	goto next_no_cls_update;
749	break;
750	case LB::ProhibitedBreakAfterHebrewPlusHyphen:
751	if (lcls != QUnicodeTables::LineBreak_HL)
752	attributes[pos].lineBreak = true;
753	break;
754	case LB::IndirectBreakIfNarrow:
755	switch (static_cast<QUnicodeTables::EastAsianWidth>(prop->eastAsianWidth)) {
756	default:
757	if (lcls != QUnicodeTables::LineBreak_SP)
758	break;
759	Q_FALLTHROUGH();
760	case QUnicodeTables::EastAsianWidth::F:
761	case QUnicodeTables::EastAsianWidth::W:
762	case QUnicodeTables::EastAsianWidth::H:
763	attributes[pos].lineBreak = true;
764	break;
765	}
766	break;
767	case LB::ProhibitedBreak:
768	// nothing to do
769	default:
770	break;
771	}
772
773	next:
774	cls = ncls;
775	lastProp = prop;
776	next_no_cls_update:
777	lcls = ncls;
778	}
779
780	if (Q_UNLIKELY(LB::NS::actionTable[nelast][LB::NS::XX] == LB::NS::Break)) {
781	// LB25: do not break lines inside numbers
782	for (qsizetype j = nestart + `1`; j < len; ++j)
783	attributes[j].lineBreak = false;
784	}
785
786	attributes[`0`].lineBreak = attributes[`0`].mandatoryBreak = false; // LB2
787	attributes[len].lineBreak = attributes[len].mandatoryBreak = true; // LB3
788	}
789
790
791	static void getWhiteSpaces(const char16_t string, qsizetype len, QCharAttributes attributes)
792	{
793	for (qsizetype i = `0`; i != len; ++i) {
794	uint ucs4 = string[i];
795	if (QChar::isHighSurrogate(ucs4) && i + `1` != len) {
796	ushort low = string[i + `1`];
797	if (QChar::isLowSurrogate(ucs4: low)) {
798	ucs4 = QChar::surrogateToUcs4(high: ucs4, low);
799	++i;
800	}
801	}
802
803	if (Q_UNLIKELY(QChar::isSpace(ucs4)))
804	attributes[i].whiteSpace = true;
805	}
806	}
807
808	namespace Tailored {
809
810	using CharAttributeFunction = void ()(QChar::Script script, const* char16_t text, qsizetype from, qsizetype len, QCharAttributes attributes);
811
812
813	enum Form {
814	Invalid = `0x0`,
815	UnknownForm = Invalid,
816	Consonant,
817	Nukta,
818	Halant,
819	Matra,
820	VowelMark,
821	StressMark,
822	IndependentVowel,
823	LengthMark,
824	Control,
825	Other
826	};
827
828	static const unsigned char indicForms[`0xe00`-`0x900`] = {
829	// Devangari
830	Invalid, VowelMark, VowelMark, VowelMark,
831	IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
832	IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
833	IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
834
835	IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
836	IndependentVowel, Consonant, Consonant, Consonant,
837	Consonant, Consonant, Consonant, Consonant,
838	Consonant, Consonant, Consonant, Consonant,
839
840	Consonant, Consonant, Consonant, Consonant,
841	Consonant, Consonant, Consonant, Consonant,
842	Consonant, Consonant, Consonant, Consonant,
843	Consonant, Consonant, Consonant, Consonant,
844
845	Consonant, Consonant, Consonant, Consonant,
846	Consonant, Consonant, Consonant, Consonant,
847	Consonant, Consonant, UnknownForm, UnknownForm,
848	Nukta, Other, Matra, Matra,
849
850	Matra, Matra, Matra, Matra,
851	Matra, Matra, Matra, Matra,
852	Matra, Matra, Matra, Matra,
853	Matra, Halant, UnknownForm, UnknownForm,
854
855	Other, StressMark, StressMark, StressMark,
856	StressMark, UnknownForm, UnknownForm, UnknownForm,
857	Consonant, Consonant, Consonant, Consonant,
858	Consonant, Consonant, Consonant, Consonant,
859
860	IndependentVowel, IndependentVowel, VowelMark, VowelMark,
861	Other, Other, Other, Other,
862	Other, Other, Other, Other,
863	Other, Other, Other, Other,
864
865	Other, Other, Other, Other,
866	Other, Other, Other, Other,
867	Other, Other, Other, Consonant,
868	Consonant, Consonant / ??? /, Consonant, Consonant,
869
870	// Bengali
871	Invalid, VowelMark, VowelMark, VowelMark,
872	Invalid, IndependentVowel, IndependentVowel, IndependentVowel,
873	IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
874	IndependentVowel, Invalid, Invalid, IndependentVowel,
875
876	IndependentVowel, Invalid, Invalid, IndependentVowel,
877	IndependentVowel, Consonant, Consonant, Consonant,
878	Consonant, Consonant, Consonant, Consonant,
879	Consonant, Consonant, Consonant, Consonant,
880
881	Consonant, Consonant, Consonant, Consonant,
882	Consonant, Consonant, Consonant, Consonant,
883	Consonant, Invalid, Consonant, Consonant,
884	Consonant, Consonant, Consonant, Consonant,
885
886	Consonant, Invalid, Consonant, Invalid,
887	Invalid, Invalid, Consonant, Consonant,
888	Consonant, Consonant, UnknownForm, UnknownForm,
889	Nukta, Other, Matra, Matra,
890
891	Matra, Matra, Matra, Matra,
892	Matra, Invalid, Invalid, Matra,
893	Matra, Invalid, Invalid, Matra,
894	Matra, Halant, Consonant, UnknownForm,
895
896	Invalid, Invalid, Invalid, Invalid,
897	Invalid, Invalid, Invalid, VowelMark,
898	Invalid, Invalid, Invalid, Invalid,
899	Consonant, Consonant, Invalid, Consonant,
900
901	IndependentVowel, IndependentVowel, VowelMark, VowelMark,
902	Other, Other, Other, Other,
903	Other, Other, Other, Other,
904	Other, Other, Other, Other,
905
906	Consonant, Consonant, Other, Other,
907	Other, Other, Other, Other,
908	Other, Other, Other, Other,
909	Other, Other, Other, Other,
910
911	// Gurmukhi
912	Invalid, VowelMark, VowelMark, VowelMark,
913	Invalid, IndependentVowel, IndependentVowel, IndependentVowel,
914	IndependentVowel, IndependentVowel, IndependentVowel, Invalid,
915	Invalid, Invalid, Invalid, IndependentVowel,
916
917	IndependentVowel, Invalid, Invalid, IndependentVowel,
918	IndependentVowel, Consonant, Consonant, Consonant,
919	Consonant, Consonant, Consonant, Consonant,
920	Consonant, Consonant, Consonant, Consonant,
921
922	Consonant, Consonant, Consonant, Consonant,
923	Consonant, Consonant, Consonant, Consonant,
924	Consonant, Invalid, Consonant, Consonant,
925	Consonant, Consonant, Consonant, Consonant,
926
927	Consonant, Invalid, Consonant, Consonant,
928	Invalid, Consonant, Consonant, Invalid,
929	Consonant, Consonant, UnknownForm, UnknownForm,
930	Nukta, Other, Matra, Matra,
931
932	Matra, Matra, Matra, Invalid,
933	Invalid, Invalid, Invalid, Matra,
934	Matra, Invalid, Invalid, Matra,
935	Matra, Halant, UnknownForm, UnknownForm,
936
937	Invalid, Invalid, Invalid, Invalid,
938	Invalid, UnknownForm, UnknownForm, UnknownForm,
939	Invalid, Consonant, Consonant, Consonant,
940	Consonant, Invalid, Consonant, Invalid,
941
942	Other, Other, Invalid, Invalid,
943	Other, Other, Other, Other,
944	Other, Other, Other, Other,
945	Other, Other, Other, Other,
946
947	StressMark, StressMark, Consonant, Consonant,
948	Other, Other, Other, Other,
949	Other, Other, Other, Other,
950	Other, Other, Other, Other,
951
952	// Gujarati
953	Invalid, VowelMark, VowelMark, VowelMark,
954	Invalid, IndependentVowel, IndependentVowel, IndependentVowel,
955	IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
956	IndependentVowel, IndependentVowel, Invalid, IndependentVowel,
957
958	IndependentVowel, IndependentVowel, Invalid, IndependentVowel,
959	IndependentVowel, Consonant, Consonant, Consonant,
960	Consonant, Consonant, Consonant, Consonant,
961	Consonant, Consonant, Consonant, Consonant,
962
963	Consonant, Consonant, Consonant, Consonant,
964	Consonant, Consonant, Consonant, Consonant,
965	Consonant, Invalid, Consonant, Consonant,
966	Consonant, Consonant, Consonant, Consonant,
967
968	Consonant, Invalid, Consonant, Consonant,
969	Invalid, Consonant, Consonant, Consonant,
970	Consonant, Consonant, UnknownForm, UnknownForm,
971	Nukta, Other, Matra, Matra,
972
973	Matra, Matra, Matra, Matra,
974	Matra, Matra, Invalid, Matra,
975	Matra, Matra, Invalid, Matra,
976	Matra, Halant, UnknownForm, UnknownForm,
977
978	Other, UnknownForm, UnknownForm, UnknownForm,
979	UnknownForm, UnknownForm, UnknownForm, UnknownForm,
980	UnknownForm, UnknownForm, UnknownForm, UnknownForm,
981	UnknownForm, UnknownForm, UnknownForm, UnknownForm,
982
983	IndependentVowel, IndependentVowel, VowelMark, VowelMark,
984	Other, Other, Other, Other,
985	Other, Other, Other, Other,
986	Other, Other, Other, Other,
987
988	Other, Other, Other, Other,
989	Other, Other, Other, Other,
990	Other, Other, Other, Other,
991	Other, Other, Other, Other,
992
993	// Oriya
994	Invalid, VowelMark, VowelMark, VowelMark,
995	Invalid, IndependentVowel, IndependentVowel, IndependentVowel,
996	IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
997	IndependentVowel, Invalid, Invalid, IndependentVowel,
998
999	IndependentVowel, Invalid, Invalid, IndependentVowel,
1000	IndependentVowel, Consonant, Consonant, Consonant,
1001	Consonant, Consonant, Consonant, Consonant,
1002	Consonant, Consonant, Consonant, Consonant,
1003
1004	Consonant, Consonant, Consonant, Consonant,
1005	Consonant, Consonant, Consonant, Consonant,
1006	Consonant, Invalid, Consonant, Consonant,
1007	Consonant, Consonant, Consonant, Consonant,
1008
1009	Consonant, Invalid, Consonant, Consonant,
1010	Invalid, Consonant, Consonant, Consonant,
1011	Consonant, Consonant, UnknownForm, UnknownForm,
1012	Nukta, Other, Matra, Matra,
1013
1014	Matra, Matra, Matra, Matra,
1015	Invalid, Invalid, Invalid, Matra,
1016	Matra, Invalid, Invalid, Matra,
1017	Matra, Halant, UnknownForm, UnknownForm,
1018
1019	Other, Invalid, Invalid, Invalid,
1020	Invalid, UnknownForm, LengthMark, LengthMark,
1021	Invalid, Invalid, Invalid, Invalid,
1022	Consonant, Consonant, Invalid, Consonant,
1023
1024	IndependentVowel, IndependentVowel, Invalid, Invalid,
1025	Invalid, Invalid, Other, Other,
1026	Other, Other, Other, Other,
1027	Other, Other, Other, Other,
1028
1029	Other, Consonant, Other, Other,
1030	Other, Other, Other, Other,
1031	Other, Other, Other, Other,
1032	Other, Other, Other, Other,
1033
1034	//Tamil
1035	Invalid, Invalid, VowelMark, Other,
1036	Invalid, IndependentVowel, IndependentVowel, IndependentVowel,
1037	IndependentVowel, IndependentVowel, IndependentVowel, Invalid,
1038	Invalid, Invalid, IndependentVowel, IndependentVowel,
1039
1040	IndependentVowel, Invalid, IndependentVowel, IndependentVowel,
1041	IndependentVowel, Consonant, Invalid, Invalid,
1042	Invalid, Consonant, Consonant, Invalid,
1043	Consonant, Invalid, Consonant, Consonant,
1044
1045	Invalid, Invalid, Invalid, Consonant,
1046	Consonant, Invalid, Invalid, Invalid,
1047	Consonant, Consonant, Consonant, Invalid,
1048	Invalid, Invalid, Consonant, Consonant,
1049
1050	Consonant, Consonant, Consonant, Consonant,
1051	Consonant, Consonant, Consonant, Consonant,
1052	Consonant, Consonant, UnknownForm, UnknownForm,
1053	Invalid, Invalid, Matra, Matra,
1054
1055	Matra, Matra, Matra, Invalid,
1056	Invalid, Invalid, Matra, Matra,
1057	Matra, Invalid, Matra, Matra,
1058	Matra, Halant, Invalid, Invalid,
1059
1060	Invalid, Invalid, Invalid, Invalid,
1061	Invalid, Invalid, Invalid, LengthMark,
1062	Invalid, Invalid, Invalid, Invalid,
1063	Invalid, Invalid, Invalid, Invalid,
1064
1065	Invalid, Invalid, Invalid, Invalid,
1066	Invalid, Invalid, Other, Other,
1067	Other, Other, Other, Other,
1068	Other, Other, Other, Other,
1069
1070	Other, Other, Other, Other,
1071	Other, Other, Other, Other,
1072	Other, Other, Other, Other,
1073	Other, Other, Other, Other,
1074
1075	// Telugu
1076	Invalid, VowelMark, VowelMark, VowelMark,
1077	Invalid, IndependentVowel, IndependentVowel, IndependentVowel,
1078	IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
1079	IndependentVowel, Invalid, IndependentVowel, IndependentVowel,
1080
1081	IndependentVowel, Invalid, IndependentVowel, IndependentVowel,
1082	IndependentVowel, Consonant, Consonant, Consonant,
1083	Consonant, Consonant, Consonant, Consonant,
1084	Consonant, Consonant, Consonant, Consonant,
1085
1086	Consonant, Consonant, Consonant, Consonant,
1087	Consonant, Consonant, Consonant, Consonant,
1088	Consonant, Invalid, Consonant, Consonant,
1089	Consonant, Consonant, Consonant, Consonant,
1090
1091	Consonant, Consonant, Consonant, Consonant,
1092	Invalid, Consonant, Consonant, Consonant,
1093	Consonant, Consonant, UnknownForm, UnknownForm,
1094	Invalid, Invalid, Matra, Matra,
1095
1096	Matra, Matra, Matra, Matra,
1097	Matra, Invalid, Matra, Matra,
1098	Matra, Invalid, Matra, Matra,
1099	Matra, Halant, Invalid, Invalid,
1100
1101	Invalid, Invalid, Invalid, Invalid,
1102	Invalid, LengthMark, Matra, Invalid,
1103	Invalid, Invalid, Invalid, Invalid,
1104	Invalid, Invalid, Invalid, Invalid,
1105
1106	IndependentVowel, IndependentVowel, Invalid, Invalid,
1107	Invalid, Invalid, Other, Other,
1108	Other, Other, Other, Other,
1109	Other, Other, Other, Other,
1110
1111	Other, Other, Other, Other,
1112	Other, Other, Other, Other,
1113	Other, Other, Other, Other,
1114	Other, Other, Other, Other,
1115
1116	// Kannada
1117	Invalid, Invalid, VowelMark, VowelMark,
1118	Invalid, IndependentVowel, IndependentVowel, IndependentVowel,
1119	IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
1120	IndependentVowel, Invalid, IndependentVowel, IndependentVowel,
1121
1122	IndependentVowel, Invalid, IndependentVowel, IndependentVowel,
1123	IndependentVowel, Consonant, Consonant, Consonant,
1124	Consonant, Consonant, Consonant, Consonant,
1125	Consonant, Consonant, Consonant, Consonant,
1126
1127	Consonant, Consonant, Consonant, Consonant,
1128	Consonant, Consonant, Consonant, Consonant,
1129	Consonant, Invalid, Consonant, Consonant,
1130	Consonant, Consonant, Consonant, Consonant,
1131
1132	Consonant, Consonant, Consonant, Consonant,
1133	Invalid, Consonant, Consonant, Consonant,
1134	Consonant, Consonant, UnknownForm, UnknownForm,
1135	Nukta, Other, Matra, Matra,
1136
1137	Matra, Matra, Matra, Matra,
1138	Matra, Invalid, Matra, Matra,
1139	Matra, Invalid, Matra, Matra,
1140	Matra, Halant, Invalid, Invalid,
1141
1142	Invalid, Invalid, Invalid, Invalid,
1143	Invalid, LengthMark, LengthMark, Invalid,
1144	Invalid, Invalid, Invalid, Invalid,
1145	Invalid, Invalid, Consonant, Invalid,
1146
1147	IndependentVowel, IndependentVowel, VowelMark, VowelMark,
1148	Invalid, Invalid, Other, Other,
1149	Other, Other, Other, Other,
1150	Other, Other, Other, Other,
1151
1152	Other, Other, Other, Other,
1153	Other, Other, Other, Other,
1154	Other, Other, Other, Other,
1155	Other, Other, Other, Other,
1156
1157	// Malayalam
1158	Invalid, Invalid, VowelMark, VowelMark,
1159	Invalid, IndependentVowel, IndependentVowel, IndependentVowel,
1160	IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
1161	IndependentVowel, Invalid, IndependentVowel, IndependentVowel,
1162
1163	IndependentVowel, Invalid, IndependentVowel, IndependentVowel,
1164	IndependentVowel, Consonant, Consonant, Consonant,
1165	Consonant, Consonant, Consonant, Consonant,
1166	Consonant, Consonant, Consonant, Consonant,
1167
1168	Consonant, Consonant, Consonant, Consonant,
1169	Consonant, Consonant, Consonant, Consonant,
1170	Consonant, Invalid, Consonant, Consonant,
1171	Consonant, Consonant, Consonant, Consonant,
1172
1173	Consonant, Consonant, Consonant, Consonant,
1174	Consonant, Consonant, Consonant, Consonant,
1175	Consonant, Consonant, UnknownForm, UnknownForm,
1176	Invalid, Invalid, Matra, Matra,
1177
1178	Matra, Matra, Matra, Matra,
1179	Invalid, Invalid, Matra, Matra,
1180	Matra, Invalid, Matra, Matra,
1181	Matra, Halant, Invalid, Invalid,
1182
1183	Invalid, Invalid, Invalid, Invalid,
1184	Invalid, Invalid, Invalid, Matra,
1185	Invalid, Invalid, Invalid, Invalid,
1186	Invalid, Invalid, Invalid, Invalid,
1187
1188	IndependentVowel, IndependentVowel, Invalid, Invalid,
1189	Invalid, Invalid, Other, Other,
1190	Other, Other, Other, Other,
1191	Other, Other, Other, Other,
1192
1193	Other, Other, Other, Other,
1194	Other, Other, Other, Other,
1195	Other, Other, Other, Other,
1196	Other, Other, Other, Other,
1197
1198	// Sinhala
1199	Invalid, Invalid, VowelMark, VowelMark,
1200	Invalid, IndependentVowel, IndependentVowel, IndependentVowel,
1201	IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
1202	IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
1203
1204	IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
1205	IndependentVowel, IndependentVowel, IndependentVowel, Invalid,
1206	Invalid, Invalid, Consonant, Consonant,
1207	Consonant, Consonant, Consonant, Consonant,
1208
1209	Consonant, Consonant, Consonant, Consonant,
1210	Consonant, Consonant, Consonant, Consonant,
1211	Consonant, Consonant, Consonant, Consonant,
1212	Consonant, Consonant, Consonant, Consonant,
1213
1214	Consonant, Consonant, Invalid, Consonant,
1215	Consonant, Consonant, Consonant, Consonant,
1216	Consonant, Consonant, Consonant, Consonant,
1217	Invalid, Consonant, Invalid, Invalid,
1218
1219	Consonant, Consonant, Consonant, Consonant,
1220	Consonant, Consonant, Consonant, Invalid,
1221	Invalid, Invalid, Halant, Invalid,
1222	Invalid, Invalid, Invalid, Matra,
1223
1224	Matra, Matra, Matra, Matra,
1225	Matra, Invalid, Matra, Invalid,
1226	Matra, Matra, Matra, Matra,
1227	Matra, Matra, Matra, Matra,
1228
1229	Invalid, Invalid, Invalid, Invalid,
1230	Invalid, Invalid, Invalid, Invalid,
1231	Invalid, Invalid, Invalid, Invalid,
1232	Invalid, Invalid, Invalid, Invalid,
1233
1234	Invalid, Invalid, Matra, Matra,
1235	Other, Other, Other, Other,
1236	Other, Other, Other, Other,
1237	Other, Other, Other, Other,
1238	};
1239
1240	static inline Form form(unsigned short uc) {
1241	if (uc < `0x900` \|\| uc > `0xdff`) {
1242	if (uc == `0x25cc`)
1243	return Consonant;
1244	if (uc == `0x200c` \|\| uc == `0x200d`)
1245	return Control;
1246	return Other;
1247	}
1248	return (Form)indicForms[uc-`0x900`];
1249	}
1250
1251	// #define INDIC_DEBUG
1252	#ifdef INDIC_DEBUG
1253	#define IDEBUG qDebug
1254	#else
1255	#define IDEBUG if constexpr (1) ; else qDebug
1256	#endif
1257
1258	/ syllables are of the form:*
1259
1260	(Consonant Nukta? Halant) Consonant Matra? VowelMark? StressMark?*
1261	(Consonant Nukta? Halant) Consonant Halant*
1262	IndependentVowel VowelMark? StressMark?
1263
1264	We return syllable boundaries on invalid combinations as well
1265	*/
1266	static qsizetype indic_nextSyllableBoundary(QChar::Script script, const char16_t s, qsizetype start, qsizetype end, bool* *invalid)
1267	{
1268	invalid = false*;
1269	IDEBUG(msg: "indic_nextSyllableBoundary: start=%lld, end=%lld", qlonglong(start), qlonglong(end));
1270	const char16_t *uc = s+start;
1271
1272	qsizetype pos = `0`;
1273	Form state = form(uc: uc[pos]);
1274	IDEBUG(msg: "state[%lld]=%d (uc=%4x)", qlonglong(pos), state, uc[pos]);
1275	pos++;
1276
1277	if (state != Consonant && state != IndependentVowel) {
1278	if (state != Other)
1279	invalid = true*;
1280	goto finish;
1281	}
1282
1283	while (pos < end - start) {
1284	Form newState = form(uc: uc[pos]);
1285	IDEBUG(msg: "state[%lld]=%d (uc=%4x)", qlonglong(pos), newState, uc[pos]);
1286	switch (newState) {
1287	case Control:
1288	newState = state;
1289	if (state == Halant && uc[pos] == `0x200d` / ZWJ /)
1290	break;
1291	// the control character should be the last char in the item
1292	if (state == Consonant && script == QChar::Script_Bengali && uc[pos-`1`] == `0x09B0` && uc[pos] == `0x200d` / ZWJ /)
1293	break;
1294	if (state == Consonant && script == QChar::Script_Kannada && uc[pos-`1`] == `0x0CB0` && uc[pos] == `0x200d` / ZWJ /)
1295	break;
1296	// Bengali and Kannada has a special exception for rendering yaphala with ra (to avoid reph) see http://www.unicode.org/faq/indic.html#15
1297	++pos;
1298	goto finish;
1299	case Consonant:
1300	if (state == Halant && (script != QChar::Script_Sinhala \|\| uc[pos-`1`] == `0x200d` / ZWJ /))
1301	break;
1302	goto finish;
1303	case Halant:
1304	if (state == Nukta \|\| state == Consonant)
1305	break;
1306	// Bengali has a special exception allowing the combination Vowel_A/E + Halant + Ya
1307	if (script == QChar::Script_Bengali && pos == `1` &&
1308	(uc[`0`] == `0x0985` \|\| uc[`0`] == `0x098f`))
1309	break;
1310	// Sinhala uses the Halant as a component of certain matras. Allow these, but keep the state on Matra.
1311	if (script == QChar::Script_Sinhala && state == Matra) {
1312	++pos;
1313	continue;
1314	}
1315	if (script == QChar::Script_Malayalam && state == Matra && uc[pos-`1`] == `0x0d41`) {
1316	++pos;
1317	continue;
1318	}
1319	goto finish;
1320	case Nukta:
1321	if (state == Consonant)
1322	break;
1323	goto finish;
1324	case StressMark:
1325	if (state == VowelMark)
1326	break;
1327	Q_FALLTHROUGH();
1328	case VowelMark:
1329	if (state == Matra \|\| state == LengthMark \|\| state == IndependentVowel)
1330	break;
1331	Q_FALLTHROUGH();
1332	case Matra:
1333	if (state == Consonant \|\| state == Nukta)
1334	break;
1335	if (state == Matra) {
1336	// ### needs proper testing for correct two/three part matras
1337	break;
1338	}
1339	// ### not sure if this is correct. If it is, does it apply only to Bengali or should
1340	// it work for all Indic languages?
1341	// the combination Independent_A + Vowel Sign AA is allowed.
1342	if (script == QChar::Script_Bengali && uc[pos] == `0x9be` && uc[pos-`1`] == `0x985`)
1343	break;
1344	if (script == QChar::Script_Tamil && state == Matra) {
1345	if (uc[pos-`1`] == `0x0bc6` &&
1346	(uc[pos] == `0xbbe` \|\| uc[pos] == `0xbd7`))
1347	break;
1348	if (uc[pos-`1`] == `0x0bc7` && uc[pos] == `0xbbe`)
1349	break;
1350	}
1351	goto finish;
1352
1353	case LengthMark:
1354	if (state == Matra) {
1355	// ### needs proper testing for correct two/three part matras
1356	break;
1357	}
1358	case IndependentVowel:
1359	case Invalid:
1360	case Other:
1361	goto finish;
1362	}
1363	state = newState;
1364	pos++;
1365	}
1366	finish:
1367	return pos+start;
1368	}
1369
1370	static void indicAttributes(QChar::Script script, const char16_t text, qsizetype from, qsizetype len, QCharAttributes attributes)
1371	{
1372	qsizetype end = from + len;
1373	attributes += from;
1374	qsizetype i = `0`;
1375	while (i < len) {
1376	bool invalid;
1377	qsizetype boundary = indic_nextSyllableBoundary(script, s: text, start: from+i, end, invalid: &invalid) - from;
1378	attributes[i].graphemeBoundary = true;
1379
1380	if (boundary > len-`1`) boundary = len;
1381	i++;
1382	while (i < boundary) {
1383	attributes[i].graphemeBoundary = false;
1384	++i;
1385	}
1386	assert(i == boundary);
1387	}
1388
1389
1390	}
1391
1392	#if QT_CONFIG(library)
1393
1394	#define LIBTHAI_MAJOR 0
1395
1396	/*
1397	* if libthai changed please update these codes too.
1398	*/
1399	struct thcell_t {
1400	unsigned char base; /< base character /*
1401	unsigned char hilo; /< upper/lower vowel/diacritic /*
1402	unsigned char top; /< top-level mark /*
1403	};
1404
1405	using ThBrk = struct _ThBrk;
1406
1407	namespace {
1408
1409	class LibThai final
1410	{
1411	Q_DISABLE_COPY_MOVE(LibThai)
1412
1413	using th_brk_new_def = ThBrk ()(const char *);
1414	using th_brk_delete_def = void ()(ThBrk );
1415	using th_brk_find_breaks_def = int ()(ThBrk , const unsigned char , int* *, size_t);
1416	using th_next_cell_def = size_t ()(const* unsigned char , size_t, struct* thcell_t , int*);
1417
1418	public:
1419	LibThai() : m_library ("thai"_L1, LIBTHAI_MAJOR)
1420	{
1421	m_th_brk_find_breaks =
1422	reinterpret_cast<th_brk_find_breaks_def>(m_library.resolve(symbol: "th_brk_find_breaks"));
1423	m_th_next_cell = reinterpret_cast<th_next_cell_def>(m_library.resolve(symbol: "th_next_cell"));
1424
1425	auto th_brk_new = reinterpret_cast<th_brk_new_def>(m_library.resolve(symbol: "th_brk_new"));
1426	if (th_brk_new) {
1427	m_state = th_brk_new(nullptr);
1428	m_th_brk_delete =
1429	reinterpret_cast<th_brk_delete_def>(m_library.resolve(symbol: "th_brk_delete"));
1430	}
1431	}
1432
1433	~LibThai()
1434	{
1435	if (m_state && m_th_brk_delete)
1436	m_th_brk_delete(m_state);
1437	m_library.unload();
1438	}
1439
1440	bool isInitialized() const { return m_th_brk_find_breaks && m_th_next_cell && m_state; }
1441
1442	int brk_find_breaks(const unsigned char s, int* pos, size_t pos_sz) const*
1443	{
1444	Q_ASSERT(m_state);
1445	Q_ASSERT(m_th_brk_find_breaks);
1446	return m_th_brk_find_breaks(m_state, s, pos, pos_sz);
1447	}
1448
1449	size_t next_cell(const unsigned char s, size_t len, struct* thcell_t cell, int* is_decomp_am)
1450	{
1451	Q_ASSERT(m_th_next_cell);
1452	return m_th_next_cell(s, len, cell, is_decomp_am);
1453	}
1454
1455	private:
1456	QLibrary m_library;
1457
1458	// Global state for th_brk_find_breaks().
1459	// Note: even if signature for th_brk_find_breaks() suggests otherwise, the
1460	// state is read-only, and so it is safe to use it from multiple threads after
1461	// initialization. This is also stated in the libthai documentation.
1462	ThBrk m_state = nullptr*;
1463
1464	th_brk_find_breaks_def m_th_brk_find_breaks = nullptr;
1465	th_next_cell_def m_th_next_cell = nullptr;
1466	th_brk_delete_def m_th_brk_delete = nullptr;
1467	};
1468
1469	} // unnamed namespace
1470
1471	Q_GLOBAL_STATIC(LibThai, g_libThai)
1472
1473	static void to_tis620(const char16_t string, qsizetype len, char* *cstr)
1474	{
1475	qsizetype i;
1476	unsigned char result = reinterpret_cast<unsigned* char *>(cstr);
1477
1478	for (i = `0`; i < len; ++i) {
1479	if (string[i] <= `0xa0`)
1480	result[i] = static_cast<unsigned char>(string[i]);
1481	else if (string[i] >= `0xe01` && string[i] <= `0xe5b`)
1482	result[i] = static_cast<unsigned char>(string[i] - `0xe00` + `0xa0`);
1483	else
1484	result[i] = static_cast<unsigned char>(~`0`); // Same encoding as libthai uses for invalid chars
1485	}
1486
1487	result[len] = `0`;
1488	}
1489
1490	/*
1491	* Thai Attributes: computes Word Break, Word Boundary and Char stop for THAI.
1492	*/
1493	static void thaiAssignAttributes(const char16_t string, qsizetype len, QCharAttributes attributes)
1494	{
1495	constexpr qsizetype Prealloc = `128`;
1496	QVarLengthArray<char, Prealloc + `1`> s(len + `1`);
1497	QVarLengthArray<int, Prealloc> break_positions(len);
1498	qsizetype numbreaks, i;
1499	struct thcell_t tis_cell;
1500
1501	LibThai *libThai = g_libThai;
1502	if (!libThai \|\| !libThai->isInitialized())
1503	return;
1504
1505	to_tis620(string, len, cstr: s.data());
1506
1507	for (i = `0`; i < len; ++i) {
1508	attributes[i].wordBreak = false;
1509	attributes[i].wordStart = false;
1510	attributes[i].wordEnd = false;
1511	attributes[i].lineBreak = false;
1512	}
1513
1514	attributes[`0`].wordBreak = true;
1515	attributes[`0`].wordStart = true;
1516	attributes[`0`].wordEnd = false;
1517	numbreaks = libThai->brk_find_breaks(s: reinterpret_cast<const unsigned char *>(s.data()),
1518	pos: break_positions.data(),
1519	pos_sz: static_cast<size_t>(break_positions.size()));
1520	for (i = `0`; i < numbreaks; ++i) {
1521	attributes[break_positions [i]].wordBreak = true;
1522	attributes[break_positions [i]].wordStart = true;
1523	attributes[break_positions [i]].wordEnd = true;
1524	attributes[break_positions [i]].lineBreak = true;
1525	}
1526	if (numbreaks > `0`)
1527	attributes[break_positions [numbreaks - `1`]].wordStart = false;
1528
1529	/ manage grapheme boundaries /
1530	i = `0`;
1531	while (i < len) {
1532	size_t cell_length =
1533	libThai->next_cell(s: reinterpret_cast<const unsigned char *>(s.data()) + i,
1534	len: size_t(len - i), cell: &tis_cell, is_decomp_am: true);
1535
1536	attributes[i].graphemeBoundary = true;
1537	for (size_t j = `1`; j < cell_length; ++j)
1538	attributes[i + j].graphemeBoundary = false;
1539
1540	i += cell_length;
1541	}
1542	}
1543
1544	#endif // QT_CONFIG(library)
1545
1546	static void thaiAttributes(QChar::Script script, const char16_t text, qsizetype from, qsizetype len, QCharAttributes attributes)
1547	{
1548	assert(script == QChar::Script_Thai);
1549	#if QT_CONFIG(library)
1550	const char16_t *uc = text + from;
1551	attributes += from;
1552	Q_UNUSED(script);
1553	thaiAssignAttributes(string: uc, len, attributes);
1554	#else
1555	Q_UNUSED(script);
1556	Q_UNUSED(text);
1557	Q_UNUSED(from);
1558	Q_UNUSED(len);
1559	Q_UNUSED(attributes);
1560	#endif
1561	}
1562
1563	/*
1564	tibetan syllables are of the form:
1565	head position consonant
1566	first sub-joined consonant
1567	....intermediate sub-joined consonants (if any)
1568	last sub-joined consonant
1569	sub-joined vowel (a-chung U+0F71)
1570	standard or compound vowel sign (or 'virama' for devanagari transliteration)
1571	*/
1572
1573	typedef enum {
1574	TibetanOther,
1575	TibetanHeadConsonant,
1576	TibetanSubjoinedConsonant,
1577	TibetanSubjoinedVowel,
1578	TibetanVowel
1579	} TibetanForm;
1580
1581	/ this table starts at U+0f40 /
1582	static const unsigned char tibetanForm[`0x80`] = {
1583	TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1584	TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1585	TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1586	TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1587
1588	TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1589	TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1590	TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1591	TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1592
1593	TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1594	TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1595	TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1596	TibetanOther, TibetanOther, TibetanOther, TibetanOther,
1597
1598	TibetanOther, TibetanVowel, TibetanVowel, TibetanVowel,
1599	TibetanVowel, TibetanVowel, TibetanVowel, TibetanVowel,
1600	TibetanVowel, TibetanVowel, TibetanVowel, TibetanVowel,
1601	TibetanVowel, TibetanVowel, TibetanVowel, TibetanVowel,
1602
1603	TibetanVowel, TibetanVowel, TibetanVowel, TibetanVowel,
1604	TibetanVowel, TibetanVowel, TibetanVowel, TibetanVowel,
1605	TibetanOther, TibetanOther, TibetanOther, TibetanOther,
1606	TibetanOther, TibetanOther, TibetanOther, TibetanOther,
1607
1608	TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1609	TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1610	TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1611	TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1612
1613	TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1614	TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1615	TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1616	TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1617
1618	TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1619	TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1620	TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1621	TibetanSubjoinedConsonant, TibetanOther, TibetanOther, TibetanOther
1622	};
1623
1624	#define tibetan_form(c) \
1625	((c) >= 0x0f40 && (c) < 0x0fc0 ? (TibetanForm)tibetanForm[(c) - 0x0f40] : TibetanOther)
1626
1627	static qsizetype tibetan_nextSyllableBoundary(const char16_t s, qsizetype start, qsizetype end, bool* *invalid)
1628	{
1629	const char16_t *uc = s + start;
1630
1631	qsizetype pos = `0`;
1632	TibetanForm state = tibetan_form(*uc);
1633
1634	/ qDebug("state[%d]=%d (uc=%4x)", pos, state, uc[pos]);/
1635	pos++;
1636
1637	if (state != TibetanHeadConsonant) {
1638	if (state != TibetanOther)
1639	invalid = true*;
1640	goto finish;
1641	}
1642
1643	while (pos < end - start) {
1644	TibetanForm newState = tibetan_form(uc[pos]);
1645	switch (newState) {
1646	case TibetanSubjoinedConsonant:
1647	case TibetanSubjoinedVowel:
1648	if (state != TibetanHeadConsonant &&
1649	state != TibetanSubjoinedConsonant)
1650	goto finish;
1651	state = newState;
1652	break;
1653	case TibetanVowel:
1654	if (state != TibetanHeadConsonant &&
1655	state != TibetanSubjoinedConsonant &&
1656	state != TibetanSubjoinedVowel)
1657	goto finish;
1658	break;
1659	case TibetanOther:
1660	case TibetanHeadConsonant:
1661	goto finish;
1662	}
1663	pos++;
1664	}
1665
1666	finish:
1667	invalid = false*;
1668	return start+pos;
1669	}
1670
1671	static void tibetanAttributes(QChar::Script script, const char16_t text, qsizetype from, qsizetype len, QCharAttributes attributes)
1672	{
1673	qsizetype end = from + len;
1674	qsizetype i = `0`;
1675	Q_UNUSED(script);
1676	attributes += from;
1677	while (i < len) {
1678	bool invalid;
1679	qsizetype boundary = tibetan_nextSyllableBoundary(s: text, start: from+i, end, invalid: &invalid) - from;
1680
1681	attributes[i].graphemeBoundary = true;
1682
1683	if (boundary > len-`1`) boundary = len;
1684	i++;
1685	while (i < boundary) {
1686	attributes[i].graphemeBoundary = false;
1687	++i;
1688	}
1689	assert(i == boundary);
1690	}
1691	}
1692
1693	enum MymrCharClassValues {
1694	Mymr_CC_RESERVED = `0`,
1695	Mymr_CC_CONSONANT = `1`, / Consonant of type 1, that has subscript form /
1696	Mymr_CC_CONSONANT2 = `2`, / Consonant of type 2, that has no subscript form /
1697	Mymr_CC_NGA = `3`, / Consonant NGA /
1698	Mymr_CC_YA = `4`, / Consonant YA /
1699	Mymr_CC_RA = `5`, / Consonant RA /
1700	Mymr_CC_WA = `6`, / Consonant WA /
1701	Mymr_CC_HA = `7`, / Consonant HA /
1702	Mymr_CC_IND_VOWEL = `8`, / Independent vowel /
1703	Mymr_CC_ZERO_WIDTH_NJ_MARK = `9`, / Zero Width non joiner character (0x200C) /
1704	Mymr_CC_VIRAMA = `10`, / Subscript consonant combining character /
1705	Mymr_CC_PRE_VOWEL = `11`, / Dependent vowel, prebase (Vowel e) /
1706	Mymr_CC_BELOW_VOWEL = `12`, / Dependent vowel, prebase (Vowel u, uu) /
1707	Mymr_CC_ABOVE_VOWEL = `13`, / Dependent vowel, prebase (Vowel i, ii, ai) /
1708	Mymr_CC_POST_VOWEL = `14`, / Dependent vowel, prebase (Vowel aa) /
1709	Mymr_CC_SIGN_ABOVE = `15`,
1710	Mymr_CC_SIGN_BELOW = `16`,
1711	Mymr_CC_SIGN_AFTER = `17`,
1712	Mymr_CC_ZERO_WIDTH_J_MARK = `18`, / Zero width joiner character /
1713	Mymr_CC_COUNT = `19` / This is the number of character classes /
1714	};
1715
1716	enum MymrCharClassFlags {
1717	Mymr_CF_CLASS_MASK = `0x0000FFFF`,
1718
1719	Mymr_CF_CONSONANT = `0x01000000`, / flag to speed up comparing /
1720	Mymr_CF_MEDIAL = `0x02000000`, / flag to speed up comparing /
1721	Mymr_CF_IND_VOWEL = `0x04000000`, / flag to speed up comparing /
1722	Mymr_CF_DEP_VOWEL = `0x08000000`, / flag to speed up comparing /
1723	Mymr_CF_DOTTED_CIRCLE = `0x10000000`, / add a dotted circle if a character with this flag is the*
1724	first in a syllable /*
1725	Mymr_CF_VIRAMA = `0x20000000`, / flag to speed up comparing /
1726
1727	/ position flags /
1728	Mymr_CF_POS_BEFORE = `0x00080000`,
1729	Mymr_CF_POS_BELOW = `0x00040000`,
1730	Mymr_CF_POS_ABOVE = `0x00020000`,
1731	Mymr_CF_POS_AFTER = `0x00010000`,
1732	Mymr_CF_POS_MASK = `0x000f0000`,
1733
1734	Mymr_CF_AFTER_KINZI = `0x00100000`
1735	};
1736
1737	Q_DECLARE_MIXED_ENUM_OPERATORS(int, MymrCharClassValues, MymrCharClassFlags)
1738
1739	/ Characters that get refrered to by name /
1740	enum MymrChar
1741	{
1742	Mymr_C_SIGN_ZWNJ = `0x200C`,
1743	Mymr_C_SIGN_ZWJ = `0x200D`,
1744	Mymr_C_DOTTED_CIRCLE = `0x25CC`,
1745	Mymr_C_RA = `0x101B`,
1746	Mymr_C_YA = `0x101A`,
1747	Mymr_C_NGA = `0x1004`,
1748	Mymr_C_VOWEL_E = `0x1031`,
1749	Mymr_C_VIRAMA = `0x1039`
1750	};
1751
1752	enum
1753	{
1754	Mymr_xx = Mymr_CC_RESERVED,
1755	Mymr_c1 = Mymr_CC_CONSONANT \| Mymr_CF_CONSONANT \| Mymr_CF_POS_BELOW,
1756	Mymr_c2 = Mymr_CC_CONSONANT2 \| Mymr_CF_CONSONANT,
1757	Mymr_ng = Mymr_CC_NGA \| Mymr_CF_CONSONANT \| Mymr_CF_POS_ABOVE,
1758	Mymr_ya = Mymr_CC_YA \| Mymr_CF_CONSONANT \| Mymr_CF_MEDIAL \| Mymr_CF_POS_AFTER \| Mymr_CF_AFTER_KINZI,
1759	Mymr_ra = Mymr_CC_RA \| Mymr_CF_CONSONANT \| Mymr_CF_MEDIAL \| Mymr_CF_POS_BEFORE,
1760	Mymr_wa = Mymr_CC_WA \| Mymr_CF_CONSONANT \| Mymr_CF_MEDIAL \| Mymr_CF_POS_BELOW,
1761	Mymr_ha = Mymr_CC_HA \| Mymr_CF_CONSONANT \| Mymr_CF_MEDIAL \| Mymr_CF_POS_BELOW,
1762	Mymr_id = Mymr_CC_IND_VOWEL \| Mymr_CF_IND_VOWEL,
1763	Mymr_vi = Mymr_CC_VIRAMA \| Mymr_CF_VIRAMA \| Mymr_CF_POS_ABOVE \| Mymr_CF_DOTTED_CIRCLE,
1764	Mymr_dl = Mymr_CC_PRE_VOWEL \| Mymr_CF_DEP_VOWEL \| Mymr_CF_POS_BEFORE \| Mymr_CF_DOTTED_CIRCLE \| Mymr_CF_AFTER_KINZI,
1765	Mymr_db = Mymr_CC_BELOW_VOWEL \| Mymr_CF_DEP_VOWEL \| Mymr_CF_POS_BELOW \| Mymr_CF_DOTTED_CIRCLE \| Mymr_CF_AFTER_KINZI,
1766	Mymr_da = Mymr_CC_ABOVE_VOWEL \| Mymr_CF_DEP_VOWEL \| Mymr_CF_POS_ABOVE \| Mymr_CF_DOTTED_CIRCLE \| Mymr_CF_AFTER_KINZI,
1767	Mymr_dr = Mymr_CC_POST_VOWEL \| Mymr_CF_DEP_VOWEL \| Mymr_CF_POS_AFTER \| Mymr_CF_DOTTED_CIRCLE \| Mymr_CF_AFTER_KINZI,
1768	Mymr_sa = Mymr_CC_SIGN_ABOVE \| Mymr_CF_DOTTED_CIRCLE \| Mymr_CF_POS_ABOVE \| Mymr_CF_AFTER_KINZI,
1769	Mymr_sb = Mymr_CC_SIGN_BELOW \| Mymr_CF_DOTTED_CIRCLE \| Mymr_CF_POS_BELOW \| Mymr_CF_AFTER_KINZI,
1770	Mymr_sp = Mymr_CC_SIGN_AFTER \| Mymr_CF_DOTTED_CIRCLE \| Mymr_CF_AFTER_KINZI
1771	};
1772
1773
1774	typedef int MymrCharClass;
1775
1776
1777	static const MymrCharClass mymrCharClasses[] =
1778	{
1779	Mymr_c1, Mymr_c1, Mymr_c1, Mymr_c1, Mymr_ng, Mymr_c1, Mymr_c1, Mymr_c1,
1780	Mymr_c1, Mymr_c1, Mymr_c2, Mymr_c1, Mymr_c1, Mymr_c1, Mymr_c1, Mymr_c1, / 1000 - 100F /
1781	Mymr_c1, Mymr_c1, Mymr_c1, Mymr_c1, Mymr_c1, Mymr_c1, Mymr_c1, Mymr_c1,
1782	Mymr_c1, Mymr_c1, Mymr_ya, Mymr_ra, Mymr_c1, Mymr_wa, Mymr_c1, Mymr_ha, / 1010 - 101F /
1783	Mymr_c2, Mymr_c2, Mymr_xx, Mymr_id, Mymr_id, Mymr_id, Mymr_id, Mymr_id,
1784	Mymr_xx, Mymr_id, Mymr_id, Mymr_xx, Mymr_dr, Mymr_da, Mymr_da, Mymr_db, / 1020 - 102F /
1785	Mymr_db, Mymr_dl, Mymr_da, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_sa, Mymr_sb,
1786	Mymr_sp, Mymr_vi, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, / 1030 - 103F /
1787	Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx,
1788	Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, / 1040 - 104F /
1789	Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx,
1790	Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, / 1050 - 105F /
1791	};
1792
1793	static MymrCharClass
1794	getMyanmarCharClass (ushort ch)
1795	{
1796	if (ch == Mymr_C_SIGN_ZWJ)
1797	return Mymr_CC_ZERO_WIDTH_J_MARK;
1798
1799	if (ch == Mymr_C_SIGN_ZWNJ)
1800	return Mymr_CC_ZERO_WIDTH_NJ_MARK;
1801
1802	if (ch < `0x1000` \|\| ch > `0x105f`)
1803	return Mymr_CC_RESERVED;
1804
1805	return mymrCharClasses[ch - `0x1000`];
1806	}
1807
1808	static const signed char mymrStateTable[][Mymr_CC_COUNT] =
1809	{
1810	/ xx c1, c2 ng ya ra wa ha id zwnj vi dl db da dr sa sb sp zwj /
1811	{ `1`, `4`, `4`, `2`, `4`, `4`, `4`, `4`, `24`, `1`, `27`, `17`, `18`, `19`, `20`, `21`, `1`, `1`, `4`}, / 0 - ground state /
1812	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`}, / 1 - exit state (or sp to the right of the syllable) /
1813	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `3`, `17`, `18`, `19`, `20`, `21`, -`1`, -`1`, `4`}, / 2 - NGA /
1814	{-`1`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, -`1`, `23`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`}, / 3 - Virama after NGA /
1815	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `5`, `17`, `18`, `19`, `20`, `21`, `1`, `1`, -`1`}, / 4 - Base consonant /
1816	{-`2`, `6`, -`2`, -`2`, `7`, `8`, `9`, `10`, -`2`, `23`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`}, / 5 - First virama /
1817	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `25`, `17`, `18`, `19`, `20`, `21`, -`1`, -`1`, -`1`}, / 6 - c1 after virama /
1818	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `12`, `17`, `18`, `19`, `20`, `21`, -`1`, -`1`, -`1`}, / 7 - ya after virama /
1819	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `12`, `17`, `18`, `19`, `20`, `21`, -`1`, -`1`, -`1`}, / 8 - ra after virama /
1820	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `12`, `17`, `18`, `19`, `20`, `21`, -`1`, -`1`, -`1`}, / 9 - wa after virama /
1821	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `17`, `18`, `19`, `20`, `21`, -`1`, -`1`, -`1`}, / 10 - ha after virama /
1822	{-`1`, -`1`, -`1`, -`1`, `7`, `8`, `9`, `10`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`}, / 11 - Virama after NGA+zwj /
1823	{-`2`, -`2`, -`2`, -`2`, -`2`, -`2`, `13`, `14`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`}, / 12 - Second virama /
1824	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `15`, `17`, `18`, `19`, `20`, `21`, -`1`, -`1`, -`1`}, / 13 - wa after virama /
1825	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `17`, `18`, `19`, `20`, `21`, -`1`, -`1`, -`1`}, / 14 - ha after virama /
1826	{-`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, `16`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`}, / 15 - Third virama /
1827	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `17`, `18`, `19`, `20`, `21`, -`1`, -`1`, -`1`}, / 16 - ha after virama /
1828	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `20`, `21`, `1`, `1`, -`1`}, / 17 - dl, Dependent vowel e /
1829	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `19`, -`1`, `21`, `1`, `1`, -`1`}, / 18 - db, Dependent vowel u,uu /
1830	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `1`, `1`, `1`, -`1`}, / 19 - da, Dependent vowel i,ii,ai /
1831	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `22`, -`1`, -`1`, -`1`, -`1`, -`1`, `1`, `1`, -`1`}, / 20 - dr, Dependent vowel aa /
1832	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `1`, `1`, -`1`}, / 21 - sa, Sign anusvara /
1833	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `23`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`}, / 22 - atha /
1834	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `1`, `1`, -`1`}, / 23 - zwnj for atha /
1835	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `1`, -`1`}, / 24 - Independent vowel /
1836	{-`2`, -`2`, -`2`, -`2`, `26`, `26`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`}, / 25 - Virama after subscript consonant /
1837	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `12`, `17`, `18`, `19`, `20`, `21`, -`1`, `1`, -`1`}, / 26 - ra/ya after subscript consonant + virama /
1838	{-`1`, `6`, -`1`, -`1`, `7`, `8`, `9`, `10`, -`1`, `23`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`}, / 27 - Virama after ground state /
1839	/ exit state -2 is for invalid order of medials and combination of invalids*
1840	with virama where virama should treat as start of next syllable
1841	*/
1842	};
1843
1844	/#define MYANMAR_DEBUG /
1845	#ifdef MYANMAR_DEBUG
1846	#define MMDEBUG qDebug
1847	#else
1848	# define MMDEBUG \
1849	if (0) \
1850	printf
1851	#endif
1852
1853	/*
1854	// Given an input string of characters and a location in which to start looking
1855	// calculate, using the state table, which one is the last character of the syllable
1856	// that starts in the starting position.
1857	*/
1858	static qsizetype myanmar_nextSyllableBoundary(const char16_t s, qsizetype start, qsizetype end, bool* *invalid)
1859	{
1860	const char16_t *uc = s + start;
1861	int state = `0`;
1862	qsizetype pos = start;
1863	invalid = false*;
1864
1865	while (pos < end) {
1866	MymrCharClass charClass = getMyanmarCharClass(ch: *uc);
1867	state = mymrStateTable[state][charClass & Mymr_CF_CLASS_MASK];
1868	if (pos == start)
1869	invalid = (bool*)(charClass & Mymr_CF_DOTTED_CIRCLE);
1870
1871	MMDEBUG(format: "state[%lld]=%d class=%8x (uc=%4x)", qlonglong(pos - start), state, charClass, *uc);
1872
1873	if (state < `0`) {
1874	if (state < -`1`)
1875	--pos;
1876	break;
1877	}
1878	++uc;
1879	++pos;
1880	}
1881	return pos;
1882	}
1883
1884	static void myanmarAttributes(QChar::Script script, const char16_t text, qsizetype from, qsizetype len, QCharAttributes attributes)
1885	{
1886	qsizetype end = from + len;
1887	qsizetype i = `0`;
1888	Q_UNUSED(script);
1889	attributes += from;
1890	while (i < len) {
1891	bool invalid;
1892	qsizetype boundary = myanmar_nextSyllableBoundary(s: text, start: from+i, end, invalid: &invalid) - from;
1893
1894	attributes[i].graphemeBoundary = true;
1895	attributes[i].lineBreak = true;
1896
1897	if (boundary > len-`1`)
1898	boundary = len;
1899	i++;
1900	while (i < boundary) {
1901	attributes[i].graphemeBoundary = false;
1902	++i;
1903	}
1904	assert(i == boundary);
1905	}
1906	}
1907
1908	/*
1909	// Vocabulary
1910	// Base -> A consonant or an independent vowel in its full (not subscript) form. It is the
1911	// center of the syllable, it can be surrounded by coeng (subscript) consonants, vowels,
1912	// split vowels, signs... but there is only one base in a syllable, it has to be coded as
1913	// the first character of the syllable.
1914	// split vowel --> vowel that has two parts placed separately (e.g. Before and after the consonant).
1915	// Khmer language has five of them. Khmer split vowels either have one part before the
1916	// base and one after the base or they have a part before the base and a part above the base.
1917	// The first part of all Khmer split vowels is the same character, identical to
1918	// the glyph of Khmer dependent vowel SRA EI
1919	// coeng --> modifier used in Khmer to construct coeng (subscript) consonants
1920	// Differently than indian languages, the coeng modifies the consonant that follows it,
1921	// not the one preceding it Each consonant has two forms, the base form and the subscript form
1922	// the base form is the normal one (using the consonants code-point), the subscript form is
1923	// displayed when the combination coeng + consonant is encountered.
1924	// Consonant of type 1 -> A consonant which has subscript for that only occupies space under a base consonant
1925	// Consonant of type 2.-> Its subscript form occupies space under and before the base (only one, RO)
1926	// Consonant of Type 3 -> Its subscript form occupies space under and after the base (KHO, CHHO, THHO, BA, YO, SA)
1927	// Consonant shifter -> Khmer has to series of consonants. The same dependent vowel has different sounds
1928	// if it is attached to a consonant of the first series or a consonant of the second series
1929	// Most consonants have an equivalent in the other series, but some of theme exist only in
1930	// one series (for example SA). If we want to use the consonant SA with a vowel sound that
1931	// can only be done with a vowel sound that corresponds to a vowel accompanying a consonant
1932	// of the other series, then we need to use a consonant shifter: TRIISAP or MUSIKATOAN
1933	// x17C9 y x17CA. TRIISAP changes a first series consonant to second series sound and
1934	// MUSIKATOAN a second series consonant to have a first series vowel sound.
1935	// Consonant shifter are both normally supercript marks, but, when they are followed by a
1936	// superscript, they change shape and take the form of subscript dependent vowel SRA U.
1937	// If they are in the same syllable as a coeng consonant, Unicode 3.0 says that they
1938	// should be typed before the coeng. Unicode 4.0 breaks the standard and says that it should
1939	// be placed after the coeng consonant.
1940	// Dependent vowel -> In khmer dependent vowels can be placed above, below, before or after the base
1941	// Each vowel has its own position. Only one vowel per syllable is allowed.
1942	// Signs -> Khmer has above signs and post signs. Only one above sign and/or one post sign are
1943	// Allowed in a syllable.
1944	//
1945	//
1946	// order is important here! This order must be the same that is found in each horizontal
1947	// line in the statetable for Khmer (see khmerStateTable) .
1948	*/
1949	enum KhmerCharClassValues {
1950	CC_RESERVED = `0`,
1951	CC_CONSONANT = `1`, / Consonant of type 1 or independent vowel /
1952	CC_CONSONANT2 = `2`, / Consonant of type 2 /
1953	CC_CONSONANT3 = `3`, / Consonant of type 3 /
1954	CC_ZERO_WIDTH_NJ_MARK = `4`, / Zero Width non joiner character (0x200C) /
1955	CC_CONSONANT_SHIFTER = `5`,
1956	CC_ROBAT = `6`, / Khmer special diacritic accent -treated differently in state table /
1957	CC_COENG = `7`, / Subscript consonant combining character /
1958	CC_DEPENDENT_VOWEL = `8`,
1959	CC_SIGN_ABOVE = `9`,
1960	CC_SIGN_AFTER = `10`,
1961	CC_ZERO_WIDTH_J_MARK = `11`, / Zero width joiner character /
1962	CC_COUNT = `12` / This is the number of character classes /
1963	};
1964
1965
1966	enum KhmerCharClassFlags {
1967	CF_CLASS_MASK = `0x0000FFFF`,
1968
1969	CF_CONSONANT = `0x01000000`, / flag to speed up comparing /
1970	CF_SPLIT_VOWEL = `0x02000000`, / flag for a split vowel -> the first part is added in front of the syllable /
1971	CF_DOTTED_CIRCLE = `0x04000000`, / add a dotted circle if a character with this flag is the first in a syllable /
1972	CF_COENG = `0x08000000`, / flag to speed up comparing /
1973	CF_SHIFTER = `0x10000000`, / flag to speed up comparing /
1974	CF_ABOVE_VOWEL = `0x20000000`, / flag to speed up comparing /
1975
1976	/ position flags /
1977	CF_POS_BEFORE = `0x00080000`,
1978	CF_POS_BELOW = `0x00040000`,
1979	CF_POS_ABOVE = `0x00020000`,
1980	CF_POS_AFTER = `0x00010000`,
1981	CF_POS_MASK = `0x000f0000`
1982	};
1983
1984	Q_DECLARE_MIXED_ENUM_OPERATORS(int, KhmerCharClassValues, KhmerCharClassFlags)
1985
1986	/ Characters that get referred to by name /
1987	enum KhmerChar {
1988	C_SIGN_ZWNJ = `0x200C`,
1989	C_SIGN_ZWJ = `0x200D`,
1990	C_RO = `0x179A`,
1991	C_VOWEL_AA = `0x17B6`,
1992	C_SIGN_NIKAHIT = `0x17C6`,
1993	C_VOWEL_E = `0x17C1`,
1994	C_COENG = `0x17D2`
1995	};
1996
1997
1998	/*
1999	// simple classes, they are used in the statetable (in this file) to control the length of a syllable
2000	// they are also used to know where a character should be placed (location in reference to the base character)
2001	// and also to know if a character, when independently displayed, should be displayed with a dotted-circle to
2002	// indicate error in syllable construction
2003	*/
2004	enum {
2005	_xx = CC_RESERVED,
2006	_sa = CC_SIGN_ABOVE \| CF_DOTTED_CIRCLE \| CF_POS_ABOVE,
2007	_sp = CC_SIGN_AFTER \| CF_DOTTED_CIRCLE\| CF_POS_AFTER,
2008	_c1 = CC_CONSONANT \| CF_CONSONANT,
2009	_c2 = CC_CONSONANT2 \| CF_CONSONANT,
2010	_c3 = CC_CONSONANT3 \| CF_CONSONANT,
2011	_rb = CC_ROBAT \| CF_POS_ABOVE \| CF_DOTTED_CIRCLE,
2012	_cs = CC_CONSONANT_SHIFTER \| CF_DOTTED_CIRCLE \| CF_SHIFTER,
2013	_dl = CC_DEPENDENT_VOWEL \| CF_POS_BEFORE \| CF_DOTTED_CIRCLE,
2014	_db = CC_DEPENDENT_VOWEL \| CF_POS_BELOW \| CF_DOTTED_CIRCLE,
2015	_da = CC_DEPENDENT_VOWEL \| CF_POS_ABOVE \| CF_DOTTED_CIRCLE \| CF_ABOVE_VOWEL,
2016	_dr = CC_DEPENDENT_VOWEL \| CF_POS_AFTER \| CF_DOTTED_CIRCLE,
2017	_co = CC_COENG \| CF_COENG \| CF_DOTTED_CIRCLE,
2018
2019	/ split vowel /
2020	_va = _da \| CF_SPLIT_VOWEL,
2021	_vr = _dr \| CF_SPLIT_VOWEL
2022	};
2023
2024
2025	/*
2026	// Character class: a character class value
2027	// ORed with character class flags.
2028	*/
2029	typedef unsigned long KhmerCharClass;
2030
2031
2032	/*
2033	// Character class tables
2034	// _xx character does not combine into syllable, such as numbers, puntuation marks, non-Khmer signs...
2035	// _sa Sign placed above the base
2036	// _sp Sign placed after the base
2037	// _c1 Consonant of type 1 or independent vowel (independent vowels behave as type 1 consonants)
2038	// _c2 Consonant of type 2 (only RO)
2039	// _c3 Consonant of type 3
2040	// _rb Khmer sign robat u17CC. combining mark for subscript consonants
2041	// _cd Consonant-shifter
2042	// _dl Dependent vowel placed before the base (left of the base)
2043	// _db Dependent vowel placed below the base
2044	// _da Dependent vowel placed above the base
2045	// _dr Dependent vowel placed behind the base (right of the base)
2046	// _co Khmer combining mark COENG u17D2, combines with the consonant or independent vowel following
2047	// it to create a subscript consonant or independent vowel
2048	// _va Khmer split vowel in which the first part is before the base and the second one above the base
2049	// _vr Khmer split vowel in which the first part is before the base and the second one behind (right of) the base
2050	*/
2051	static const KhmerCharClass khmerCharClasses[] = {
2052	_c1, _c1, _c1, _c3, _c1, _c1, _c1, _c1, _c3, _c1, _c1, _c1, _c1, _c3, _c1, _c1, / 1780 - 178F /
2053	_c1, _c1, _c1, _c1, _c3, _c1, _c1, _c1, _c1, _c3, _c2, _c1, _c1, _c1, _c3, _c3, / 1790 - 179F /
2054	_c1, _c3, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, / 17A0 - 17AF /
2055	_c1, _c1, _c1, _c1, _dr, _dr, _dr, _da, _da, _da, _da, _db, _db, _db, _va, _vr, / 17B0 - 17BF /
2056	_vr, _dl, _dl, _dl, _vr, _vr, _sa, _sp, _sp, _cs, _cs, _sa, _rb, _sa, _sa, _sa, / 17C0 - 17CF /
2057	_sa, _sa, _co, _sa, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _sa, _xx, _xx / 17D0 - 17DF /
2058	};
2059
2060	/ this enum must reflect the range of khmerCharClasses /
2061	enum KhmerCharClassesRange {
2062	KhmerFirstChar = `0x1780`,
2063	KhmerLastChar = `0x17df`
2064	};
2065
2066	/*
2067	// Below we define how a character in the input string is either in the khmerCharClasses table
2068	// (in which case we get its type back), a ZWJ or ZWNJ (two characters that may appear
2069	// within the syllable, but are not in the table) we also get their type back, or an unknown object
2070	// in which case we get _xx (CC_RESERVED) back
2071	*/
2072	static KhmerCharClass getKhmerCharClass(ushort uc)
2073	{
2074	if (uc == C_SIGN_ZWJ) {
2075	return CC_ZERO_WIDTH_J_MARK;
2076	}
2077
2078	if (uc == C_SIGN_ZWNJ) {
2079	return CC_ZERO_WIDTH_NJ_MARK;
2080	}
2081
2082	if (uc < KhmerFirstChar \|\| uc > KhmerLastChar) {
2083	return CC_RESERVED;
2084	}
2085
2086	return khmerCharClasses[uc - KhmerFirstChar];
2087	}
2088
2089
2090	/*
2091	// The stateTable is used to calculate the end (the length) of a well
2092	// formed Khmer Syllable.
2093	//
2094	// Each horizontal line is ordered exactly the same way as the values in KhmerClassTable
2095	// CharClassValues. This coincidence of values allows the follow up of the table.
2096	//
2097	// Each line corresponds to a state, which does not necessarily need to be a type
2098	// of component... for example, state 2 is a base, with is always a first character
2099	// in the syllable, but the state could be produced a consonant of any type when
2100	// it is the first character that is analysed (in ground state).
2101	//
2102	// Differentiating 3 types of consonants is necessary in order to
2103	// forbid the use of certain combinations, such as having a second
2104	// coeng after a coeng RO,
2105	// The inexistent possibility of having a type 3 after another type 3 is permitted,
2106	// eliminating it would very much complicate the table, and it does not create typing
2107	// problems, as the case above.
2108	//
2109	// The table is quite complex, in order to limit the number of coeng consonants
2110	// to 2 (by means of the table).
2111	//
2112	// There a peculiarity, as far as Unicode is concerned:
2113	// - The consonant-shifter is considered in two possible different
2114	// locations, the one considered in Unicode 3.0 and the one considered in
2115	// Unicode 4.0. (there is a backwards compatibility problem in this standard).
2116	//
2117	//
2118	// xx independent character, such as a number, punctuation sign or non-khmer char
2119	//
2120	// c1 Khmer consonant of type 1 or an independent vowel
2121	// that is, a letter in which the subscript for is only under the
2122	// base, not taking any space to the right or to the left
2123	//
2124	// c2 Khmer consonant of type 2, the coeng form takes space under
2125	// and to the left of the base (only RO is of this type)
2126	//
2127	// c3 Khmer consonant of type 3. Its subscript form takes space under
2128	// and to the right of the base.
2129	//
2130	// cs Khmer consonant shifter
2131	//
2132	// rb Khmer robat
2133	//
2134	// co coeng character (u17D2)
2135	//
2136	// dv dependent vowel (including split vowels, they are treated in the same way).
2137	// even if dv is not defined above, the component that is really tested for is
2138	// KhmerClassTable::CC_DEPENDENT_VOWEL, which is common to all dependent vowels
2139	//
2140	// zwj Zero Width joiner
2141	//
2142	// zwnj Zero width non joiner
2143	//
2144	// sa above sign
2145	//
2146	// sp post sign
2147	//
2148	// there are lines with equal content but for an easier understanding
2149	// (and maybe change in the future) we did not join them
2150	*/
2151	static const signed char khmerStateTable[][CC_COUNT] =
2152	{
2153	/ xx c1 c2 c3 zwnj cs rb co dv sa sp zwj /
2154	{ `1`, `2`, `2`, `2`, `1`, `1`, `1`, `6`, `1`, `1`, `1`, `2`}, / 0 - ground state /
2155	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`}, / 1 - exit state (or sign to the right of the syllable) /
2156	{-`1`, -`1`, -`1`, -`1`, `3`, `4`, `5`, `6`, `16`, `17`, `1`, -`1`}, / 2 - Base consonant /
2157	{-`1`, -`1`, -`1`, -`1`, -`1`, `4`, -`1`, -`1`, `16`, -`1`, -`1`, -`1`}, / 3 - First ZWNJ before a register shifter It can only be followed by a shifter or a vowel /
2158	{-`1`, -`1`, -`1`, -`1`, `15`, -`1`, -`1`, `6`, `16`, `17`, `1`, `14`}, / 4 - First register shifter /
2159	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `20`, -`1`, `1`, -`1`}, / 5 - Robat /
2160	{-`1`, `7`, `8`, `9`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`}, / 6 - First Coeng /
2161	{-`1`, -`1`, -`1`, -`1`, `12`, `13`, -`1`, `10`, `16`, `17`, `1`, `14`}, / 7 - First consonant of type 1 after coeng /
2162	{-`1`, -`1`, -`1`, -`1`, `12`, `13`, -`1`, -`1`, `16`, `17`, `1`, `14`}, / 8 - First consonant of type 2 after coeng /
2163	{-`1`, -`1`, -`1`, -`1`, `12`, `13`, -`1`, `10`, `16`, `17`, `1`, `14`}, / 9 - First consonant or type 3 after ceong /
2164	{-`1`, `11`, `11`, `11`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`}, / 10 - Second Coeng (no register shifter before) /
2165	{-`1`, -`1`, -`1`, -`1`, `15`, -`1`, -`1`, -`1`, `16`, `17`, `1`, `14`}, / 11 - Second coeng consonant (or ind. vowel) no register shifter before /
2166	{-`1`, -`1`, -`1`, -`1`, -`1`, `13`, -`1`, -`1`, `16`, -`1`, -`1`, -`1`}, / 12 - Second ZWNJ before a register shifter /
2167	{-`1`, -`1`, -`1`, -`1`, `15`, -`1`, -`1`, -`1`, `16`, `17`, `1`, `14`}, / 13 - Second register shifter /
2168	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `16`, -`1`, -`1`, -`1`}, / 14 - ZWJ before vowel /
2169	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `16`, -`1`, -`1`, -`1`}, / 15 - ZWNJ before vowel /
2170	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `17`, `1`, `18`}, / 16 - dependent vowel /
2171	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `1`, `18`}, / 17 - sign above /
2172	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `19`, -`1`, -`1`, -`1`, -`1`}, / 18 - ZWJ after vowel /
2173	{-`1`, `1`, -`1`, `1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`}, / 19 - Third coeng /
2174	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `1`, -`1`}, / 20 - dependent vowel after a Robat /
2175	};
2176
2177
2178	/ #define KHMER_DEBUG /
2179	#ifdef KHMER_DEBUG
2180	#define KHDEBUG qDebug
2181	#else
2182	# define KHDEBUG \
2183	if (0) \
2184	printf
2185	#endif
2186
2187	/*
2188	// Given an input string of characters and a location in which to start looking
2189	// calculate, using the state table, which one is the last character of the syllable
2190	// that starts in the starting position.
2191	*/
2192	static qsizetype khmer_nextSyllableBoundary(const char16_t s, qsizetype start, qsizetype end, bool* *invalid)
2193	{
2194	const char16_t *uc = s + start;
2195	int state = `0`;
2196	qsizetype pos = start;
2197	invalid = false*;
2198
2199	while (pos < end) {
2200	KhmerCharClass charClass = getKhmerCharClass(uc: *uc);
2201	if (pos == start) {
2202	*invalid = (charClass > `0`) && ! (charClass & CF_CONSONANT);
2203	}
2204	state = khmerStateTable[state][charClass & CF_CLASS_MASK];
2205
2206	KHDEBUG(format: "state[%lld]=%d class=%8lx (uc=%4x)", qlonglong(pos - start), state,
2207	charClass, *uc );
2208
2209	if (state < `0`) {
2210	break;
2211	}
2212	++uc;
2213	++pos;
2214	}
2215	return pos;
2216	}
2217
2218	static void khmerAttributes(QChar::Script script, const char16_t text, qsizetype from, qsizetype len, QCharAttributes attributes)
2219	{
2220	qsizetype end = from + len;
2221	qsizetype i = `0`;
2222	Q_UNUSED(script);
2223	attributes += from;
2224	while ( i < len ) {
2225	bool invalid;
2226	qsizetype boundary = khmer_nextSyllableBoundary( s: text, start: from+i, end, invalid: &invalid ) - from;
2227
2228	attributes[i].graphemeBoundary = true;
2229
2230	if ( boundary > len-`1` ) boundary = len;
2231	i++;
2232	while ( i < boundary ) {
2233	attributes[i].graphemeBoundary = false;
2234	++i;
2235	}
2236	assert( i == boundary );
2237	}
2238	}
2239
2240
2241	const CharAttributeFunction charAttributeFunction[] = {
2242	// Script_Unknown,
2243	nullptr,
2244	// Script_Inherited,
2245	nullptr,
2246	// Script_Common,
2247	nullptr,
2248	// Script_Latin,
2249	nullptr,
2250	// Script_Greek,
2251	nullptr,
2252	// Script_Cyrillic,
2253	nullptr,
2254	// Script_Armenian,
2255	nullptr,
2256	// Script_Hebrew,
2257	nullptr,
2258	// Script_Arabic,
2259	nullptr,
2260	// Script_Syriac,
2261	nullptr,
2262	// Script_Thaana,
2263	nullptr,
2264	// Script_Devanagari,
2265	indicAttributes,
2266	// Script_Bengali,
2267	indicAttributes,
2268	// Script_Gurmukhi,
2269	indicAttributes,
2270	// Script_Gujarati,
2271	indicAttributes,
2272	// Script_Oriya,
2273	indicAttributes,
2274	// Script_Tamil,
2275	indicAttributes,
2276	// Script_Telugu,
2277	indicAttributes,
2278	// Script_Kannada,
2279	indicAttributes,
2280	// Script_Malayalam,
2281	indicAttributes,
2282	// Script_Sinhala,
2283	indicAttributes,
2284	// Script_Thai,
2285	thaiAttributes,
2286	// Script_Lao,
2287	nullptr,
2288	// Script_Tibetan,
2289	tibetanAttributes,
2290	// Script_Myanmar,
2291	myanmarAttributes,
2292	// Script_Georgian,
2293	nullptr,
2294	// Script_Hangul,
2295	nullptr,
2296	// Script_Ethiopic,
2297	nullptr,
2298	// Script_Cherokee,
2299	nullptr,
2300	// Script_CanadianAboriginal,
2301	nullptr,
2302	// Script_Ogham,
2303	nullptr,
2304	// Script_Runic,
2305	nullptr,
2306	// Script_Khmer,
2307	khmerAttributes
2308	};
2309
2310	static void getCharAttributes(const char16_t *string, qsizetype stringLength,
2311	const QUnicodeTools::ScriptItem *items, qsizetype numItems,
2312	QCharAttributes *attributes)
2313	{
2314	if (stringLength == `0`)
2315	return;
2316	for (qsizetype i = `0`; i < numItems; ++i) {
2317	QChar::Script script = items[i].script;
2318	if (script > QChar::Script_Khmer)
2319	script = QChar::Script_Common;
2320	CharAttributeFunction attributeFunction = charAttributeFunction[script];
2321	if (!attributeFunction)
2322	continue;
2323	qsizetype end = i < numItems - `1` ? items[i + `1`].position : stringLength;
2324	attributeFunction(script, string, items[i].position, end - items[i].position, attributes);
2325	}
2326	}
2327
2328	}
2329
2330	Q_CORE_EXPORT void initCharAttributes(QStringView string,
2331	const ScriptItem *items, qsizetype numItems,
2332	QCharAttributes *attributes, CharAttributeOptions options)
2333	{
2334	if (string.size() <= `0`)
2335	return;
2336
2337	if (!(options & DontClearAttributes))
2338	::memset(s: attributes, c: `0`, n: (string.size() + `1`) * sizeof(QCharAttributes));
2339
2340	if (options & GraphemeBreaks)
2341	getGraphemeBreaks(string: string.utf16(), len: string.size(), attributes);
2342	if (options & WordBreaks)
2343	getWordBreaks(string: string.utf16(), len: string.size(), attributes);
2344	if (options & SentenceBreaks)
2345	getSentenceBreaks(string: string.utf16(), len: string.size(), attributes);
2346	if (options & LineBreaks)
2347	getLineBreaks(string: string.utf16(), len: string.size(), attributes, options);
2348	if (options & WhiteSpaces)
2349	getWhiteSpaces(string: string.utf16(), len: string.size(), attributes);
2350
2351	if (!qt_initcharattributes_default_algorithm_only) {
2352	if (!items \|\| numItems <= `0`)
2353	return;
2354
2355	Tailored::getCharAttributes(string: string.utf16(), stringLength: string.size(), items, numItems, attributes);
2356	}
2357	}
2358
2359
2360	// ----------------------------------------------------------------------------
2361	//
2362	// The Unicode script property. See http://www.unicode.org/reports/tr24/tr24-24.html
2363	//
2364	// ----------------------------------------------------------------------------
2365
2366	Q_CORE_EXPORT void initScripts(QStringView string, ScriptItemArray *scripts)
2367	{
2368	qsizetype sor = `0`;
2369	qsizetype eor = `0`;
2370	QChar::Script script = QChar::Script_Common;
2371
2372	for (qsizetype i = `0`; i < string.size(); ++i, eor = i) {
2373	char32_t ucs4 = string [i].unicode();
2374	if (QChar::isHighSurrogate(ucs4) && i + `1` < string.size()) {
2375	ushort low = string [i + `1`].unicode();
2376	if (QChar::isLowSurrogate(ucs4: low)) {
2377	ucs4 = QChar::surrogateToUcs4(high: ucs4, low);
2378	++i;
2379	}
2380	}
2381
2382	const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
2383
2384	QChar::Script nscript = QChar::Script(prop->script);
2385
2386	if (Q_LIKELY(nscript == script \|\| nscript <= QChar::Script_Common))
2387	continue;
2388
2389	// inherit preceding Common-s
2390	if (Q_UNLIKELY(script <= QChar::Script_Common)) {
2391	// also covers a case where the base character of Common script followed
2392	// by one or more combining marks of non-Inherited, non-Common script
2393	script = nscript;
2394	continue;
2395	}
2396
2397	// Never break between a combining mark (gc= Mc, Mn or Me) and its base character.
2398	// Thus, a combining mark - whatever its script property value is - should inherit
2399	// the script property value of its base character.
2400	static const int test = (FLAG(QChar::Mark_NonSpacing) \| FLAG(QChar::Mark_SpacingCombining) \| FLAG(QChar::Mark_Enclosing));
2401	if (Q_UNLIKELY(FLAG(prop->category) & test))
2402	continue;
2403
2404	Q_ASSERT(script > QChar::Script_Common);
2405	Q_ASSERT(sor < eor);
2406	scripts->append(t: ScriptItem{.position: sor, .script: script});
2407	sor = eor;
2408
2409	script = nscript;
2410	}
2411
2412	Q_ASSERT(script >= QChar::Script_Common);
2413	Q_ASSERT(eor == string.size());
2414	scripts->append(t: ScriptItem{.position: sor, .script: script});
2415	}
2416
2417	} // namespace QUnicodeTools
2418
2419	QT_END_NAMESPACE
2420

source code of qtbase/src/corelib/text/qunicodetools.cpp