1// Copyright (C) 2022 The Qt Company Ltd.
2// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
3#include <QtCore/qtextboundaryfinder.h>
4#include <QtCore/qvarlengtharray.h>
5
6#include <private/qunicodetools_p.h>
7
8QT_BEGIN_NAMESPACE
9
10static void init(QTextBoundaryFinder::BoundaryType type, QStringView str, QCharAttributes *attributes)
11{
12 QUnicodeTools::ScriptItemArray scriptItems;
13 QUnicodeTools::initScripts(str, scripts: &scriptItems);
14
15 QUnicodeTools::CharAttributeOptions options;
16 switch (type) {
17 case QTextBoundaryFinder::Grapheme: options |= QUnicodeTools::GraphemeBreaks; break;
18 case QTextBoundaryFinder::Word: options |= QUnicodeTools::WordBreaks; break;
19 case QTextBoundaryFinder::Sentence: options |= QUnicodeTools::SentenceBreaks; break;
20 case QTextBoundaryFinder::Line: options |= QUnicodeTools::LineBreaks; break;
21 default: break;
22 }
23 QUnicodeTools::initCharAttributes(str, items: scriptItems.data(), numItems: scriptItems.size(), attributes, options);
24}
25
26/*!
27 \class QTextBoundaryFinder
28 \inmodule QtCore
29
30 \brief The QTextBoundaryFinder class provides a way of finding Unicode text boundaries in a string.
31
32 \since 4.4
33 \ingroup tools
34 \ingroup shared
35 \ingroup string-processing
36 \reentrant
37
38 QTextBoundaryFinder allows to find Unicode text boundaries in a
39 string, accordingly to the Unicode text boundary specification (see
40 \l{https://www.unicode.org/reports/tr14/}{Unicode Standard Annex #14} and
41 \l{https://www.unicode.org/reports/tr29/}{Unicode Standard Annex #29}).
42
43 QTextBoundaryFinder can operate on a QString in four possible
44 modes depending on the value of \a BoundaryType.
45
46 Units of Unicode characters that make up what the user thinks of
47 as a character or basic unit of the language are here called
48 Grapheme clusters. The two unicode characters 'A' + diaeresis do
49 for example form one grapheme cluster as the user thinks of them
50 as one character, yet it is in this case represented by two
51 unicode code points
52 (see \l{https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries}).
53
54 Word boundaries are there to locate the start and end of what a
55 language considers to be a word
56 (see \l{https://www.unicode.org/reports/tr29/#Word_Boundaries}).
57
58 Line break boundaries give possible places where a line break
59 might happen and sentence boundaries will show the beginning and
60 end of whole sentences
61 (see \l{https://www.unicode.org/reports/tr29/#Sentence_Boundaries} and
62 \l{https://www.unicode.org/reports/tr14/}).
63
64 The first position in a string is always a valid boundary and
65 refers to the position before the first character. The last
66 position at the length of the string is also valid and refers
67 to the position after the last character.
68*/
69
70/*!
71 \enum QTextBoundaryFinder::BoundaryType
72
73 \value Grapheme Finds a grapheme which is the smallest boundary. It
74 including letters, punctuation marks, numerals and more.
75 \value Word Finds a word.
76 \value Line Finds possible positions for breaking the text into multiple
77 lines.
78 \value Sentence Finds sentence boundaries. These include periods, question
79 marks etc.
80*/
81
82/*!
83 \enum QTextBoundaryFinder::BoundaryReason
84
85 \value NotAtBoundary The boundary finder is not at a boundary position.
86 \value BreakOpportunity The boundary finder is at a break opportunity position.
87 Such a break opportunity might also be an item boundary
88 (either StartOfItem, EndOfItem, or combination of both),
89 a mandatory line break, or a soft hyphen.
90 \value [since 5.0] StartOfItem The boundary finder is at the start of
91 a grapheme, a word, a sentence, or a line.
92 \value [since 5.0] EndOfItem The boundary finder is at the end of
93 a grapheme, a word, a sentence, or a line.
94 \value [since 5.0] MandatoryBreak The boundary finder is at the end of line
95 (can occur for a Line boundary type only).
96 \value SoftHyphen The boundary finder is at the soft hyphen
97 (can occur for a Line boundary type only).
98*/
99
100/*!
101 Constructs an invalid QTextBoundaryFinder object.
102*/
103QTextBoundaryFinder::QTextBoundaryFinder()
104 : freeBuffer(true)
105{
106}
107
108/*!
109 Copies the QTextBoundaryFinder object, \a other.
110*/
111QTextBoundaryFinder::QTextBoundaryFinder(const QTextBoundaryFinder &other)
112 : t(other.t)
113 , s(other.s)
114 , sv(other.sv)
115 , pos(other.pos)
116 , freeBuffer(true)
117{
118 if (other.attributes) {
119 Q_ASSERT(sv.size() > 0);
120 attributes = (QCharAttributes *) malloc(size: (sv.size() + 1) * sizeof(QCharAttributes));
121 Q_CHECK_PTR(attributes);
122 memcpy(dest: attributes, src: other.attributes, n: (sv.size() + 1) * sizeof(QCharAttributes));
123 }
124}
125
126/*!
127 Assigns the object, \a other, to another QTextBoundaryFinder object.
128*/
129QTextBoundaryFinder &QTextBoundaryFinder::operator=(const QTextBoundaryFinder &other)
130{
131 if (&other == this)
132 return *this;
133
134 if (other.attributes) {
135 Q_ASSERT(other.sv.size() > 0);
136 size_t newCapacity = (size_t(other.sv.size()) + 1) * sizeof(QCharAttributes);
137 QCharAttributes *newD = (QCharAttributes *) realloc(ptr: freeBuffer ? attributes : nullptr, size: newCapacity);
138 Q_CHECK_PTR(newD);
139 freeBuffer = true;
140 attributes = newD;
141 }
142
143 t = other.t;
144 s = other.s;
145 sv = other.sv;
146 pos = other.pos;
147
148 if (other.attributes) {
149 memcpy(dest: attributes, src: other.attributes, n: (sv.size() + 1) * sizeof(QCharAttributes));
150 } else {
151 if (freeBuffer)
152 free(ptr: attributes);
153 attributes = nullptr;
154 }
155
156 return *this;
157}
158
159/*!
160 Destructs the QTextBoundaryFinder object.
161*/
162QTextBoundaryFinder::~QTextBoundaryFinder()
163{
164 Q_UNUSED(unused);
165 if (freeBuffer)
166 free(ptr: attributes);
167}
168
169/*!
170 Creates a QTextBoundaryFinder object of \a type operating on \a string.
171*/
172QTextBoundaryFinder::QTextBoundaryFinder(BoundaryType type, const QString &string)
173 : t(type)
174 , s(string)
175 , sv(s)
176 , pos(0)
177 , freeBuffer(true)
178 , attributes(nullptr)
179{
180 if (sv.size() > 0) {
181 attributes = (QCharAttributes *) malloc(size: (sv.size() + 1) * sizeof(QCharAttributes));
182 Q_CHECK_PTR(attributes);
183 init(type: t, str: sv, attributes);
184 }
185}
186
187/*!
188 \fn QTextBoundaryFinder::QTextBoundaryFinder(BoundaryType type, const QChar *chars, qsizetype length, unsigned char *buffer, qsizetype bufferSize)
189 \overload
190
191 The same as QTextBoundaryFinder(type, QStringView(chars, length), buffer, bufferSize).
192*/
193
194/*!
195 Creates a QTextBoundaryFinder object of \a type operating on \a string.
196 \since 6.0
197
198 \a buffer is an optional working buffer of size \a bufferSize you can pass to
199 the QTextBoundaryFinder. If the buffer is large enough to hold the working
200 data required (bufferSize >= length + 1), it will use this
201 instead of allocating its own buffer.
202
203 \warning QTextBoundaryFinder does not create a copy of \a string. It is the
204 application programmer's responsibility to ensure the array is allocated for
205 as long as the QTextBoundaryFinder object stays alive. The same applies to
206 \a buffer.
207*/
208QTextBoundaryFinder::QTextBoundaryFinder(BoundaryType type, QStringView string, unsigned char *buffer, qsizetype bufferSize)
209 : t(type)
210 , sv(string)
211 , pos(0)
212 , freeBuffer(true)
213 , attributes(nullptr)
214{
215 if (!sv.isEmpty()) {
216 if (buffer && bufferSize / int(sizeof(QCharAttributes)) >= sv.size() + 1) {
217 attributes = reinterpret_cast<QCharAttributes *>(buffer);
218 freeBuffer = false;
219 } else {
220 attributes = (QCharAttributes *) malloc(size: (sv.size() + 1) * sizeof(QCharAttributes));
221 Q_CHECK_PTR(attributes);
222 }
223 init(type: t, str: sv, attributes);
224 }
225}
226
227/*!
228 Moves the finder to the start of the string. This is equivalent to setPosition(0).
229
230 \sa setPosition(), position()
231*/
232void QTextBoundaryFinder::toStart()
233{
234 pos = 0;
235}
236
237/*!
238 Moves the finder to the end of the string. This is equivalent to setPosition(string.length()).
239
240 \sa setPosition(), position()
241*/
242void QTextBoundaryFinder::toEnd()
243{
244 pos = sv.size();
245}
246
247/*!
248 Returns the current position of the QTextBoundaryFinder.
249
250 The range is from 0 (the beginning of the string) to the length of
251 the string inclusive.
252
253 \sa setPosition()
254*/
255qsizetype QTextBoundaryFinder::position() const
256{
257 return pos;
258}
259
260/*!
261 Sets the current position of the QTextBoundaryFinder to \a position.
262
263 If \a position is out of bounds, it will be bound to only valid
264 positions. In this case, valid positions are from 0 to the length of
265 the string inclusive.
266
267 \sa position()
268*/
269void QTextBoundaryFinder::setPosition(qsizetype position)
270{
271 pos = qBound(min: 0, val: position, max: sv.size());
272}
273
274/*! \fn QTextBoundaryFinder::BoundaryType QTextBoundaryFinder::type() const
275
276 Returns the type of the QTextBoundaryFinder.
277*/
278
279/*! \fn bool QTextBoundaryFinder::isValid() const
280
281 Returns \c true if the text boundary finder is valid; otherwise returns \c false.
282 A default QTextBoundaryFinder is invalid.
283*/
284
285/*!
286 Returns the string the QTextBoundaryFinder object operates on.
287*/
288QString QTextBoundaryFinder::string() const
289{
290 if (sv.data() == s.unicode() && sv.size() == s.size())
291 return s;
292 return sv.toString();
293}
294
295
296/*!
297 Moves the QTextBoundaryFinder to the next boundary position and returns that position.
298
299 Returns -1 if there is no next boundary.
300*/
301qsizetype QTextBoundaryFinder::toNextBoundary()
302{
303 if (!attributes || pos < 0 || pos >= sv.size()) {
304 pos = -1;
305 return pos;
306 }
307
308 ++pos;
309 switch(t) {
310 case Grapheme:
311 while (pos < sv.size() && !attributes[pos].graphemeBoundary)
312 ++pos;
313 break;
314 case Word:
315 while (pos < sv.size() && !attributes[pos].wordBreak)
316 ++pos;
317 break;
318 case Sentence:
319 while (pos < sv.size() && !attributes[pos].sentenceBoundary)
320 ++pos;
321 break;
322 case Line:
323 while (pos < sv.size() && !attributes[pos].lineBreak)
324 ++pos;
325 break;
326 }
327
328 return pos;
329}
330
331/*!
332 Moves the QTextBoundaryFinder to the previous boundary position and returns that position.
333
334 Returns -1 if there is no previous boundary.
335*/
336qsizetype QTextBoundaryFinder::toPreviousBoundary()
337{
338 if (!attributes || pos <= 0 || pos > sv.size()) {
339 pos = -1;
340 return pos;
341 }
342
343 --pos;
344 switch(t) {
345 case Grapheme:
346 while (pos > 0 && !attributes[pos].graphemeBoundary)
347 --pos;
348 break;
349 case Word:
350 while (pos > 0 && !attributes[pos].wordBreak)
351 --pos;
352 break;
353 case Sentence:
354 while (pos > 0 && !attributes[pos].sentenceBoundary)
355 --pos;
356 break;
357 case Line:
358 while (pos > 0 && !attributes[pos].lineBreak)
359 --pos;
360 break;
361 }
362
363 return pos;
364}
365
366/*!
367 Returns \c true if the object's position() is currently at a valid text boundary.
368*/
369bool QTextBoundaryFinder::isAtBoundary() const
370{
371 if (!attributes || pos < 0 || pos > sv.size())
372 return false;
373
374 switch(t) {
375 case Grapheme:
376 return attributes[pos].graphemeBoundary;
377 case Word:
378 return attributes[pos].wordBreak;
379 case Sentence:
380 return attributes[pos].sentenceBoundary;
381 case Line:
382 // ### TR#14 LB2 prohibits break at sot
383 return attributes[pos].lineBreak || pos == 0;
384 }
385 return false;
386}
387
388/*!
389 Returns the reasons for the boundary finder to have chosen the current position as a boundary.
390*/
391QTextBoundaryFinder::BoundaryReasons QTextBoundaryFinder::boundaryReasons() const
392{
393 BoundaryReasons reasons = NotAtBoundary;
394 if (!attributes || pos < 0 || pos > sv.size())
395 return reasons;
396
397 const QCharAttributes attr = attributes[pos];
398 switch (t) {
399 case Grapheme:
400 if (attr.graphemeBoundary) {
401 reasons |= BreakOpportunity | StartOfItem | EndOfItem;
402 if (pos == 0)
403 reasons &= (~EndOfItem);
404 else if (pos == sv.size())
405 reasons &= (~StartOfItem);
406 }
407 break;
408 case Word:
409 if (attr.wordBreak) {
410 reasons |= BreakOpportunity;
411 if (attr.wordStart)
412 reasons |= StartOfItem;
413 if (attr.wordEnd)
414 reasons |= EndOfItem;
415 }
416 break;
417 case Sentence:
418 if (attr.sentenceBoundary) {
419 reasons |= BreakOpportunity | StartOfItem | EndOfItem;
420 if (pos == 0)
421 reasons &= (~EndOfItem);
422 else if (pos == sv.size())
423 reasons &= (~StartOfItem);
424 }
425 break;
426 case Line:
427 // ### TR#14 LB2 prohibits break at sot
428 if (attr.lineBreak || pos == 0) {
429 reasons |= BreakOpportunity;
430 if (attr.mandatoryBreak || pos == 0) {
431 reasons |= MandatoryBreak | StartOfItem | EndOfItem;
432 if (pos == 0)
433 reasons &= (~EndOfItem);
434 else if (pos == sv.size())
435 reasons &= (~StartOfItem);
436 } else if (pos > 0 && sv[pos - 1].unicode() == QChar::SoftHyphen) {
437 reasons |= SoftHyphen;
438 }
439 }
440 break;
441 default:
442 break;
443 }
444
445 return reasons;
446}
447
448QT_END_NAMESPACE
449

source code of qtbase/src/corelib/text/qtextboundaryfinder.cpp