1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4 ********************************************************************
5 * COPYRIGHT:
6 * Copyright (c) 1996-2015, International Business Machines Corporation and
7 * others. All Rights Reserved.
8 ********************************************************************
9 */
10
11#ifndef NORMLZR_H
12#define NORMLZR_H
13
14#include "unicode/utypes.h"
15
16#if U_SHOW_CPLUSPLUS_API
17
18/**
19 * \file
20 * \brief C++ API: Unicode Normalization
21 */
22
23#if !UCONFIG_NO_NORMALIZATION
24
25#include "unicode/chariter.h"
26#include "unicode/normalizer2.h"
27#include "unicode/unistr.h"
28#include "unicode/unorm.h"
29#include "unicode/uobject.h"
30
31U_NAMESPACE_BEGIN
32/**
33 * Old Unicode normalization API.
34 *
35 * This API has been replaced by the Normalizer2 class and is only available
36 * for backward compatibility. This class simply delegates to the Normalizer2 class.
37 * There is one exception: The new API does not provide a replacement for Normalizer::compare().
38 *
39 * The Normalizer class supports the standard normalization forms described in
40 * <a href="http://www.unicode.org/unicode/reports/tr15/" target="unicode">
41 * Unicode Standard Annex #15: Unicode Normalization Forms</a>.
42 *
43 * The Normalizer class consists of two parts:
44 * - static functions that normalize strings or test if strings are normalized
45 * - a Normalizer object is an iterator that takes any kind of text and
46 * provides iteration over its normalized form
47 *
48 * The Normalizer class is not suitable for subclassing.
49 *
50 * For basic information about normalization forms and details about the C API
51 * please see the documentation in unorm.h.
52 *
53 * The iterator API with the Normalizer constructors and the non-static functions
54 * use a CharacterIterator as input. It is possible to pass a string which
55 * is then internally wrapped in a CharacterIterator.
56 * The input text is not normalized all at once, but incrementally where needed
57 * (providing efficient random access).
58 * This allows to pass in a large text but spend only a small amount of time
59 * normalizing a small part of that text.
60 * However, if the entire text is normalized, then the iterator will be
61 * slower than normalizing the entire text at once and iterating over the result.
62 * A possible use of the Normalizer iterator is also to report an index into the
63 * original text that is close to where the normalized characters come from.
64 *
65 * <em>Important:</em> The iterator API was cleaned up significantly for ICU 2.0.
66 * The earlier implementation reported the getIndex() inconsistently,
67 * and previous() could not be used after setIndex(), next(), first(), and current().
68 *
69 * Normalizer allows to start normalizing from anywhere in the input text by
70 * calling setIndexOnly(), first(), or last().
71 * Without calling any of these, the iterator will start at the beginning of the text.
72 *
73 * At any time, next() returns the next normalized code point (UChar32),
74 * with post-increment semantics (like CharacterIterator::next32PostInc()).
75 * previous() returns the previous normalized code point (UChar32),
76 * with pre-decrement semantics (like CharacterIterator::previous32()).
77 *
78 * current() returns the current code point
79 * (respectively the one at the newly set index) without moving
80 * the getIndex(). Note that if the text at the current position
81 * needs to be normalized, then these functions will do that.
82 * (This is why current() is not const.)
83 * It is more efficient to call setIndexOnly() instead, which does not
84 * normalize.
85 *
86 * getIndex() always refers to the position in the input text where the normalized
87 * code points are returned from. It does not always change with each returned
88 * code point.
89 * The code point that is returned from any of the functions
90 * corresponds to text at or after getIndex(), according to the
91 * function's iteration semantics (post-increment or pre-decrement).
92 *
93 * next() returns a code point from at or after the getIndex()
94 * from before the next() call. After the next() call, the getIndex()
95 * might have moved to where the next code point will be returned from
96 * (from a next() or current() call).
97 * This is semantically equivalent to array access with array[index++]
98 * (post-increment semantics).
99 *
100 * previous() returns a code point from at or after the getIndex()
101 * from after the previous() call.
102 * This is semantically equivalent to array access with array[--index]
103 * (pre-decrement semantics).
104 *
105 * Internally, the Normalizer iterator normalizes a small piece of text
106 * starting at the getIndex() and ending at a following "safe" index.
107 * The normalized results is stored in an internal string buffer, and
108 * the code points are iterated from there.
109 * With multiple iteration calls, this is repeated until the next piece
110 * of text needs to be normalized, and the getIndex() needs to be moved.
111 *
112 * The following "safe" index, the internal buffer, and the secondary
113 * iteration index into that buffer are not exposed on the API.
114 * This also means that it is currently not practical to return to
115 * a particular, arbitrary position in the text because one would need to
116 * know, and be able to set, in addition to the getIndex(), at least also the
117 * current index into the internal buffer.
118 * It is currently only possible to observe when getIndex() changes
119 * (with careful consideration of the iteration semantics),
120 * at which time the internal index will be 0.
121 * For example, if getIndex() is different after next() than before it,
122 * then the internal index is 0 and one can return to this getIndex()
123 * later with setIndexOnly().
124 *
125 * Note: While the setIndex() and getIndex() refer to indices in the
126 * underlying Unicode input text, the next() and previous() methods
127 * iterate through characters in the normalized output.
128 * This means that there is not necessarily a one-to-one correspondence
129 * between characters returned by next() and previous() and the indices
130 * passed to and returned from setIndex() and getIndex().
131 * It is for this reason that Normalizer does not implement the CharacterIterator interface.
132 *
133 * @author Laura Werner, Mark Davis, Markus Scherer
134 * @stable ICU 2.0
135 */
136class U_COMMON_API Normalizer : public UObject {
137public:
138#ifndef U_HIDE_DEPRECATED_API
139 /**
140 * If DONE is returned from an iteration function that returns a code point,
141 * then there are no more normalization results available.
142 * @deprecated ICU 56 Use Normalizer2 instead.
143 */
144 enum {
145 DONE=0xffff
146 };
147
148 // Constructors
149
150 /**
151 * Creates a new <code>Normalizer</code> object for iterating over the
152 * normalized form of a given string.
153 * <p>
154 * @param str The string to be normalized. The normalization
155 * will start at the beginning of the string.
156 *
157 * @param mode The normalization mode.
158 * @deprecated ICU 56 Use Normalizer2 instead.
159 */
160 Normalizer(const UnicodeString& str, UNormalizationMode mode);
161
162 /**
163 * Creates a new <code>Normalizer</code> object for iterating over the
164 * normalized form of a given string.
165 * <p>
166 * @param str The string to be normalized. The normalization
167 * will start at the beginning of the string.
168 *
169 * @param length Length of the string, or -1 if NUL-terminated.
170 * @param mode The normalization mode.
171 * @deprecated ICU 56 Use Normalizer2 instead.
172 */
173 Normalizer(ConstChar16Ptr str, int32_t length, UNormalizationMode mode);
174
175 /**
176 * Creates a new <code>Normalizer</code> object for iterating over the
177 * normalized form of the given text.
178 * <p>
179 * @param iter The input text to be normalized. The normalization
180 * will start at the beginning of the string.
181 *
182 * @param mode The normalization mode.
183 * @deprecated ICU 56 Use Normalizer2 instead.
184 */
185 Normalizer(const CharacterIterator& iter, UNormalizationMode mode);
186#endif /* U_HIDE_DEPRECATED_API */
187
188#ifndef U_FORCE_HIDE_DEPRECATED_API
189 /**
190 * Copy constructor.
191 * @param copy The object to be copied.
192 * @deprecated ICU 56 Use Normalizer2 instead.
193 */
194 Normalizer(const Normalizer& copy);
195
196 /**
197 * Destructor
198 * @deprecated ICU 56 Use Normalizer2 instead.
199 */
200 virtual ~Normalizer();
201#endif // U_FORCE_HIDE_DEPRECATED_API
202
203 //-------------------------------------------------------------------------
204 // Static utility methods
205 //-------------------------------------------------------------------------
206
207#ifndef U_HIDE_DEPRECATED_API
208 /**
209 * Normalizes a <code>UnicodeString</code> according to the specified normalization mode.
210 * This is a wrapper for unorm_normalize(), using UnicodeString's.
211 *
212 * The <code>options</code> parameter specifies which optional
213 * <code>Normalizer</code> features are to be enabled for this operation.
214 *
215 * @param source the input string to be normalized.
216 * @param mode the normalization mode
217 * @param options the optional features to be enabled (0 for no options)
218 * @param result The normalized string (on output).
219 * @param status The error code.
220 * @deprecated ICU 56 Use Normalizer2 instead.
221 */
222 static void U_EXPORT2 normalize(const UnicodeString& source,
223 UNormalizationMode mode, int32_t options,
224 UnicodeString& result,
225 UErrorCode &status);
226
227 /**
228 * Compose a <code>UnicodeString</code>.
229 * This is equivalent to normalize() with mode UNORM_NFC or UNORM_NFKC.
230 * This is a wrapper for unorm_normalize(), using UnicodeString's.
231 *
232 * The <code>options</code> parameter specifies which optional
233 * <code>Normalizer</code> features are to be enabled for this operation.
234 *
235 * @param source the string to be composed.
236 * @param compat Perform compatibility decomposition before composition.
237 * If this argument is <code>false</code>, only canonical
238 * decomposition will be performed.
239 * @param options the optional features to be enabled (0 for no options)
240 * @param result The composed string (on output).
241 * @param status The error code.
242 * @deprecated ICU 56 Use Normalizer2 instead.
243 */
244 static void U_EXPORT2 compose(const UnicodeString& source,
245 UBool compat, int32_t options,
246 UnicodeString& result,
247 UErrorCode &status);
248
249 /**
250 * Static method to decompose a <code>UnicodeString</code>.
251 * This is equivalent to normalize() with mode UNORM_NFD or UNORM_NFKD.
252 * This is a wrapper for unorm_normalize(), using UnicodeString's.
253 *
254 * The <code>options</code> parameter specifies which optional
255 * <code>Normalizer</code> features are to be enabled for this operation.
256 *
257 * @param source the string to be decomposed.
258 * @param compat Perform compatibility decomposition.
259 * If this argument is <code>false</code>, only canonical
260 * decomposition will be performed.
261 * @param options the optional features to be enabled (0 for no options)
262 * @param result The decomposed string (on output).
263 * @param status The error code.
264 * @deprecated ICU 56 Use Normalizer2 instead.
265 */
266 static void U_EXPORT2 decompose(const UnicodeString& source,
267 UBool compat, int32_t options,
268 UnicodeString& result,
269 UErrorCode &status);
270
271 /**
272 * Performing quick check on a string, to quickly determine if the string is
273 * in a particular normalization format.
274 * This is a wrapper for unorm_quickCheck(), using a UnicodeString.
275 *
276 * Three types of result can be returned UNORM_YES, UNORM_NO or
277 * UNORM_MAYBE. Result UNORM_YES indicates that the argument
278 * string is in the desired normalized format, UNORM_NO determines that
279 * argument string is not in the desired normalized format. A
280 * UNORM_MAYBE result indicates that a more thorough check is required,
281 * the user may have to put the string in its normalized form and compare the
282 * results.
283 * @param source string for determining if it is in a normalized format
284 * @param mode normalization format
285 * @param status A reference to a UErrorCode to receive any errors
286 * @return UNORM_YES, UNORM_NO or UNORM_MAYBE
287 *
288 * @see isNormalized
289 * @deprecated ICU 56 Use Normalizer2 instead.
290 */
291 static inline UNormalizationCheckResult
292 quickCheck(const UnicodeString &source, UNormalizationMode mode, UErrorCode &status);
293
294 /**
295 * Performing quick check on a string; same as the other version of quickCheck
296 * but takes an extra options parameter like most normalization functions.
297 *
298 * @param source string for determining if it is in a normalized format
299 * @param mode normalization format
300 * @param options the optional features to be enabled (0 for no options)
301 * @param status A reference to a UErrorCode to receive any errors
302 * @return UNORM_YES, UNORM_NO or UNORM_MAYBE
303 *
304 * @see isNormalized
305 * @deprecated ICU 56 Use Normalizer2 instead.
306 */
307 static UNormalizationCheckResult
308 quickCheck(const UnicodeString &source, UNormalizationMode mode, int32_t options, UErrorCode &status);
309
310 /**
311 * Test if a string is in a given normalization form.
312 * This is semantically equivalent to source.equals(normalize(source, mode)) .
313 *
314 * Unlike unorm_quickCheck(), this function returns a definitive result,
315 * never a "maybe".
316 * For NFD, NFKD, and FCD, both functions work exactly the same.
317 * For NFC and NFKC where quickCheck may return "maybe", this function will
318 * perform further tests to arrive at a true/false result.
319 *
320 * @param src String that is to be tested if it is in a normalization format.
321 * @param mode Which normalization form to test for.
322 * @param errorCode ICU error code in/out parameter.
323 * Must fulfill U_SUCCESS before the function call.
324 * @return Boolean value indicating whether the source string is in the
325 * "mode" normalization form.
326 *
327 * @see quickCheck
328 * @deprecated ICU 56 Use Normalizer2 instead.
329 */
330 static inline UBool
331 isNormalized(const UnicodeString &src, UNormalizationMode mode, UErrorCode &errorCode);
332
333 /**
334 * Test if a string is in a given normalization form; same as the other version of isNormalized
335 * but takes an extra options parameter like most normalization functions.
336 *
337 * @param src String that is to be tested if it is in a normalization format.
338 * @param mode Which normalization form to test for.
339 * @param options the optional features to be enabled (0 for no options)
340 * @param errorCode ICU error code in/out parameter.
341 * Must fulfill U_SUCCESS before the function call.
342 * @return Boolean value indicating whether the source string is in the
343 * "mode" normalization form.
344 *
345 * @see quickCheck
346 * @deprecated ICU 56 Use Normalizer2 instead.
347 */
348 static UBool
349 isNormalized(const UnicodeString &src, UNormalizationMode mode, int32_t options, UErrorCode &errorCode);
350
351 /**
352 * Concatenate normalized strings, making sure that the result is normalized as well.
353 *
354 * If both the left and the right strings are in
355 * the normalization form according to "mode/options",
356 * then the result will be
357 *
358 * \code
359 * dest=normalize(left+right, mode, options)
360 * \endcode
361 *
362 * For details see unorm_concatenate in unorm.h.
363 *
364 * @param left Left source string.
365 * @param right Right source string.
366 * @param result The output string.
367 * @param mode The normalization mode.
368 * @param options A bit set of normalization options.
369 * @param errorCode ICU error code in/out parameter.
370 * Must fulfill U_SUCCESS before the function call.
371 * @return result
372 *
373 * @see unorm_concatenate
374 * @see normalize
375 * @see unorm_next
376 * @see unorm_previous
377 *
378 * @deprecated ICU 56 Use Normalizer2 instead.
379 */
380 static UnicodeString &
381 U_EXPORT2 concatenate(const UnicodeString &left, const UnicodeString &right,
382 UnicodeString &result,
383 UNormalizationMode mode, int32_t options,
384 UErrorCode &errorCode);
385#endif /* U_HIDE_DEPRECATED_API */
386
387 /**
388 * Compare two strings for canonical equivalence.
389 * Further options include case-insensitive comparison and
390 * code point order (as opposed to code unit order).
391 *
392 * Canonical equivalence between two strings is defined as their normalized
393 * forms (NFD or NFC) being identical.
394 * This function compares strings incrementally instead of normalizing
395 * (and optionally case-folding) both strings entirely,
396 * improving performance significantly.
397 *
398 * Bulk normalization is only necessary if the strings do not fulfill the FCD
399 * conditions. Only in this case, and only if the strings are relatively long,
400 * is memory allocated temporarily.
401 * For FCD strings and short non-FCD strings there is no memory allocation.
402 *
403 * Semantically, this is equivalent to
404 * strcmp[CodePointOrder](NFD(foldCase(s1)), NFD(foldCase(s2)))
405 * where code point order and foldCase are all optional.
406 *
407 * UAX 21 2.5 Caseless Matching specifies that for a canonical caseless match
408 * the case folding must be performed first, then the normalization.
409 *
410 * @param s1 First source string.
411 * @param s2 Second source string.
412 *
413 * @param options A bit set of options:
414 * - U_FOLD_CASE_DEFAULT or 0 is used for default options:
415 * Case-sensitive comparison in code unit order, and the input strings
416 * are quick-checked for FCD.
417 *
418 * - UNORM_INPUT_IS_FCD
419 * Set if the caller knows that both s1 and s2 fulfill the FCD conditions.
420 * If not set, the function will quickCheck for FCD
421 * and normalize if necessary.
422 *
423 * - U_COMPARE_CODE_POINT_ORDER
424 * Set to choose code point order instead of code unit order
425 * (see u_strCompare for details).
426 *
427 * - U_COMPARE_IGNORE_CASE
428 * Set to compare strings case-insensitively using case folding,
429 * instead of case-sensitively.
430 * If set, then the following case folding options are used.
431 *
432 * - Options as used with case-insensitive comparisons, currently:
433 *
434 * - U_FOLD_CASE_EXCLUDE_SPECIAL_I
435 * (see u_strCaseCompare for details)
436 *
437 * - regular normalization options shifted left by UNORM_COMPARE_NORM_OPTIONS_SHIFT
438 *
439 * @param errorCode ICU error code in/out parameter.
440 * Must fulfill U_SUCCESS before the function call.
441 * @return <0 or 0 or >0 as usual for string comparisons
442 *
443 * @see unorm_compare
444 * @see normalize
445 * @see UNORM_FCD
446 * @see u_strCompare
447 * @see u_strCaseCompare
448 *
449 * @stable ICU 2.2
450 */
451 static inline int32_t
452 compare(const UnicodeString &s1, const UnicodeString &s2,
453 uint32_t options,
454 UErrorCode &errorCode);
455
456#ifndef U_HIDE_DEPRECATED_API
457 //-------------------------------------------------------------------------
458 // Iteration API
459 //-------------------------------------------------------------------------
460
461 /**
462 * Return the current character in the normalized text.
463 * current() may need to normalize some text at getIndex().
464 * The getIndex() is not changed.
465 *
466 * @return the current normalized code point
467 * @deprecated ICU 56 Use Normalizer2 instead.
468 */
469 UChar32 current(void);
470
471 /**
472 * Return the first character in the normalized text.
473 * This is equivalent to setIndexOnly(startIndex()) followed by next().
474 * (Post-increment semantics.)
475 *
476 * @return the first normalized code point
477 * @deprecated ICU 56 Use Normalizer2 instead.
478 */
479 UChar32 first(void);
480
481 /**
482 * Return the last character in the normalized text.
483 * This is equivalent to setIndexOnly(endIndex()) followed by previous().
484 * (Pre-decrement semantics.)
485 *
486 * @return the last normalized code point
487 * @deprecated ICU 56 Use Normalizer2 instead.
488 */
489 UChar32 last(void);
490
491 /**
492 * Return the next character in the normalized text.
493 * (Post-increment semantics.)
494 * If the end of the text has already been reached, DONE is returned.
495 * The DONE value could be confused with a U+FFFF non-character code point
496 * in the text. If this is possible, you can test getIndex()<endIndex()
497 * before calling next(), or (getIndex()<endIndex() || last()!=DONE)
498 * after calling next(). (Calling last() will change the iterator state!)
499 *
500 * The C API unorm_next() is more efficient and does not have this ambiguity.
501 *
502 * @return the next normalized code point
503 * @deprecated ICU 56 Use Normalizer2 instead.
504 */
505 UChar32 next(void);
506
507 /**
508 * Return the previous character in the normalized text and decrement.
509 * (Pre-decrement semantics.)
510 * If the beginning of the text has already been reached, DONE is returned.
511 * The DONE value could be confused with a U+FFFF non-character code point
512 * in the text. If this is possible, you can test
513 * (getIndex()>startIndex() || first()!=DONE). (Calling first() will change
514 * the iterator state!)
515 *
516 * The C API unorm_previous() is more efficient and does not have this ambiguity.
517 *
518 * @return the previous normalized code point
519 * @deprecated ICU 56 Use Normalizer2 instead.
520 */
521 UChar32 previous(void);
522
523 /**
524 * Set the iteration position in the input text that is being normalized,
525 * without any immediate normalization.
526 * After setIndexOnly(), getIndex() will return the same index that is
527 * specified here.
528 *
529 * @param index the desired index in the input text.
530 * @deprecated ICU 56 Use Normalizer2 instead.
531 */
532 void setIndexOnly(int32_t index);
533
534 /**
535 * Reset the index to the beginning of the text.
536 * This is equivalent to setIndexOnly(startIndex)).
537 * @deprecated ICU 56 Use Normalizer2 instead.
538 */
539 void reset(void);
540
541 /**
542 * Retrieve the current iteration position in the input text that is
543 * being normalized.
544 *
545 * A following call to next() will return a normalized code point from
546 * the input text at or after this index.
547 *
548 * After a call to previous(), getIndex() will point at or before the
549 * position in the input text where the normalized code point
550 * was returned from with previous().
551 *
552 * @return the current index in the input text
553 * @deprecated ICU 56 Use Normalizer2 instead.
554 */
555 int32_t getIndex(void) const;
556
557 /**
558 * Retrieve the index of the start of the input text. This is the begin index
559 * of the <code>CharacterIterator</code> or the start (i.e. index 0) of the string
560 * over which this <code>Normalizer</code> is iterating.
561 *
562 * @return the smallest index in the input text where the Normalizer operates
563 * @deprecated ICU 56 Use Normalizer2 instead.
564 */
565 int32_t startIndex(void) const;
566
567 /**
568 * Retrieve the index of the end of the input text. This is the end index
569 * of the <code>CharacterIterator</code> or the length of the string
570 * over which this <code>Normalizer</code> is iterating.
571 * This end index is exclusive, i.e., the Normalizer operates only on characters
572 * before this index.
573 *
574 * @return the first index in the input text where the Normalizer does not operate
575 * @deprecated ICU 56 Use Normalizer2 instead.
576 */
577 int32_t endIndex(void) const;
578
579 /**
580 * Returns true when both iterators refer to the same character in the same
581 * input text.
582 *
583 * @param that a Normalizer object to compare this one to
584 * @return comparison result
585 * @deprecated ICU 56 Use Normalizer2 instead.
586 */
587 bool operator==(const Normalizer& that) const;
588
589 /**
590 * Returns false when both iterators refer to the same character in the same
591 * input text.
592 *
593 * @param that a Normalizer object to compare this one to
594 * @return comparison result
595 * @deprecated ICU 56 Use Normalizer2 instead.
596 */
597 inline bool operator!=(const Normalizer& that) const;
598
599 /**
600 * Returns a pointer to a new Normalizer that is a clone of this one.
601 * The caller is responsible for deleting the new clone.
602 * @return a pointer to a new Normalizer
603 * @deprecated ICU 56 Use Normalizer2 instead.
604 */
605 Normalizer* clone() const;
606
607 /**
608 * Generates a hash code for this iterator.
609 *
610 * @return the hash code
611 * @deprecated ICU 56 Use Normalizer2 instead.
612 */
613 int32_t hashCode(void) const;
614
615 //-------------------------------------------------------------------------
616 // Property access methods
617 //-------------------------------------------------------------------------
618
619 /**
620 * Set the normalization mode for this object.
621 * <p>
622 * <b>Note:</b>If the normalization mode is changed while iterating
623 * over a string, calls to {@link #next() } and {@link #previous() } may
624 * return previously buffers characters in the old normalization mode
625 * until the iteration is able to re-sync at the next base character.
626 * It is safest to call {@link #setIndexOnly }, {@link #reset() },
627 * {@link #setText }, {@link #first() },
628 * {@link #last() }, etc. after calling <code>setMode</code>.
629 * <p>
630 * @param newMode the new mode for this <code>Normalizer</code>.
631 * @see #getUMode
632 * @deprecated ICU 56 Use Normalizer2 instead.
633 */
634 void setMode(UNormalizationMode newMode);
635
636 /**
637 * Return the normalization mode for this object.
638 *
639 * This is an unusual name because there used to be a getMode() that
640 * returned a different type.
641 *
642 * @return the mode for this <code>Normalizer</code>
643 * @see #setMode
644 * @deprecated ICU 56 Use Normalizer2 instead.
645 */
646 UNormalizationMode getUMode(void) const;
647
648 /**
649 * Set options that affect this <code>Normalizer</code>'s operation.
650 * Options do not change the basic composition or decomposition operation
651 * that is being performed, but they control whether
652 * certain optional portions of the operation are done.
653 * Currently the only available option is obsolete.
654 *
655 * It is possible to specify multiple options that are all turned on or off.
656 *
657 * @param option the option(s) whose value is/are to be set.
658 * @param value the new setting for the option. Use <code>true</code> to
659 * turn the option(s) on and <code>false</code> to turn it/them off.
660 *
661 * @see #getOption
662 * @deprecated ICU 56 Use Normalizer2 instead.
663 */
664 void setOption(int32_t option,
665 UBool value);
666
667 /**
668 * Determine whether an option is turned on or off.
669 * If multiple options are specified, then the result is true if any
670 * of them are set.
671 * <p>
672 * @param option the option(s) that are to be checked
673 * @return true if any of the option(s) are set
674 * @see #setOption
675 * @deprecated ICU 56 Use Normalizer2 instead.
676 */
677 UBool getOption(int32_t option) const;
678
679 /**
680 * Set the input text over which this <code>Normalizer</code> will iterate.
681 * The iteration position is set to the beginning.
682 *
683 * @param newText a string that replaces the current input text
684 * @param status a UErrorCode
685 * @deprecated ICU 56 Use Normalizer2 instead.
686 */
687 void setText(const UnicodeString& newText,
688 UErrorCode &status);
689
690 /**
691 * Set the input text over which this <code>Normalizer</code> will iterate.
692 * The iteration position is set to the beginning.
693 *
694 * @param newText a CharacterIterator object that replaces the current input text
695 * @param status a UErrorCode
696 * @deprecated ICU 56 Use Normalizer2 instead.
697 */
698 void setText(const CharacterIterator& newText,
699 UErrorCode &status);
700
701 /**
702 * Set the input text over which this <code>Normalizer</code> will iterate.
703 * The iteration position is set to the beginning.
704 *
705 * @param newText a string that replaces the current input text
706 * @param length the length of the string, or -1 if NUL-terminated
707 * @param status a UErrorCode
708 * @deprecated ICU 56 Use Normalizer2 instead.
709 */
710 void setText(ConstChar16Ptr newText,
711 int32_t length,
712 UErrorCode &status);
713 /**
714 * Copies the input text into the UnicodeString argument.
715 *
716 * @param result Receives a copy of the text under iteration.
717 * @deprecated ICU 56 Use Normalizer2 instead.
718 */
719 void getText(UnicodeString& result);
720
721 /**
722 * ICU "poor man's RTTI", returns a UClassID for this class.
723 * @returns a UClassID for this class.
724 * @deprecated ICU 56 Use Normalizer2 instead.
725 */
726 static UClassID U_EXPORT2 getStaticClassID();
727#endif /* U_HIDE_DEPRECATED_API */
728
729#ifndef U_FORCE_HIDE_DEPRECATED_API
730 /**
731 * ICU "poor man's RTTI", returns a UClassID for the actual class.
732 * @return a UClassID for the actual class.
733 * @deprecated ICU 56 Use Normalizer2 instead.
734 */
735 virtual UClassID getDynamicClassID() const override;
736#endif // U_FORCE_HIDE_DEPRECATED_API
737
738private:
739 //-------------------------------------------------------------------------
740 // Private functions
741 //-------------------------------------------------------------------------
742
743 Normalizer(); // default constructor not implemented
744 Normalizer &operator=(const Normalizer &that); // assignment operator not implemented
745
746 // Private utility methods for iteration
747 // For documentation, see the source code
748 UBool nextNormalize();
749 UBool previousNormalize();
750
751 void init();
752 void clearBuffer(void);
753
754 //-------------------------------------------------------------------------
755 // Private data
756 //-------------------------------------------------------------------------
757
758 FilteredNormalizer2*fFilteredNorm2; // owned if not NULL
759 const Normalizer2 *fNorm2; // not owned; may be equal to fFilteredNorm2
760 UNormalizationMode fUMode; // deprecated
761 int32_t fOptions;
762
763 // The input text and our position in it
764 CharacterIterator *text;
765
766 // The normalization buffer is the result of normalization
767 // of the source in [currentIndex..nextIndex[ .
768 int32_t currentIndex, nextIndex;
769
770 // A buffer for holding intermediate results
771 UnicodeString buffer;
772 int32_t bufferPos;
773};
774
775//-------------------------------------------------------------------------
776// Inline implementations
777//-------------------------------------------------------------------------
778
779#ifndef U_HIDE_DEPRECATED_API
780inline bool
781Normalizer::operator!= (const Normalizer& other) const
782{ return ! operator==(that: other); }
783
784inline UNormalizationCheckResult
785Normalizer::quickCheck(const UnicodeString& source,
786 UNormalizationMode mode,
787 UErrorCode &status) {
788 return quickCheck(source, mode, options: 0, status);
789}
790
791inline UBool
792Normalizer::isNormalized(const UnicodeString& source,
793 UNormalizationMode mode,
794 UErrorCode &status) {
795 return isNormalized(src: source, mode, options: 0, errorCode&: status);
796}
797#endif /* U_HIDE_DEPRECATED_API */
798
799inline int32_t
800Normalizer::compare(const UnicodeString &s1, const UnicodeString &s2,
801 uint32_t options,
802 UErrorCode &errorCode) {
803 // all argument checking is done in unorm_compare
804 return unorm_compare(s1: toUCharPtr(p: s1.getBuffer()), length1: s1.length(),
805 s2: toUCharPtr(p: s2.getBuffer()), length2: s2.length(),
806 options,
807 pErrorCode: &errorCode);
808}
809
810U_NAMESPACE_END
811
812#endif /* #if !UCONFIG_NO_NORMALIZATION */
813
814#endif // NORMLZR_H
815
816#endif /* U_SHOW_CPLUSPLUS_API */
817

source code of include/unicode/normlzr.h