unorm2.h [include/unicode/unorm2.h]

1	/*
2	*******************************************************************************
3	*
4	* Copyright (C) 2009-2013, International Business Machines
5	* Corporation and others. All Rights Reserved.
6	*
7	*******************************************************************************
8	* file name: unorm2.h
9	* encoding: US-ASCII
10	* tab size: 8 (not used)
11	* indentation:4
12	*
13	* created on: 2009dec15
14	* created by: Markus W. Scherer
15	*/
16
17	#ifndef __UNORM2_H__
18	#define __UNORM2_H__
19
20	/**
21	* \file
22	* \brief C API: New API for Unicode Normalization.
23	*
24	* Unicode normalization functionality for standard Unicode normalization or
25	* for using custom mapping tables.
26	* All instances of UNormalizer2 are unmodifiable/immutable.
27	* Instances returned by unorm2_getInstance() are singletons that must not be deleted by the caller.
28	* For more details see the Normalizer2 C++ class.
29	*/
30
31	#include "unicode/utypes.h"
32	#include "unicode/localpointer.h"
33	#include "unicode/uset.h"
34
35	/**
36	* Constants for normalization modes.
37	* For details about standard Unicode normalization forms
38	* and about the algorithms which are also used with custom mapping tables
39	* see http://www.unicode.org/unicode/reports/tr15/
40	* @stable ICU 4.4
41	*/
42	typedef enum {
43	/**
44	* Decomposition followed by composition.
45	* Same as standard NFC when using an "nfc" instance.
46	* Same as standard NFKC when using an "nfkc" instance.
47	* For details about standard Unicode normalization forms
48	* see http://www.unicode.org/unicode/reports/tr15/
49	* @stable ICU 4.4
50	*/
51	UNORM2_COMPOSE,
52	/**
53	* Map, and reorder canonically.
54	* Same as standard NFD when using an "nfc" instance.
55	* Same as standard NFKD when using an "nfkc" instance.
56	* For details about standard Unicode normalization forms
57	* see http://www.unicode.org/unicode/reports/tr15/
58	* @stable ICU 4.4
59	*/
60	UNORM2_DECOMPOSE,
61	/**
62	* "Fast C or D" form.
63	* If a string is in this form, then further decomposition <i>without reordering</i>
64	* would yield the same form as DECOMPOSE.
65	* Text in "Fast C or D" form can be processed efficiently with data tables
66	* that are "canonically closed", that is, that provide equivalent data for
67	* equivalent text, without having to be fully normalized.
68	* Not a standard Unicode normalization form.
69	* Not a unique form: Different FCD strings can be canonically equivalent.
70	* For details see http://www.unicode.org/notes/tn5/#FCD
71	* @stable ICU 4.4
72	*/
73	UNORM2_FCD,
74	/**
75	* Compose only contiguously.
76	* Also known as "FCC" or "Fast C Contiguous".
77	* The result will often but not always be in NFC.
78	* The result will conform to FCD which is useful for processing.
79	* Not a standard Unicode normalization form.
80	* For details see http://www.unicode.org/notes/tn5/#FCC
81	* @stable ICU 4.4
82	*/
83	UNORM2_COMPOSE_CONTIGUOUS
84	} UNormalization2Mode;
85
86	/**
87	* Result values for normalization quick check functions.
88	* For details see http://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms
89	* @stable ICU 2.0
90	*/
91	typedef enum UNormalizationCheckResult {
92	/**
93	* The input string is not in the normalization form.
94	* @stable ICU 2.0
95	*/
96	UNORM_NO,
97	/**
98	* The input string is in the normalization form.
99	* @stable ICU 2.0
100	*/
101	UNORM_YES,
102	/**
103	* The input string may or may not be in the normalization form.
104	* This value is only returned for composition forms like NFC and FCC,
105	* when a backward-combining character is found for which the surrounding text
106	* would have to be analyzed further.
107	* @stable ICU 2.0
108	*/
109	UNORM_MAYBE
110	} UNormalizationCheckResult;
111
112	/**
113	* Opaque C service object type for the new normalization API.
114	* @stable ICU 4.4
115	*/
116	struct UNormalizer2;
117	typedef struct UNormalizer2 UNormalizer2; /< C typedef for struct UNormalizer2. @stable ICU 4.4 /*
118
119	#if !UCONFIG_NO_NORMALIZATION
120
121	/**
122	* Returns a UNormalizer2 instance for Unicode NFC normalization.
123	* Same as unorm2_getInstance(NULL, "nfc", UNORM2_COMPOSE, pErrorCode).
124	* Returns an unmodifiable singleton instance. Do not delete it.
125	* @param pErrorCode Standard ICU error code. Its input value must
126	* pass the U_SUCCESS() test, or else the function returns
127	* immediately. Check for U_FAILURE() on output or use with
128	* function chaining. (See User Guide for details.)
129	* @return the requested Normalizer2, if successful
130	* @stable ICU 49
131	*/
132	U_STABLE const UNormalizer2 * U_EXPORT2
133	unorm2_getNFCInstance(UErrorCode *pErrorCode);
134
135	/**
136	* Returns a UNormalizer2 instance for Unicode NFD normalization.
137	* Same as unorm2_getInstance(NULL, "nfc", UNORM2_DECOMPOSE, pErrorCode).
138	* Returns an unmodifiable singleton instance. Do not delete it.
139	* @param pErrorCode Standard ICU error code. Its input value must
140	* pass the U_SUCCESS() test, or else the function returns
141	* immediately. Check for U_FAILURE() on output or use with
142	* function chaining. (See User Guide for details.)
143	* @return the requested Normalizer2, if successful
144	* @stable ICU 49
145	*/
146	U_STABLE const UNormalizer2 * U_EXPORT2
147	unorm2_getNFDInstance(UErrorCode *pErrorCode);
148
149	/**
150	* Returns a UNormalizer2 instance for Unicode NFKC normalization.
151	* Same as unorm2_getInstance(NULL, "nfkc", UNORM2_COMPOSE, pErrorCode).
152	* Returns an unmodifiable singleton instance. Do not delete it.
153	* @param pErrorCode Standard ICU error code. Its input value must
154	* pass the U_SUCCESS() test, or else the function returns
155	* immediately. Check for U_FAILURE() on output or use with
156	* function chaining. (See User Guide for details.)
157	* @return the requested Normalizer2, if successful
158	* @stable ICU 49
159	*/
160	U_STABLE const UNormalizer2 * U_EXPORT2
161	unorm2_getNFKCInstance(UErrorCode *pErrorCode);
162
163	/**
164	* Returns a UNormalizer2 instance for Unicode NFKD normalization.
165	* Same as unorm2_getInstance(NULL, "nfkc", UNORM2_DECOMPOSE, pErrorCode).
166	* Returns an unmodifiable singleton instance. Do not delete it.
167	* @param pErrorCode Standard ICU error code. Its input value must
168	* pass the U_SUCCESS() test, or else the function returns
169	* immediately. Check for U_FAILURE() on output or use with
170	* function chaining. (See User Guide for details.)
171	* @return the requested Normalizer2, if successful
172	* @stable ICU 49
173	*/
174	U_STABLE const UNormalizer2 * U_EXPORT2
175	unorm2_getNFKDInstance(UErrorCode *pErrorCode);
176
177	/**
178	* Returns a UNormalizer2 instance for Unicode NFKC_Casefold normalization.
179	* Same as unorm2_getInstance(NULL, "nfkc_cf", UNORM2_COMPOSE, pErrorCode).
180	* Returns an unmodifiable singleton instance. Do not delete it.
181	* @param pErrorCode Standard ICU error code. Its input value must
182	* pass the U_SUCCESS() test, or else the function returns
183	* immediately. Check for U_FAILURE() on output or use with
184	* function chaining. (See User Guide for details.)
185	* @return the requested Normalizer2, if successful
186	* @stable ICU 49
187	*/
188	U_STABLE const UNormalizer2 * U_EXPORT2
189	unorm2_getNFKCCasefoldInstance(UErrorCode *pErrorCode);
190
191	/**
192	* Returns a UNormalizer2 instance which uses the specified data file
193	* (packageName/name similar to ucnv_openPackage() and ures_open()/ResourceBundle)
194	* and which composes or decomposes text according to the specified mode.
195	* Returns an unmodifiable singleton instance. Do not delete it.
196	*
197	* Use packageName=NULL for data files that are part of ICU's own data.
198	* Use name="nfc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFC/NFD.
199	* Use name="nfkc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFKC/NFKD.
200	* Use name="nfkc_cf" and UNORM2_COMPOSE for Unicode standard NFKC_CF=NFKC_Casefold.
201	*
202	* @param packageName NULL for ICU built-in data, otherwise application data package name
203	* @param name "nfc" or "nfkc" or "nfkc_cf" or name of custom data file
204	* @param mode normalization mode (compose or decompose etc.)
205	* @param pErrorCode Standard ICU error code. Its input value must
206	* pass the U_SUCCESS() test, or else the function returns
207	* immediately. Check for U_FAILURE() on output or use with
208	* function chaining. (See User Guide for details.)
209	* @return the requested UNormalizer2, if successful
210	* @stable ICU 4.4
211	*/
212	U_STABLE const UNormalizer2 * U_EXPORT2
213	unorm2_getInstance(const char *packageName,
214	const char *name,
215	UNormalization2Mode mode,
216	UErrorCode *pErrorCode);
217
218	/**
219	* Constructs a filtered normalizer wrapping any UNormalizer2 instance
220	* and a filter set.
221	* Both are aliased and must not be modified or deleted while this object
222	* is used.
223	* The filter set should be frozen; otherwise the performance will suffer greatly.
224	* @param norm2 wrapped UNormalizer2 instance
225	* @param filterSet USet which determines the characters to be normalized
226	* @param pErrorCode Standard ICU error code. Its input value must
227	* pass the U_SUCCESS() test, or else the function returns
228	* immediately. Check for U_FAILURE() on output or use with
229	* function chaining. (See User Guide for details.)
230	* @return the requested UNormalizer2, if successful
231	* @stable ICU 4.4
232	*/
233	U_STABLE UNormalizer2 * U_EXPORT2
234	unorm2_openFiltered(const UNormalizer2 norm2, const* USet filterSet, UErrorCode pErrorCode);
235
236	/**
237	* Closes a UNormalizer2 instance from unorm2_openFiltered().
238	* Do not close instances from unorm2_getInstance()!
239	* @param norm2 UNormalizer2 instance to be closed
240	* @stable ICU 4.4
241	*/
242	U_STABLE void U_EXPORT2
243	unorm2_close(UNormalizer2 *norm2);
244
245	#if U_SHOW_CPLUSPLUS_API
246
247	U_NAMESPACE_BEGIN
248
249	/**
250	* \class LocalUNormalizer2Pointer
251	* "Smart pointer" class, closes a UNormalizer2 via unorm2_close().
252	* For most methods see the LocalPointerBase base class.
253	*
254	* @see LocalPointerBase
255	* @see LocalPointer
256	* @stable ICU 4.4
257	*/
258	U_DEFINE_LOCAL_OPEN_POINTER(LocalUNormalizer2Pointer, UNormalizer2, unorm2_close);
259
260	U_NAMESPACE_END
261
262	#endif
263
264	/**
265	* Writes the normalized form of the source string to the destination string
266	* (replacing its contents) and returns the length of the destination string.
267	* The source and destination strings must be different buffers.
268	* @param norm2 UNormalizer2 instance
269	* @param src source string
270	* @param length length of the source string, or -1 if NUL-terminated
271	* @param dest destination string; its contents is replaced with normalized src
272	* @param capacity number of UChars that can be written to dest
273	* @param pErrorCode Standard ICU error code. Its input value must
274	* pass the U_SUCCESS() test, or else the function returns
275	* immediately. Check for U_FAILURE() on output or use with
276	* function chaining. (See User Guide for details.)
277	* @return dest
278	* @stable ICU 4.4
279	*/
280	U_STABLE int32_t U_EXPORT2
281	unorm2_normalize(const UNormalizer2 *norm2,
282	const UChar *src, int32_t length,
283	UChar *dest, int32_t capacity,
284	UErrorCode *pErrorCode);
285	/**
286	* Appends the normalized form of the second string to the first string
287	* (merging them at the boundary) and returns the length of the first string.
288	* The result is normalized if the first string was normalized.
289	* The first and second strings must be different buffers.
290	* @param norm2 UNormalizer2 instance
291	* @param first string, should be normalized
292	* @param firstLength length of the first string, or -1 if NUL-terminated
293	* @param firstCapacity number of UChars that can be written to first
294	* @param second string, will be normalized
295	* @param secondLength length of the source string, or -1 if NUL-terminated
296	* @param pErrorCode Standard ICU error code. Its input value must
297	* pass the U_SUCCESS() test, or else the function returns
298	* immediately. Check for U_FAILURE() on output or use with
299	* function chaining. (See User Guide for details.)
300	* @return first
301	* @stable ICU 4.4
302	*/
303	U_STABLE int32_t U_EXPORT2
304	unorm2_normalizeSecondAndAppend(const UNormalizer2 *norm2,
305	UChar *first, int32_t firstLength, int32_t firstCapacity,
306	const UChar *second, int32_t secondLength,
307	UErrorCode *pErrorCode);
308	/**
309	* Appends the second string to the first string
310	* (merging them at the boundary) and returns the length of the first string.
311	* The result is normalized if both the strings were normalized.
312	* The first and second strings must be different buffers.
313	* @param norm2 UNormalizer2 instance
314	* @param first string, should be normalized
315	* @param firstLength length of the first string, or -1 if NUL-terminated
316	* @param firstCapacity number of UChars that can be written to first
317	* @param second string, should be normalized
318	* @param secondLength length of the source string, or -1 if NUL-terminated
319	* @param pErrorCode Standard ICU error code. Its input value must
320	* pass the U_SUCCESS() test, or else the function returns
321	* immediately. Check for U_FAILURE() on output or use with
322	* function chaining. (See User Guide for details.)
323	* @return first
324	* @stable ICU 4.4
325	*/
326	U_STABLE int32_t U_EXPORT2
327	unorm2_append(const UNormalizer2 *norm2,
328	UChar *first, int32_t firstLength, int32_t firstCapacity,
329	const UChar *second, int32_t secondLength,
330	UErrorCode *pErrorCode);
331
332	/**
333	* Gets the decomposition mapping of c.
334	* Roughly equivalent to normalizing the String form of c
335	* on a UNORM2_DECOMPOSE UNormalizer2 instance, but much faster, and except that this function
336	* returns a negative value and does not write a string
337	* if c does not have a decomposition mapping in this instance's data.
338	* This function is independent of the mode of the UNormalizer2.
339	* @param norm2 UNormalizer2 instance
340	* @param c code point
341	* @param decomposition String buffer which will be set to c's
342	* decomposition mapping, if there is one.
343	* @param capacity number of UChars that can be written to decomposition
344	* @param pErrorCode Standard ICU error code. Its input value must
345	* pass the U_SUCCESS() test, or else the function returns
346	* immediately. Check for U_FAILURE() on output or use with
347	* function chaining. (See User Guide for details.)
348	* @return the non-negative length of c's decomposition, if there is one; otherwise a negative value
349	* @stable ICU 4.6
350	*/
351	U_STABLE int32_t U_EXPORT2
352	unorm2_getDecomposition(const UNormalizer2 *norm2,
353	UChar32 c, UChar *decomposition, int32_t capacity,
354	UErrorCode *pErrorCode);
355
356	/**
357	* Gets the raw decomposition mapping of c.
358	*
359	* This is similar to the unorm2_getDecomposition() function but returns the
360	* raw decomposition mapping as specified in UnicodeData.txt or
361	* (for custom data) in the mapping files processed by the gennorm2 tool.
362	* By contrast, unorm2_getDecomposition() returns the processed,
363	* recursively-decomposed version of this mapping.
364	*
365	* When used on a standard NFKC Normalizer2 instance,
366	* unorm2_getRawDecomposition() returns the Unicode Decomposition_Mapping (dm) property.
367	*
368	* When used on a standard NFC Normalizer2 instance,
369	* it returns the Decomposition_Mapping only if the Decomposition_Type (dt) is Canonical (Can);
370	* in this case, the result contains either one or two code points (=1..4 UChars).
371	*
372	* This function is independent of the mode of the UNormalizer2.
373	* @param norm2 UNormalizer2 instance
374	* @param c code point
375	* @param decomposition String buffer which will be set to c's
376	* raw decomposition mapping, if there is one.
377	* @param capacity number of UChars that can be written to decomposition
378	* @param pErrorCode Standard ICU error code. Its input value must
379	* pass the U_SUCCESS() test, or else the function returns
380	* immediately. Check for U_FAILURE() on output or use with
381	* function chaining. (See User Guide for details.)
382	* @return the non-negative length of c's raw decomposition, if there is one; otherwise a negative value
383	* @stable ICU 49
384	*/
385	U_STABLE int32_t U_EXPORT2
386	unorm2_getRawDecomposition(const UNormalizer2 *norm2,
387	UChar32 c, UChar *decomposition, int32_t capacity,
388	UErrorCode *pErrorCode);
389
390	/**
391	* Performs pairwise composition of a & b and returns the composite if there is one.
392	*
393	* Returns a composite code point c only if c has a two-way mapping to a+b.
394	* In standard Unicode normalization, this means that
395	* c has a canonical decomposition to a+b
396	* and c does not have the Full_Composition_Exclusion property.
397	*
398	* This function is independent of the mode of the UNormalizer2.
399	* @param norm2 UNormalizer2 instance
400	* @param a A (normalization starter) code point.
401	* @param b Another code point.
402	* @return The non-negative composite code point if there is one; otherwise a negative value.
403	* @stable ICU 49
404	*/
405	U_STABLE UChar32 U_EXPORT2
406	unorm2_composePair(const UNormalizer2 *norm2, UChar32 a, UChar32 b);
407
408	/**
409	* Gets the combining class of c.
410	* The default implementation returns 0
411	* but all standard implementations return the Unicode Canonical_Combining_Class value.
412	* @param norm2 UNormalizer2 instance
413	* @param c code point
414	* @return c's combining class
415	* @stable ICU 49
416	*/
417	U_STABLE uint8_t U_EXPORT2
418	unorm2_getCombiningClass(const UNormalizer2 *norm2, UChar32 c);
419
420	/**
421	* Tests if the string is normalized.
422	* Internally, in cases where the quickCheck() method would return "maybe"
423	* (which is only possible for the two COMPOSE modes) this method
424	* resolves to "yes" or "no" to provide a definitive result,
425	* at the cost of doing more work in those cases.
426	* @param norm2 UNormalizer2 instance
427	* @param s input string
428	* @param length length of the string, or -1 if NUL-terminated
429	* @param pErrorCode Standard ICU error code. Its input value must
430	* pass the U_SUCCESS() test, or else the function returns
431	* immediately. Check for U_FAILURE() on output or use with
432	* function chaining. (See User Guide for details.)
433	* @return TRUE if s is normalized
434	* @stable ICU 4.4
435	*/
436	U_STABLE UBool U_EXPORT2
437	unorm2_isNormalized(const UNormalizer2 *norm2,
438	const UChar *s, int32_t length,
439	UErrorCode *pErrorCode);
440
441	/**
442	* Tests if the string is normalized.
443	* For the two COMPOSE modes, the result could be "maybe" in cases that
444	* would take a little more work to resolve definitively.
445	* Use spanQuickCheckYes() and normalizeSecondAndAppend() for a faster
446	* combination of quick check + normalization, to avoid
447	* re-checking the "yes" prefix.
448	* @param norm2 UNormalizer2 instance
449	* @param s input string
450	* @param length length of the string, or -1 if NUL-terminated
451	* @param pErrorCode Standard ICU error code. Its input value must
452	* pass the U_SUCCESS() test, or else the function returns
453	* immediately. Check for U_FAILURE() on output or use with
454	* function chaining. (See User Guide for details.)
455	* @return UNormalizationCheckResult
456	* @stable ICU 4.4
457	*/
458	U_STABLE UNormalizationCheckResult U_EXPORT2
459	unorm2_quickCheck(const UNormalizer2 *norm2,
460	const UChar *s, int32_t length,
461	UErrorCode *pErrorCode);
462
463	/**
464	* Returns the end of the normalized substring of the input string.
465	* In other words, with <code>end=spanQuickCheckYes(s, ec);</code>
466	* the substring <code>UnicodeString(s, 0, end)</code>
467	* will pass the quick check with a "yes" result.
468	*
469	* The returned end index is usually one or more characters before the
470	* "no" or "maybe" character: The end index is at a normalization boundary.
471	* (See the class documentation for more about normalization boundaries.)
472	*
473	* When the goal is a normalized string and most input strings are expected
474	* to be normalized already, then call this method,
475	* and if it returns a prefix shorter than the input string,
476	* copy that prefix and use normalizeSecondAndAppend() for the remainder.
477	* @param norm2 UNormalizer2 instance
478	* @param s input string
479	* @param length length of the string, or -1 if NUL-terminated
480	* @param pErrorCode Standard ICU error code. Its input value must
481	* pass the U_SUCCESS() test, or else the function returns
482	* immediately. Check for U_FAILURE() on output or use with
483	* function chaining. (See User Guide for details.)
484	* @return "yes" span end index
485	* @stable ICU 4.4
486	*/
487	U_STABLE int32_t U_EXPORT2
488	unorm2_spanQuickCheckYes(const UNormalizer2 *norm2,
489	const UChar *s, int32_t length,
490	UErrorCode *pErrorCode);
491
492	/**
493	* Tests if the character always has a normalization boundary before it,
494	* regardless of context.
495	* For details see the Normalizer2 base class documentation.
496	* @param norm2 UNormalizer2 instance
497	* @param c character to test
498	* @return TRUE if c has a normalization boundary before it
499	* @stable ICU 4.4
500	*/
501	U_STABLE UBool U_EXPORT2
502	unorm2_hasBoundaryBefore(const UNormalizer2 *norm2, UChar32 c);
503
504	/**
505	* Tests if the character always has a normalization boundary after it,
506	* regardless of context.
507	* For details see the Normalizer2 base class documentation.
508	* @param norm2 UNormalizer2 instance
509	* @param c character to test
510	* @return TRUE if c has a normalization boundary after it
511	* @stable ICU 4.4
512	*/
513	U_STABLE UBool U_EXPORT2
514	unorm2_hasBoundaryAfter(const UNormalizer2 *norm2, UChar32 c);
515
516	/**
517	* Tests if the character is normalization-inert.
518	* For details see the Normalizer2 base class documentation.
519	* @param norm2 UNormalizer2 instance
520	* @param c character to test
521	* @return TRUE if c is normalization-inert
522	* @stable ICU 4.4
523	*/
524	U_STABLE UBool U_EXPORT2
525	unorm2_isInert(const UNormalizer2 *norm2, UChar32 c);
526
527	#endif /* !UCONFIG_NO_NORMALIZATION */
528	#endif /* __UNORM2_H__ */
529