1 | // © 2016 and later: Unicode, Inc. and others. |
---|---|

2 | // License & terms of use: http://www.unicode.org/copyright.html |

3 | /* |

4 | ******************************************************************************* |

5 | * |

6 | * Copyright (C) 2002-2014, International Business Machines |

7 | * Corporation and others. All Rights Reserved. |

8 | * |

9 | ******************************************************************************* |

10 | * file name: uset.h |

11 | * encoding: UTF-8 |

12 | * tab size: 8 (not used) |

13 | * indentation:4 |

14 | * |

15 | * created on: 2002mar07 |

16 | * created by: Markus W. Scherer |

17 | * |

18 | * C version of UnicodeSet. |

19 | */ |

20 | |

21 | |

22 | /** |

23 | * \file |

24 | * \brief C API: Unicode Set |

25 | * |

26 | * <p>This is a C wrapper around the C++ UnicodeSet class.</p> |

27 | */ |

28 | |

29 | #ifndef __USET_H__ |

30 | #define __USET_H__ |

31 | |

32 | #include "unicode/utypes.h" |

33 | #include "unicode/uchar.h" |

34 | #include "unicode/localpointer.h" |

35 | |

36 | #ifndef USET_DEFINED |

37 | |

38 | #ifndef U_IN_DOXYGEN |

39 | #define USET_DEFINED |

40 | #endif |

41 | /** |

42 | * USet is the C API type corresponding to C++ class UnicodeSet. |

43 | * Use the uset_* API to manipulate. Create with |

44 | * uset_open*, and destroy with uset_close. |

45 | * @stable ICU 2.4 |

46 | */ |

47 | typedef struct USet USet; |

48 | #endif |

49 | |

50 | /** |

51 | * Bitmask values to be passed to uset_openPatternOptions() or |

52 | * uset_applyPattern() taking an option parameter. |

53 | * @stable ICU 2.4 |

54 | */ |

55 | enum { |

56 | /** |

57 | * Ignore white space within patterns unless quoted or escaped. |

58 | * @stable ICU 2.4 |

59 | */ |

60 | USET_IGNORE_SPACE = 1, |

61 | |

62 | /** |

63 | * Enable case insensitive matching. E.g., "[ab]" with this flag |

64 | * will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will |

65 | * match all except 'a', 'A', 'b', and 'B'. This performs a full |

66 | * closure over case mappings, e.g. U+017F for s. |

67 | * |

68 | * The resulting set is a superset of the input for the code points but |

69 | * not for the strings. |

70 | * It performs a case mapping closure of the code points and adds |

71 | * full case folding strings for the code points, and reduces strings of |

72 | * the original set to their full case folding equivalents. |

73 | * |

74 | * This is designed for case-insensitive matches, for example |

75 | * in regular expressions. The full code point case closure allows checking of |

76 | * an input character directly against the closure set. |

77 | * Strings are matched by comparing the case-folded form from the closure |

78 | * set with an incremental case folding of the string in question. |

79 | * |

80 | * The closure set will also contain single code points if the original |

81 | * set contained case-equivalent strings (like U+00DF for "ss" or "Ss" etc.). |

82 | * This is not necessary (that is, redundant) for the above matching method |

83 | * but results in the same closure sets regardless of whether the original |

84 | * set contained the code point or a string. |

85 | * |

86 | * @stable ICU 2.4 |

87 | */ |

88 | USET_CASE_INSENSITIVE = 2, |

89 | |

90 | /** |

91 | * Enable case insensitive matching. E.g., "[ab]" with this flag |

92 | * will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will |

93 | * match all except 'a', 'A', 'b', and 'B'. This adds the lower-, |

94 | * title-, and uppercase mappings as well as the case folding |

95 | * of each existing element in the set. |

96 | * @stable ICU 3.2 |

97 | */ |

98 | USET_ADD_CASE_MAPPINGS = 4 |

99 | }; |

100 | |

101 | /** |

102 | * Argument values for whether span() and similar functions continue while |

103 | * the current character is contained vs. not contained in the set. |

104 | * |

105 | * The functionality is straightforward for sets with only single code points, |

106 | * without strings (which is the common case): |

107 | * - USET_SPAN_CONTAINED and USET_SPAN_SIMPLE work the same. |

108 | * - USET_SPAN_CONTAINED and USET_SPAN_SIMPLE are inverses of USET_SPAN_NOT_CONTAINED. |

109 | * - span() and spanBack() partition any string the same way when |

110 | * alternating between span(USET_SPAN_NOT_CONTAINED) and |

111 | * span(either "contained" condition). |

112 | * - Using a complemented (inverted) set and the opposite span conditions |

113 | * yields the same results. |

114 | * |

115 | * When a set contains multi-code point strings, then these statements may not |

116 | * be true, depending on the strings in the set (for example, whether they |

117 | * overlap with each other) and the string that is processed. |

118 | * For a set with strings: |

119 | * - The complement of the set contains the opposite set of code points, |

120 | * but the same set of strings. |

121 | * Therefore, complementing both the set and the span conditions |

122 | * may yield different results. |

123 | * - When starting spans at different positions in a string |

124 | * (span(s, ...) vs. span(s+1, ...)) the ends of the spans may be different |

125 | * because a set string may start before the later position. |

126 | * - span(USET_SPAN_SIMPLE) may be shorter than |

127 | * span(USET_SPAN_CONTAINED) because it will not recursively try |

128 | * all possible paths. |

129 | * For example, with a set which contains the three strings "xy", "xya" and "ax", |

130 | * span("xyax", USET_SPAN_CONTAINED) will return 4 but |

131 | * span("xyax", USET_SPAN_SIMPLE) will return 3. |

132 | * span(USET_SPAN_SIMPLE) will never be longer than |

133 | * span(USET_SPAN_CONTAINED). |

134 | * - With either "contained" condition, span() and spanBack() may partition |

135 | * a string in different ways. |

136 | * For example, with a set which contains the two strings "ab" and "ba", |

137 | * and when processing the string "aba", |

138 | * span() will yield contained/not-contained boundaries of { 0, 2, 3 } |

139 | * while spanBack() will yield boundaries of { 0, 1, 3 }. |

140 | * |

141 | * Note: If it is important to get the same boundaries whether iterating forward |

142 | * or backward through a string, then either only span() should be used and |

143 | * the boundaries cached for backward operation, or an ICU BreakIterator |

144 | * could be used. |

145 | * |

146 | * Note: Unpaired surrogates are treated like surrogate code points. |

147 | * Similarly, set strings match only on code point boundaries, |

148 | * never in the middle of a surrogate pair. |

149 | * Illegal UTF-8 sequences are treated like U+FFFD. |

150 | * When processing UTF-8 strings, malformed set strings |

151 | * (strings with unpaired surrogates which cannot be converted to UTF-8) |

152 | * are ignored. |

153 | * |

154 | * @stable ICU 3.8 |

155 | */ |

156 | typedef enum USetSpanCondition { |

157 | /** |

158 | * Continues a span() while there is no set element at the current position. |

159 | * Increments by one code point at a time. |

160 | * Stops before the first set element (character or string). |

161 | * (For code points only, this is like while contains(current)==FALSE). |

162 | * |

163 | * When span() returns, the substring between where it started and the position |

164 | * it returned consists only of characters that are not in the set, |

165 | * and none of its strings overlap with the span. |

166 | * |

167 | * @stable ICU 3.8 |

168 | */ |

169 | USET_SPAN_NOT_CONTAINED = 0, |

170 | /** |

171 | * Spans the longest substring that is a concatenation of set elements (characters or strings). |

172 | * (For characters only, this is like while contains(current)==TRUE). |

173 | * |

174 | * When span() returns, the substring between where it started and the position |

175 | * it returned consists only of set elements (characters or strings) that are in the set. |

176 | * |

177 | * If a set contains strings, then the span will be the longest substring for which there |

178 | * exists at least one non-overlapping concatenation of set elements (characters or strings). |

179 | * This is equivalent to a POSIX regular expression for <code>(OR of each set element)*</code>. |

180 | * (Java/ICU/Perl regex stops at the first match of an OR.) |

181 | * |

182 | * @stable ICU 3.8 |

183 | */ |

184 | USET_SPAN_CONTAINED = 1, |

185 | /** |

186 | * Continues a span() while there is a set element at the current position. |

187 | * Increments by the longest matching element at each position. |

188 | * (For characters only, this is like while contains(current)==TRUE). |

189 | * |

190 | * When span() returns, the substring between where it started and the position |

191 | * it returned consists only of set elements (characters or strings) that are in the set. |

192 | * |

193 | * If a set only contains single characters, then this is the same |

194 | * as USET_SPAN_CONTAINED. |

195 | * |

196 | * If a set contains strings, then the span will be the longest substring |

197 | * with a match at each position with the longest single set element (character or string). |

198 | * |

199 | * Use this span condition together with other longest-match algorithms, |

200 | * such as ICU converters (ucnv_getUnicodeSet()). |

201 | * |

202 | * @stable ICU 3.8 |

203 | */ |

204 | USET_SPAN_SIMPLE = 2, |

205 | #ifndef U_HIDE_DEPRECATED_API |

206 | /** |

207 | * One more than the last span condition. |

208 | * @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420. |

209 | */ |

210 | USET_SPAN_CONDITION_COUNT |

211 | #endif // U_HIDE_DEPRECATED_API |

212 | } USetSpanCondition; |

213 | |

214 | enum { |

215 | /** |

216 | * Capacity of USerializedSet::staticArray. |

217 | * Enough for any single-code point set. |

218 | * Also provides padding for nice sizeof(USerializedSet). |

219 | * @stable ICU 2.4 |

220 | */ |

221 | USET_SERIALIZED_STATIC_ARRAY_CAPACITY=8 |

222 | }; |

223 | |

224 | /** |

225 | * A serialized form of a Unicode set. Limited manipulations are |

226 | * possible directly on a serialized set. See below. |

227 | * @stable ICU 2.4 |

228 | */ |

229 | typedef struct USerializedSet { |

230 | /** |

231 | * The serialized Unicode Set. |

232 | * @stable ICU 2.4 |

233 | */ |

234 | const uint16_t *array; |

235 | /** |

236 | * The length of the array that contains BMP characters. |

237 | * @stable ICU 2.4 |

238 | */ |

239 | int32_t bmpLength; |

240 | /** |

241 | * The total length of the array. |

242 | * @stable ICU 2.4 |

243 | */ |

244 | int32_t length; |

245 | /** |

246 | * A small buffer for the array to reduce memory allocations. |

247 | * @stable ICU 2.4 |

248 | */ |

249 | uint16_t staticArray[USET_SERIALIZED_STATIC_ARRAY_CAPACITY]; |

250 | } USerializedSet; |

251 | |

252 | /********************************************************************* |

253 | * USet API |

254 | *********************************************************************/ |

255 | |

256 | /** |

257 | * Create an empty USet object. |

258 | * Equivalent to uset_open(1, 0). |

259 | * @return a newly created USet. The caller must call uset_close() on |

260 | * it when done. |

261 | * @stable ICU 4.2 |

262 | */ |

263 | U_STABLE USet* U_EXPORT2 |

264 | uset_openEmpty(void); |

265 | |

266 | /** |

267 | * Creates a USet object that contains the range of characters |

268 | * start..end, inclusive. If <code>start > end</code> |

269 | * then an empty set is created (same as using uset_openEmpty()). |

270 | * @param start first character of the range, inclusive |

271 | * @param end last character of the range, inclusive |

272 | * @return a newly created USet. The caller must call uset_close() on |

273 | * it when done. |

274 | * @stable ICU 2.4 |

275 | */ |

276 | U_STABLE USet* U_EXPORT2 |

277 | uset_open(UChar32 start, UChar32 end); |

278 | |

279 | /** |

280 | * Creates a set from the given pattern. See the UnicodeSet class |

281 | * description for the syntax of the pattern language. |

282 | * @param pattern a string specifying what characters are in the set |

283 | * @param patternLength the length of the pattern, or -1 if null |

284 | * terminated |

285 | * @param ec the error code |

286 | * @stable ICU 2.4 |

287 | */ |

288 | U_STABLE USet* U_EXPORT2 |

289 | uset_openPattern(const UChar* pattern, int32_t patternLength, |

290 | UErrorCode* ec); |

291 | |

292 | /** |

293 | * Creates a set from the given pattern. See the UnicodeSet class |

294 | * description for the syntax of the pattern language. |

295 | * @param pattern a string specifying what characters are in the set |

296 | * @param patternLength the length of the pattern, or -1 if null |

297 | * terminated |

298 | * @param options bitmask for options to apply to the pattern. |

299 | * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE. |

300 | * @param ec the error code |

301 | * @stable ICU 2.4 |

302 | */ |

303 | U_STABLE USet* U_EXPORT2 |

304 | uset_openPatternOptions(const UChar* pattern, int32_t patternLength, |

305 | uint32_t options, |

306 | UErrorCode* ec); |

307 | |

308 | /** |

309 | * Disposes of the storage used by a USet object. This function should |

310 | * be called exactly once for objects returned by uset_open(). |

311 | * @param set the object to dispose of |

312 | * @stable ICU 2.4 |

313 | */ |

314 | U_STABLE void U_EXPORT2 |

315 | uset_close(USet* set); |

316 | |

317 | #if U_SHOW_CPLUSPLUS_API |

318 | |

319 | U_NAMESPACE_BEGIN |

320 | |

321 | /** |

322 | * \class LocalUSetPointer |

323 | * "Smart pointer" class, closes a USet via uset_close(). |

324 | * For most methods see the LocalPointerBase base class. |

325 | * |

326 | * @see LocalPointerBase |

327 | * @see LocalPointer |

328 | * @stable ICU 4.4 |

329 | */ |

330 | U_DEFINE_LOCAL_OPEN_POINTER(LocalUSetPointer, USet, uset_close); |

331 | |

332 | U_NAMESPACE_END |

333 | |

334 | #endif |

335 | |

336 | /** |

337 | * Returns a copy of this object. |

338 | * If this set is frozen, then the clone will be frozen as well. |

339 | * Use uset_cloneAsThawed() for a mutable clone of a frozen set. |

340 | * @param set the original set |

341 | * @return the newly allocated copy of the set |

342 | * @see uset_cloneAsThawed |

343 | * @stable ICU 3.8 |

344 | */ |

345 | U_STABLE USet * U_EXPORT2 |

346 | uset_clone(const USet *set); |

347 | |

348 | /** |

349 | * Determines whether the set has been frozen (made immutable) or not. |

350 | * See the ICU4J Freezable interface for details. |

351 | * @param set the set |

352 | * @return TRUE/FALSE for whether the set has been frozen |

353 | * @see uset_freeze |

354 | * @see uset_cloneAsThawed |

355 | * @stable ICU 3.8 |

356 | */ |

357 | U_STABLE UBool U_EXPORT2 |

358 | uset_isFrozen(const USet *set); |

359 | |

360 | /** |

361 | * Freeze the set (make it immutable). |

362 | * Once frozen, it cannot be unfrozen and is therefore thread-safe |

363 | * until it is deleted. |

364 | * See the ICU4J Freezable interface for details. |

365 | * Freezing the set may also make some operations faster, for example |

366 | * uset_contains() and uset_span(). |

367 | * A frozen set will not be modified. (It remains frozen.) |

368 | * @param set the set |

369 | * @return the same set, now frozen |

370 | * @see uset_isFrozen |

371 | * @see uset_cloneAsThawed |

372 | * @stable ICU 3.8 |

373 | */ |

374 | U_STABLE void U_EXPORT2 |

375 | uset_freeze(USet *set); |

376 | |

377 | /** |

378 | * Clone the set and make the clone mutable. |

379 | * See the ICU4J Freezable interface for details. |

380 | * @param set the set |

381 | * @return the mutable clone |

382 | * @see uset_freeze |

383 | * @see uset_isFrozen |

384 | * @see uset_clone |

385 | * @stable ICU 3.8 |

386 | */ |

387 | U_STABLE USet * U_EXPORT2 |

388 | uset_cloneAsThawed(const USet *set); |

389 | |

390 | /** |

391 | * Causes the USet object to represent the range <code>start - end</code>. |

392 | * If <code>start > end</code> then this USet is set to an empty range. |

393 | * A frozen set will not be modified. |

394 | * @param set the object to set to the given range |

395 | * @param start first character in the set, inclusive |

396 | * @param end last character in the set, inclusive |

397 | * @stable ICU 3.2 |

398 | */ |

399 | U_STABLE void U_EXPORT2 |

400 | uset_set(USet* set, |

401 | UChar32 start, UChar32 end); |

402 | |

403 | /** |

404 | * Modifies the set to represent the set specified by the given |

405 | * pattern. See the UnicodeSet class description for the syntax of |

406 | * the pattern language. See also the User Guide chapter about UnicodeSet. |

407 | * <em>Empties the set passed before applying the pattern.</em> |

408 | * A frozen set will not be modified. |

409 | * @param set The set to which the pattern is to be applied. |

410 | * @param pattern A pointer to UChar string specifying what characters are in the set. |

411 | * The character at pattern[0] must be a '['. |

412 | * @param patternLength The length of the UChar string. -1 if NUL terminated. |

413 | * @param options A bitmask for options to apply to the pattern. |

414 | * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE. |

415 | * @param status Returns an error if the pattern cannot be parsed. |

416 | * @return Upon successful parse, the value is either |

417 | * the index of the character after the closing ']' |

418 | * of the parsed pattern. |

419 | * If the status code indicates failure, then the return value |

420 | * is the index of the error in the source. |

421 | * |

422 | * @stable ICU 2.8 |

423 | */ |

424 | U_STABLE int32_t U_EXPORT2 |

425 | uset_applyPattern(USet *set, |

426 | const UChar *pattern, int32_t patternLength, |

427 | uint32_t options, |

428 | UErrorCode *status); |

429 | |

430 | /** |

431 | * Modifies the set to contain those code points which have the given value |

432 | * for the given binary or enumerated property, as returned by |

433 | * u_getIntPropertyValue. Prior contents of this set are lost. |

434 | * A frozen set will not be modified. |

435 | * |

436 | * @param set the object to contain the code points defined by the property |

437 | * |

438 | * @param prop a property in the range UCHAR_BIN_START..UCHAR_BIN_LIMIT-1 |

439 | * or UCHAR_INT_START..UCHAR_INT_LIMIT-1 |

440 | * or UCHAR_MASK_START..UCHAR_MASK_LIMIT-1. |

441 | * |

442 | * @param value a value in the range u_getIntPropertyMinValue(prop).. |

443 | * u_getIntPropertyMaxValue(prop), with one exception. If prop is |

444 | * UCHAR_GENERAL_CATEGORY_MASK, then value should not be a UCharCategory, but |

445 | * rather a mask value produced by U_GET_GC_MASK(). This allows grouped |

446 | * categories such as [:L:] to be represented. |

447 | * |

448 | * @param ec error code input/output parameter |

449 | * |

450 | * @stable ICU 3.2 |

451 | */ |

452 | U_STABLE void U_EXPORT2 |

453 | uset_applyIntPropertyValue(USet* set, |

454 | UProperty prop, int32_t value, UErrorCode* ec); |

455 | |

456 | /** |

457 | * Modifies the set to contain those code points which have the |

458 | * given value for the given property. Prior contents of this |

459 | * set are lost. |

460 | * A frozen set will not be modified. |

461 | * |

462 | * @param set the object to contain the code points defined by the given |

463 | * property and value alias |

464 | * |

465 | * @param prop a string specifying a property alias, either short or long. |

466 | * The name is matched loosely. See PropertyAliases.txt for names and a |

467 | * description of loose matching. If the value string is empty, then this |

468 | * string is interpreted as either a General_Category value alias, a Script |

469 | * value alias, a binary property alias, or a special ID. Special IDs are |

470 | * matched loosely and correspond to the following sets: |

471 | * |

472 | * "ANY" = [\\u0000-\\U0010FFFF], |

473 | * "ASCII" = [\\u0000-\\u007F], |

474 | * "Assigned" = [:^Cn:]. |

475 | * |

476 | * @param propLength the length of the prop, or -1 if NULL |

477 | * |

478 | * @param value a string specifying a value alias, either short or long. |

479 | * The name is matched loosely. See PropertyValueAliases.txt for names |

480 | * and a description of loose matching. In addition to aliases listed, |

481 | * numeric values and canonical combining classes may be expressed |

482 | * numerically, e.g., ("nv", "0.5") or ("ccc", "220"). The value string |

483 | * may also be empty. |

484 | * |

485 | * @param valueLength the length of the value, or -1 if NULL |

486 | * |

487 | * @param ec error code input/output parameter |

488 | * |

489 | * @stable ICU 3.2 |

490 | */ |

491 | U_STABLE void U_EXPORT2 |

492 | uset_applyPropertyAlias(USet* set, |

493 | const UChar *prop, int32_t propLength, |

494 | const UChar *value, int32_t valueLength, |

495 | UErrorCode* ec); |

496 | |

497 | /** |

498 | * Return true if the given position, in the given pattern, appears |

499 | * to be the start of a UnicodeSet pattern. |

500 | * |

501 | * @param pattern a string specifying the pattern |

502 | * @param patternLength the length of the pattern, or -1 if NULL |

503 | * @param pos the given position |

504 | * @stable ICU 3.2 |

505 | */ |

506 | U_STABLE UBool U_EXPORT2 |

507 | uset_resemblesPattern(const UChar *pattern, int32_t patternLength, |

508 | int32_t pos); |

509 | |

510 | /** |

511 | * Returns a string representation of this set. If the result of |

512 | * calling this function is passed to a uset_openPattern(), it |

513 | * will produce another set that is equal to this one. |

514 | * @param set the set |

515 | * @param result the string to receive the rules, may be NULL |

516 | * @param resultCapacity the capacity of result, may be 0 if result is NULL |

517 | * @param escapeUnprintable if TRUE then convert unprintable |

518 | * character to their hex escape representations, \\uxxxx or |

519 | * \\Uxxxxxxxx. Unprintable characters are those other than |

520 | * U+000A, U+0020..U+007E. |

521 | * @param ec error code. |

522 | * @return length of string, possibly larger than resultCapacity |

523 | * @stable ICU 2.4 |

524 | */ |

525 | U_STABLE int32_t U_EXPORT2 |

526 | uset_toPattern(const USet* set, |

527 | UChar* result, int32_t resultCapacity, |

528 | UBool escapeUnprintable, |

529 | UErrorCode* ec); |

530 | |

531 | /** |

532 | * Adds the given character to the given USet. After this call, |

533 | * uset_contains(set, c) will return TRUE. |

534 | * A frozen set will not be modified. |

535 | * @param set the object to which to add the character |

536 | * @param c the character to add |

537 | * @stable ICU 2.4 |

538 | */ |

539 | U_STABLE void U_EXPORT2 |

540 | uset_add(USet* set, UChar32 c); |

541 | |

542 | /** |

543 | * Adds all of the elements in the specified set to this set if |

544 | * they're not already present. This operation effectively |

545 | * modifies this set so that its value is the <i>union</i> of the two |

546 | * sets. The behavior of this operation is unspecified if the specified |

547 | * collection is modified while the operation is in progress. |

548 | * A frozen set will not be modified. |

549 | * |

550 | * @param set the object to which to add the set |

551 | * @param additionalSet the source set whose elements are to be added to this set. |

552 | * @stable ICU 2.6 |

553 | */ |

554 | U_STABLE void U_EXPORT2 |

555 | uset_addAll(USet* set, const USet *additionalSet); |

556 | |

557 | /** |

558 | * Adds the given range of characters to the given USet. After this call, |

559 | * uset_contains(set, start, end) will return TRUE. |

560 | * A frozen set will not be modified. |

561 | * @param set the object to which to add the character |

562 | * @param start the first character of the range to add, inclusive |

563 | * @param end the last character of the range to add, inclusive |

564 | * @stable ICU 2.2 |

565 | */ |

566 | U_STABLE void U_EXPORT2 |

567 | uset_addRange(USet* set, UChar32 start, UChar32 end); |

568 | |

569 | /** |

570 | * Adds the given string to the given USet. After this call, |

571 | * uset_containsString(set, str, strLen) will return TRUE. |

572 | * A frozen set will not be modified. |

573 | * @param set the object to which to add the character |

574 | * @param str the string to add |

575 | * @param strLen the length of the string or -1 if null terminated. |

576 | * @stable ICU 2.4 |

577 | */ |

578 | U_STABLE void U_EXPORT2 |

579 | uset_addString(USet* set, const UChar* str, int32_t strLen); |

580 | |

581 | /** |

582 | * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"} |

583 | * If this set already any particular character, it has no effect on that character. |

584 | * A frozen set will not be modified. |

585 | * @param set the object to which to add the character |

586 | * @param str the source string |

587 | * @param strLen the length of the string or -1 if null terminated. |

588 | * @stable ICU 3.4 |

589 | */ |

590 | U_STABLE void U_EXPORT2 |

591 | uset_addAllCodePoints(USet* set, const UChar *str, int32_t strLen); |

592 | |

593 | /** |

594 | * Removes the given character from the given USet. After this call, |

595 | * uset_contains(set, c) will return FALSE. |

596 | * A frozen set will not be modified. |

597 | * @param set the object from which to remove the character |

598 | * @param c the character to remove |

599 | * @stable ICU 2.4 |

600 | */ |

601 | U_STABLE void U_EXPORT2 |

602 | uset_remove(USet* set, UChar32 c); |

603 | |

604 | /** |

605 | * Removes the given range of characters from the given USet. After this call, |

606 | * uset_contains(set, start, end) will return FALSE. |

607 | * A frozen set will not be modified. |

608 | * @param set the object to which to add the character |

609 | * @param start the first character of the range to remove, inclusive |

610 | * @param end the last character of the range to remove, inclusive |

611 | * @stable ICU 2.2 |

612 | */ |

613 | U_STABLE void U_EXPORT2 |

614 | uset_removeRange(USet* set, UChar32 start, UChar32 end); |

615 | |

616 | /** |

617 | * Removes the given string to the given USet. After this call, |

618 | * uset_containsString(set, str, strLen) will return FALSE. |

619 | * A frozen set will not be modified. |

620 | * @param set the object to which to add the character |

621 | * @param str the string to remove |

622 | * @param strLen the length of the string or -1 if null terminated. |

623 | * @stable ICU 2.4 |

624 | */ |

625 | U_STABLE void U_EXPORT2 |

626 | uset_removeString(USet* set, const UChar* str, int32_t strLen); |

627 | |

628 | /** |

629 | * Removes from this set all of its elements that are contained in the |

630 | * specified set. This operation effectively modifies this |

631 | * set so that its value is the <i>asymmetric set difference</i> of |

632 | * the two sets. |

633 | * A frozen set will not be modified. |

634 | * @param set the object from which the elements are to be removed |

635 | * @param removeSet the object that defines which elements will be |

636 | * removed from this set |

637 | * @stable ICU 3.2 |

638 | */ |

639 | U_STABLE void U_EXPORT2 |

640 | uset_removeAll(USet* set, const USet* removeSet); |

641 | |

642 | /** |

643 | * Retain only the elements in this set that are contained in the |

644 | * specified range. If <code>start > end</code> then an empty range is |

645 | * retained, leaving the set empty. This is equivalent to |

646 | * a boolean logic AND, or a set INTERSECTION. |

647 | * A frozen set will not be modified. |

648 | * |

649 | * @param set the object for which to retain only the specified range |

650 | * @param start first character, inclusive, of range to be retained |

651 | * to this set. |

652 | * @param end last character, inclusive, of range to be retained |

653 | * to this set. |

654 | * @stable ICU 3.2 |

655 | */ |

656 | U_STABLE void U_EXPORT2 |

657 | uset_retain(USet* set, UChar32 start, UChar32 end); |

658 | |

659 | /** |

660 | * Retains only the elements in this set that are contained in the |

661 | * specified set. In other words, removes from this set all of |

662 | * its elements that are not contained in the specified set. This |

663 | * operation effectively modifies this set so that its value is |

664 | * the <i>intersection</i> of the two sets. |

665 | * A frozen set will not be modified. |

666 | * |

667 | * @param set the object on which to perform the retain |

668 | * @param retain set that defines which elements this set will retain |

669 | * @stable ICU 3.2 |

670 | */ |

671 | U_STABLE void U_EXPORT2 |

672 | uset_retainAll(USet* set, const USet* retain); |

673 | |

674 | /** |

675 | * Reallocate this objects internal structures to take up the least |

676 | * possible space, without changing this object's value. |

677 | * A frozen set will not be modified. |

678 | * |

679 | * @param set the object on which to perfrom the compact |

680 | * @stable ICU 3.2 |

681 | */ |

682 | U_STABLE void U_EXPORT2 |

683 | uset_compact(USet* set); |

684 | |

685 | /** |

686 | * Inverts this set. This operation modifies this set so that |

687 | * its value is its complement. This operation does not affect |

688 | * the multicharacter strings, if any. |

689 | * A frozen set will not be modified. |

690 | * @param set the set |

691 | * @stable ICU 2.4 |

692 | */ |

693 | U_STABLE void U_EXPORT2 |

694 | uset_complement(USet* set); |

695 | |

696 | /** |

697 | * Complements in this set all elements contained in the specified |

698 | * set. Any character in the other set will be removed if it is |

699 | * in this set, or will be added if it is not in this set. |

700 | * A frozen set will not be modified. |

701 | * |

702 | * @param set the set with which to complement |

703 | * @param complement set that defines which elements will be xor'ed |

704 | * from this set. |

705 | * @stable ICU 3.2 |

706 | */ |

707 | U_STABLE void U_EXPORT2 |

708 | uset_complementAll(USet* set, const USet* complement); |

709 | |

710 | /** |

711 | * Removes all of the elements from this set. This set will be |

712 | * empty after this call returns. |

713 | * A frozen set will not be modified. |

714 | * @param set the set |

715 | * @stable ICU 2.4 |

716 | */ |

717 | U_STABLE void U_EXPORT2 |

718 | uset_clear(USet* set); |

719 | |

720 | /** |

721 | * Close this set over the given attribute. For the attribute |

722 | * USET_CASE, the result is to modify this set so that: |

723 | * |

724 | * 1. For each character or string 'a' in this set, all strings or |

725 | * characters 'b' such that foldCase(a) == foldCase(b) are added |

726 | * to this set. |

727 | * |

728 | * 2. For each string 'e' in the resulting set, if e != |

729 | * foldCase(e), 'e' will be removed. |

730 | * |

731 | * Example: [aq\\u00DF{Bc}{bC}{Fi}] => [aAqQ\\u00DF\\uFB01{ss}{bc}{fi}] |

732 | * |

733 | * (Here foldCase(x) refers to the operation u_strFoldCase, and a |

734 | * == b denotes that the contents are the same, not pointer |

735 | * comparison.) |

736 | * |

737 | * A frozen set will not be modified. |

738 | * |

739 | * @param set the set |

740 | * |

741 | * @param attributes bitmask for attributes to close over. |

742 | * Currently only the USET_CASE bit is supported. Any undefined bits |

743 | * are ignored. |

744 | * @stable ICU 4.2 |

745 | */ |

746 | U_STABLE void U_EXPORT2 |

747 | uset_closeOver(USet* set, int32_t attributes); |

748 | |

749 | /** |

750 | * Remove all strings from this set. |

751 | * |

752 | * @param set the set |

753 | * @stable ICU 4.2 |

754 | */ |

755 | U_STABLE void U_EXPORT2 |

756 | uset_removeAllStrings(USet* set); |

757 | |

758 | /** |

759 | * Returns TRUE if the given USet contains no characters and no |

760 | * strings. |

761 | * @param set the set |

762 | * @return true if set is empty |

763 | * @stable ICU 2.4 |

764 | */ |

765 | U_STABLE UBool U_EXPORT2 |

766 | uset_isEmpty(const USet* set); |

767 | |

768 | /** |

769 | * Returns TRUE if the given USet contains the given character. |

770 | * This function works faster with a frozen set. |

771 | * @param set the set |

772 | * @param c The codepoint to check for within the set |

773 | * @return true if set contains c |

774 | * @stable ICU 2.4 |

775 | */ |

776 | U_STABLE UBool U_EXPORT2 |

777 | uset_contains(const USet* set, UChar32 c); |

778 | |

779 | /** |

780 | * Returns TRUE if the given USet contains all characters c |

781 | * where start <= c && c <= end. |

782 | * @param set the set |

783 | * @param start the first character of the range to test, inclusive |

784 | * @param end the last character of the range to test, inclusive |

785 | * @return TRUE if set contains the range |

786 | * @stable ICU 2.2 |

787 | */ |

788 | U_STABLE UBool U_EXPORT2 |

789 | uset_containsRange(const USet* set, UChar32 start, UChar32 end); |

790 | |

791 | /** |

792 | * Returns TRUE if the given USet contains the given string. |

793 | * @param set the set |

794 | * @param str the string |

795 | * @param strLen the length of the string or -1 if null terminated. |

796 | * @return true if set contains str |

797 | * @stable ICU 2.4 |

798 | */ |

799 | U_STABLE UBool U_EXPORT2 |

800 | uset_containsString(const USet* set, const UChar* str, int32_t strLen); |

801 | |

802 | /** |

803 | * Returns the index of the given character within this set, where |

804 | * the set is ordered by ascending code point. If the character |

805 | * is not in this set, return -1. The inverse of this method is |

806 | * <code>charAt()</code>. |

807 | * @param set the set |

808 | * @param c the character to obtain the index for |

809 | * @return an index from 0..size()-1, or -1 |

810 | * @stable ICU 3.2 |

811 | */ |

812 | U_STABLE int32_t U_EXPORT2 |

813 | uset_indexOf(const USet* set, UChar32 c); |

814 | |

815 | /** |

816 | * Returns the character at the given index within this set, where |

817 | * the set is ordered by ascending code point. If the index is |

818 | * out of range, return (UChar32)-1. The inverse of this method is |

819 | * <code>indexOf()</code>. |

820 | * @param set the set |

821 | * @param charIndex an index from 0..size()-1 to obtain the char for |

822 | * @return the character at the given index, or (UChar32)-1. |

823 | * @stable ICU 3.2 |

824 | */ |

825 | U_STABLE UChar32 U_EXPORT2 |

826 | uset_charAt(const USet* set, int32_t charIndex); |

827 | |

828 | /** |

829 | * Returns the number of characters and strings contained in the given |

830 | * USet. |

831 | * @param set the set |

832 | * @return a non-negative integer counting the characters and strings |

833 | * contained in set |

834 | * @stable ICU 2.4 |

835 | */ |

836 | U_STABLE int32_t U_EXPORT2 |

837 | uset_size(const USet* set); |

838 | |

839 | /** |

840 | * Returns the number of items in this set. An item is either a range |

841 | * of characters or a single multicharacter string. |

842 | * @param set the set |

843 | * @return a non-negative integer counting the character ranges |

844 | * and/or strings contained in set |

845 | * @stable ICU 2.4 |

846 | */ |

847 | U_STABLE int32_t U_EXPORT2 |

848 | uset_getItemCount(const USet* set); |

849 | |

850 | /** |

851 | * Returns an item of this set. An item is either a range of |

852 | * characters or a single multicharacter string. |

853 | * @param set the set |

854 | * @param itemIndex a non-negative integer in the range 0.. |

855 | * uset_getItemCount(set)-1 |

856 | * @param start pointer to variable to receive first character |

857 | * in range, inclusive |

858 | * @param end pointer to variable to receive last character in range, |

859 | * inclusive |

860 | * @param str buffer to receive the string, may be NULL |

861 | * @param strCapacity capacity of str, or 0 if str is NULL |

862 | * @param ec error code |

863 | * @return the length of the string (>= 2), or 0 if the item is a |

864 | * range, in which case it is the range *start..*end, or -1 if |

865 | * itemIndex is out of range |

866 | * @stable ICU 2.4 |

867 | */ |

868 | U_STABLE int32_t U_EXPORT2 |

869 | uset_getItem(const USet* set, int32_t itemIndex, |

870 | UChar32* start, UChar32* end, |

871 | UChar* str, int32_t strCapacity, |

872 | UErrorCode* ec); |

873 | |

874 | /** |

875 | * Returns true if set1 contains all the characters and strings |

876 | * of set2. It answers the question, 'Is set1 a superset of set2?' |

877 | * @param set1 set to be checked for containment |

878 | * @param set2 set to be checked for containment |

879 | * @return true if the test condition is met |

880 | * @stable ICU 3.2 |

881 | */ |

882 | U_STABLE UBool U_EXPORT2 |

883 | uset_containsAll(const USet* set1, const USet* set2); |

884 | |

885 | /** |

886 | * Returns true if this set contains all the characters |

887 | * of the given string. This is does not check containment of grapheme |

888 | * clusters, like uset_containsString. |

889 | * @param set set of characters to be checked for containment |

890 | * @param str string containing codepoints to be checked for containment |

891 | * @param strLen the length of the string or -1 if null terminated. |

892 | * @return true if the test condition is met |

893 | * @stable ICU 3.4 |

894 | */ |

895 | U_STABLE UBool U_EXPORT2 |

896 | uset_containsAllCodePoints(const USet* set, const UChar *str, int32_t strLen); |

897 | |

898 | /** |

899 | * Returns true if set1 contains none of the characters and strings |

900 | * of set2. It answers the question, 'Is set1 a disjoint set of set2?' |

901 | * @param set1 set to be checked for containment |

902 | * @param set2 set to be checked for containment |

903 | * @return true if the test condition is met |

904 | * @stable ICU 3.2 |

905 | */ |

906 | U_STABLE UBool U_EXPORT2 |

907 | uset_containsNone(const USet* set1, const USet* set2); |

908 | |

909 | /** |

910 | * Returns true if set1 contains some of the characters and strings |

911 | * of set2. It answers the question, 'Does set1 and set2 have an intersection?' |

912 | * @param set1 set to be checked for containment |

913 | * @param set2 set to be checked for containment |

914 | * @return true if the test condition is met |

915 | * @stable ICU 3.2 |

916 | */ |

917 | U_STABLE UBool U_EXPORT2 |

918 | uset_containsSome(const USet* set1, const USet* set2); |

919 | |

920 | /** |

921 | * Returns the length of the initial substring of the input string which |

922 | * consists only of characters and strings that are contained in this set |

923 | * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), |

924 | * or only of characters and strings that are not contained |

925 | * in this set (USET_SPAN_NOT_CONTAINED). |

926 | * See USetSpanCondition for details. |

927 | * Similar to the strspn() C library function. |

928 | * Unpaired surrogates are treated according to contains() of their surrogate code points. |

929 | * This function works faster with a frozen set and with a non-negative string length argument. |

930 | * @param set the set |

931 | * @param s start of the string |

932 | * @param length of the string; can be -1 for NUL-terminated |

933 | * @param spanCondition specifies the containment condition |

934 | * @return the length of the initial substring according to the spanCondition; |

935 | * 0 if the start of the string does not fit the spanCondition |

936 | * @stable ICU 3.8 |

937 | * @see USetSpanCondition |

938 | */ |

939 | U_STABLE int32_t U_EXPORT2 |

940 | uset_span(const USet *set, const UChar *s, int32_t length, USetSpanCondition spanCondition); |

941 | |

942 | /** |

943 | * Returns the start of the trailing substring of the input string which |

944 | * consists only of characters and strings that are contained in this set |

945 | * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), |

946 | * or only of characters and strings that are not contained |

947 | * in this set (USET_SPAN_NOT_CONTAINED). |

948 | * See USetSpanCondition for details. |

949 | * Unpaired surrogates are treated according to contains() of their surrogate code points. |

950 | * This function works faster with a frozen set and with a non-negative string length argument. |

951 | * @param set the set |

952 | * @param s start of the string |

953 | * @param length of the string; can be -1 for NUL-terminated |

954 | * @param spanCondition specifies the containment condition |

955 | * @return the start of the trailing substring according to the spanCondition; |

956 | * the string length if the end of the string does not fit the spanCondition |

957 | * @stable ICU 3.8 |

958 | * @see USetSpanCondition |

959 | */ |

960 | U_STABLE int32_t U_EXPORT2 |

961 | uset_spanBack(const USet *set, const UChar *s, int32_t length, USetSpanCondition spanCondition); |

962 | |

963 | /** |

964 | * Returns the length of the initial substring of the input string which |

965 | * consists only of characters and strings that are contained in this set |

966 | * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), |

967 | * or only of characters and strings that are not contained |

968 | * in this set (USET_SPAN_NOT_CONTAINED). |

969 | * See USetSpanCondition for details. |

970 | * Similar to the strspn() C library function. |

971 | * Malformed byte sequences are treated according to contains(0xfffd). |

972 | * This function works faster with a frozen set and with a non-negative string length argument. |

973 | * @param set the set |

974 | * @param s start of the string (UTF-8) |

975 | * @param length of the string; can be -1 for NUL-terminated |

976 | * @param spanCondition specifies the containment condition |

977 | * @return the length of the initial substring according to the spanCondition; |

978 | * 0 if the start of the string does not fit the spanCondition |

979 | * @stable ICU 3.8 |

980 | * @see USetSpanCondition |

981 | */ |

982 | U_STABLE int32_t U_EXPORT2 |

983 | uset_spanUTF8(const USet *set, const char *s, int32_t length, USetSpanCondition spanCondition); |

984 | |

985 | /** |

986 | * Returns the start of the trailing substring of the input string which |

987 | * consists only of characters and strings that are contained in this set |

988 | * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), |

989 | * or only of characters and strings that are not contained |

990 | * in this set (USET_SPAN_NOT_CONTAINED). |

991 | * See USetSpanCondition for details. |

992 | * Malformed byte sequences are treated according to contains(0xfffd). |

993 | * This function works faster with a frozen set and with a non-negative string length argument. |

994 | * @param set the set |

995 | * @param s start of the string (UTF-8) |

996 | * @param length of the string; can be -1 for NUL-terminated |

997 | * @param spanCondition specifies the containment condition |

998 | * @return the start of the trailing substring according to the spanCondition; |

999 | * the string length if the end of the string does not fit the spanCondition |

1000 | * @stable ICU 3.8 |

1001 | * @see USetSpanCondition |

1002 | */ |

1003 | U_STABLE int32_t U_EXPORT2 |

1004 | uset_spanBackUTF8(const USet *set, const char *s, int32_t length, USetSpanCondition spanCondition); |

1005 | |

1006 | /** |

1007 | * Returns true if set1 contains all of the characters and strings |

1008 | * of set2, and vis versa. It answers the question, 'Is set1 equal to set2?' |

1009 | * @param set1 set to be checked for containment |

1010 | * @param set2 set to be checked for containment |

1011 | * @return true if the test condition is met |

1012 | * @stable ICU 3.2 |

1013 | */ |

1014 | U_STABLE UBool U_EXPORT2 |

1015 | uset_equals(const USet* set1, const USet* set2); |

1016 | |

1017 | /********************************************************************* |

1018 | * Serialized set API |

1019 | *********************************************************************/ |

1020 | |

1021 | /** |

1022 | * Serializes this set into an array of 16-bit integers. Serialization |

1023 | * (currently) only records the characters in the set; multicharacter |

1024 | * strings are ignored. |

1025 | * |

1026 | * The array |

1027 | * has following format (each line is one 16-bit integer): |

1028 | * |

1029 | * length = (n+2*m) | (m!=0?0x8000:0) |

1030 | * bmpLength = n; present if m!=0 |

1031 | * bmp[0] |

1032 | * bmp[1] |

1033 | * ... |

1034 | * bmp[n-1] |

1035 | * supp-high[0] |

1036 | * supp-low[0] |

1037 | * supp-high[1] |

1038 | * supp-low[1] |

1039 | * ... |

1040 | * supp-high[m-1] |

1041 | * supp-low[m-1] |

1042 | * |

1043 | * The array starts with a header. After the header are n bmp |

1044 | * code points, then m supplementary code points. Either n or m |

1045 | * or both may be zero. n+2*m is always <= 0x7FFF. |

1046 | * |

1047 | * If there are no supplementary characters (if m==0) then the |

1048 | * header is one 16-bit integer, 'length', with value n. |

1049 | * |

1050 | * If there are supplementary characters (if m!=0) then the header |

1051 | * is two 16-bit integers. The first, 'length', has value |

1052 | * (n+2*m)|0x8000. The second, 'bmpLength', has value n. |

1053 | * |

1054 | * After the header the code points are stored in ascending order. |

1055 | * Supplementary code points are stored as most significant 16 |

1056 | * bits followed by least significant 16 bits. |

1057 | * |

1058 | * @param set the set |

1059 | * @param dest pointer to buffer of destCapacity 16-bit integers. |

1060 | * May be NULL only if destCapacity is zero. |

1061 | * @param destCapacity size of dest, or zero. Must not be negative. |

1062 | * @param pErrorCode pointer to the error code. Will be set to |

1063 | * U_INDEX_OUTOFBOUNDS_ERROR if n+2*m > 0x7FFF. Will be set to |

1064 | * U_BUFFER_OVERFLOW_ERROR if n+2*m+(m!=0?2:1) > destCapacity. |

1065 | * @return the total length of the serialized format, including |

1066 | * the header, that is, n+2*m+(m!=0?2:1), or 0 on error other |

1067 | * than U_BUFFER_OVERFLOW_ERROR. |

1068 | * @stable ICU 2.4 |

1069 | */ |

1070 | U_STABLE int32_t U_EXPORT2 |

1071 | uset_serialize(const USet* set, uint16_t* dest, int32_t destCapacity, UErrorCode* pErrorCode); |

1072 | |

1073 | /** |

1074 | * Given a serialized array, fill in the given serialized set object. |

1075 | * @param fillSet pointer to result |

1076 | * @param src pointer to start of array |

1077 | * @param srcLength length of array |

1078 | * @return true if the given array is valid, otherwise false |

1079 | * @stable ICU 2.4 |

1080 | */ |

1081 | U_STABLE UBool U_EXPORT2 |

1082 | uset_getSerializedSet(USerializedSet* fillSet, const uint16_t* src, int32_t srcLength); |

1083 | |

1084 | /** |

1085 | * Set the USerializedSet to contain the given character (and nothing |

1086 | * else). |

1087 | * @param fillSet pointer to result |

1088 | * @param c The codepoint to set |

1089 | * @stable ICU 2.4 |

1090 | */ |

1091 | U_STABLE void U_EXPORT2 |

1092 | uset_setSerializedToOne(USerializedSet* fillSet, UChar32 c); |

1093 | |

1094 | /** |

1095 | * Returns TRUE if the given USerializedSet contains the given |

1096 | * character. |

1097 | * @param set the serialized set |

1098 | * @param c The codepoint to check for within the set |

1099 | * @return true if set contains c |

1100 | * @stable ICU 2.4 |

1101 | */ |

1102 | U_STABLE UBool U_EXPORT2 |

1103 | uset_serializedContains(const USerializedSet* set, UChar32 c); |

1104 | |

1105 | /** |

1106 | * Returns the number of disjoint ranges of characters contained in |

1107 | * the given serialized set. Ignores any strings contained in the |

1108 | * set. |

1109 | * @param set the serialized set |

1110 | * @return a non-negative integer counting the character ranges |

1111 | * contained in set |

1112 | * @stable ICU 2.4 |

1113 | */ |

1114 | U_STABLE int32_t U_EXPORT2 |

1115 | uset_getSerializedRangeCount(const USerializedSet* set); |

1116 | |

1117 | /** |

1118 | * Returns a range of characters contained in the given serialized |

1119 | * set. |

1120 | * @param set the serialized set |

1121 | * @param rangeIndex a non-negative integer in the range 0.. |

1122 | * uset_getSerializedRangeCount(set)-1 |

1123 | * @param pStart pointer to variable to receive first character |

1124 | * in range, inclusive |

1125 | * @param pEnd pointer to variable to receive last character in range, |

1126 | * inclusive |

1127 | * @return true if rangeIndex is valid, otherwise false |

1128 | * @stable ICU 2.4 |

1129 | */ |

1130 | U_STABLE UBool U_EXPORT2 |

1131 | uset_getSerializedRange(const USerializedSet* set, int32_t rangeIndex, |

1132 | UChar32* pStart, UChar32* pEnd); |

1133 | |

1134 | #endif |

1135 |