Unicode.h source code [llvm/include/llvm/Support/Unicode.h]

1	//===- llvm/Support/Unicode.h - Unicode character properties -- C++ --=====//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This file defines functions that allow querying certain properties of Unicode
10	// characters.
11	//
12	//===----------------------------------------------------------------------===//
13
14	#ifndef LLVM_SUPPORT_UNICODE_H
15	#define LLVM_SUPPORT_UNICODE_H
16
17	#include "llvm/ADT/SmallString.h"
18	#include <optional>
19	#include <string>
20
21	namespace llvm {
22	class StringRef;
23
24	namespace sys {
25	namespace unicode {
26
27	enum ColumnWidthErrors {
28	ErrorInvalidUTF8 = -`2`,
29	ErrorNonPrintableCharacter = -`1`
30	};
31
32	/// Determines if a character is likely to be displayed correctly on the
33	/// terminal. Exact implementation would have to depend on the specific
34	/// terminal, so we define the semantic that should be suitable for generic case
35	/// of a terminal capable to output Unicode characters.
36	///
37	/// Printable codepoints are those in the categories L, M, N, P, S and Zs
38	/// \return true if the character is considered printable.
39	bool isPrintable(int UCS);
40
41	// Formatting codepoints are codepoints in the Cf category.
42	bool isFormatting(int UCS);
43
44	/// Gets the number of positions the UTF8-encoded \p Text is likely to occupy
45	/// when output on a terminal ("character width"). This depends on the
46	/// implementation of the terminal, and there's no standard definition of
47	/// character width.
48	///
49	/// The implementation defines it in a way that is expected to be compatible
50	/// with a generic Unicode-capable terminal.
51	///
52	/// \return Character width:
53	/// ErrorNonPrintableCharacter (-1) if \p Text contains non-printable*
54	/// characters (as identified by isPrintable);
55	/// 0 for each non-spacing and enclosing combining mark;*
56	/// 2 for each CJK character excluding halfwidth forms;*
57	/// 1 for each of the remaining characters.*
58	int columnWidthUTF8(StringRef Text);
59
60	/// Fold input unicode character according the Simple unicode case folding
61	/// rules.
62	int foldCharSimple(int C);
63
64	/// Maps the name or the alias of a Unicode character to its associated
65	/// codepoints.
66	/// The names and aliases are derived from UnicodeData.txt and NameAliases.txt
67	/// For compatibility with the semantics of named character escape sequences in
68	/// C++, this mapping does an exact match sensitive to casing and spacing.
69	/// \return The codepoint of the corresponding character, if any.
70	std::optional<char32_t> nameToCodepointStrict(StringRef Name);
71
72	struct LooseMatchingResult {
73	char32_t CodePoint;
74	SmallString<`64`> Name;
75	};
76
77	std::optional<LooseMatchingResult> nameToCodepointLooseMatching(StringRef Name);
78
79	struct MatchForCodepointName {
80	std::string Name;
81	uint32_t Distance = `0`;
82	char32_t Value = `0`;
83	};
84
85	SmallVector<MatchForCodepointName>
86	nearestMatchesForCodepointName(StringRef Pattern, std::size_t MaxMatchesCount);
87
88	} // namespace unicode
89	} // namespace sys
90	} // namespace llvm
91
92	#endif
93

source code of llvm/include/llvm/Support/Unicode.h