ConvertUTFWrapper.cpp source code [llvm/lib/Support/ConvertUTFWrapper.cpp]

1	//===-- ConvertUTFWrapper.cpp - Wrap ConvertUTF.h with clang data types -----===
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8
9	#include "llvm/ADT/ArrayRef.h"
10	#include "llvm/ADT/StringRef.h"
11	#include "llvm/Support/ConvertUTF.h"
12	#include "llvm/Support/ErrorHandling.h"
13	#include "llvm/Support/SwapByteOrder.h"
14	#include <string>
15	#include <vector>
16
17	namespace llvm {
18
19	bool ConvertUTF8toWide(unsigned WideCharWidth, llvm::StringRef Source,
20	char &ResultPtr, const* UTF8 *&ErrorPtr) {
21	assert(WideCharWidth == `1` \|\| WideCharWidth == `2` \|\| WideCharWidth == `4`);
22	ConversionResult result = conversionOK;
23	// Copy the character span over.
24	if (WideCharWidth == `1`) {
25	const UTF8 Pos = reinterpret_cast<const* UTF8*>(Source.begin());
26	if (!isLegalUTF8String(source: &Pos, sourceEnd: reinterpret_cast<const UTF8*>(Source.end()))) {
27	result = sourceIllegal;
28	ErrorPtr = Pos;
29	} else {
30	memcpy(dest: ResultPtr, src: Source.data(), n: Source.size());
31	ResultPtr += Source.size();
32	}
33	} else if (WideCharWidth == `2`) {
34	const UTF8 sourceStart = (const* UTF8*)Source.data();
35	// FIXME: Make the type of the result buffer correct instead of
36	// using reinterpret_cast.
37	UTF16 targetStart = reinterpret_cast<UTF16 >(ResultPtr);
38	ConversionFlags flags = strictConversion;
39	result =
40	ConvertUTF8toUTF16(sourceStart: &sourceStart, sourceEnd: sourceStart + Source.size(),
41	targetStart: &targetStart, targetEnd: targetStart + Source.size(), flags);
42	if (result == conversionOK)
43	ResultPtr = reinterpret_cast<char *>(targetStart);
44	else
45	ErrorPtr = sourceStart;
46	} else if (WideCharWidth == `4`) {
47	const UTF8 sourceStart = (const* UTF8 *)Source.data();
48	// FIXME: Make the type of the result buffer correct instead of
49	// using reinterpret_cast.
50	UTF32 targetStart = reinterpret_cast<UTF32 >(ResultPtr);
51	ConversionFlags flags = strictConversion;
52	result =
53	ConvertUTF8toUTF32(sourceStart: &sourceStart, sourceEnd: sourceStart + Source.size(),
54	targetStart: &targetStart, targetEnd: targetStart + Source.size(), flags);
55	if (result == conversionOK)
56	ResultPtr = reinterpret_cast<char *>(targetStart);
57	else
58	ErrorPtr = sourceStart;
59	}
60	assert((result != targetExhausted) &&
61	"ConvertUTF8toUTFXX exhausted target buffer");
62	return result == conversionOK;
63	}
64
65	bool ConvertCodePointToUTF8(unsigned Source, char *&ResultPtr) {
66	const UTF32 *SourceStart = &Source;
67	const UTF32 *SourceEnd = SourceStart + `1`;
68	UTF8 TargetStart = reinterpret_cast<UTF8 >(ResultPtr);
69	UTF8 *TargetEnd = TargetStart + `4`;
70	ConversionResult CR = ConvertUTF32toUTF8(
71	sourceStart: &SourceStart, sourceEnd: SourceEnd, targetStart: &TargetStart, targetEnd: TargetEnd, flags: strictConversion);
72	if (CR != conversionOK)
73	return false;
74
75	ResultPtr = reinterpret_cast<char *>(TargetStart);
76	return true;
77	}
78
79	bool hasUTF16ByteOrderMark(ArrayRef<char> S) {
80	return (S.size() >= `2` && ((S [`0`] == `'\xff'` && S [`1`] == `'\xfe'`) \|\|
81	(S [`0`] == `'\xfe'` && S [`1`] == `'\xff'`)));
82	}
83
84	bool convertUTF16ToUTF8String(ArrayRef<char> SrcBytes, std::string &Out) {
85	assert(Out.empty());
86
87	// Error out on an uneven byte count.
88	if (SrcBytes.size() % `2`)
89	return false;
90
91	// Avoid OOB by returning early on empty input.
92	if (SrcBytes.empty())
93	return true;
94
95	const UTF16 Src = reinterpret_cast<const* UTF16 *>(SrcBytes.begin());
96	const UTF16 SrcEnd = reinterpret_cast<const* UTF16 *>(SrcBytes.end());
97
98	assert((uintptr_t)Src % sizeof(UTF16) == `0`);
99
100	// Byteswap if necessary.
101	std::vector<UTF16> ByteSwapped;
102	if (Src[`0`] == UNI_UTF16_BYTE_ORDER_MARK_SWAPPED) {
103	ByteSwapped.insert(position: ByteSwapped.end(), first: Src, last: SrcEnd);
104	for (UTF16 &I : ByteSwapped)
105	I = llvm::byteswap<uint16_t>(V: I);
106	Src = &ByteSwapped [`0`];
107	SrcEnd = &ByteSwapped [ByteSwapped.size() - `1`] + `1`;
108	}
109
110	// Skip the BOM for conversion.
111	if (Src[`0`] == UNI_UTF16_BYTE_ORDER_MARK_NATIVE)
112	Src++;
113
114	// Just allocate enough space up front. We'll shrink it later. Allocate
115	// enough that we can fit a null terminator without reallocating.
116	Out.resize(n: SrcBytes.size() * UNI_MAX_UTF8_BYTES_PER_CODE_POINT + `1`);
117	UTF8 Dst = reinterpret_cast<UTF8 >(&Out [`0`]);
118	UTF8 *DstEnd = Dst + Out.size();
119
120	ConversionResult CR =
121	ConvertUTF16toUTF8(sourceStart: &Src, sourceEnd: SrcEnd, targetStart: &Dst, targetEnd: DstEnd, flags: strictConversion);
122	assert(CR != targetExhausted);
123
124	if (CR != conversionOK) {
125	Out.clear();
126	return false;
127	}
128
129	Out.resize(n: reinterpret_cast<char *>(Dst) - &Out [`0`]);
130	Out.push_back(c: `0`);
131	Out.pop_back();
132	return true;
133	}
134
135	bool convertUTF16ToUTF8String(ArrayRef<UTF16> Src, std::string &Out) {
136	return convertUTF16ToUTF8String(
137	SrcBytes: llvm::ArrayRef<char>(reinterpret_cast<const char *>(Src.data()),
138	Src.size() * sizeof(UTF16)),
139	Out);
140	}
141
142	bool convertUTF32ToUTF8String(ArrayRef<char> SrcBytes, std::string &Out) {
143	assert(Out.empty());
144
145	// Error out on an uneven byte count.
146	if (SrcBytes.size() % `4`)
147	return false;
148
149	// Avoid OOB by returning early on empty input.
150	if (SrcBytes.empty())
151	return true;
152
153	const UTF32 Src = reinterpret_cast<const* UTF32 *>(SrcBytes.begin());
154	const UTF32 SrcEnd = reinterpret_cast<const* UTF32 *>(SrcBytes.end());
155
156	assert((uintptr_t)Src % sizeof(UTF32) == `0`);
157
158	// Byteswap if necessary.
159	std::vector<UTF32> ByteSwapped;
160	if (Src[`0`] == UNI_UTF32_BYTE_ORDER_MARK_SWAPPED) {
161	ByteSwapped.insert(position: ByteSwapped.end(), first: Src, last: SrcEnd);
162	for (UTF32 &I : ByteSwapped)
163	I = llvm::byteswap<uint32_t>(V: I);
164	Src = &ByteSwapped [`0`];
165	SrcEnd = &ByteSwapped [ByteSwapped.size() - `1`] + `1`;
166	}
167
168	// Skip the BOM for conversion.
169	if (Src[`0`] == UNI_UTF32_BYTE_ORDER_MARK_NATIVE)
170	Src++;
171
172	// Just allocate enough space up front. We'll shrink it later. Allocate
173	// enough that we can fit a null terminator without reallocating.
174	Out.resize(n: SrcBytes.size() * UNI_MAX_UTF8_BYTES_PER_CODE_POINT + `1`);
175	UTF8 Dst = reinterpret_cast<UTF8 >(&Out [`0`]);
176	UTF8 *DstEnd = Dst + Out.size();
177
178	ConversionResult CR =
179	ConvertUTF32toUTF8(sourceStart: &Src, sourceEnd: SrcEnd, targetStart: &Dst, targetEnd: DstEnd, flags: strictConversion);
180	assert(CR != targetExhausted);
181
182	if (CR != conversionOK) {
183	Out.clear();
184	return false;
185	}
186
187	Out.resize(n: reinterpret_cast<char *>(Dst) - &Out [`0`]);
188	Out.push_back(c: `0`);
189	Out.pop_back();
190	return true;
191	}
192
193	bool convertUTF32ToUTF8String(ArrayRef<UTF32> Src, std::string &Out) {
194	return convertUTF32ToUTF8String(
195	SrcBytes: llvm::ArrayRef<char>(reinterpret_cast<const char *>(Src.data()),
196	Src.size() * sizeof(UTF32)),
197	Out);
198	}
199
200	bool convertUTF8ToUTF16String(StringRef SrcUTF8,
201	SmallVectorImpl<UTF16> &DstUTF16) {
202	assert(DstUTF16.empty());
203
204	// Avoid OOB by returning early on empty input.
205	if (SrcUTF8.empty()) {
206	DstUTF16.push_back(Elt: `0`);
207	DstUTF16.pop_back();
208	return true;
209	}
210
211	const UTF8 Src = reinterpret_cast<const* UTF8 *>(SrcUTF8.begin());
212	const UTF8 SrcEnd = reinterpret_cast<const* UTF8 *>(SrcUTF8.end());
213
214	// Allocate the same number of UTF-16 code units as UTF-8 code units. Encoding
215	// as UTF-16 should always require the same amount or less code units than the
216	// UTF-8 encoding. Allocate one extra byte for the null terminator though,
217	// so that someone calling DstUTF16.data() gets a null terminated string.
218	// We resize down later so we don't have to worry that this over allocates.
219	DstUTF16.resize(N: SrcUTF8.size()+`1`);
220	UTF16 *Dst = &DstUTF16 [`0`];
221	UTF16 *DstEnd = Dst + DstUTF16.size();
222
223	ConversionResult CR =
224	ConvertUTF8toUTF16(sourceStart: &Src, sourceEnd: SrcEnd, targetStart: &Dst, targetEnd: DstEnd, flags: strictConversion);
225	assert(CR != targetExhausted);
226
227	if (CR != conversionOK) {
228	DstUTF16.clear();
229	return false;
230	}
231
232	DstUTF16.resize(N: Dst - &DstUTF16 [`0`]);
233	DstUTF16.push_back(Elt: `0`);
234	DstUTF16.pop_back();
235	return true;
236	}
237
238	static_assert(sizeof(wchar_t) == `1` \|\| sizeof(wchar_t) == `2` \|\|
239	sizeof(wchar_t) == `4`,
240	"Expected wchar_t to be 1, 2, or 4 bytes");
241
242	template <typename TResult>
243	static inline bool ConvertUTF8toWideInternal(llvm::StringRef Source,
244	TResult &Result) {
245	// Even in the case of UTF-16, the number of bytes in a UTF-8 string is
246	// at least as large as the number of elements in the resulting wide
247	// string, because surrogate pairs take at least 4 bytes in UTF-8.
248	Result.resize(Source.size() + `1`);
249	char ResultPtr = reinterpret_cast<char* *>(&Result[`0`]);
250	const UTF8 *ErrorPtr;
251	if (!ConvertUTF8toWide(WideCharWidth: sizeof(wchar_t), Source, ResultPtr, ErrorPtr)) {
252	Result.clear();
253	return false;
254	}
255	Result.resize(reinterpret_cast<wchar_t *>(ResultPtr) - &Result[`0`]);
256	return true;
257	}
258
259	bool ConvertUTF8toWide(llvm::StringRef Source, std::wstring &Result) {
260	return ConvertUTF8toWideInternal(Source, Result);
261	}
262
263	bool ConvertUTF8toWide(const char *Source, std::wstring &Result) {
264	if (!Source) {
265	Result.clear();
266	return true;
267	}
268	return ConvertUTF8toWide(Source: llvm::StringRef (Source), Result);
269	}
270
271	bool convertWideToUTF8(const std::wstring &Source, std::string &Result) {
272	if (sizeof(wchar_t) == `1`) {
273	const UTF8 Start = reinterpret_cast<const* UTF8 *>(Source.data());
274	const UTF8 *End =
275	reinterpret_cast<const UTF8 *>(Source.data() + Source.size());
276	if (!isLegalUTF8String(source: &Start, sourceEnd: End))
277	return false;
278	Result.resize(n: Source.size());
279	memcpy(dest: &Result [`0`], src: Source.data(), n: Source.size());
280	return true;
281	} else if (sizeof(wchar_t) == `2`) {
282	return convertUTF16ToUTF8String(
283	Src: llvm::ArrayRef<UTF16>(reinterpret_cast<const UTF16 *>(Source.data()),
284	Source.size()),
285	Out&: Result);
286	} else if (sizeof(wchar_t) == `4`) {
287	const UTF32 Start = reinterpret_cast<const* UTF32 *>(Source.data());
288	const UTF32 *End =
289	reinterpret_cast<const UTF32 *>(Source.data() + Source.size());
290	Result.resize(UNI_MAX_UTF8_BYTES_PER_CODE_POINT * Source.size());
291	UTF8 ResultPtr = reinterpret_cast<UTF8 >(&Result [`0`]);
292	UTF8 ResultEnd = reinterpret_cast<UTF8 >(&Result [`0`] + Result.size());
293	if (ConvertUTF32toUTF8(sourceStart: &Start, sourceEnd: End, targetStart: &ResultPtr, targetEnd: ResultEnd,
294	flags: strictConversion) == conversionOK) {
295	Result.resize(n: reinterpret_cast<char *>(ResultPtr) - &Result [`0`]);
296	return true;
297	} else {
298	Result.clear();
299	return false;
300	}
301	} else {
302	llvm_unreachable(
303	"Control should never reach this point; see static_assert further up");
304	}
305	}
306
307	} // end namespace llvm
308
309

source code of llvm/lib/Support/ConvertUTFWrapper.cpp