1//===- llvm/unittest/Support/ConvertUTFTest.cpp - ConvertUTF tests --------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "llvm/Support/ConvertUTF.h"
10#include "llvm/ADT/ArrayRef.h"
11#include "gtest/gtest.h"
12#include <string>
13#include <vector>
14
15using namespace llvm;
16
17TEST(ConvertUTFTest, ConvertUTF16LittleEndianToUTF8String) {
18 // Src is the look of disapproval.
19 alignas(UTF16) static const char Src[] = "\xff\xfe\xa0\x0c_\x00\xa0\x0c";
20 ArrayRef<char> Ref(Src, sizeof(Src) - 1);
21 std::string Result;
22 bool Success = convertUTF16ToUTF8String(SrcBytes: Ref, Out&: Result);
23 EXPECT_TRUE(Success);
24 std::string Expected("\xe0\xb2\xa0_\xe0\xb2\xa0");
25 EXPECT_EQ(Expected, Result);
26}
27
28TEST(ConvertUTFTest, ConvertUTF32LittleEndianToUTF8String) {
29 // Src is the look of disapproval.
30 alignas(UTF32) static const char Src[] =
31 "\xFF\xFE\x00\x00\xA0\x0C\x00\x00\x5F\x00\x00\x00\xA0\x0C\x00\x00";
32 ArrayRef<char> Ref(Src, sizeof(Src) - 1);
33 std::string Result;
34 bool Success = convertUTF32ToUTF8String(SrcBytes: Ref, Out&: Result);
35 EXPECT_TRUE(Success);
36 std::string Expected("\xE0\xB2\xA0_\xE0\xB2\xA0");
37 EXPECT_EQ(Expected, Result);
38}
39
40TEST(ConvertUTFTest, ConvertUTF16BigEndianToUTF8String) {
41 // Src is the look of disapproval.
42 alignas(UTF16) static const char Src[] = "\xfe\xff\x0c\xa0\x00_\x0c\xa0";
43 ArrayRef<char> Ref(Src, sizeof(Src) - 1);
44 std::string Result;
45 bool Success = convertUTF16ToUTF8String(SrcBytes: Ref, Out&: Result);
46 EXPECT_TRUE(Success);
47 std::string Expected("\xe0\xb2\xa0_\xe0\xb2\xa0");
48 EXPECT_EQ(Expected, Result);
49}
50
51TEST(ConvertUTFTest, ConvertUTF32BigEndianToUTF8String) {
52 // Src is the look of disapproval.
53 alignas(UTF32) static const char Src[] =
54 "\x00\x00\xFE\xFF\x00\x00\x0C\xA0\x00\x00\x00\x5F\x00\x00\x0C\xA0";
55 ArrayRef<char> Ref(Src, sizeof(Src) - 1);
56 std::string Result;
57 bool Success = convertUTF32ToUTF8String(SrcBytes: Ref, Out&: Result);
58 EXPECT_TRUE(Success);
59 std::string Expected("\xE0\xB2\xA0_\xE0\xB2\xA0");
60 EXPECT_EQ(Expected, Result);
61}
62
63TEST(ConvertUTFTest, ConvertUTF8ToUTF16String) {
64 // Src is the look of disapproval.
65 static const char Src[] = "\xe0\xb2\xa0_\xe0\xb2\xa0";
66 StringRef Ref(Src, sizeof(Src) - 1);
67 SmallVector<UTF16, 5> Result;
68 bool Success = convertUTF8ToUTF16String(SrcUTF8: Ref, DstUTF16&: Result);
69 EXPECT_TRUE(Success);
70 static const UTF16 Expected[] = {0x0CA0, 0x005f, 0x0CA0, 0};
71 ASSERT_EQ(3u, Result.size());
72 for (int I = 0, E = 3; I != E; ++I)
73 EXPECT_EQ(Expected[I], Result[I]);
74}
75
76TEST(ConvertUTFTest, OddLengthInput) {
77 std::string Result;
78 bool Success = convertUTF16ToUTF8String(SrcBytes: ArrayRef("xxxxx", 5), Out&: Result);
79 EXPECT_FALSE(Success);
80}
81
82TEST(ConvertUTFTest, Empty) {
83 std::string Result;
84 bool Success =
85 convertUTF16ToUTF8String(SrcBytes: llvm::ArrayRef<char>(std::nullopt), Out&: Result);
86 EXPECT_TRUE(Success);
87 EXPECT_TRUE(Result.empty());
88}
89
90TEST(ConvertUTFTest, HasUTF16BOM) {
91 bool HasBOM = hasUTF16ByteOrderMark(SrcBytes: ArrayRef("\xff\xfe", 2));
92 EXPECT_TRUE(HasBOM);
93 HasBOM = hasUTF16ByteOrderMark(SrcBytes: ArrayRef("\xfe\xff", 2));
94 EXPECT_TRUE(HasBOM);
95 HasBOM = hasUTF16ByteOrderMark(SrcBytes: ArrayRef("\xfe\xff ", 3));
96 EXPECT_TRUE(HasBOM); // Don't care about odd lengths.
97 HasBOM = hasUTF16ByteOrderMark(SrcBytes: ArrayRef("\xfe\xff\x00asdf", 6));
98 EXPECT_TRUE(HasBOM);
99
100 HasBOM = hasUTF16ByteOrderMark(SrcBytes: std::nullopt);
101 EXPECT_FALSE(HasBOM);
102 HasBOM = hasUTF16ByteOrderMark(SrcBytes: ArrayRef("\xfe", 1));
103 EXPECT_FALSE(HasBOM);
104}
105
106TEST(ConvertUTFTest, UTF16WrappersForConvertUTF16ToUTF8String) {
107 // Src is the look of disapproval.
108 alignas(UTF16) static const char Src[] = "\xff\xfe\xa0\x0c_\x00\xa0\x0c";
109 ArrayRef<UTF16> SrcRef = ArrayRef((const UTF16 *)Src, 4);
110 std::string Result;
111 bool Success = convertUTF16ToUTF8String(Src: SrcRef, Out&: Result);
112 EXPECT_TRUE(Success);
113 std::string Expected("\xe0\xb2\xa0_\xe0\xb2\xa0");
114 EXPECT_EQ(Expected, Result);
115}
116
117TEST(ConvertUTFTest, ConvertUTF8toWide) {
118 // Src is the look of disapproval.
119 static const char Src[] = "\xe0\xb2\xa0_\xe0\xb2\xa0";
120 std::wstring Result;
121 bool Success = ConvertUTF8toWide(Source: (const char*)Src, Result);
122 EXPECT_TRUE(Success);
123 std::wstring Expected(L"\x0ca0_\x0ca0");
124 EXPECT_EQ(Expected, Result);
125 Result.clear();
126 Success = ConvertUTF8toWide(Source: StringRef(Src, 7), Result);
127 EXPECT_TRUE(Success);
128 EXPECT_EQ(Expected, Result);
129}
130
131TEST(ConvertUTFTest, convertWideToUTF8) {
132 // Src is the look of disapproval.
133 static const wchar_t Src[] = L"\x0ca0_\x0ca0";
134 std::string Result;
135 bool Success = convertWideToUTF8(Source: Src, Result);
136 EXPECT_TRUE(Success);
137 std::string Expected("\xe0\xb2\xa0_\xe0\xb2\xa0");
138 EXPECT_EQ(Expected, Result);
139}
140
141struct ConvertUTFResultContainer {
142 ConversionResult ErrorCode;
143 std::vector<unsigned> UnicodeScalars;
144
145 ConvertUTFResultContainer(ConversionResult ErrorCode)
146 : ErrorCode(ErrorCode) {}
147
148 ConvertUTFResultContainer
149 withScalars(unsigned US0 = 0x110000, unsigned US1 = 0x110000,
150 unsigned US2 = 0x110000, unsigned US3 = 0x110000,
151 unsigned US4 = 0x110000, unsigned US5 = 0x110000,
152 unsigned US6 = 0x110000, unsigned US7 = 0x110000) {
153 ConvertUTFResultContainer Result(*this);
154 if (US0 != 0x110000)
155 Result.UnicodeScalars.push_back(x: US0);
156 if (US1 != 0x110000)
157 Result.UnicodeScalars.push_back(x: US1);
158 if (US2 != 0x110000)
159 Result.UnicodeScalars.push_back(x: US2);
160 if (US3 != 0x110000)
161 Result.UnicodeScalars.push_back(x: US3);
162 if (US4 != 0x110000)
163 Result.UnicodeScalars.push_back(x: US4);
164 if (US5 != 0x110000)
165 Result.UnicodeScalars.push_back(x: US5);
166 if (US6 != 0x110000)
167 Result.UnicodeScalars.push_back(x: US6);
168 if (US7 != 0x110000)
169 Result.UnicodeScalars.push_back(x: US7);
170 return Result;
171 }
172};
173
174std::pair<ConversionResult, std::vector<unsigned>>
175ConvertUTF8ToUnicodeScalarsLenient(StringRef S) {
176 const UTF8 *SourceStart = reinterpret_cast<const UTF8 *>(S.data());
177
178 const UTF8 *SourceNext = SourceStart;
179 std::vector<UTF32> Decoded(S.size(), 0);
180 UTF32 *TargetStart = Decoded.data();
181
182 auto ErrorCode =
183 ConvertUTF8toUTF32(sourceStart: &SourceNext, sourceEnd: SourceStart + S.size(), targetStart: &TargetStart,
184 targetEnd: Decoded.data() + Decoded.size(), flags: lenientConversion);
185
186 Decoded.resize(new_size: TargetStart - Decoded.data());
187
188 return std::make_pair(x&: ErrorCode, y&: Decoded);
189}
190
191std::pair<ConversionResult, std::vector<unsigned>>
192ConvertUTF8ToUnicodeScalarsPartialLenient(StringRef S) {
193 const UTF8 *SourceStart = reinterpret_cast<const UTF8 *>(S.data());
194
195 const UTF8 *SourceNext = SourceStart;
196 std::vector<UTF32> Decoded(S.size(), 0);
197 UTF32 *TargetStart = Decoded.data();
198
199 auto ErrorCode = ConvertUTF8toUTF32Partial(
200 sourceStart: &SourceNext, sourceEnd: SourceStart + S.size(), targetStart: &TargetStart,
201 targetEnd: Decoded.data() + Decoded.size(), flags: lenientConversion);
202
203 Decoded.resize(new_size: TargetStart - Decoded.data());
204
205 return std::make_pair(x&: ErrorCode, y&: Decoded);
206}
207
208::testing::AssertionResult
209CheckConvertUTF8ToUnicodeScalars(ConvertUTFResultContainer Expected,
210 StringRef S, bool Partial = false) {
211 ConversionResult ErrorCode;
212 std::vector<unsigned> Decoded;
213 if (!Partial)
214 std::tie(args&: ErrorCode, args&: Decoded) = ConvertUTF8ToUnicodeScalarsLenient(S);
215 else
216 std::tie(args&: ErrorCode, args&: Decoded) = ConvertUTF8ToUnicodeScalarsPartialLenient(S);
217
218 if (Expected.ErrorCode != ErrorCode)
219 return ::testing::AssertionFailure() << "Expected error code "
220 << Expected.ErrorCode << ", actual "
221 << ErrorCode;
222
223 if (Expected.UnicodeScalars != Decoded)
224 return ::testing::AssertionFailure()
225 << "Expected lenient decoded result:\n"
226 << ::testing::PrintToString(value: Expected.UnicodeScalars) << "\n"
227 << "Actual result:\n" << ::testing::PrintToString(value: Decoded);
228
229 return ::testing::AssertionSuccess();
230}
231
232TEST(ConvertUTFTest, UTF8ToUTF32Lenient) {
233
234 //
235 // 1-byte sequences
236 //
237
238 // U+0041 LATIN CAPITAL LETTER A
239 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
240 ConvertUTFResultContainer(conversionOK).withScalars(0x0041), "\x41"));
241
242 //
243 // 2-byte sequences
244 //
245
246 // U+0283 LATIN SMALL LETTER ESH
247 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
248 ConvertUTFResultContainer(conversionOK).withScalars(0x0283),
249 "\xca\x83"));
250
251 // U+03BA GREEK SMALL LETTER KAPPA
252 // U+1F79 GREEK SMALL LETTER OMICRON WITH OXIA
253 // U+03C3 GREEK SMALL LETTER SIGMA
254 // U+03BC GREEK SMALL LETTER MU
255 // U+03B5 GREEK SMALL LETTER EPSILON
256 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
257 ConvertUTFResultContainer(conversionOK)
258 .withScalars(0x03ba, 0x1f79, 0x03c3, 0x03bc, 0x03b5),
259 "\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5"));
260
261 //
262 // 3-byte sequences
263 //
264
265 // U+4F8B CJK UNIFIED IDEOGRAPH-4F8B
266 // U+6587 CJK UNIFIED IDEOGRAPH-6587
267 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
268 ConvertUTFResultContainer(conversionOK).withScalars(0x4f8b, 0x6587),
269 "\xe4\xbe\x8b\xe6\x96\x87"));
270
271 // U+D55C HANGUL SYLLABLE HAN
272 // U+AE00 HANGUL SYLLABLE GEUL
273 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
274 ConvertUTFResultContainer(conversionOK).withScalars(0xd55c, 0xae00),
275 "\xed\x95\x9c\xea\xb8\x80"));
276
277 // U+1112 HANGUL CHOSEONG HIEUH
278 // U+1161 HANGUL JUNGSEONG A
279 // U+11AB HANGUL JONGSEONG NIEUN
280 // U+1100 HANGUL CHOSEONG KIYEOK
281 // U+1173 HANGUL JUNGSEONG EU
282 // U+11AF HANGUL JONGSEONG RIEUL
283 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
284 ConvertUTFResultContainer(conversionOK)
285 .withScalars(0x1112, 0x1161, 0x11ab, 0x1100, 0x1173, 0x11af),
286 "\xe1\x84\x92\xe1\x85\xa1\xe1\x86\xab\xe1\x84\x80\xe1\x85\xb3"
287 "\xe1\x86\xaf"));
288
289 //
290 // 4-byte sequences
291 //
292
293 // U+E0100 VARIATION SELECTOR-17
294 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
295 ConvertUTFResultContainer(conversionOK).withScalars(0x000E0100),
296 "\xf3\xa0\x84\x80"));
297
298 //
299 // First possible sequence of a certain length
300 //
301
302 // U+0000 NULL
303 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
304 ConvertUTFResultContainer(conversionOK).withScalars(0x0000),
305 StringRef("\x00", 1)));
306
307 // U+0080 PADDING CHARACTER
308 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
309 ConvertUTFResultContainer(conversionOK).withScalars(0x0080),
310 "\xc2\x80"));
311
312 // U+0800 SAMARITAN LETTER ALAF
313 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
314 ConvertUTFResultContainer(conversionOK).withScalars(0x0800),
315 "\xe0\xa0\x80"));
316
317 // U+10000 LINEAR B SYLLABLE B008 A
318 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
319 ConvertUTFResultContainer(conversionOK).withScalars(0x10000),
320 "\xf0\x90\x80\x80"));
321
322 // U+200000 (invalid)
323 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
324 ConvertUTFResultContainer(sourceIllegal)
325 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
326 "\xf8\x88\x80\x80\x80"));
327
328 // U+4000000 (invalid)
329 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
330 ConvertUTFResultContainer(sourceIllegal)
331 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
332 "\xfc\x84\x80\x80\x80\x80"));
333
334 //
335 // Last possible sequence of a certain length
336 //
337
338 // U+007F DELETE
339 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
340 ConvertUTFResultContainer(conversionOK).withScalars(0x007f), "\x7f"));
341
342 // U+07FF (unassigned)
343 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
344 ConvertUTFResultContainer(conversionOK).withScalars(0x07ff),
345 "\xdf\xbf"));
346
347 // U+FFFF (noncharacter)
348 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
349 ConvertUTFResultContainer(conversionOK).withScalars(0xffff),
350 "\xef\xbf\xbf"));
351
352 // U+1FFFFF (invalid)
353 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
354 ConvertUTFResultContainer(sourceIllegal)
355 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
356 "\xf7\xbf\xbf\xbf"));
357
358 // U+3FFFFFF (invalid)
359 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
360 ConvertUTFResultContainer(sourceIllegal)
361 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
362 "\xfb\xbf\xbf\xbf\xbf"));
363
364 // U+7FFFFFFF (invalid)
365 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
366 ConvertUTFResultContainer(sourceIllegal)
367 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
368 "\xfd\xbf\xbf\xbf\xbf\xbf"));
369
370 //
371 // Other boundary conditions
372 //
373
374 // U+D7FF (unassigned)
375 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
376 ConvertUTFResultContainer(conversionOK).withScalars(0xd7ff),
377 "\xed\x9f\xbf"));
378
379 // U+E000 (private use)
380 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
381 ConvertUTFResultContainer(conversionOK).withScalars(0xe000),
382 "\xee\x80\x80"));
383
384 // U+FFFD REPLACEMENT CHARACTER
385 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
386 ConvertUTFResultContainer(conversionOK).withScalars(0xfffd),
387 "\xef\xbf\xbd"));
388
389 // U+10FFFF (noncharacter)
390 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
391 ConvertUTFResultContainer(conversionOK).withScalars(0x10ffff),
392 "\xf4\x8f\xbf\xbf"));
393
394 // U+110000 (invalid)
395 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
396 ConvertUTFResultContainer(sourceIllegal)
397 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
398 "\xf4\x90\x80\x80"));
399
400 //
401 // Unexpected continuation bytes
402 //
403
404 // A sequence of unexpected continuation bytes that don't follow a first
405 // byte, every byte is a maximal subpart.
406
407 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
408 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\x80"));
409 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
410 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xbf"));
411 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
412 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
413 "\x80\x80"));
414 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
415 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
416 "\x80\xbf"));
417 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
418 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
419 "\xbf\x80"));
420 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
421 ConvertUTFResultContainer(sourceIllegal)
422 .withScalars(0xfffd, 0xfffd, 0xfffd),
423 "\x80\xbf\x80"));
424 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
425 ConvertUTFResultContainer(sourceIllegal)
426 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
427 "\x80\xbf\x80\xbf"));
428 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
429 ConvertUTFResultContainer(sourceIllegal)
430 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
431 "\x80\xbf\x82\xbf\xaa"));
432 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
433 ConvertUTFResultContainer(sourceIllegal)
434 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
435 "\xaa\xb0\xbb\xbf\xaa\xa0"));
436 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
437 ConvertUTFResultContainer(sourceIllegal)
438 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
439 "\xaa\xb0\xbb\xbf\xaa\xa0\x8f"));
440
441 // All continuation bytes (0x80--0xbf).
442 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
443 ConvertUTFResultContainer(sourceIllegal)
444 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
445 0xfffd, 0xfffd, 0xfffd, 0xfffd)
446 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
447 0xfffd, 0xfffd, 0xfffd, 0xfffd)
448 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
449 0xfffd, 0xfffd, 0xfffd, 0xfffd)
450 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
451 0xfffd, 0xfffd, 0xfffd, 0xfffd)
452 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
453 0xfffd, 0xfffd, 0xfffd, 0xfffd)
454 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
455 0xfffd, 0xfffd, 0xfffd, 0xfffd)
456 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
457 0xfffd, 0xfffd, 0xfffd, 0xfffd)
458 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
459 0xfffd, 0xfffd, 0xfffd, 0xfffd),
460 "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
461 "\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
462 "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf"
463 "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"));
464
465 //
466 // Lonely start bytes
467 //
468
469 // Start bytes of 2-byte sequences (0xc0--0xdf).
470 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
471 ConvertUTFResultContainer(sourceIllegal)
472 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
473 0xfffd, 0xfffd, 0xfffd, 0xfffd)
474 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
475 0xfffd, 0xfffd, 0xfffd, 0xfffd)
476 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
477 0xfffd, 0xfffd, 0xfffd, 0xfffd)
478 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
479 0xfffd, 0xfffd, 0xfffd, 0xfffd),
480 "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf"
481 "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf"));
482
483 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
484 ConvertUTFResultContainer(sourceIllegal)
485 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
486 0xfffd, 0x0020, 0xfffd, 0x0020)
487 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
488 0xfffd, 0x0020, 0xfffd, 0x0020)
489 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
490 0xfffd, 0x0020, 0xfffd, 0x0020)
491 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
492 0xfffd, 0x0020, 0xfffd, 0x0020)
493 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
494 0xfffd, 0x0020, 0xfffd, 0x0020)
495 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
496 0xfffd, 0x0020, 0xfffd, 0x0020)
497 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
498 0xfffd, 0x0020, 0xfffd, 0x0020)
499 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
500 0xfffd, 0x0020, 0xfffd, 0x0020),
501 "\xc0\x20\xc1\x20\xc2\x20\xc3\x20\xc4\x20\xc5\x20\xc6\x20\xc7\x20"
502 "\xc8\x20\xc9\x20\xca\x20\xcb\x20\xcc\x20\xcd\x20\xce\x20\xcf\x20"
503 "\xd0\x20\xd1\x20\xd2\x20\xd3\x20\xd4\x20\xd5\x20\xd6\x20\xd7\x20"
504 "\xd8\x20\xd9\x20\xda\x20\xdb\x20\xdc\x20\xdd\x20\xde\x20\xdf\x20"));
505
506 // Start bytes of 3-byte sequences (0xe0--0xef).
507 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
508 ConvertUTFResultContainer(sourceIllegal)
509 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
510 0xfffd, 0xfffd, 0xfffd, 0xfffd)
511 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
512 0xfffd, 0xfffd, 0xfffd, 0xfffd),
513 "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef"));
514
515 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
516 ConvertUTFResultContainer(sourceIllegal)
517 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
518 0xfffd, 0x0020, 0xfffd, 0x0020)
519 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
520 0xfffd, 0x0020, 0xfffd, 0x0020)
521 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
522 0xfffd, 0x0020, 0xfffd, 0x0020)
523 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
524 0xfffd, 0x0020, 0xfffd, 0x0020),
525 "\xe0\x20\xe1\x20\xe2\x20\xe3\x20\xe4\x20\xe5\x20\xe6\x20\xe7\x20"
526 "\xe8\x20\xe9\x20\xea\x20\xeb\x20\xec\x20\xed\x20\xee\x20\xef\x20"));
527
528 // Start bytes of 4-byte sequences (0xf0--0xf7).
529 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
530 ConvertUTFResultContainer(sourceIllegal)
531 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
532 0xfffd, 0xfffd, 0xfffd, 0xfffd),
533 "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7"));
534
535 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
536 ConvertUTFResultContainer(sourceIllegal)
537 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
538 0xfffd, 0x0020, 0xfffd, 0x0020)
539 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
540 0xfffd, 0x0020, 0xfffd, 0x0020),
541 "\xf0\x20\xf1\x20\xf2\x20\xf3\x20\xf4\x20\xf5\x20\xf6\x20\xf7\x20"));
542
543 // Start bytes of 5-byte sequences (0xf8--0xfb).
544 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
545 ConvertUTFResultContainer(sourceIllegal)
546 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
547 "\xf8\xf9\xfa\xfb"));
548
549 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
550 ConvertUTFResultContainer(sourceIllegal)
551 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
552 0xfffd, 0x0020, 0xfffd, 0x0020),
553 "\xf8\x20\xf9\x20\xfa\x20\xfb\x20"));
554
555 // Start bytes of 6-byte sequences (0xfc--0xfd).
556 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
557 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
558 "\xfc\xfd"));
559
560 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
561 ConvertUTFResultContainer(sourceIllegal)
562 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020),
563 "\xfc\x20\xfd\x20"));
564
565 //
566 // Other bytes (0xc0--0xc1, 0xfe--0xff).
567 //
568
569 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
570 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xc0"));
571 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
572 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xc1"));
573 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
574 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfe"));
575 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
576 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xff"));
577
578 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
579 ConvertUTFResultContainer(sourceIllegal)
580 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
581 "\xc0\xc1\xfe\xff"));
582
583 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
584 ConvertUTFResultContainer(sourceIllegal)
585 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
586 "\xfe\xfe\xff\xff"));
587
588 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
589 ConvertUTFResultContainer(sourceIllegal)
590 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
591 "\xfe\x80\x80\x80\x80\x80"));
592
593 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
594 ConvertUTFResultContainer(sourceIllegal)
595 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
596 "\xff\x80\x80\x80\x80\x80"));
597
598 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
599 ConvertUTFResultContainer(sourceIllegal)
600 .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
601 0xfffd, 0x0020, 0xfffd, 0x0020),
602 "\xc0\x20\xc1\x20\xfe\x20\xff\x20"));
603
604 //
605 // Sequences with one continuation byte missing
606 //
607
608 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
609 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xc2"));
610 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
611 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xdf"));
612 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
613 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
614 "\xe0\xa0"));
615 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
616 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
617 "\xe0\xbf"));
618 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
619 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
620 "\xe1\x80"));
621 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
622 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
623 "\xec\xbf"));
624 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
625 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
626 "\xed\x80"));
627 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
628 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
629 "\xed\x9f"));
630 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
631 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
632 "\xee\x80"));
633 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
634 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
635 "\xef\xbf"));
636 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
637 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
638 "\xf0\x90\x80"));
639 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
640 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
641 "\xf0\xbf\xbf"));
642 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
643 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
644 "\xf1\x80\x80"));
645 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
646 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
647 "\xf3\xbf\xbf"));
648 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
649 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
650 "\xf4\x80\x80"));
651 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
652 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
653 "\xf4\x8f\xbf"));
654
655 // Overlong sequences with one trailing byte missing.
656 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
657 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
658 "\xc0"));
659 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
660 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
661 "\xc1"));
662 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
663 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
664 "\xe0\x80"));
665 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
666 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
667 "\xe0\x9f"));
668 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
669 ConvertUTFResultContainer(sourceIllegal)
670 .withScalars(0xfffd, 0xfffd, 0xfffd),
671 "\xf0\x80\x80"));
672 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
673 ConvertUTFResultContainer(sourceIllegal)
674 .withScalars(0xfffd, 0xfffd, 0xfffd),
675 "\xf0\x8f\x80"));
676 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
677 ConvertUTFResultContainer(sourceIllegal)
678 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
679 "\xf8\x80\x80\x80"));
680 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
681 ConvertUTFResultContainer(sourceIllegal)
682 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
683 "\xfc\x80\x80\x80\x80"));
684
685 // Sequences that represent surrogates with one trailing byte missing.
686 // High surrogates
687 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
688 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
689 "\xed\xa0"));
690 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
691 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
692 "\xed\xac"));
693 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
694 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
695 "\xed\xaf"));
696 // Low surrogates
697 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
698 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
699 "\xed\xb0"));
700 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
701 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
702 "\xed\xb4"));
703 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
704 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
705 "\xed\xbf"));
706
707 // Ill-formed 4-byte sequences.
708 // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
709 // U+1100xx (invalid)
710 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
711 ConvertUTFResultContainer(sourceIllegal)
712 .withScalars(0xfffd, 0xfffd, 0xfffd),
713 "\xf4\x90\x80"));
714 // U+13FBxx (invalid)
715 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
716 ConvertUTFResultContainer(sourceIllegal)
717 .withScalars(0xfffd, 0xfffd, 0xfffd),
718 "\xf4\xbf\xbf"));
719 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
720 ConvertUTFResultContainer(sourceIllegal)
721 .withScalars(0xfffd, 0xfffd, 0xfffd),
722 "\xf5\x80\x80"));
723 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
724 ConvertUTFResultContainer(sourceIllegal)
725 .withScalars(0xfffd, 0xfffd, 0xfffd),
726 "\xf6\x80\x80"));
727 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
728 ConvertUTFResultContainer(sourceIllegal)
729 .withScalars(0xfffd, 0xfffd, 0xfffd),
730 "\xf7\x80\x80"));
731 // U+1FFBxx (invalid)
732 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
733 ConvertUTFResultContainer(sourceIllegal)
734 .withScalars(0xfffd, 0xfffd, 0xfffd),
735 "\xf7\xbf\xbf"));
736
737 // Ill-formed 5-byte sequences.
738 // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
739 // U+2000xx (invalid)
740 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
741 ConvertUTFResultContainer(sourceIllegal)
742 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
743 "\xf8\x88\x80\x80"));
744 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
745 ConvertUTFResultContainer(sourceIllegal)
746 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
747 "\xf8\xbf\xbf\xbf"));
748 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
749 ConvertUTFResultContainer(sourceIllegal)
750 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
751 "\xf9\x80\x80\x80"));
752 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
753 ConvertUTFResultContainer(sourceIllegal)
754 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
755 "\xfa\x80\x80\x80"));
756 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
757 ConvertUTFResultContainer(sourceIllegal)
758 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
759 "\xfb\x80\x80\x80"));
760 // U+3FFFFxx (invalid)
761 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
762 ConvertUTFResultContainer(sourceIllegal)
763 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
764 "\xfb\xbf\xbf\xbf"));
765
766 // Ill-formed 6-byte sequences.
767 // 1111110u 10uuuuuu 10uzzzzz 10zzzyyyy 10yyyyxx 10xxxxxx
768 // U+40000xx (invalid)
769 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
770 ConvertUTFResultContainer(sourceIllegal)
771 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
772 "\xfc\x84\x80\x80\x80"));
773 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
774 ConvertUTFResultContainer(sourceIllegal)
775 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
776 "\xfc\xbf\xbf\xbf\xbf"));
777 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
778 ConvertUTFResultContainer(sourceIllegal)
779 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
780 "\xfd\x80\x80\x80\x80"));
781 // U+7FFFFFxx (invalid)
782 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
783 ConvertUTFResultContainer(sourceIllegal)
784 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
785 "\xfd\xbf\xbf\xbf\xbf"));
786
787 //
788 // Sequences with two continuation bytes missing
789 //
790
791 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
792 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
793 "\xf0\x90"));
794 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
795 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
796 "\xf0\xbf"));
797 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
798 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
799 "\xf1\x80"));
800 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
801 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
802 "\xf3\xbf"));
803 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
804 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
805 "\xf4\x80"));
806 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
807 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
808 "\xf4\x8f"));
809
810 // Overlong sequences with two trailing byte missing.
811 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
812 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xe0"));
813 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
814 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
815 "\xf0\x80"));
816 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
817 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
818 "\xf0\x8f"));
819 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
820 ConvertUTFResultContainer(sourceIllegal)
821 .withScalars(0xfffd, 0xfffd, 0xfffd),
822 "\xf8\x80\x80"));
823 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
824 ConvertUTFResultContainer(sourceIllegal)
825 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
826 "\xfc\x80\x80\x80"));
827
828 // Sequences that represent surrogates with two trailing bytes missing.
829 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
830 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xed"));
831
832 // Ill-formed 4-byte sequences.
833 // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
834 // U+110yxx (invalid)
835 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
836 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
837 "\xf4\x90"));
838 // U+13Fyxx (invalid)
839 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
840 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
841 "\xf4\xbf"));
842 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
843 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
844 "\xf5\x80"));
845 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
846 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
847 "\xf6\x80"));
848 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
849 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
850 "\xf7\x80"));
851 // U+1FFyxx (invalid)
852 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
853 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
854 "\xf7\xbf"));
855
856 // Ill-formed 5-byte sequences.
857 // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
858 // U+200yxx (invalid)
859 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
860 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
861 "\xf8\x88\x80"));
862 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
863 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
864 "\xf8\xbf\xbf"));
865 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
866 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
867 "\xf9\x80\x80"));
868 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
869 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
870 "\xfa\x80\x80"));
871 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
872 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
873 "\xfb\x80\x80"));
874 // U+3FFFyxx (invalid)
875 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
876 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
877 "\xfb\xbf\xbf"));
878
879 // Ill-formed 6-byte sequences.
880 // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
881 // U+4000yxx (invalid)
882 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
883 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
884 "\xfc\x84\x80\x80"));
885 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
886 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
887 "\xfc\xbf\xbf\xbf"));
888 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
889 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
890 "\xfd\x80\x80\x80"));
891 // U+7FFFFyxx (invalid)
892 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
893 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
894 "\xfd\xbf\xbf\xbf"));
895
896 //
897 // Sequences with three continuation bytes missing
898 //
899
900 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
901 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf0"));
902 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
903 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf1"));
904 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
905 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf2"));
906 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
907 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf3"));
908 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
909 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf4"));
910
911 // Broken overlong sequences.
912 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
913 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf0"));
914 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
915 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
916 "\xf8\x80"));
917 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
918 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
919 "\xfc\x80\x80"));
920
921 // Ill-formed 4-byte sequences.
922 // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
923 // U+14yyxx (invalid)
924 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
925 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf5"));
926 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
927 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf6"));
928 // U+1Cyyxx (invalid)
929 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
930 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf7"));
931
932 // Ill-formed 5-byte sequences.
933 // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
934 // U+20yyxx (invalid)
935 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
936 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
937 "\xf8\x88"));
938 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
939 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
940 "\xf8\xbf"));
941 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
942 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
943 "\xf9\x80"));
944 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
945 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
946 "\xfa\x80"));
947 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
948 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
949 "\xfb\x80"));
950 // U+3FCyyxx (invalid)
951 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
952 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
953 "\xfb\xbf"));
954
955 // Ill-formed 6-byte sequences.
956 // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
957 // U+400yyxx (invalid)
958 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
959 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
960 "\xfc\x84\x80"));
961 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
962 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
963 "\xfc\xbf\xbf"));
964 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
965 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
966 "\xfd\x80\x80"));
967 // U+7FFCyyxx (invalid)
968 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
969 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
970 "\xfd\xbf\xbf"));
971
972 //
973 // Sequences with four continuation bytes missing
974 //
975
976 // Ill-formed 5-byte sequences.
977 // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
978 // U+uzyyxx (invalid)
979 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
980 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf8"));
981 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
982 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf9"));
983 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
984 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfa"));
985 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
986 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfb"));
987 // U+3zyyxx (invalid)
988 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
989 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfb"));
990
991 // Broken overlong sequences.
992 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
993 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf8"));
994 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
995 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
996 "\xfc\x80"));
997
998 // Ill-formed 6-byte sequences.
999 // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
1000 // U+uzzyyxx (invalid)
1001 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1002 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
1003 "\xfc\x84"));
1004 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1005 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
1006 "\xfc\xbf"));
1007 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1008 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
1009 "\xfd\x80"));
1010 // U+7Fzzyyxx (invalid)
1011 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1012 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
1013 "\xfd\xbf"));
1014
1015 //
1016 // Sequences with five continuation bytes missing
1017 //
1018
1019 // Ill-formed 6-byte sequences.
1020 // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
1021 // U+uzzyyxx (invalid)
1022 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1023 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfc"));
1024 // U+uuzzyyxx (invalid)
1025 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1026 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfd"));
1027
1028 //
1029 // Consecutive sequences with trailing bytes missing
1030 //
1031
1032 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1033 ConvertUTFResultContainer(sourceIllegal)
1034 .withScalars(0xfffd, /**/ 0xfffd, 0xfffd, /**/ 0xfffd, 0xfffd, 0xfffd)
1035 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd)
1036 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd)
1037 .withScalars(0xfffd, /**/ 0xfffd, /**/ 0xfffd, 0xfffd, 0xfffd)
1038 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd)
1039 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1040 "\xc0" "\xe0\x80" "\xf0\x80\x80"
1041 "\xf8\x80\x80\x80"
1042 "\xfc\x80\x80\x80\x80"
1043 "\xdf" "\xef\xbf" "\xf7\xbf\xbf"
1044 "\xfb\xbf\xbf\xbf"
1045 "\xfd\xbf\xbf\xbf\xbf"));
1046
1047 //
1048 // Overlong UTF-8 sequences
1049 //
1050
1051 // U+002F SOLIDUS
1052 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1053 ConvertUTFResultContainer(conversionOK).withScalars(0x002f), "\x2f"));
1054
1055 // Overlong sequences of the above.
1056 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1057 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
1058 "\xc0\xaf"));
1059 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1060 ConvertUTFResultContainer(sourceIllegal)
1061 .withScalars(0xfffd, 0xfffd, 0xfffd),
1062 "\xe0\x80\xaf"));
1063 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1064 ConvertUTFResultContainer(sourceIllegal)
1065 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
1066 "\xf0\x80\x80\xaf"));
1067 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1068 ConvertUTFResultContainer(sourceIllegal)
1069 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1070 "\xf8\x80\x80\x80\xaf"));
1071 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1072 ConvertUTFResultContainer(sourceIllegal)
1073 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1074 "\xfc\x80\x80\x80\x80\xaf"));
1075
1076 // U+0000 NULL
1077 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1078 ConvertUTFResultContainer(conversionOK).withScalars(0x0000),
1079 StringRef("\x00", 1)));
1080
1081 // Overlong sequences of the above.
1082 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1083 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
1084 "\xc0\x80"));
1085 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1086 ConvertUTFResultContainer(sourceIllegal)
1087 .withScalars(0xfffd, 0xfffd, 0xfffd),
1088 "\xe0\x80\x80"));
1089 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1090 ConvertUTFResultContainer(sourceIllegal)
1091 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
1092 "\xf0\x80\x80\x80"));
1093 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1094 ConvertUTFResultContainer(sourceIllegal)
1095 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1096 "\xf8\x80\x80\x80\x80"));
1097 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1098 ConvertUTFResultContainer(sourceIllegal)
1099 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1100 "\xfc\x80\x80\x80\x80\x80"));
1101
1102 // Other overlong sequences.
1103 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1104 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
1105 "\xc0\xbf"));
1106 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1107 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
1108 "\xc1\x80"));
1109 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1110 ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
1111 "\xc1\xbf"));
1112 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1113 ConvertUTFResultContainer(sourceIllegal)
1114 .withScalars(0xfffd, 0xfffd, 0xfffd),
1115 "\xe0\x9f\xbf"));
1116 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1117 ConvertUTFResultContainer(sourceIllegal)
1118 .withScalars(0xfffd, 0xfffd, 0xfffd),
1119 "\xed\xa0\x80"));
1120 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1121 ConvertUTFResultContainer(sourceIllegal)
1122 .withScalars(0xfffd, 0xfffd, 0xfffd),
1123 "\xed\xbf\xbf"));
1124 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1125 ConvertUTFResultContainer(sourceIllegal)
1126 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
1127 "\xf0\x8f\x80\x80"));
1128 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1129 ConvertUTFResultContainer(sourceIllegal)
1130 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
1131 "\xf0\x8f\xbf\xbf"));
1132 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1133 ConvertUTFResultContainer(sourceIllegal)
1134 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1135 "\xf8\x87\xbf\xbf\xbf"));
1136 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1137 ConvertUTFResultContainer(sourceIllegal)
1138 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1139 "\xfc\x83\xbf\xbf\xbf\xbf"));
1140
1141 //
1142 // Isolated surrogates
1143 //
1144
1145 // Unicode 6.3.0:
1146 //
1147 // D71. High-surrogate code point: A Unicode code point in the range
1148 // U+D800 to U+DBFF.
1149 //
1150 // D73. Low-surrogate code point: A Unicode code point in the range
1151 // U+DC00 to U+DFFF.
1152
1153 // Note: U+E0100 is <DB40 DD00> in UTF16.
1154
1155 // High surrogates
1156
1157 // U+D800
1158 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1159 ConvertUTFResultContainer(sourceIllegal)
1160 .withScalars(0xfffd, 0xfffd, 0xfffd),
1161 "\xed\xa0\x80"));
1162
1163 // U+DB40
1164 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1165 ConvertUTFResultContainer(sourceIllegal)
1166 .withScalars(0xfffd, 0xfffd, 0xfffd),
1167 "\xed\xac\xa0"));
1168
1169 // U+DBFF
1170 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1171 ConvertUTFResultContainer(sourceIllegal)
1172 .withScalars(0xfffd, 0xfffd, 0xfffd),
1173 "\xed\xaf\xbf"));
1174
1175 // Low surrogates
1176
1177 // U+DC00
1178 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1179 ConvertUTFResultContainer(sourceIllegal)
1180 .withScalars(0xfffd, 0xfffd, 0xfffd),
1181 "\xed\xb0\x80"));
1182
1183 // U+DD00
1184 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1185 ConvertUTFResultContainer(sourceIllegal)
1186 .withScalars(0xfffd, 0xfffd, 0xfffd),
1187 "\xed\xb4\x80"));
1188
1189 // U+DFFF
1190 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1191 ConvertUTFResultContainer(sourceIllegal)
1192 .withScalars(0xfffd, 0xfffd, 0xfffd),
1193 "\xed\xbf\xbf"));
1194
1195 // Surrogate pairs
1196
1197 // U+D800 U+DC00
1198 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1199 ConvertUTFResultContainer(sourceIllegal)
1200 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1201 "\xed\xa0\x80\xed\xb0\x80"));
1202
1203 // U+D800 U+DD00
1204 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1205 ConvertUTFResultContainer(sourceIllegal)
1206 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1207 "\xed\xa0\x80\xed\xb4\x80"));
1208
1209 // U+D800 U+DFFF
1210 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1211 ConvertUTFResultContainer(sourceIllegal)
1212 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1213 "\xed\xa0\x80\xed\xbf\xbf"));
1214
1215 // U+DB40 U+DC00
1216 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1217 ConvertUTFResultContainer(sourceIllegal)
1218 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1219 "\xed\xac\xa0\xed\xb0\x80"));
1220
1221 // U+DB40 U+DD00
1222 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1223 ConvertUTFResultContainer(sourceIllegal)
1224 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1225 "\xed\xac\xa0\xed\xb4\x80"));
1226
1227 // U+DB40 U+DFFF
1228 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1229 ConvertUTFResultContainer(sourceIllegal)
1230 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1231 "\xed\xac\xa0\xed\xbf\xbf"));
1232
1233 // U+DBFF U+DC00
1234 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1235 ConvertUTFResultContainer(sourceIllegal)
1236 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1237 "\xed\xaf\xbf\xed\xb0\x80"));
1238
1239 // U+DBFF U+DD00
1240 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1241 ConvertUTFResultContainer(sourceIllegal)
1242 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1243 "\xed\xaf\xbf\xed\xb4\x80"));
1244
1245 // U+DBFF U+DFFF
1246 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1247 ConvertUTFResultContainer(sourceIllegal)
1248 .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1249 "\xed\xaf\xbf\xed\xbf\xbf"));
1250
1251 //
1252 // Noncharacters
1253 //
1254
1255 // Unicode 6.3.0:
1256 //
1257 // D14. Noncharacter: A code point that is permanently reserved for
1258 // internal use and that should never be interchanged. Noncharacters
1259 // consist of the values U+nFFFE and U+nFFFF (where n is from 0 to 1016)
1260 // and the values U+FDD0..U+FDEF.
1261
1262 // U+FFFE
1263 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1264 ConvertUTFResultContainer(conversionOK).withScalars(0xfffe),
1265 "\xef\xbf\xbe"));
1266
1267 // U+FFFF
1268 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1269 ConvertUTFResultContainer(conversionOK).withScalars(0xffff),
1270 "\xef\xbf\xbf"));
1271
1272 // U+1FFFE
1273 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1274 ConvertUTFResultContainer(conversionOK).withScalars(0x1fffe),
1275 "\xf0\x9f\xbf\xbe"));
1276
1277 // U+1FFFF
1278 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1279 ConvertUTFResultContainer(conversionOK).withScalars(0x1ffff),
1280 "\xf0\x9f\xbf\xbf"));
1281
1282 // U+2FFFE
1283 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1284 ConvertUTFResultContainer(conversionOK).withScalars(0x2fffe),
1285 "\xf0\xaf\xbf\xbe"));
1286
1287 // U+2FFFF
1288 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1289 ConvertUTFResultContainer(conversionOK).withScalars(0x2ffff),
1290 "\xf0\xaf\xbf\xbf"));
1291
1292 // U+3FFFE
1293 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1294 ConvertUTFResultContainer(conversionOK).withScalars(0x3fffe),
1295 "\xf0\xbf\xbf\xbe"));
1296
1297 // U+3FFFF
1298 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1299 ConvertUTFResultContainer(conversionOK).withScalars(0x3ffff),
1300 "\xf0\xbf\xbf\xbf"));
1301
1302 // U+4FFFE
1303 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1304 ConvertUTFResultContainer(conversionOK).withScalars(0x4fffe),
1305 "\xf1\x8f\xbf\xbe"));
1306
1307 // U+4FFFF
1308 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1309 ConvertUTFResultContainer(conversionOK).withScalars(0x4ffff),
1310 "\xf1\x8f\xbf\xbf"));
1311
1312 // U+5FFFE
1313 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1314 ConvertUTFResultContainer(conversionOK).withScalars(0x5fffe),
1315 "\xf1\x9f\xbf\xbe"));
1316
1317 // U+5FFFF
1318 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1319 ConvertUTFResultContainer(conversionOK).withScalars(0x5ffff),
1320 "\xf1\x9f\xbf\xbf"));
1321
1322 // U+6FFFE
1323 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1324 ConvertUTFResultContainer(conversionOK).withScalars(0x6fffe),
1325 "\xf1\xaf\xbf\xbe"));
1326
1327 // U+6FFFF
1328 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1329 ConvertUTFResultContainer(conversionOK).withScalars(0x6ffff),
1330 "\xf1\xaf\xbf\xbf"));
1331
1332 // U+7FFFE
1333 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1334 ConvertUTFResultContainer(conversionOK).withScalars(0x7fffe),
1335 "\xf1\xbf\xbf\xbe"));
1336
1337 // U+7FFFF
1338 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1339 ConvertUTFResultContainer(conversionOK).withScalars(0x7ffff),
1340 "\xf1\xbf\xbf\xbf"));
1341
1342 // U+8FFFE
1343 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1344 ConvertUTFResultContainer(conversionOK).withScalars(0x8fffe),
1345 "\xf2\x8f\xbf\xbe"));
1346
1347 // U+8FFFF
1348 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1349 ConvertUTFResultContainer(conversionOK).withScalars(0x8ffff),
1350 "\xf2\x8f\xbf\xbf"));
1351
1352 // U+9FFFE
1353 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1354 ConvertUTFResultContainer(conversionOK).withScalars(0x9fffe),
1355 "\xf2\x9f\xbf\xbe"));
1356
1357 // U+9FFFF
1358 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1359 ConvertUTFResultContainer(conversionOK).withScalars(0x9ffff),
1360 "\xf2\x9f\xbf\xbf"));
1361
1362 // U+AFFFE
1363 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1364 ConvertUTFResultContainer(conversionOK).withScalars(0xafffe),
1365 "\xf2\xaf\xbf\xbe"));
1366
1367 // U+AFFFF
1368 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1369 ConvertUTFResultContainer(conversionOK).withScalars(0xaffff),
1370 "\xf2\xaf\xbf\xbf"));
1371
1372 // U+BFFFE
1373 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1374 ConvertUTFResultContainer(conversionOK).withScalars(0xbfffe),
1375 "\xf2\xbf\xbf\xbe"));
1376
1377 // U+BFFFF
1378 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1379 ConvertUTFResultContainer(conversionOK).withScalars(0xbffff),
1380 "\xf2\xbf\xbf\xbf"));
1381
1382 // U+CFFFE
1383 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1384 ConvertUTFResultContainer(conversionOK).withScalars(0xcfffe),
1385 "\xf3\x8f\xbf\xbe"));
1386
1387 // U+CFFFF
1388 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1389 ConvertUTFResultContainer(conversionOK).withScalars(0xcfffF),
1390 "\xf3\x8f\xbf\xbf"));
1391
1392 // U+DFFFE
1393 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1394 ConvertUTFResultContainer(conversionOK).withScalars(0xdfffe),
1395 "\xf3\x9f\xbf\xbe"));
1396
1397 // U+DFFFF
1398 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1399 ConvertUTFResultContainer(conversionOK).withScalars(0xdffff),
1400 "\xf3\x9f\xbf\xbf"));
1401
1402 // U+EFFFE
1403 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1404 ConvertUTFResultContainer(conversionOK).withScalars(0xefffe),
1405 "\xf3\xaf\xbf\xbe"));
1406
1407 // U+EFFFF
1408 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1409 ConvertUTFResultContainer(conversionOK).withScalars(0xeffff),
1410 "\xf3\xaf\xbf\xbf"));
1411
1412 // U+FFFFE
1413 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1414 ConvertUTFResultContainer(conversionOK).withScalars(0xffffe),
1415 "\xf3\xbf\xbf\xbe"));
1416
1417 // U+FFFFF
1418 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1419 ConvertUTFResultContainer(conversionOK).withScalars(0xfffff),
1420 "\xf3\xbf\xbf\xbf"));
1421
1422 // U+10FFFE
1423 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1424 ConvertUTFResultContainer(conversionOK).withScalars(0x10fffe),
1425 "\xf4\x8f\xbf\xbe"));
1426
1427 // U+10FFFF
1428 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1429 ConvertUTFResultContainer(conversionOK).withScalars(0x10ffff),
1430 "\xf4\x8f\xbf\xbf"));
1431
1432 // U+FDD0
1433 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1434 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd0),
1435 "\xef\xb7\x90"));
1436
1437 // U+FDD1
1438 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1439 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd1),
1440 "\xef\xb7\x91"));
1441
1442 // U+FDD2
1443 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1444 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd2),
1445 "\xef\xb7\x92"));
1446
1447 // U+FDD3
1448 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1449 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd3),
1450 "\xef\xb7\x93"));
1451
1452 // U+FDD4
1453 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1454 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd4),
1455 "\xef\xb7\x94"));
1456
1457 // U+FDD5
1458 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1459 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd5),
1460 "\xef\xb7\x95"));
1461
1462 // U+FDD6
1463 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1464 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd6),
1465 "\xef\xb7\x96"));
1466
1467 // U+FDD7
1468 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1469 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd7),
1470 "\xef\xb7\x97"));
1471
1472 // U+FDD8
1473 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1474 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd8),
1475 "\xef\xb7\x98"));
1476
1477 // U+FDD9
1478 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1479 ConvertUTFResultContainer(conversionOK).withScalars(0xfdd9),
1480 "\xef\xb7\x99"));
1481
1482 // U+FDDA
1483 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1484 ConvertUTFResultContainer(conversionOK).withScalars(0xfdda),
1485 "\xef\xb7\x9a"));
1486
1487 // U+FDDB
1488 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1489 ConvertUTFResultContainer(conversionOK).withScalars(0xfddb),
1490 "\xef\xb7\x9b"));
1491
1492 // U+FDDC
1493 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1494 ConvertUTFResultContainer(conversionOK).withScalars(0xfddc),
1495 "\xef\xb7\x9c"));
1496
1497 // U+FDDD
1498 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1499 ConvertUTFResultContainer(conversionOK).withScalars(0xfddd),
1500 "\xef\xb7\x9d"));
1501
1502 // U+FDDE
1503 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1504 ConvertUTFResultContainer(conversionOK).withScalars(0xfdde),
1505 "\xef\xb7\x9e"));
1506
1507 // U+FDDF
1508 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1509 ConvertUTFResultContainer(conversionOK).withScalars(0xfddf),
1510 "\xef\xb7\x9f"));
1511
1512 // U+FDE0
1513 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1514 ConvertUTFResultContainer(conversionOK).withScalars(0xfde0),
1515 "\xef\xb7\xa0"));
1516
1517 // U+FDE1
1518 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1519 ConvertUTFResultContainer(conversionOK).withScalars(0xfde1),
1520 "\xef\xb7\xa1"));
1521
1522 // U+FDE2
1523 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1524 ConvertUTFResultContainer(conversionOK).withScalars(0xfde2),
1525 "\xef\xb7\xa2"));
1526
1527 // U+FDE3
1528 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1529 ConvertUTFResultContainer(conversionOK).withScalars(0xfde3),
1530 "\xef\xb7\xa3"));
1531
1532 // U+FDE4
1533 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1534 ConvertUTFResultContainer(conversionOK).withScalars(0xfde4),
1535 "\xef\xb7\xa4"));
1536
1537 // U+FDE5
1538 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1539 ConvertUTFResultContainer(conversionOK).withScalars(0xfde5),
1540 "\xef\xb7\xa5"));
1541
1542 // U+FDE6
1543 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1544 ConvertUTFResultContainer(conversionOK).withScalars(0xfde6),
1545 "\xef\xb7\xa6"));
1546
1547 // U+FDE7
1548 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1549 ConvertUTFResultContainer(conversionOK).withScalars(0xfde7),
1550 "\xef\xb7\xa7"));
1551
1552 // U+FDE8
1553 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1554 ConvertUTFResultContainer(conversionOK).withScalars(0xfde8),
1555 "\xef\xb7\xa8"));
1556
1557 // U+FDE9
1558 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1559 ConvertUTFResultContainer(conversionOK).withScalars(0xfde9),
1560 "\xef\xb7\xa9"));
1561
1562 // U+FDEA
1563 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1564 ConvertUTFResultContainer(conversionOK).withScalars(0xfdea),
1565 "\xef\xb7\xaa"));
1566
1567 // U+FDEB
1568 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1569 ConvertUTFResultContainer(conversionOK).withScalars(0xfdeb),
1570 "\xef\xb7\xab"));
1571
1572 // U+FDEC
1573 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1574 ConvertUTFResultContainer(conversionOK).withScalars(0xfdec),
1575 "\xef\xb7\xac"));
1576
1577 // U+FDED
1578 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1579 ConvertUTFResultContainer(conversionOK).withScalars(0xfded),
1580 "\xef\xb7\xad"));
1581
1582 // U+FDEE
1583 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1584 ConvertUTFResultContainer(conversionOK).withScalars(0xfdee),
1585 "\xef\xb7\xae"));
1586
1587 // U+FDEF
1588 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1589 ConvertUTFResultContainer(conversionOK).withScalars(0xfdef),
1590 "\xef\xb7\xaf"));
1591
1592 // U+FDF0
1593 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1594 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf0),
1595 "\xef\xb7\xb0"));
1596
1597 // U+FDF1
1598 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1599 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf1),
1600 "\xef\xb7\xb1"));
1601
1602 // U+FDF2
1603 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1604 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf2),
1605 "\xef\xb7\xb2"));
1606
1607 // U+FDF3
1608 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1609 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf3),
1610 "\xef\xb7\xb3"));
1611
1612 // U+FDF4
1613 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1614 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf4),
1615 "\xef\xb7\xb4"));
1616
1617 // U+FDF5
1618 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1619 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf5),
1620 "\xef\xb7\xb5"));
1621
1622 // U+FDF6
1623 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1624 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf6),
1625 "\xef\xb7\xb6"));
1626
1627 // U+FDF7
1628 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1629 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf7),
1630 "\xef\xb7\xb7"));
1631
1632 // U+FDF8
1633 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1634 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf8),
1635 "\xef\xb7\xb8"));
1636
1637 // U+FDF9
1638 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1639 ConvertUTFResultContainer(conversionOK).withScalars(0xfdf9),
1640 "\xef\xb7\xb9"));
1641
1642 // U+FDFA
1643 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1644 ConvertUTFResultContainer(conversionOK).withScalars(0xfdfa),
1645 "\xef\xb7\xba"));
1646
1647 // U+FDFB
1648 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1649 ConvertUTFResultContainer(conversionOK).withScalars(0xfdfb),
1650 "\xef\xb7\xbb"));
1651
1652 // U+FDFC
1653 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1654 ConvertUTFResultContainer(conversionOK).withScalars(0xfdfc),
1655 "\xef\xb7\xbc"));
1656
1657 // U+FDFD
1658 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1659 ConvertUTFResultContainer(conversionOK).withScalars(0xfdfd),
1660 "\xef\xb7\xbd"));
1661
1662 // U+FDFE
1663 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1664 ConvertUTFResultContainer(conversionOK).withScalars(0xfdfe),
1665 "\xef\xb7\xbe"));
1666
1667 // U+FDFF
1668 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1669 ConvertUTFResultContainer(conversionOK).withScalars(0xfdff),
1670 "\xef\xb7\xbf"));
1671}
1672
1673TEST(ConvertUTFTest, UTF8ToUTF32PartialLenient) {
1674 // U+0041 LATIN CAPITAL LETTER A
1675 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1676 ConvertUTFResultContainer(conversionOK).withScalars(0x0041),
1677 "\x41", true));
1678
1679 //
1680 // Sequences with one continuation byte missing
1681 //
1682
1683 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1684 ConvertUTFResultContainer(sourceExhausted),
1685 "\xc2", true));
1686 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1687 ConvertUTFResultContainer(sourceExhausted),
1688 "\xdf", true));
1689 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1690 ConvertUTFResultContainer(sourceExhausted),
1691 "\xe0\xa0", true));
1692 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1693 ConvertUTFResultContainer(sourceExhausted),
1694 "\xe0\xbf", true));
1695 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1696 ConvertUTFResultContainer(sourceExhausted),
1697 "\xe1\x80", true));
1698 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1699 ConvertUTFResultContainer(sourceExhausted),
1700 "\xec\xbf", true));
1701 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1702 ConvertUTFResultContainer(sourceExhausted),
1703 "\xed\x80", true));
1704 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1705 ConvertUTFResultContainer(sourceExhausted),
1706 "\xed\x9f", true));
1707 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1708 ConvertUTFResultContainer(sourceExhausted),
1709 "\xee\x80", true));
1710 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1711 ConvertUTFResultContainer(sourceExhausted),
1712 "\xef\xbf", true));
1713 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1714 ConvertUTFResultContainer(sourceExhausted),
1715 "\xf0\x90\x80", true));
1716 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1717 ConvertUTFResultContainer(sourceExhausted),
1718 "\xf0\xbf\xbf", true));
1719 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1720 ConvertUTFResultContainer(sourceExhausted),
1721 "\xf1\x80\x80", true));
1722 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1723 ConvertUTFResultContainer(sourceExhausted),
1724 "\xf3\xbf\xbf", true));
1725 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1726 ConvertUTFResultContainer(sourceExhausted),
1727 "\xf4\x80\x80", true));
1728 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1729 ConvertUTFResultContainer(sourceExhausted),
1730 "\xf4\x8f\xbf", true));
1731
1732 EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1733 ConvertUTFResultContainer(sourceExhausted).withScalars(0x0041),
1734 "\x41\xc2", true));
1735}
1736
1737

source code of llvm/unittests/Support/ConvertUTFTest.cpp