1 | //===- llvm/unittest/Support/ConvertUTFTest.cpp - ConvertUTF tests --------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #include "llvm/Support/ConvertUTF.h" |
10 | #include "llvm/ADT/ArrayRef.h" |
11 | #include "gtest/gtest.h" |
12 | #include <string> |
13 | #include <vector> |
14 | |
15 | using namespace llvm; |
16 | |
17 | TEST(ConvertUTFTest, ConvertUTF16LittleEndianToUTF8String) { |
18 | // Src is the look of disapproval. |
19 | alignas(UTF16) static const char Src[] = "\xff\xfe\xa0\x0c_\x00\xa0\x0c" ; |
20 | ArrayRef<char> Ref(Src, sizeof(Src) - 1); |
21 | std::string Result; |
22 | bool Success = convertUTF16ToUTF8String(SrcBytes: Ref, Out&: Result); |
23 | EXPECT_TRUE(Success); |
24 | std::string Expected("\xe0\xb2\xa0_\xe0\xb2\xa0" ); |
25 | EXPECT_EQ(Expected, Result); |
26 | } |
27 | |
28 | TEST(ConvertUTFTest, ConvertUTF32LittleEndianToUTF8String) { |
29 | // Src is the look of disapproval. |
30 | alignas(UTF32) static const char Src[] = |
31 | "\xFF\xFE\x00\x00\xA0\x0C\x00\x00\x5F\x00\x00\x00\xA0\x0C\x00\x00" ; |
32 | ArrayRef<char> Ref(Src, sizeof(Src) - 1); |
33 | std::string Result; |
34 | bool Success = convertUTF32ToUTF8String(SrcBytes: Ref, Out&: Result); |
35 | EXPECT_TRUE(Success); |
36 | std::string Expected("\xE0\xB2\xA0_\xE0\xB2\xA0" ); |
37 | EXPECT_EQ(Expected, Result); |
38 | } |
39 | |
40 | TEST(ConvertUTFTest, ConvertUTF16BigEndianToUTF8String) { |
41 | // Src is the look of disapproval. |
42 | alignas(UTF16) static const char Src[] = "\xfe\xff\x0c\xa0\x00_\x0c\xa0" ; |
43 | ArrayRef<char> Ref(Src, sizeof(Src) - 1); |
44 | std::string Result; |
45 | bool Success = convertUTF16ToUTF8String(SrcBytes: Ref, Out&: Result); |
46 | EXPECT_TRUE(Success); |
47 | std::string Expected("\xe0\xb2\xa0_\xe0\xb2\xa0" ); |
48 | EXPECT_EQ(Expected, Result); |
49 | } |
50 | |
51 | TEST(ConvertUTFTest, ConvertUTF32BigEndianToUTF8String) { |
52 | // Src is the look of disapproval. |
53 | alignas(UTF32) static const char Src[] = |
54 | "\x00\x00\xFE\xFF\x00\x00\x0C\xA0\x00\x00\x00\x5F\x00\x00\x0C\xA0" ; |
55 | ArrayRef<char> Ref(Src, sizeof(Src) - 1); |
56 | std::string Result; |
57 | bool Success = convertUTF32ToUTF8String(SrcBytes: Ref, Out&: Result); |
58 | EXPECT_TRUE(Success); |
59 | std::string Expected("\xE0\xB2\xA0_\xE0\xB2\xA0" ); |
60 | EXPECT_EQ(Expected, Result); |
61 | } |
62 | |
63 | TEST(ConvertUTFTest, ConvertUTF8ToUTF16String) { |
64 | // Src is the look of disapproval. |
65 | static const char Src[] = "\xe0\xb2\xa0_\xe0\xb2\xa0" ; |
66 | StringRef Ref(Src, sizeof(Src) - 1); |
67 | SmallVector<UTF16, 5> Result; |
68 | bool Success = convertUTF8ToUTF16String(SrcUTF8: Ref, DstUTF16&: Result); |
69 | EXPECT_TRUE(Success); |
70 | static const UTF16 Expected[] = {0x0CA0, 0x005f, 0x0CA0, 0}; |
71 | ASSERT_EQ(3u, Result.size()); |
72 | for (int I = 0, E = 3; I != E; ++I) |
73 | EXPECT_EQ(Expected[I], Result[I]); |
74 | } |
75 | |
76 | TEST(ConvertUTFTest, OddLengthInput) { |
77 | std::string Result; |
78 | bool Success = convertUTF16ToUTF8String(SrcBytes: ArrayRef("xxxxx" , 5), Out&: Result); |
79 | EXPECT_FALSE(Success); |
80 | } |
81 | |
82 | TEST(ConvertUTFTest, Empty) { |
83 | std::string Result; |
84 | bool Success = |
85 | convertUTF16ToUTF8String(SrcBytes: llvm::ArrayRef<char>(std::nullopt), Out&: Result); |
86 | EXPECT_TRUE(Success); |
87 | EXPECT_TRUE(Result.empty()); |
88 | } |
89 | |
90 | TEST(ConvertUTFTest, HasUTF16BOM) { |
91 | bool HasBOM = hasUTF16ByteOrderMark(SrcBytes: ArrayRef("\xff\xfe" , 2)); |
92 | EXPECT_TRUE(HasBOM); |
93 | HasBOM = hasUTF16ByteOrderMark(SrcBytes: ArrayRef("\xfe\xff" , 2)); |
94 | EXPECT_TRUE(HasBOM); |
95 | HasBOM = hasUTF16ByteOrderMark(SrcBytes: ArrayRef("\xfe\xff " , 3)); |
96 | EXPECT_TRUE(HasBOM); // Don't care about odd lengths. |
97 | HasBOM = hasUTF16ByteOrderMark(SrcBytes: ArrayRef("\xfe\xff\x00asdf" , 6)); |
98 | EXPECT_TRUE(HasBOM); |
99 | |
100 | HasBOM = hasUTF16ByteOrderMark(SrcBytes: std::nullopt); |
101 | EXPECT_FALSE(HasBOM); |
102 | HasBOM = hasUTF16ByteOrderMark(SrcBytes: ArrayRef("\xfe" , 1)); |
103 | EXPECT_FALSE(HasBOM); |
104 | } |
105 | |
106 | TEST(ConvertUTFTest, UTF16WrappersForConvertUTF16ToUTF8String) { |
107 | // Src is the look of disapproval. |
108 | alignas(UTF16) static const char Src[] = "\xff\xfe\xa0\x0c_\x00\xa0\x0c" ; |
109 | ArrayRef<UTF16> SrcRef = ArrayRef((const UTF16 *)Src, 4); |
110 | std::string Result; |
111 | bool Success = convertUTF16ToUTF8String(Src: SrcRef, Out&: Result); |
112 | EXPECT_TRUE(Success); |
113 | std::string Expected("\xe0\xb2\xa0_\xe0\xb2\xa0" ); |
114 | EXPECT_EQ(Expected, Result); |
115 | } |
116 | |
117 | TEST(ConvertUTFTest, ConvertUTF8toWide) { |
118 | // Src is the look of disapproval. |
119 | static const char Src[] = "\xe0\xb2\xa0_\xe0\xb2\xa0" ; |
120 | std::wstring Result; |
121 | bool Success = ConvertUTF8toWide(Source: (const char*)Src, Result); |
122 | EXPECT_TRUE(Success); |
123 | std::wstring Expected(L"\x0ca0_\x0ca0" ); |
124 | EXPECT_EQ(Expected, Result); |
125 | Result.clear(); |
126 | Success = ConvertUTF8toWide(Source: StringRef(Src, 7), Result); |
127 | EXPECT_TRUE(Success); |
128 | EXPECT_EQ(Expected, Result); |
129 | } |
130 | |
131 | TEST(ConvertUTFTest, convertWideToUTF8) { |
132 | // Src is the look of disapproval. |
133 | static const wchar_t Src[] = L"\x0ca0_\x0ca0" ; |
134 | std::string Result; |
135 | bool Success = convertWideToUTF8(Source: Src, Result); |
136 | EXPECT_TRUE(Success); |
137 | std::string Expected("\xe0\xb2\xa0_\xe0\xb2\xa0" ); |
138 | EXPECT_EQ(Expected, Result); |
139 | } |
140 | |
141 | struct ConvertUTFResultContainer { |
142 | ConversionResult ErrorCode; |
143 | std::vector<unsigned> UnicodeScalars; |
144 | |
145 | ConvertUTFResultContainer(ConversionResult ErrorCode) |
146 | : ErrorCode(ErrorCode) {} |
147 | |
148 | ConvertUTFResultContainer |
149 | withScalars(unsigned US0 = 0x110000, unsigned US1 = 0x110000, |
150 | unsigned US2 = 0x110000, unsigned US3 = 0x110000, |
151 | unsigned US4 = 0x110000, unsigned US5 = 0x110000, |
152 | unsigned US6 = 0x110000, unsigned US7 = 0x110000) { |
153 | ConvertUTFResultContainer Result(*this); |
154 | if (US0 != 0x110000) |
155 | Result.UnicodeScalars.push_back(x: US0); |
156 | if (US1 != 0x110000) |
157 | Result.UnicodeScalars.push_back(x: US1); |
158 | if (US2 != 0x110000) |
159 | Result.UnicodeScalars.push_back(x: US2); |
160 | if (US3 != 0x110000) |
161 | Result.UnicodeScalars.push_back(x: US3); |
162 | if (US4 != 0x110000) |
163 | Result.UnicodeScalars.push_back(x: US4); |
164 | if (US5 != 0x110000) |
165 | Result.UnicodeScalars.push_back(x: US5); |
166 | if (US6 != 0x110000) |
167 | Result.UnicodeScalars.push_back(x: US6); |
168 | if (US7 != 0x110000) |
169 | Result.UnicodeScalars.push_back(x: US7); |
170 | return Result; |
171 | } |
172 | }; |
173 | |
174 | std::pair<ConversionResult, std::vector<unsigned>> |
175 | ConvertUTF8ToUnicodeScalarsLenient(StringRef S) { |
176 | const UTF8 *SourceStart = reinterpret_cast<const UTF8 *>(S.data()); |
177 | |
178 | const UTF8 *SourceNext = SourceStart; |
179 | std::vector<UTF32> Decoded(S.size(), 0); |
180 | UTF32 *TargetStart = Decoded.data(); |
181 | |
182 | auto ErrorCode = |
183 | ConvertUTF8toUTF32(sourceStart: &SourceNext, sourceEnd: SourceStart + S.size(), targetStart: &TargetStart, |
184 | targetEnd: Decoded.data() + Decoded.size(), flags: lenientConversion); |
185 | |
186 | Decoded.resize(new_size: TargetStart - Decoded.data()); |
187 | |
188 | return std::make_pair(x&: ErrorCode, y&: Decoded); |
189 | } |
190 | |
191 | std::pair<ConversionResult, std::vector<unsigned>> |
192 | ConvertUTF8ToUnicodeScalarsPartialLenient(StringRef S) { |
193 | const UTF8 *SourceStart = reinterpret_cast<const UTF8 *>(S.data()); |
194 | |
195 | const UTF8 *SourceNext = SourceStart; |
196 | std::vector<UTF32> Decoded(S.size(), 0); |
197 | UTF32 *TargetStart = Decoded.data(); |
198 | |
199 | auto ErrorCode = ConvertUTF8toUTF32Partial( |
200 | sourceStart: &SourceNext, sourceEnd: SourceStart + S.size(), targetStart: &TargetStart, |
201 | targetEnd: Decoded.data() + Decoded.size(), flags: lenientConversion); |
202 | |
203 | Decoded.resize(new_size: TargetStart - Decoded.data()); |
204 | |
205 | return std::make_pair(x&: ErrorCode, y&: Decoded); |
206 | } |
207 | |
208 | ::testing::AssertionResult |
209 | CheckConvertUTF8ToUnicodeScalars(ConvertUTFResultContainer Expected, |
210 | StringRef S, bool Partial = false) { |
211 | ConversionResult ErrorCode; |
212 | std::vector<unsigned> Decoded; |
213 | if (!Partial) |
214 | std::tie(args&: ErrorCode, args&: Decoded) = ConvertUTF8ToUnicodeScalarsLenient(S); |
215 | else |
216 | std::tie(args&: ErrorCode, args&: Decoded) = ConvertUTF8ToUnicodeScalarsPartialLenient(S); |
217 | |
218 | if (Expected.ErrorCode != ErrorCode) |
219 | return ::testing::AssertionFailure() << "Expected error code " |
220 | << Expected.ErrorCode << ", actual " |
221 | << ErrorCode; |
222 | |
223 | if (Expected.UnicodeScalars != Decoded) |
224 | return ::testing::AssertionFailure() |
225 | << "Expected lenient decoded result:\n" |
226 | << ::testing::PrintToString(value: Expected.UnicodeScalars) << "\n" |
227 | << "Actual result:\n" << ::testing::PrintToString(value: Decoded); |
228 | |
229 | return ::testing::AssertionSuccess(); |
230 | } |
231 | |
232 | TEST(ConvertUTFTest, UTF8ToUTF32Lenient) { |
233 | |
234 | // |
235 | // 1-byte sequences |
236 | // |
237 | |
238 | // U+0041 LATIN CAPITAL LETTER A |
239 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
240 | ConvertUTFResultContainer(conversionOK).withScalars(0x0041), "\x41" )); |
241 | |
242 | // |
243 | // 2-byte sequences |
244 | // |
245 | |
246 | // U+0283 LATIN SMALL LETTER ESH |
247 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
248 | ConvertUTFResultContainer(conversionOK).withScalars(0x0283), |
249 | "\xca\x83" )); |
250 | |
251 | // U+03BA GREEK SMALL LETTER KAPPA |
252 | // U+1F79 GREEK SMALL LETTER OMICRON WITH OXIA |
253 | // U+03C3 GREEK SMALL LETTER SIGMA |
254 | // U+03BC GREEK SMALL LETTER MU |
255 | // U+03B5 GREEK SMALL LETTER EPSILON |
256 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
257 | ConvertUTFResultContainer(conversionOK) |
258 | .withScalars(0x03ba, 0x1f79, 0x03c3, 0x03bc, 0x03b5), |
259 | "\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5" )); |
260 | |
261 | // |
262 | // 3-byte sequences |
263 | // |
264 | |
265 | // U+4F8B CJK UNIFIED IDEOGRAPH-4F8B |
266 | // U+6587 CJK UNIFIED IDEOGRAPH-6587 |
267 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
268 | ConvertUTFResultContainer(conversionOK).withScalars(0x4f8b, 0x6587), |
269 | "\xe4\xbe\x8b\xe6\x96\x87" )); |
270 | |
271 | // U+D55C HANGUL SYLLABLE HAN |
272 | // U+AE00 HANGUL SYLLABLE GEUL |
273 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
274 | ConvertUTFResultContainer(conversionOK).withScalars(0xd55c, 0xae00), |
275 | "\xed\x95\x9c\xea\xb8\x80" )); |
276 | |
277 | // U+1112 HANGUL CHOSEONG HIEUH |
278 | // U+1161 HANGUL JUNGSEONG A |
279 | // U+11AB HANGUL JONGSEONG NIEUN |
280 | // U+1100 HANGUL CHOSEONG KIYEOK |
281 | // U+1173 HANGUL JUNGSEONG EU |
282 | // U+11AF HANGUL JONGSEONG RIEUL |
283 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
284 | ConvertUTFResultContainer(conversionOK) |
285 | .withScalars(0x1112, 0x1161, 0x11ab, 0x1100, 0x1173, 0x11af), |
286 | "\xe1\x84\x92\xe1\x85\xa1\xe1\x86\xab\xe1\x84\x80\xe1\x85\xb3" |
287 | "\xe1\x86\xaf" )); |
288 | |
289 | // |
290 | // 4-byte sequences |
291 | // |
292 | |
293 | // U+E0100 VARIATION SELECTOR-17 |
294 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
295 | ConvertUTFResultContainer(conversionOK).withScalars(0x000E0100), |
296 | "\xf3\xa0\x84\x80" )); |
297 | |
298 | // |
299 | // First possible sequence of a certain length |
300 | // |
301 | |
302 | // U+0000 NULL |
303 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
304 | ConvertUTFResultContainer(conversionOK).withScalars(0x0000), |
305 | StringRef("\x00" , 1))); |
306 | |
307 | // U+0080 PADDING CHARACTER |
308 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
309 | ConvertUTFResultContainer(conversionOK).withScalars(0x0080), |
310 | "\xc2\x80" )); |
311 | |
312 | // U+0800 SAMARITAN LETTER ALAF |
313 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
314 | ConvertUTFResultContainer(conversionOK).withScalars(0x0800), |
315 | "\xe0\xa0\x80" )); |
316 | |
317 | // U+10000 LINEAR B SYLLABLE B008 A |
318 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
319 | ConvertUTFResultContainer(conversionOK).withScalars(0x10000), |
320 | "\xf0\x90\x80\x80" )); |
321 | |
322 | // U+200000 (invalid) |
323 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
324 | ConvertUTFResultContainer(sourceIllegal) |
325 | .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), |
326 | "\xf8\x88\x80\x80\x80" )); |
327 | |
328 | // U+4000000 (invalid) |
329 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
330 | ConvertUTFResultContainer(sourceIllegal) |
331 | .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), |
332 | "\xfc\x84\x80\x80\x80\x80" )); |
333 | |
334 | // |
335 | // Last possible sequence of a certain length |
336 | // |
337 | |
338 | // U+007F DELETE |
339 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
340 | ConvertUTFResultContainer(conversionOK).withScalars(0x007f), "\x7f" )); |
341 | |
342 | // U+07FF (unassigned) |
343 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
344 | ConvertUTFResultContainer(conversionOK).withScalars(0x07ff), |
345 | "\xdf\xbf" )); |
346 | |
347 | // U+FFFF (noncharacter) |
348 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
349 | ConvertUTFResultContainer(conversionOK).withScalars(0xffff), |
350 | "\xef\xbf\xbf" )); |
351 | |
352 | // U+1FFFFF (invalid) |
353 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
354 | ConvertUTFResultContainer(sourceIllegal) |
355 | .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), |
356 | "\xf7\xbf\xbf\xbf" )); |
357 | |
358 | // U+3FFFFFF (invalid) |
359 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
360 | ConvertUTFResultContainer(sourceIllegal) |
361 | .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), |
362 | "\xfb\xbf\xbf\xbf\xbf" )); |
363 | |
364 | // U+7FFFFFFF (invalid) |
365 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
366 | ConvertUTFResultContainer(sourceIllegal) |
367 | .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), |
368 | "\xfd\xbf\xbf\xbf\xbf\xbf" )); |
369 | |
370 | // |
371 | // Other boundary conditions |
372 | // |
373 | |
374 | // U+D7FF (unassigned) |
375 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
376 | ConvertUTFResultContainer(conversionOK).withScalars(0xd7ff), |
377 | "\xed\x9f\xbf" )); |
378 | |
379 | // U+E000 (private use) |
380 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
381 | ConvertUTFResultContainer(conversionOK).withScalars(0xe000), |
382 | "\xee\x80\x80" )); |
383 | |
384 | // U+FFFD REPLACEMENT CHARACTER |
385 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
386 | ConvertUTFResultContainer(conversionOK).withScalars(0xfffd), |
387 | "\xef\xbf\xbd" )); |
388 | |
389 | // U+10FFFF (noncharacter) |
390 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
391 | ConvertUTFResultContainer(conversionOK).withScalars(0x10ffff), |
392 | "\xf4\x8f\xbf\xbf" )); |
393 | |
394 | // U+110000 (invalid) |
395 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
396 | ConvertUTFResultContainer(sourceIllegal) |
397 | .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), |
398 | "\xf4\x90\x80\x80" )); |
399 | |
400 | // |
401 | // Unexpected continuation bytes |
402 | // |
403 | |
404 | // A sequence of unexpected continuation bytes that don't follow a first |
405 | // byte, every byte is a maximal subpart. |
406 | |
407 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
408 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\x80" )); |
409 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
410 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xbf" )); |
411 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
412 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), |
413 | "\x80\x80" )); |
414 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
415 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), |
416 | "\x80\xbf" )); |
417 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
418 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), |
419 | "\xbf\x80" )); |
420 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
421 | ConvertUTFResultContainer(sourceIllegal) |
422 | .withScalars(0xfffd, 0xfffd, 0xfffd), |
423 | "\x80\xbf\x80" )); |
424 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
425 | ConvertUTFResultContainer(sourceIllegal) |
426 | .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), |
427 | "\x80\xbf\x80\xbf" )); |
428 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
429 | ConvertUTFResultContainer(sourceIllegal) |
430 | .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), |
431 | "\x80\xbf\x82\xbf\xaa" )); |
432 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
433 | ConvertUTFResultContainer(sourceIllegal) |
434 | .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), |
435 | "\xaa\xb0\xbb\xbf\xaa\xa0" )); |
436 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
437 | ConvertUTFResultContainer(sourceIllegal) |
438 | .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), |
439 | "\xaa\xb0\xbb\xbf\xaa\xa0\x8f" )); |
440 | |
441 | // All continuation bytes (0x80--0xbf). |
442 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
443 | ConvertUTFResultContainer(sourceIllegal) |
444 | .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, |
445 | 0xfffd, 0xfffd, 0xfffd, 0xfffd) |
446 | .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, |
447 | 0xfffd, 0xfffd, 0xfffd, 0xfffd) |
448 | .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, |
449 | 0xfffd, 0xfffd, 0xfffd, 0xfffd) |
450 | .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, |
451 | 0xfffd, 0xfffd, 0xfffd, 0xfffd) |
452 | .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, |
453 | 0xfffd, 0xfffd, 0xfffd, 0xfffd) |
454 | .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, |
455 | 0xfffd, 0xfffd, 0xfffd, 0xfffd) |
456 | .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, |
457 | 0xfffd, 0xfffd, 0xfffd, 0xfffd) |
458 | .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, |
459 | 0xfffd, 0xfffd, 0xfffd, 0xfffd), |
460 | "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f" |
461 | "\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f" |
462 | "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf" |
463 | "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf" )); |
464 | |
465 | // |
466 | // Lonely start bytes |
467 | // |
468 | |
469 | // Start bytes of 2-byte sequences (0xc0--0xdf). |
470 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
471 | ConvertUTFResultContainer(sourceIllegal) |
472 | .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, |
473 | 0xfffd, 0xfffd, 0xfffd, 0xfffd) |
474 | .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, |
475 | 0xfffd, 0xfffd, 0xfffd, 0xfffd) |
476 | .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, |
477 | 0xfffd, 0xfffd, 0xfffd, 0xfffd) |
478 | .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, |
479 | 0xfffd, 0xfffd, 0xfffd, 0xfffd), |
480 | "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf" |
481 | "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf" )); |
482 | |
483 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
484 | ConvertUTFResultContainer(sourceIllegal) |
485 | .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, |
486 | 0xfffd, 0x0020, 0xfffd, 0x0020) |
487 | .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, |
488 | 0xfffd, 0x0020, 0xfffd, 0x0020) |
489 | .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, |
490 | 0xfffd, 0x0020, 0xfffd, 0x0020) |
491 | .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, |
492 | 0xfffd, 0x0020, 0xfffd, 0x0020) |
493 | .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, |
494 | 0xfffd, 0x0020, 0xfffd, 0x0020) |
495 | .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, |
496 | 0xfffd, 0x0020, 0xfffd, 0x0020) |
497 | .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, |
498 | 0xfffd, 0x0020, 0xfffd, 0x0020) |
499 | .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, |
500 | 0xfffd, 0x0020, 0xfffd, 0x0020), |
501 | "\xc0\x20\xc1\x20\xc2\x20\xc3\x20\xc4\x20\xc5\x20\xc6\x20\xc7\x20" |
502 | "\xc8\x20\xc9\x20\xca\x20\xcb\x20\xcc\x20\xcd\x20\xce\x20\xcf\x20" |
503 | "\xd0\x20\xd1\x20\xd2\x20\xd3\x20\xd4\x20\xd5\x20\xd6\x20\xd7\x20" |
504 | "\xd8\x20\xd9\x20\xda\x20\xdb\x20\xdc\x20\xdd\x20\xde\x20\xdf\x20" )); |
505 | |
506 | // Start bytes of 3-byte sequences (0xe0--0xef). |
507 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
508 | ConvertUTFResultContainer(sourceIllegal) |
509 | .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, |
510 | 0xfffd, 0xfffd, 0xfffd, 0xfffd) |
511 | .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, |
512 | 0xfffd, 0xfffd, 0xfffd, 0xfffd), |
513 | "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef" )); |
514 | |
515 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
516 | ConvertUTFResultContainer(sourceIllegal) |
517 | .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, |
518 | 0xfffd, 0x0020, 0xfffd, 0x0020) |
519 | .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, |
520 | 0xfffd, 0x0020, 0xfffd, 0x0020) |
521 | .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, |
522 | 0xfffd, 0x0020, 0xfffd, 0x0020) |
523 | .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, |
524 | 0xfffd, 0x0020, 0xfffd, 0x0020), |
525 | "\xe0\x20\xe1\x20\xe2\x20\xe3\x20\xe4\x20\xe5\x20\xe6\x20\xe7\x20" |
526 | "\xe8\x20\xe9\x20\xea\x20\xeb\x20\xec\x20\xed\x20\xee\x20\xef\x20" )); |
527 | |
528 | // Start bytes of 4-byte sequences (0xf0--0xf7). |
529 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
530 | ConvertUTFResultContainer(sourceIllegal) |
531 | .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, |
532 | 0xfffd, 0xfffd, 0xfffd, 0xfffd), |
533 | "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7" )); |
534 | |
535 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
536 | ConvertUTFResultContainer(sourceIllegal) |
537 | .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, |
538 | 0xfffd, 0x0020, 0xfffd, 0x0020) |
539 | .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, |
540 | 0xfffd, 0x0020, 0xfffd, 0x0020), |
541 | "\xf0\x20\xf1\x20\xf2\x20\xf3\x20\xf4\x20\xf5\x20\xf6\x20\xf7\x20" )); |
542 | |
543 | // Start bytes of 5-byte sequences (0xf8--0xfb). |
544 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
545 | ConvertUTFResultContainer(sourceIllegal) |
546 | .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), |
547 | "\xf8\xf9\xfa\xfb" )); |
548 | |
549 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
550 | ConvertUTFResultContainer(sourceIllegal) |
551 | .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, |
552 | 0xfffd, 0x0020, 0xfffd, 0x0020), |
553 | "\xf8\x20\xf9\x20\xfa\x20\xfb\x20" )); |
554 | |
555 | // Start bytes of 6-byte sequences (0xfc--0xfd). |
556 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
557 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), |
558 | "\xfc\xfd" )); |
559 | |
560 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
561 | ConvertUTFResultContainer(sourceIllegal) |
562 | .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020), |
563 | "\xfc\x20\xfd\x20" )); |
564 | |
565 | // |
566 | // Other bytes (0xc0--0xc1, 0xfe--0xff). |
567 | // |
568 | |
569 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
570 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xc0" )); |
571 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
572 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xc1" )); |
573 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
574 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfe" )); |
575 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
576 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xff" )); |
577 | |
578 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
579 | ConvertUTFResultContainer(sourceIllegal) |
580 | .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), |
581 | "\xc0\xc1\xfe\xff" )); |
582 | |
583 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
584 | ConvertUTFResultContainer(sourceIllegal) |
585 | .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), |
586 | "\xfe\xfe\xff\xff" )); |
587 | |
588 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
589 | ConvertUTFResultContainer(sourceIllegal) |
590 | .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), |
591 | "\xfe\x80\x80\x80\x80\x80" )); |
592 | |
593 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
594 | ConvertUTFResultContainer(sourceIllegal) |
595 | .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), |
596 | "\xff\x80\x80\x80\x80\x80" )); |
597 | |
598 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
599 | ConvertUTFResultContainer(sourceIllegal) |
600 | .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020, |
601 | 0xfffd, 0x0020, 0xfffd, 0x0020), |
602 | "\xc0\x20\xc1\x20\xfe\x20\xff\x20" )); |
603 | |
604 | // |
605 | // Sequences with one continuation byte missing |
606 | // |
607 | |
608 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
609 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xc2" )); |
610 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
611 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xdf" )); |
612 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
613 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), |
614 | "\xe0\xa0" )); |
615 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
616 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), |
617 | "\xe0\xbf" )); |
618 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
619 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), |
620 | "\xe1\x80" )); |
621 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
622 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), |
623 | "\xec\xbf" )); |
624 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
625 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), |
626 | "\xed\x80" )); |
627 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
628 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), |
629 | "\xed\x9f" )); |
630 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
631 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), |
632 | "\xee\x80" )); |
633 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
634 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), |
635 | "\xef\xbf" )); |
636 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
637 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), |
638 | "\xf0\x90\x80" )); |
639 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
640 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), |
641 | "\xf0\xbf\xbf" )); |
642 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
643 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), |
644 | "\xf1\x80\x80" )); |
645 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
646 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), |
647 | "\xf3\xbf\xbf" )); |
648 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
649 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), |
650 | "\xf4\x80\x80" )); |
651 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
652 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), |
653 | "\xf4\x8f\xbf" )); |
654 | |
655 | // Overlong sequences with one trailing byte missing. |
656 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
657 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), |
658 | "\xc0" )); |
659 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
660 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), |
661 | "\xc1" )); |
662 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
663 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), |
664 | "\xe0\x80" )); |
665 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
666 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), |
667 | "\xe0\x9f" )); |
668 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
669 | ConvertUTFResultContainer(sourceIllegal) |
670 | .withScalars(0xfffd, 0xfffd, 0xfffd), |
671 | "\xf0\x80\x80" )); |
672 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
673 | ConvertUTFResultContainer(sourceIllegal) |
674 | .withScalars(0xfffd, 0xfffd, 0xfffd), |
675 | "\xf0\x8f\x80" )); |
676 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
677 | ConvertUTFResultContainer(sourceIllegal) |
678 | .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), |
679 | "\xf8\x80\x80\x80" )); |
680 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
681 | ConvertUTFResultContainer(sourceIllegal) |
682 | .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), |
683 | "\xfc\x80\x80\x80\x80" )); |
684 | |
685 | // Sequences that represent surrogates with one trailing byte missing. |
686 | // High surrogates |
687 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
688 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), |
689 | "\xed\xa0" )); |
690 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
691 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), |
692 | "\xed\xac" )); |
693 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
694 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), |
695 | "\xed\xaf" )); |
696 | // Low surrogates |
697 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
698 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), |
699 | "\xed\xb0" )); |
700 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
701 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), |
702 | "\xed\xb4" )); |
703 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
704 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), |
705 | "\xed\xbf" )); |
706 | |
707 | // Ill-formed 4-byte sequences. |
708 | // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx |
709 | // U+1100xx (invalid) |
710 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
711 | ConvertUTFResultContainer(sourceIllegal) |
712 | .withScalars(0xfffd, 0xfffd, 0xfffd), |
713 | "\xf4\x90\x80" )); |
714 | // U+13FBxx (invalid) |
715 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
716 | ConvertUTFResultContainer(sourceIllegal) |
717 | .withScalars(0xfffd, 0xfffd, 0xfffd), |
718 | "\xf4\xbf\xbf" )); |
719 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
720 | ConvertUTFResultContainer(sourceIllegal) |
721 | .withScalars(0xfffd, 0xfffd, 0xfffd), |
722 | "\xf5\x80\x80" )); |
723 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
724 | ConvertUTFResultContainer(sourceIllegal) |
725 | .withScalars(0xfffd, 0xfffd, 0xfffd), |
726 | "\xf6\x80\x80" )); |
727 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
728 | ConvertUTFResultContainer(sourceIllegal) |
729 | .withScalars(0xfffd, 0xfffd, 0xfffd), |
730 | "\xf7\x80\x80" )); |
731 | // U+1FFBxx (invalid) |
732 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
733 | ConvertUTFResultContainer(sourceIllegal) |
734 | .withScalars(0xfffd, 0xfffd, 0xfffd), |
735 | "\xf7\xbf\xbf" )); |
736 | |
737 | // Ill-formed 5-byte sequences. |
738 | // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx |
739 | // U+2000xx (invalid) |
740 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
741 | ConvertUTFResultContainer(sourceIllegal) |
742 | .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), |
743 | "\xf8\x88\x80\x80" )); |
744 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
745 | ConvertUTFResultContainer(sourceIllegal) |
746 | .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), |
747 | "\xf8\xbf\xbf\xbf" )); |
748 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
749 | ConvertUTFResultContainer(sourceIllegal) |
750 | .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), |
751 | "\xf9\x80\x80\x80" )); |
752 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
753 | ConvertUTFResultContainer(sourceIllegal) |
754 | .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), |
755 | "\xfa\x80\x80\x80" )); |
756 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
757 | ConvertUTFResultContainer(sourceIllegal) |
758 | .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), |
759 | "\xfb\x80\x80\x80" )); |
760 | // U+3FFFFxx (invalid) |
761 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
762 | ConvertUTFResultContainer(sourceIllegal) |
763 | .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), |
764 | "\xfb\xbf\xbf\xbf" )); |
765 | |
766 | // Ill-formed 6-byte sequences. |
767 | // 1111110u 10uuuuuu 10uzzzzz 10zzzyyyy 10yyyyxx 10xxxxxx |
768 | // U+40000xx (invalid) |
769 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
770 | ConvertUTFResultContainer(sourceIllegal) |
771 | .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), |
772 | "\xfc\x84\x80\x80\x80" )); |
773 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
774 | ConvertUTFResultContainer(sourceIllegal) |
775 | .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), |
776 | "\xfc\xbf\xbf\xbf\xbf" )); |
777 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
778 | ConvertUTFResultContainer(sourceIllegal) |
779 | .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), |
780 | "\xfd\x80\x80\x80\x80" )); |
781 | // U+7FFFFFxx (invalid) |
782 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
783 | ConvertUTFResultContainer(sourceIllegal) |
784 | .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), |
785 | "\xfd\xbf\xbf\xbf\xbf" )); |
786 | |
787 | // |
788 | // Sequences with two continuation bytes missing |
789 | // |
790 | |
791 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
792 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), |
793 | "\xf0\x90" )); |
794 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
795 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), |
796 | "\xf0\xbf" )); |
797 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
798 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), |
799 | "\xf1\x80" )); |
800 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
801 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), |
802 | "\xf3\xbf" )); |
803 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
804 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), |
805 | "\xf4\x80" )); |
806 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
807 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), |
808 | "\xf4\x8f" )); |
809 | |
810 | // Overlong sequences with two trailing byte missing. |
811 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
812 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xe0" )); |
813 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
814 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), |
815 | "\xf0\x80" )); |
816 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
817 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), |
818 | "\xf0\x8f" )); |
819 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
820 | ConvertUTFResultContainer(sourceIllegal) |
821 | .withScalars(0xfffd, 0xfffd, 0xfffd), |
822 | "\xf8\x80\x80" )); |
823 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
824 | ConvertUTFResultContainer(sourceIllegal) |
825 | .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), |
826 | "\xfc\x80\x80\x80" )); |
827 | |
828 | // Sequences that represent surrogates with two trailing bytes missing. |
829 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
830 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xed" )); |
831 | |
832 | // Ill-formed 4-byte sequences. |
833 | // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx |
834 | // U+110yxx (invalid) |
835 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
836 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), |
837 | "\xf4\x90" )); |
838 | // U+13Fyxx (invalid) |
839 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
840 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), |
841 | "\xf4\xbf" )); |
842 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
843 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), |
844 | "\xf5\x80" )); |
845 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
846 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), |
847 | "\xf6\x80" )); |
848 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
849 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), |
850 | "\xf7\x80" )); |
851 | // U+1FFyxx (invalid) |
852 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
853 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), |
854 | "\xf7\xbf" )); |
855 | |
856 | // Ill-formed 5-byte sequences. |
857 | // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx |
858 | // U+200yxx (invalid) |
859 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
860 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd), |
861 | "\xf8\x88\x80" )); |
862 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
863 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd), |
864 | "\xf8\xbf\xbf" )); |
865 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
866 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd), |
867 | "\xf9\x80\x80" )); |
868 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
869 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd), |
870 | "\xfa\x80\x80" )); |
871 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
872 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd), |
873 | "\xfb\x80\x80" )); |
874 | // U+3FFFyxx (invalid) |
875 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
876 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd), |
877 | "\xfb\xbf\xbf" )); |
878 | |
879 | // Ill-formed 6-byte sequences. |
880 | // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx |
881 | // U+4000yxx (invalid) |
882 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
883 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), |
884 | "\xfc\x84\x80\x80" )); |
885 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
886 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), |
887 | "\xfc\xbf\xbf\xbf" )); |
888 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
889 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), |
890 | "\xfd\x80\x80\x80" )); |
891 | // U+7FFFFyxx (invalid) |
892 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
893 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), |
894 | "\xfd\xbf\xbf\xbf" )); |
895 | |
896 | // |
897 | // Sequences with three continuation bytes missing |
898 | // |
899 | |
900 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
901 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf0" )); |
902 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
903 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf1" )); |
904 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
905 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf2" )); |
906 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
907 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf3" )); |
908 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
909 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf4" )); |
910 | |
911 | // Broken overlong sequences. |
912 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
913 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf0" )); |
914 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
915 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), |
916 | "\xf8\x80" )); |
917 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
918 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd), |
919 | "\xfc\x80\x80" )); |
920 | |
921 | // Ill-formed 4-byte sequences. |
922 | // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx |
923 | // U+14yyxx (invalid) |
924 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
925 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf5" )); |
926 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
927 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf6" )); |
928 | // U+1Cyyxx (invalid) |
929 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
930 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf7" )); |
931 | |
932 | // Ill-formed 5-byte sequences. |
933 | // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx |
934 | // U+20yyxx (invalid) |
935 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
936 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), |
937 | "\xf8\x88" )); |
938 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
939 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), |
940 | "\xf8\xbf" )); |
941 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
942 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), |
943 | "\xf9\x80" )); |
944 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
945 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), |
946 | "\xfa\x80" )); |
947 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
948 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), |
949 | "\xfb\x80" )); |
950 | // U+3FCyyxx (invalid) |
951 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
952 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), |
953 | "\xfb\xbf" )); |
954 | |
955 | // Ill-formed 6-byte sequences. |
956 | // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx |
957 | // U+400yyxx (invalid) |
958 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
959 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd), |
960 | "\xfc\x84\x80" )); |
961 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
962 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd), |
963 | "\xfc\xbf\xbf" )); |
964 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
965 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd), |
966 | "\xfd\x80\x80" )); |
967 | // U+7FFCyyxx (invalid) |
968 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
969 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd), |
970 | "\xfd\xbf\xbf" )); |
971 | |
972 | // |
973 | // Sequences with four continuation bytes missing |
974 | // |
975 | |
976 | // Ill-formed 5-byte sequences. |
977 | // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx |
978 | // U+uzyyxx (invalid) |
979 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
980 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf8" )); |
981 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
982 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf9" )); |
983 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
984 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfa" )); |
985 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
986 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfb" )); |
987 | // U+3zyyxx (invalid) |
988 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
989 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfb" )); |
990 | |
991 | // Broken overlong sequences. |
992 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
993 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf8" )); |
994 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
995 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), |
996 | "\xfc\x80" )); |
997 | |
998 | // Ill-formed 6-byte sequences. |
999 | // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx |
1000 | // U+uzzyyxx (invalid) |
1001 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1002 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), |
1003 | "\xfc\x84" )); |
1004 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1005 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), |
1006 | "\xfc\xbf" )); |
1007 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1008 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), |
1009 | "\xfd\x80" )); |
1010 | // U+7Fzzyyxx (invalid) |
1011 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1012 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), |
1013 | "\xfd\xbf" )); |
1014 | |
1015 | // |
1016 | // Sequences with five continuation bytes missing |
1017 | // |
1018 | |
1019 | // Ill-formed 6-byte sequences. |
1020 | // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx |
1021 | // U+uzzyyxx (invalid) |
1022 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1023 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfc" )); |
1024 | // U+uuzzyyxx (invalid) |
1025 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1026 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfd" )); |
1027 | |
1028 | // |
1029 | // Consecutive sequences with trailing bytes missing |
1030 | // |
1031 | |
1032 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1033 | ConvertUTFResultContainer(sourceIllegal) |
1034 | .withScalars(0xfffd, /**/ 0xfffd, 0xfffd, /**/ 0xfffd, 0xfffd, 0xfffd) |
1035 | .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd) |
1036 | .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd) |
1037 | .withScalars(0xfffd, /**/ 0xfffd, /**/ 0xfffd, 0xfffd, 0xfffd) |
1038 | .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd) |
1039 | .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), |
1040 | "\xc0" "\xe0\x80" "\xf0\x80\x80" |
1041 | "\xf8\x80\x80\x80" |
1042 | "\xfc\x80\x80\x80\x80" |
1043 | "\xdf" "\xef\xbf" "\xf7\xbf\xbf" |
1044 | "\xfb\xbf\xbf\xbf" |
1045 | "\xfd\xbf\xbf\xbf\xbf" )); |
1046 | |
1047 | // |
1048 | // Overlong UTF-8 sequences |
1049 | // |
1050 | |
1051 | // U+002F SOLIDUS |
1052 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1053 | ConvertUTFResultContainer(conversionOK).withScalars(0x002f), "\x2f" )); |
1054 | |
1055 | // Overlong sequences of the above. |
1056 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1057 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), |
1058 | "\xc0\xaf" )); |
1059 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1060 | ConvertUTFResultContainer(sourceIllegal) |
1061 | .withScalars(0xfffd, 0xfffd, 0xfffd), |
1062 | "\xe0\x80\xaf" )); |
1063 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1064 | ConvertUTFResultContainer(sourceIllegal) |
1065 | .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), |
1066 | "\xf0\x80\x80\xaf" )); |
1067 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1068 | ConvertUTFResultContainer(sourceIllegal) |
1069 | .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), |
1070 | "\xf8\x80\x80\x80\xaf" )); |
1071 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1072 | ConvertUTFResultContainer(sourceIllegal) |
1073 | .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), |
1074 | "\xfc\x80\x80\x80\x80\xaf" )); |
1075 | |
1076 | // U+0000 NULL |
1077 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1078 | ConvertUTFResultContainer(conversionOK).withScalars(0x0000), |
1079 | StringRef("\x00" , 1))); |
1080 | |
1081 | // Overlong sequences of the above. |
1082 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1083 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), |
1084 | "\xc0\x80" )); |
1085 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1086 | ConvertUTFResultContainer(sourceIllegal) |
1087 | .withScalars(0xfffd, 0xfffd, 0xfffd), |
1088 | "\xe0\x80\x80" )); |
1089 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1090 | ConvertUTFResultContainer(sourceIllegal) |
1091 | .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), |
1092 | "\xf0\x80\x80\x80" )); |
1093 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1094 | ConvertUTFResultContainer(sourceIllegal) |
1095 | .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), |
1096 | "\xf8\x80\x80\x80\x80" )); |
1097 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1098 | ConvertUTFResultContainer(sourceIllegal) |
1099 | .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), |
1100 | "\xfc\x80\x80\x80\x80\x80" )); |
1101 | |
1102 | // Other overlong sequences. |
1103 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1104 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), |
1105 | "\xc0\xbf" )); |
1106 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1107 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), |
1108 | "\xc1\x80" )); |
1109 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1110 | ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd), |
1111 | "\xc1\xbf" )); |
1112 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1113 | ConvertUTFResultContainer(sourceIllegal) |
1114 | .withScalars(0xfffd, 0xfffd, 0xfffd), |
1115 | "\xe0\x9f\xbf" )); |
1116 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1117 | ConvertUTFResultContainer(sourceIllegal) |
1118 | .withScalars(0xfffd, 0xfffd, 0xfffd), |
1119 | "\xed\xa0\x80" )); |
1120 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1121 | ConvertUTFResultContainer(sourceIllegal) |
1122 | .withScalars(0xfffd, 0xfffd, 0xfffd), |
1123 | "\xed\xbf\xbf" )); |
1124 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1125 | ConvertUTFResultContainer(sourceIllegal) |
1126 | .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), |
1127 | "\xf0\x8f\x80\x80" )); |
1128 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1129 | ConvertUTFResultContainer(sourceIllegal) |
1130 | .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd), |
1131 | "\xf0\x8f\xbf\xbf" )); |
1132 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1133 | ConvertUTFResultContainer(sourceIllegal) |
1134 | .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), |
1135 | "\xf8\x87\xbf\xbf\xbf" )); |
1136 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1137 | ConvertUTFResultContainer(sourceIllegal) |
1138 | .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), |
1139 | "\xfc\x83\xbf\xbf\xbf\xbf" )); |
1140 | |
1141 | // |
1142 | // Isolated surrogates |
1143 | // |
1144 | |
1145 | // Unicode 6.3.0: |
1146 | // |
1147 | // D71. High-surrogate code point: A Unicode code point in the range |
1148 | // U+D800 to U+DBFF. |
1149 | // |
1150 | // D73. Low-surrogate code point: A Unicode code point in the range |
1151 | // U+DC00 to U+DFFF. |
1152 | |
1153 | // Note: U+E0100 is <DB40 DD00> in UTF16. |
1154 | |
1155 | // High surrogates |
1156 | |
1157 | // U+D800 |
1158 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1159 | ConvertUTFResultContainer(sourceIllegal) |
1160 | .withScalars(0xfffd, 0xfffd, 0xfffd), |
1161 | "\xed\xa0\x80" )); |
1162 | |
1163 | // U+DB40 |
1164 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1165 | ConvertUTFResultContainer(sourceIllegal) |
1166 | .withScalars(0xfffd, 0xfffd, 0xfffd), |
1167 | "\xed\xac\xa0" )); |
1168 | |
1169 | // U+DBFF |
1170 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1171 | ConvertUTFResultContainer(sourceIllegal) |
1172 | .withScalars(0xfffd, 0xfffd, 0xfffd), |
1173 | "\xed\xaf\xbf" )); |
1174 | |
1175 | // Low surrogates |
1176 | |
1177 | // U+DC00 |
1178 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1179 | ConvertUTFResultContainer(sourceIllegal) |
1180 | .withScalars(0xfffd, 0xfffd, 0xfffd), |
1181 | "\xed\xb0\x80" )); |
1182 | |
1183 | // U+DD00 |
1184 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1185 | ConvertUTFResultContainer(sourceIllegal) |
1186 | .withScalars(0xfffd, 0xfffd, 0xfffd), |
1187 | "\xed\xb4\x80" )); |
1188 | |
1189 | // U+DFFF |
1190 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1191 | ConvertUTFResultContainer(sourceIllegal) |
1192 | .withScalars(0xfffd, 0xfffd, 0xfffd), |
1193 | "\xed\xbf\xbf" )); |
1194 | |
1195 | // Surrogate pairs |
1196 | |
1197 | // U+D800 U+DC00 |
1198 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1199 | ConvertUTFResultContainer(sourceIllegal) |
1200 | .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), |
1201 | "\xed\xa0\x80\xed\xb0\x80" )); |
1202 | |
1203 | // U+D800 U+DD00 |
1204 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1205 | ConvertUTFResultContainer(sourceIllegal) |
1206 | .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), |
1207 | "\xed\xa0\x80\xed\xb4\x80" )); |
1208 | |
1209 | // U+D800 U+DFFF |
1210 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1211 | ConvertUTFResultContainer(sourceIllegal) |
1212 | .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), |
1213 | "\xed\xa0\x80\xed\xbf\xbf" )); |
1214 | |
1215 | // U+DB40 U+DC00 |
1216 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1217 | ConvertUTFResultContainer(sourceIllegal) |
1218 | .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), |
1219 | "\xed\xac\xa0\xed\xb0\x80" )); |
1220 | |
1221 | // U+DB40 U+DD00 |
1222 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1223 | ConvertUTFResultContainer(sourceIllegal) |
1224 | .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), |
1225 | "\xed\xac\xa0\xed\xb4\x80" )); |
1226 | |
1227 | // U+DB40 U+DFFF |
1228 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1229 | ConvertUTFResultContainer(sourceIllegal) |
1230 | .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), |
1231 | "\xed\xac\xa0\xed\xbf\xbf" )); |
1232 | |
1233 | // U+DBFF U+DC00 |
1234 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1235 | ConvertUTFResultContainer(sourceIllegal) |
1236 | .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), |
1237 | "\xed\xaf\xbf\xed\xb0\x80" )); |
1238 | |
1239 | // U+DBFF U+DD00 |
1240 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1241 | ConvertUTFResultContainer(sourceIllegal) |
1242 | .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), |
1243 | "\xed\xaf\xbf\xed\xb4\x80" )); |
1244 | |
1245 | // U+DBFF U+DFFF |
1246 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1247 | ConvertUTFResultContainer(sourceIllegal) |
1248 | .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd), |
1249 | "\xed\xaf\xbf\xed\xbf\xbf" )); |
1250 | |
1251 | // |
1252 | // Noncharacters |
1253 | // |
1254 | |
1255 | // Unicode 6.3.0: |
1256 | // |
1257 | // D14. Noncharacter: A code point that is permanently reserved for |
1258 | // internal use and that should never be interchanged. Noncharacters |
1259 | // consist of the values U+nFFFE and U+nFFFF (where n is from 0 to 1016) |
1260 | // and the values U+FDD0..U+FDEF. |
1261 | |
1262 | // U+FFFE |
1263 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1264 | ConvertUTFResultContainer(conversionOK).withScalars(0xfffe), |
1265 | "\xef\xbf\xbe" )); |
1266 | |
1267 | // U+FFFF |
1268 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1269 | ConvertUTFResultContainer(conversionOK).withScalars(0xffff), |
1270 | "\xef\xbf\xbf" )); |
1271 | |
1272 | // U+1FFFE |
1273 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1274 | ConvertUTFResultContainer(conversionOK).withScalars(0x1fffe), |
1275 | "\xf0\x9f\xbf\xbe" )); |
1276 | |
1277 | // U+1FFFF |
1278 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1279 | ConvertUTFResultContainer(conversionOK).withScalars(0x1ffff), |
1280 | "\xf0\x9f\xbf\xbf" )); |
1281 | |
1282 | // U+2FFFE |
1283 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1284 | ConvertUTFResultContainer(conversionOK).withScalars(0x2fffe), |
1285 | "\xf0\xaf\xbf\xbe" )); |
1286 | |
1287 | // U+2FFFF |
1288 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1289 | ConvertUTFResultContainer(conversionOK).withScalars(0x2ffff), |
1290 | "\xf0\xaf\xbf\xbf" )); |
1291 | |
1292 | // U+3FFFE |
1293 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1294 | ConvertUTFResultContainer(conversionOK).withScalars(0x3fffe), |
1295 | "\xf0\xbf\xbf\xbe" )); |
1296 | |
1297 | // U+3FFFF |
1298 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1299 | ConvertUTFResultContainer(conversionOK).withScalars(0x3ffff), |
1300 | "\xf0\xbf\xbf\xbf" )); |
1301 | |
1302 | // U+4FFFE |
1303 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1304 | ConvertUTFResultContainer(conversionOK).withScalars(0x4fffe), |
1305 | "\xf1\x8f\xbf\xbe" )); |
1306 | |
1307 | // U+4FFFF |
1308 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1309 | ConvertUTFResultContainer(conversionOK).withScalars(0x4ffff), |
1310 | "\xf1\x8f\xbf\xbf" )); |
1311 | |
1312 | // U+5FFFE |
1313 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1314 | ConvertUTFResultContainer(conversionOK).withScalars(0x5fffe), |
1315 | "\xf1\x9f\xbf\xbe" )); |
1316 | |
1317 | // U+5FFFF |
1318 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1319 | ConvertUTFResultContainer(conversionOK).withScalars(0x5ffff), |
1320 | "\xf1\x9f\xbf\xbf" )); |
1321 | |
1322 | // U+6FFFE |
1323 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1324 | ConvertUTFResultContainer(conversionOK).withScalars(0x6fffe), |
1325 | "\xf1\xaf\xbf\xbe" )); |
1326 | |
1327 | // U+6FFFF |
1328 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1329 | ConvertUTFResultContainer(conversionOK).withScalars(0x6ffff), |
1330 | "\xf1\xaf\xbf\xbf" )); |
1331 | |
1332 | // U+7FFFE |
1333 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1334 | ConvertUTFResultContainer(conversionOK).withScalars(0x7fffe), |
1335 | "\xf1\xbf\xbf\xbe" )); |
1336 | |
1337 | // U+7FFFF |
1338 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1339 | ConvertUTFResultContainer(conversionOK).withScalars(0x7ffff), |
1340 | "\xf1\xbf\xbf\xbf" )); |
1341 | |
1342 | // U+8FFFE |
1343 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1344 | ConvertUTFResultContainer(conversionOK).withScalars(0x8fffe), |
1345 | "\xf2\x8f\xbf\xbe" )); |
1346 | |
1347 | // U+8FFFF |
1348 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1349 | ConvertUTFResultContainer(conversionOK).withScalars(0x8ffff), |
1350 | "\xf2\x8f\xbf\xbf" )); |
1351 | |
1352 | // U+9FFFE |
1353 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1354 | ConvertUTFResultContainer(conversionOK).withScalars(0x9fffe), |
1355 | "\xf2\x9f\xbf\xbe" )); |
1356 | |
1357 | // U+9FFFF |
1358 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1359 | ConvertUTFResultContainer(conversionOK).withScalars(0x9ffff), |
1360 | "\xf2\x9f\xbf\xbf" )); |
1361 | |
1362 | // U+AFFFE |
1363 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1364 | ConvertUTFResultContainer(conversionOK).withScalars(0xafffe), |
1365 | "\xf2\xaf\xbf\xbe" )); |
1366 | |
1367 | // U+AFFFF |
1368 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1369 | ConvertUTFResultContainer(conversionOK).withScalars(0xaffff), |
1370 | "\xf2\xaf\xbf\xbf" )); |
1371 | |
1372 | // U+BFFFE |
1373 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1374 | ConvertUTFResultContainer(conversionOK).withScalars(0xbfffe), |
1375 | "\xf2\xbf\xbf\xbe" )); |
1376 | |
1377 | // U+BFFFF |
1378 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1379 | ConvertUTFResultContainer(conversionOK).withScalars(0xbffff), |
1380 | "\xf2\xbf\xbf\xbf" )); |
1381 | |
1382 | // U+CFFFE |
1383 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1384 | ConvertUTFResultContainer(conversionOK).withScalars(0xcfffe), |
1385 | "\xf3\x8f\xbf\xbe" )); |
1386 | |
1387 | // U+CFFFF |
1388 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1389 | ConvertUTFResultContainer(conversionOK).withScalars(0xcfffF), |
1390 | "\xf3\x8f\xbf\xbf" )); |
1391 | |
1392 | // U+DFFFE |
1393 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1394 | ConvertUTFResultContainer(conversionOK).withScalars(0xdfffe), |
1395 | "\xf3\x9f\xbf\xbe" )); |
1396 | |
1397 | // U+DFFFF |
1398 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1399 | ConvertUTFResultContainer(conversionOK).withScalars(0xdffff), |
1400 | "\xf3\x9f\xbf\xbf" )); |
1401 | |
1402 | // U+EFFFE |
1403 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1404 | ConvertUTFResultContainer(conversionOK).withScalars(0xefffe), |
1405 | "\xf3\xaf\xbf\xbe" )); |
1406 | |
1407 | // U+EFFFF |
1408 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1409 | ConvertUTFResultContainer(conversionOK).withScalars(0xeffff), |
1410 | "\xf3\xaf\xbf\xbf" )); |
1411 | |
1412 | // U+FFFFE |
1413 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1414 | ConvertUTFResultContainer(conversionOK).withScalars(0xffffe), |
1415 | "\xf3\xbf\xbf\xbe" )); |
1416 | |
1417 | // U+FFFFF |
1418 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1419 | ConvertUTFResultContainer(conversionOK).withScalars(0xfffff), |
1420 | "\xf3\xbf\xbf\xbf" )); |
1421 | |
1422 | // U+10FFFE |
1423 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1424 | ConvertUTFResultContainer(conversionOK).withScalars(0x10fffe), |
1425 | "\xf4\x8f\xbf\xbe" )); |
1426 | |
1427 | // U+10FFFF |
1428 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1429 | ConvertUTFResultContainer(conversionOK).withScalars(0x10ffff), |
1430 | "\xf4\x8f\xbf\xbf" )); |
1431 | |
1432 | // U+FDD0 |
1433 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1434 | ConvertUTFResultContainer(conversionOK).withScalars(0xfdd0), |
1435 | "\xef\xb7\x90" )); |
1436 | |
1437 | // U+FDD1 |
1438 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1439 | ConvertUTFResultContainer(conversionOK).withScalars(0xfdd1), |
1440 | "\xef\xb7\x91" )); |
1441 | |
1442 | // U+FDD2 |
1443 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1444 | ConvertUTFResultContainer(conversionOK).withScalars(0xfdd2), |
1445 | "\xef\xb7\x92" )); |
1446 | |
1447 | // U+FDD3 |
1448 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1449 | ConvertUTFResultContainer(conversionOK).withScalars(0xfdd3), |
1450 | "\xef\xb7\x93" )); |
1451 | |
1452 | // U+FDD4 |
1453 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1454 | ConvertUTFResultContainer(conversionOK).withScalars(0xfdd4), |
1455 | "\xef\xb7\x94" )); |
1456 | |
1457 | // U+FDD5 |
1458 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1459 | ConvertUTFResultContainer(conversionOK).withScalars(0xfdd5), |
1460 | "\xef\xb7\x95" )); |
1461 | |
1462 | // U+FDD6 |
1463 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1464 | ConvertUTFResultContainer(conversionOK).withScalars(0xfdd6), |
1465 | "\xef\xb7\x96" )); |
1466 | |
1467 | // U+FDD7 |
1468 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1469 | ConvertUTFResultContainer(conversionOK).withScalars(0xfdd7), |
1470 | "\xef\xb7\x97" )); |
1471 | |
1472 | // U+FDD8 |
1473 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1474 | ConvertUTFResultContainer(conversionOK).withScalars(0xfdd8), |
1475 | "\xef\xb7\x98" )); |
1476 | |
1477 | // U+FDD9 |
1478 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1479 | ConvertUTFResultContainer(conversionOK).withScalars(0xfdd9), |
1480 | "\xef\xb7\x99" )); |
1481 | |
1482 | // U+FDDA |
1483 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1484 | ConvertUTFResultContainer(conversionOK).withScalars(0xfdda), |
1485 | "\xef\xb7\x9a" )); |
1486 | |
1487 | // U+FDDB |
1488 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1489 | ConvertUTFResultContainer(conversionOK).withScalars(0xfddb), |
1490 | "\xef\xb7\x9b" )); |
1491 | |
1492 | // U+FDDC |
1493 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1494 | ConvertUTFResultContainer(conversionOK).withScalars(0xfddc), |
1495 | "\xef\xb7\x9c" )); |
1496 | |
1497 | // U+FDDD |
1498 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1499 | ConvertUTFResultContainer(conversionOK).withScalars(0xfddd), |
1500 | "\xef\xb7\x9d" )); |
1501 | |
1502 | // U+FDDE |
1503 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1504 | ConvertUTFResultContainer(conversionOK).withScalars(0xfdde), |
1505 | "\xef\xb7\x9e" )); |
1506 | |
1507 | // U+FDDF |
1508 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1509 | ConvertUTFResultContainer(conversionOK).withScalars(0xfddf), |
1510 | "\xef\xb7\x9f" )); |
1511 | |
1512 | // U+FDE0 |
1513 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1514 | ConvertUTFResultContainer(conversionOK).withScalars(0xfde0), |
1515 | "\xef\xb7\xa0" )); |
1516 | |
1517 | // U+FDE1 |
1518 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1519 | ConvertUTFResultContainer(conversionOK).withScalars(0xfde1), |
1520 | "\xef\xb7\xa1" )); |
1521 | |
1522 | // U+FDE2 |
1523 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1524 | ConvertUTFResultContainer(conversionOK).withScalars(0xfde2), |
1525 | "\xef\xb7\xa2" )); |
1526 | |
1527 | // U+FDE3 |
1528 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1529 | ConvertUTFResultContainer(conversionOK).withScalars(0xfde3), |
1530 | "\xef\xb7\xa3" )); |
1531 | |
1532 | // U+FDE4 |
1533 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1534 | ConvertUTFResultContainer(conversionOK).withScalars(0xfde4), |
1535 | "\xef\xb7\xa4" )); |
1536 | |
1537 | // U+FDE5 |
1538 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1539 | ConvertUTFResultContainer(conversionOK).withScalars(0xfde5), |
1540 | "\xef\xb7\xa5" )); |
1541 | |
1542 | // U+FDE6 |
1543 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1544 | ConvertUTFResultContainer(conversionOK).withScalars(0xfde6), |
1545 | "\xef\xb7\xa6" )); |
1546 | |
1547 | // U+FDE7 |
1548 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1549 | ConvertUTFResultContainer(conversionOK).withScalars(0xfde7), |
1550 | "\xef\xb7\xa7" )); |
1551 | |
1552 | // U+FDE8 |
1553 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1554 | ConvertUTFResultContainer(conversionOK).withScalars(0xfde8), |
1555 | "\xef\xb7\xa8" )); |
1556 | |
1557 | // U+FDE9 |
1558 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1559 | ConvertUTFResultContainer(conversionOK).withScalars(0xfde9), |
1560 | "\xef\xb7\xa9" )); |
1561 | |
1562 | // U+FDEA |
1563 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1564 | ConvertUTFResultContainer(conversionOK).withScalars(0xfdea), |
1565 | "\xef\xb7\xaa" )); |
1566 | |
1567 | // U+FDEB |
1568 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1569 | ConvertUTFResultContainer(conversionOK).withScalars(0xfdeb), |
1570 | "\xef\xb7\xab" )); |
1571 | |
1572 | // U+FDEC |
1573 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1574 | ConvertUTFResultContainer(conversionOK).withScalars(0xfdec), |
1575 | "\xef\xb7\xac" )); |
1576 | |
1577 | // U+FDED |
1578 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1579 | ConvertUTFResultContainer(conversionOK).withScalars(0xfded), |
1580 | "\xef\xb7\xad" )); |
1581 | |
1582 | // U+FDEE |
1583 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1584 | ConvertUTFResultContainer(conversionOK).withScalars(0xfdee), |
1585 | "\xef\xb7\xae" )); |
1586 | |
1587 | // U+FDEF |
1588 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1589 | ConvertUTFResultContainer(conversionOK).withScalars(0xfdef), |
1590 | "\xef\xb7\xaf" )); |
1591 | |
1592 | // U+FDF0 |
1593 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1594 | ConvertUTFResultContainer(conversionOK).withScalars(0xfdf0), |
1595 | "\xef\xb7\xb0" )); |
1596 | |
1597 | // U+FDF1 |
1598 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1599 | ConvertUTFResultContainer(conversionOK).withScalars(0xfdf1), |
1600 | "\xef\xb7\xb1" )); |
1601 | |
1602 | // U+FDF2 |
1603 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1604 | ConvertUTFResultContainer(conversionOK).withScalars(0xfdf2), |
1605 | "\xef\xb7\xb2" )); |
1606 | |
1607 | // U+FDF3 |
1608 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1609 | ConvertUTFResultContainer(conversionOK).withScalars(0xfdf3), |
1610 | "\xef\xb7\xb3" )); |
1611 | |
1612 | // U+FDF4 |
1613 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1614 | ConvertUTFResultContainer(conversionOK).withScalars(0xfdf4), |
1615 | "\xef\xb7\xb4" )); |
1616 | |
1617 | // U+FDF5 |
1618 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1619 | ConvertUTFResultContainer(conversionOK).withScalars(0xfdf5), |
1620 | "\xef\xb7\xb5" )); |
1621 | |
1622 | // U+FDF6 |
1623 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1624 | ConvertUTFResultContainer(conversionOK).withScalars(0xfdf6), |
1625 | "\xef\xb7\xb6" )); |
1626 | |
1627 | // U+FDF7 |
1628 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1629 | ConvertUTFResultContainer(conversionOK).withScalars(0xfdf7), |
1630 | "\xef\xb7\xb7" )); |
1631 | |
1632 | // U+FDF8 |
1633 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1634 | ConvertUTFResultContainer(conversionOK).withScalars(0xfdf8), |
1635 | "\xef\xb7\xb8" )); |
1636 | |
1637 | // U+FDF9 |
1638 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1639 | ConvertUTFResultContainer(conversionOK).withScalars(0xfdf9), |
1640 | "\xef\xb7\xb9" )); |
1641 | |
1642 | // U+FDFA |
1643 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1644 | ConvertUTFResultContainer(conversionOK).withScalars(0xfdfa), |
1645 | "\xef\xb7\xba" )); |
1646 | |
1647 | // U+FDFB |
1648 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1649 | ConvertUTFResultContainer(conversionOK).withScalars(0xfdfb), |
1650 | "\xef\xb7\xbb" )); |
1651 | |
1652 | // U+FDFC |
1653 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1654 | ConvertUTFResultContainer(conversionOK).withScalars(0xfdfc), |
1655 | "\xef\xb7\xbc" )); |
1656 | |
1657 | // U+FDFD |
1658 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1659 | ConvertUTFResultContainer(conversionOK).withScalars(0xfdfd), |
1660 | "\xef\xb7\xbd" )); |
1661 | |
1662 | // U+FDFE |
1663 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1664 | ConvertUTFResultContainer(conversionOK).withScalars(0xfdfe), |
1665 | "\xef\xb7\xbe" )); |
1666 | |
1667 | // U+FDFF |
1668 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1669 | ConvertUTFResultContainer(conversionOK).withScalars(0xfdff), |
1670 | "\xef\xb7\xbf" )); |
1671 | } |
1672 | |
1673 | TEST(ConvertUTFTest, UTF8ToUTF32PartialLenient) { |
1674 | // U+0041 LATIN CAPITAL LETTER A |
1675 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1676 | ConvertUTFResultContainer(conversionOK).withScalars(0x0041), |
1677 | "\x41" , true)); |
1678 | |
1679 | // |
1680 | // Sequences with one continuation byte missing |
1681 | // |
1682 | |
1683 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1684 | ConvertUTFResultContainer(sourceExhausted), |
1685 | "\xc2" , true)); |
1686 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1687 | ConvertUTFResultContainer(sourceExhausted), |
1688 | "\xdf" , true)); |
1689 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1690 | ConvertUTFResultContainer(sourceExhausted), |
1691 | "\xe0\xa0" , true)); |
1692 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1693 | ConvertUTFResultContainer(sourceExhausted), |
1694 | "\xe0\xbf" , true)); |
1695 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1696 | ConvertUTFResultContainer(sourceExhausted), |
1697 | "\xe1\x80" , true)); |
1698 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1699 | ConvertUTFResultContainer(sourceExhausted), |
1700 | "\xec\xbf" , true)); |
1701 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1702 | ConvertUTFResultContainer(sourceExhausted), |
1703 | "\xed\x80" , true)); |
1704 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1705 | ConvertUTFResultContainer(sourceExhausted), |
1706 | "\xed\x9f" , true)); |
1707 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1708 | ConvertUTFResultContainer(sourceExhausted), |
1709 | "\xee\x80" , true)); |
1710 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1711 | ConvertUTFResultContainer(sourceExhausted), |
1712 | "\xef\xbf" , true)); |
1713 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1714 | ConvertUTFResultContainer(sourceExhausted), |
1715 | "\xf0\x90\x80" , true)); |
1716 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1717 | ConvertUTFResultContainer(sourceExhausted), |
1718 | "\xf0\xbf\xbf" , true)); |
1719 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1720 | ConvertUTFResultContainer(sourceExhausted), |
1721 | "\xf1\x80\x80" , true)); |
1722 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1723 | ConvertUTFResultContainer(sourceExhausted), |
1724 | "\xf3\xbf\xbf" , true)); |
1725 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1726 | ConvertUTFResultContainer(sourceExhausted), |
1727 | "\xf4\x80\x80" , true)); |
1728 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1729 | ConvertUTFResultContainer(sourceExhausted), |
1730 | "\xf4\x8f\xbf" , true)); |
1731 | |
1732 | EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars( |
1733 | ConvertUTFResultContainer(sourceExhausted).withScalars(0x0041), |
1734 | "\x41\xc2" , true)); |
1735 | } |
1736 | |
1737 | |