1 | // |
2 | // Copyright (c) 2009-2011 Artyom Beilis (Tonkikh) |
3 | // Copyright (c) 2022-2023 Alexander Grund |
4 | // |
5 | // Distributed under the Boost Software License, Version 1.0. |
6 | // https://www.boost.org/LICENSE_1_0.txt |
7 | |
8 | #include <boost/locale/encoding.hpp> |
9 | #include <boost/locale/generator.hpp> |
10 | #include <algorithm> |
11 | |
12 | #include "boostLocale/test/tools.hpp" |
13 | #include "boostLocale/test/unit_test.hpp" |
14 | |
15 | const bool test_iso_8859_8 = |
16 | #if defined(BOOST_LOCALE_WITH_ICU) || defined(BOOST_LOCALE_WITH_ICONV) |
17 | true; |
18 | #else |
19 | hasWinCodepage(28598); |
20 | #endif |
21 | |
22 | #if defined(BOOST_LOCALE_WITH_ICONV) |
23 | // Reproduce issue #206 to detect faulty IConv |
24 | static bool isFaultyIconv() |
25 | { |
26 | namespace blc = boost::locale::conv; |
27 | auto from_utf = blc::detail::make_utf_decoder<char>(charset: "ISO-2022-CN" , how: blc::skip, impl: blc::detail::conv_backend::IConv); |
28 | try { |
29 | from_utf->convert(text: "实" ); |
30 | } catch(const std::runtime_error& e) { // LCOV_EXCL_LINE |
31 | return std::string(e.what()).find(s: "IConv is faulty" ) != std::string::npos; // LCOV_EXCL_LINE |
32 | } |
33 | return false; |
34 | } |
35 | #else |
36 | constexpr bool isFaultyIconv() |
37 | { |
38 | return false; |
39 | } |
40 | #endif |
41 | |
42 | constexpr boost::locale::conv::detail::conv_backend all_conv_backends[] = { |
43 | #ifdef BOOST_LOCALE_WITH_ICONV |
44 | boost::locale::conv::detail::conv_backend::IConv, |
45 | #endif |
46 | #ifdef BOOST_LOCALE_WITH_ICU |
47 | boost::locale::conv::detail::conv_backend::ICU, |
48 | #endif |
49 | #if BOOST_LOCALE_USE_WIN32_API |
50 | boost::locale::conv::detail::conv_backend::WinAPI, |
51 | #endif |
52 | }; |
53 | |
54 | std::ostream& operator<<(std::ostream& s, boost::locale::conv::detail::conv_backend impl) |
55 | { |
56 | using boost::locale::conv::detail::conv_backend; |
57 | switch(impl) { |
58 | case conv_backend::Default: return s << "[Default]" ; // LCOV_EXCL_LINE |
59 | case conv_backend::IConv: return s << "[IConv]" ; |
60 | case conv_backend::ICU: return s << "[ICU]" ; |
61 | case conv_backend::WinAPI: return s << "[WinAPI]" ; |
62 | } |
63 | return s; // LCOV_EXCL_LINE |
64 | } |
65 | |
66 | #define TEST_FAIL_CONVERSION(X) TEST_THROWS(X, boost::locale::conv::conversion_error) |
67 | |
68 | template<typename Char> |
69 | void test_to_utf_for_impls(const std::string& source, |
70 | const std::basic_string<Char>& target, |
71 | const std::string& encoding, |
72 | const bool expectSuccess = true, |
73 | const bool test_default = true) |
74 | { |
75 | if(test_default) { |
76 | boost::locale::conv::utf_encoder<Char> conv(encoding); |
77 | TEST_EQ(conv(source), target); |
78 | } |
79 | for(const auto impl : all_conv_backends) { |
80 | std::cout << "----- " << impl << '\n'; |
81 | using boost::locale::conv::invalid_charset_error; |
82 | try { |
83 | auto convPtr = |
84 | boost::locale::conv::detail::make_utf_encoder<Char>(encoding, boost::locale::conv::skip, impl); |
85 | TEST_EQ(convPtr->convert(source), target); |
86 | } catch(invalid_charset_error&) { |
87 | std::cout << "--- Charset not supported\n" ; // LCOV_EXCL_LINE |
88 | continue; // LCOV_EXCL_LINE |
89 | } |
90 | if(!expectSuccess) { |
91 | auto convPtr = |
92 | boost::locale::conv::detail::make_utf_encoder<Char>(encoding, boost::locale::conv::stop, impl); |
93 | TEST_FAIL_CONVERSION(convPtr->convert(source)); |
94 | } |
95 | } |
96 | if(encoding == "UTF-8" ) { |
97 | using boost::locale::conv::utf_to_utf; |
98 | TEST_EQ(utf_to_utf<Char>(source), target); |
99 | if(expectSuccess) |
100 | TEST_EQ(utf_to_utf<char>(source), source); |
101 | else |
102 | TEST_FAIL_CONVERSION(utf_to_utf<Char>(source, boost::locale::conv::stop)); |
103 | } |
104 | } |
105 | |
106 | template<typename Char> |
107 | void test_from_utf_for_impls(const std::basic_string<Char>& source, |
108 | const std::string& target, |
109 | const std::string& encoding, |
110 | const bool expectSuccess = true, |
111 | const bool test_default = true) |
112 | { |
113 | if(test_default) { |
114 | boost::locale::conv::utf_decoder<Char> conv(encoding); |
115 | TEST_EQ(conv(source), target); |
116 | } |
117 | for(const auto impl : all_conv_backends) { |
118 | std::cout << "----- " << impl << '\n'; |
119 | using boost::locale::conv::invalid_charset_error; |
120 | try { |
121 | auto convPtr = |
122 | boost::locale::conv::detail::make_utf_decoder<Char>(encoding, boost::locale::conv::skip, impl); |
123 | TEST_EQ(convPtr->convert(source), target); |
124 | } catch(invalid_charset_error&) { |
125 | std::cout << "--- Charset not supported\n" ; // LCOV_EXCL_LINE |
126 | continue; // LCOV_EXCL_LINE |
127 | } |
128 | if(!expectSuccess) { |
129 | auto convPtr = |
130 | boost::locale::conv::detail::make_utf_decoder<Char>(encoding, boost::locale::conv::stop, impl); |
131 | TEST_FAIL_CONVERSION(convPtr->convert(source)); |
132 | } |
133 | } |
134 | if(encoding == "UTF-8" ) { |
135 | using boost::locale::conv::utf_to_utf; |
136 | TEST_EQ(utf_to_utf<char>(source), target); |
137 | if(expectSuccess) |
138 | TEST_EQ(utf_to_utf<Char>(source), source); |
139 | else |
140 | TEST_FAIL_CONVERSION(utf_to_utf<char>(source, boost::locale::conv::stop)); |
141 | } |
142 | } |
143 | |
144 | template<typename Char> |
145 | void test_to_from_utf(const std::string& source, |
146 | const std::basic_string<Char>& target, |
147 | const std::string& encoding, |
148 | const bool test_default = true) |
149 | { |
150 | std::cout << "-- " << encoding << std::endl; |
151 | |
152 | if(test_default) { |
153 | TEST_EQ(boost::locale::conv::to_utf<Char>(source, encoding), target); |
154 | TEST_EQ(boost::locale::conv::from_utf<Char>(target, encoding), source); |
155 | } |
156 | test_to_utf_for_impls(source, target, encoding, true, test_default); |
157 | test_from_utf_for_impls(target, source, encoding, true, test_default); |
158 | } |
159 | |
160 | template<typename Char> |
161 | void test_error_to_utf(const std::string& source, const std::basic_string<Char>& target, const std::string& encoding) |
162 | { |
163 | using boost::locale::conv::to_utf; |
164 | using boost::locale::conv::stop; |
165 | |
166 | // Default: Replace, no error |
167 | TEST_EQ(to_utf<Char>(source, encoding), target); |
168 | // Test all overloads with method=stop -> error |
169 | // source as string, C-String, range |
170 | TEST_FAIL_CONVERSION(to_utf<Char>(source, encoding, stop)); |
171 | TEST_FAIL_CONVERSION(to_utf<Char>(source.c_str(), encoding, stop)); |
172 | TEST_FAIL_CONVERSION(to_utf<Char>(source.c_str(), source.c_str() + source.size(), encoding, stop)); |
173 | // Same but encoding via locale |
174 | const std::locale l = boost::locale::generator{}("en_US." + encoding); |
175 | TEST_FAIL_CONVERSION(to_utf<Char>(source, l, stop)); |
176 | TEST_FAIL_CONVERSION(to_utf<Char>(source.c_str(), l, stop)); |
177 | TEST_FAIL_CONVERSION(to_utf<Char>(source.c_str(), source.c_str() + source.size(), l, stop)); |
178 | test_to_utf_for_impls(source, target, encoding, false); |
179 | } |
180 | |
181 | template<typename Char> |
182 | void test_error_from_utf(const std::basic_string<Char>& source, const std::string& target, const std::string& encoding) |
183 | { |
184 | using boost::locale::conv::from_utf; |
185 | using boost::locale::conv::stop; |
186 | |
187 | // Default: Replace, no error |
188 | TEST_EQ(from_utf<Char>(source, encoding), target); |
189 | // Test all overloads with method=stop -> error |
190 | // source as string, C-String, range |
191 | TEST_FAIL_CONVERSION(from_utf<Char>(source, encoding, stop)); |
192 | TEST_FAIL_CONVERSION(from_utf<Char>(source.c_str(), encoding, stop)); |
193 | TEST_FAIL_CONVERSION(from_utf<Char>(source.c_str(), source.c_str() + source.size(), encoding, stop)); |
194 | // Same but encoding via locale |
195 | const std::locale l = boost::locale::generator{}("en_US." + encoding); |
196 | TEST_FAIL_CONVERSION(from_utf<Char>(source, l, stop)); |
197 | TEST_FAIL_CONVERSION(from_utf<Char>(source.c_str(), l, stop)); |
198 | TEST_FAIL_CONVERSION(from_utf<Char>(source.c_str(), source.c_str() + source.size(), l, stop)); |
199 | test_from_utf_for_impls(source, target, encoding, false); |
200 | } |
201 | |
202 | template<typename Char> |
203 | std::basic_string<Char> utf(const std::string& s) |
204 | { |
205 | return to<Char>(s); |
206 | } |
207 | |
208 | template<> |
209 | std::basic_string<char> utf(const std::string& s) |
210 | { |
211 | return s; |
212 | } |
213 | |
214 | template<typename Char> |
215 | void test_with_0() |
216 | { |
217 | std::cout << "-- Test string containing NULL chars" << std::endl; |
218 | const char with_null[] = "foo\0\0 of\0" ; |
219 | const std::string s_with_null(with_null, sizeof(with_null) - 1); |
220 | const std::basic_string<Char> s_with_null2 = ascii_to<Char>(with_null); |
221 | for(const std::string charset : {"UTF-8" , "ISO8859-1" }) { |
222 | for(const auto impl : all_conv_backends) { |
223 | std::cout << "--- " << charset << " to UTF with Impl " << impl << std::endl; |
224 | auto to_utf = |
225 | boost::locale::conv::detail::make_utf_encoder<Char>(charset, boost::locale::conv::default_method, impl); |
226 | TEST_EQ(to_utf->convert(s_with_null), s_with_null2); |
227 | std::cout << "--- " << charset << " from UTF with Impl " << impl << std::endl; |
228 | auto from_utf = |
229 | boost::locale::conv::detail::make_utf_decoder<Char>(charset, boost::locale::conv::default_method, impl); |
230 | TEST_EQ(from_utf->convert(s_with_null2), s_with_null); |
231 | } |
232 | } |
233 | using boost::locale::conv::utf_to_utf; |
234 | TEST_EQ(utf_to_utf<Char>(s_with_null), s_with_null2); |
235 | TEST_EQ(utf_to_utf<Char>(s_with_null2), s_with_null2); |
236 | TEST_EQ(utf_to_utf<char>(s_with_null2), s_with_null); |
237 | TEST_EQ(utf_to_utf<char>(s_with_null), s_with_null); |
238 | } |
239 | |
240 | template<typename Char, int n = sizeof(Char)> |
241 | struct utfutf; |
242 | |
243 | #ifdef BOOST_MSVC |
244 | # pragma warning(push) |
245 | # pragma warning(disable : 4309) // narrowing static_cast warning |
246 | #endif |
247 | template<typename U8Char> |
248 | struct utfutf<U8Char, 1> { |
249 | static const U8Char* ok() { return reinterpret_cast<const U8Char*>("grüßen" ); } |
250 | static const U8Char* bad() |
251 | { |
252 | return reinterpret_cast<const U8Char*>("gr\xFF" |
253 | "üßen" ); |
254 | // split into 2 to make SunCC happy |
255 | } |
256 | static U8Char bad_char() { return static_cast<U8Char>(0xFF); } |
257 | }; |
258 | |
259 | template<> |
260 | struct utfutf<wchar_t, 2> { |
261 | static const wchar_t* ok() { return L"\x67\x72\xfc\xdf\x65\x6e" ; } |
262 | static const wchar_t* bad() |
263 | { |
264 | static wchar_t buf[256] = L"\x67\x72\xFF\xfc\xFE\xFD\xdf\x65\x6e" ; |
265 | buf[2] = 0xDC01; // second surrogate must not be |
266 | buf[4] = 0xD801; // First |
267 | buf[5] = 0xD801; // Must be surrogate trail |
268 | return buf; |
269 | } |
270 | static wchar_t bad_char() { return static_cast<wchar_t>(0xDC01); } |
271 | }; |
272 | |
273 | template<> |
274 | struct utfutf<wchar_t, 4> { |
275 | static const wchar_t* ok() { return L"\x67\x72\xfc\xdf\x65\x6e" ; } |
276 | static const wchar_t* bad() |
277 | { |
278 | static wchar_t buf[256] = L"\x67\x72\xFF\xfc\xdf\x65\x6e" ; |
279 | buf[2] = static_cast<wchar_t>(0x1000000); // > 10FFFF |
280 | return buf; |
281 | } |
282 | static wchar_t bad_char() { return static_cast<wchar_t>(0x1000000); } |
283 | }; |
284 | #ifdef BOOST_MSVC |
285 | # pragma warning(pop) |
286 | #endif |
287 | |
288 | template<typename CharOut, typename CharIn> |
289 | void test_combinations() |
290 | { |
291 | using boost::locale::conv::utf_to_utf; |
292 | typedef utfutf<CharOut> out; |
293 | typedef utfutf<CharIn> in; |
294 | const CharIn* inOk = in::ok(); |
295 | // Both overloads: C-string and string. Both call the range overload |
296 | TEST((utf_to_utf<CharOut>(inOk) == out::ok())); |
297 | TEST((utf_to_utf<CharOut>(std::basic_string<CharIn>(inOk)) == out::ok())); |
298 | const CharIn* inBad = in::bad(); |
299 | // Again both overloads |
300 | TEST_FAIL_CONVERSION((utf_to_utf<CharOut>(inBad, boost::locale::conv::stop))); |
301 | TEST_FAIL_CONVERSION((utf_to_utf<CharOut>(std::basic_string<CharIn>(inBad), boost::locale::conv::stop))); |
302 | TEST((utf_to_utf<CharOut>(in::bad()) == out::ok())); |
303 | } |
304 | |
305 | void test_all_combinations() |
306 | { |
307 | std::cout << "Testing utf_to_utf\n" ; |
308 | std::cout << " char<-char" << std::endl; |
309 | test_combinations<char, char>(); |
310 | std::cout << " char<-wchar" << std::endl; |
311 | test_combinations<char, wchar_t>(); |
312 | std::cout << " wchar<-char" << std::endl; |
313 | test_combinations<wchar_t, char>(); |
314 | std::cout << " wchar<-wchar" << std::endl; |
315 | test_combinations<wchar_t, wchar_t>(); |
316 | } |
317 | |
318 | template<typename Char> |
319 | void test_utf_for() |
320 | { |
321 | using boost::locale::conv::invalid_charset_error; |
322 | |
323 | { |
324 | using boost::locale::conv::to_utf; |
325 | using boost::locale::conv::from_utf; |
326 | TEST_THROWS(to_utf<Char>("Hello" , "invalid-charset" ), invalid_charset_error); |
327 | TEST_THROWS(from_utf<Char>(ascii_to<Char>("Hello" ), "invalid-charset" ), invalid_charset_error); |
328 | } |
329 | |
330 | test_to_from_utf<Char>(to<char>(utf8: "grüßen" ), utf<Char>("grüßen" ), "ISO8859-1" ); |
331 | if(test_iso_8859_8) |
332 | test_to_from_utf<Char>("\xf9\xec\xe5\xed" , utf<Char>("שלום" ), "ISO8859-8" ); |
333 | test_to_from_utf<Char>("grüßen" , utf<Char>("grüßen" ), "UTF-8" ); |
334 | test_to_from_utf<Char>("abc\"\xf0\xa0\x82\x8a\"" , utf<Char>("abc\"\xf0\xa0\x82\x8a\"" ), "UTF-8" ); |
335 | // Testing a codepage which may be an issue on Windows, see issue #121 |
336 | try { |
337 | test_to_from_utf<Char>("\x1b$BE_5(\x1b(B" , utf<Char>("冬季" ), "iso-2022-jp" ); |
338 | } catch(const invalid_charset_error&) { // LCOV_EXCL_LINE |
339 | std::cout << "--- not supported\n" ; // LCOV_EXCL_LINE |
340 | } |
341 | if(!isFaultyIconv()) { |
342 | // Testing a codepage which may crash with IConv on macOS, see issue #196 |
343 | test_to_from_utf<Char>("\xa1\xad\xa1\xad" , utf<Char>("……" ), "gbk" , false); |
344 | // This might cause a bogus E2BIG on macOS, see issue #206 |
345 | test_to_from_utf<Char>("\x1b\x24\x29\x41\x0e\x4a\x35\xf" , utf<Char>("实" ), "ISO-2022-CN" , false); |
346 | } |
347 | |
348 | std::cout << "- Testing correct invalid bytes skipping\n" ; |
349 | { |
350 | std::cout << "-- UTF-8" << std::endl; |
351 | |
352 | std::cout << "--- At start single" << std::endl; |
353 | test_error_to_utf<Char>("\xFFgrüßen" , utf<Char>("grüßen" ), "UTF-8" ); |
354 | std::cout << "--- At start multiple" << std::endl; |
355 | test_error_to_utf<Char>("\xFF\xFFgrüßen" , utf<Char>("grüßen" ), "UTF-8" ); |
356 | |
357 | std::cout << "--- At middle single" << std::endl; |
358 | test_error_to_utf<Char>("g\xFFrüßen" , utf<Char>("grüßen" ), "UTF-8" ); |
359 | std::cout << "--- At middle multiple" << std::endl; |
360 | test_error_to_utf<Char>("g\xFF\xFF\xFFrüßen" , utf<Char>("grüßen" ), "UTF-8" ); |
361 | |
362 | std::cout << "--- At end single" << std::endl; |
363 | test_error_to_utf<Char>("grüßen\xFF" , utf<Char>("grüßen" ), "UTF-8" ); |
364 | std::cout << "--- At end multiple" << std::endl; |
365 | test_error_to_utf<Char>("grüßen\xFF\xFF" , utf<Char>("grüßen" ), "UTF-8" ); |
366 | |
367 | try { |
368 | std::cout << "-- ISO-8859-8" << std::endl; |
369 | test_error_to_utf<Char>("\xFB" , utf<Char>("" ), "ISO-8859-8" ); |
370 | test_error_to_utf<Char>("\xFB-" , utf<Char>("-" ), "ISO-8859-8" ); |
371 | test_error_to_utf<Char>("test \xE0\xE1\xFB" , utf<Char>("test \xd7\x90\xd7\x91" ), "ISO-8859-8" ); |
372 | test_error_to_utf<Char>("test \xE0\xE1\xFB-" , utf<Char>("test \xd7\x90\xd7\x91-" ), "ISO-8859-8" ); |
373 | } catch(const invalid_charset_error&) { // LCOV_EXCL_LINE |
374 | std::cout << "--- not supported\n" ; // LCOV_EXCL_LINE |
375 | } |
376 | try { |
377 | std::cout << "-- cp932" << std::endl; |
378 | test_error_to_utf<Char>("\x83\xF8" , utf<Char>("" ), "cp932" ); |
379 | test_error_to_utf<Char>("\x83\xF8-" , utf<Char>("-" ), "cp932" ); |
380 | test_error_to_utf<Char>("test\xE0\xA0 \x83\xF8" , utf<Char>("test\xe7\x87\xbf " ), "cp932" ); |
381 | test_error_to_utf<Char>("test\xE0\xA0 \x83\xF8-" , utf<Char>("test\xe7\x87\xbf -" ), "cp932" ); |
382 | } catch(const invalid_charset_error&) { // LCOV_EXCL_LINE |
383 | std::cout << "--- not supported\n" ; // LCOV_EXCL_LINE |
384 | } |
385 | std::cout << "-- Error for encoding at start" << std::endl; |
386 | test_error_from_utf<Char>(utf<Char>("שלום hello" ), " hello" , "ISO8859-1" ); |
387 | std::cout << "-- Error for encoding at middle and end" << std::endl; |
388 | test_error_from_utf<Char>(utf<Char>("hello שלום world" ), "hello world" , "ISO8859-1" ); |
389 | std::cout << "-- Error for encoding at end" << std::endl; |
390 | test_error_from_utf<Char>(utf<Char>("hello שלום" ), "hello " , "ISO8859-1" ); |
391 | std::cout << "-- Error for decoding to UTF-8" << std::endl; |
392 | test_error_from_utf<Char>(utfutf<Char>::bad(), utfutf<char>::ok(), "UTF-8" ); |
393 | std::cout << "-- Error for decoding to Latin1" << std::endl; |
394 | test_error_from_utf<Char>(utfutf<Char>::bad(), to<char>(utf8: utfutf<char>::ok()), "Latin1" ); |
395 | |
396 | const std::basic_string<Char> onlyInvalidUtf(2, utfutf<Char>::bad_char()); |
397 | std::cout << "-- Error decoding string of only invalid chars to UTF-8" << std::endl; |
398 | test_error_from_utf<Char>(onlyInvalidUtf, "" , "UTF-8" ); |
399 | std::cout << "-- Error decoding string of only invalid chars to Latin1" << std::endl; |
400 | test_error_from_utf<Char>(onlyInvalidUtf, "" , "Latin1" ); |
401 | } |
402 | |
403 | test_with_0<Char>(); |
404 | } |
405 | |
406 | template<typename Char1, typename Char2> |
407 | void test_utf_to_utf_for(const std::string& utf8_string) |
408 | { |
409 | const auto utf_string1 = utf<Char1>(utf8_string); |
410 | const auto utf_string2 = utf<Char2>(utf8_string); |
411 | using boost::locale::conv::utf_to_utf; |
412 | TEST_EQ(utf_to_utf<Char1>(utf_string2), utf_string1); |
413 | TEST_EQ(utf_to_utf<Char2>(utf_string1), utf_string2); |
414 | TEST_EQ(utf_to_utf<Char1>(utf_string1), utf_string1); |
415 | TEST_EQ(utf_to_utf<Char2>(utf_string2), utf_string2); |
416 | } |
417 | |
418 | template<typename Char> |
419 | void test_utf_to_utf_for() |
420 | { |
421 | const std::string& utf8_string = "A-Za-z0-9grüße'\xf0\xa0\x82\x8a'\xf4\x8f\xbf\xbf" ; |
422 | std::cout << "---- char\n" ; |
423 | test_utf_to_utf_for<Char, char>(utf8_string); |
424 | test_to_utf_for_impls(utf8_string, utf<Char>(utf8_string), "UTF-8" ); |
425 | test_from_utf_for_impls(utf<Char>(utf8_string), utf8_string, "UTF-8" ); |
426 | std::cout << "---- wchar_t\n" ; |
427 | test_utf_to_utf_for<Char, wchar_t>(utf8_string); |
428 | #ifndef BOOST_LOCALE_NO_CXX20_STRING8 |
429 | std::cout << "---- char8_t\n" ; |
430 | test_utf_to_utf_for<Char, char8_t>(utf8_string); |
431 | #endif |
432 | #ifdef BOOST_LOCALE_ENABLE_CHAR16_T |
433 | std::cout << "---- char16_t\n" ; |
434 | test_utf_to_utf_for<Char, char16_t>(utf8_string); |
435 | #endif |
436 | #ifdef BOOST_LOCALE_ENABLE_CHAR32_T |
437 | std::cout << "---- char32_t\n" ; |
438 | test_utf_to_utf_for<Char, char32_t>(utf8_string); |
439 | #endif |
440 | } |
441 | |
442 | void test_utf_to_utf() |
443 | { |
444 | std::cout << "- Testing UTF to UTF conversion\n" ; |
445 | std::cout << "-- char\n" ; |
446 | test_utf_to_utf_for<char>(); |
447 | std::cout << "-- wchar_t\n" ; |
448 | test_utf_to_utf_for<wchar_t>(); |
449 | #ifndef BOOST_LOCALE_NO_CXX20_STRING8 |
450 | std::cout << "-- char8_t\n" ; |
451 | test_utf_to_utf_for<char8_t>(); |
452 | #endif |
453 | #ifdef BOOST_LOCALE_ENABLE_CHAR16_T |
454 | std::cout << "-- char16_t\n" ; |
455 | test_utf_to_utf_for<char16_t>(); |
456 | #endif |
457 | #ifdef BOOST_LOCALE_ENABLE_CHAR32_T |
458 | std::cout << "-- char32_t\n" ; |
459 | test_utf_to_utf_for<char32_t>(); |
460 | #endif |
461 | } |
462 | |
463 | /// Test all overloads of to_utf/from_utf templated by Char |
464 | template<typename Char> |
465 | void test_latin1_conversions_for() |
466 | { |
467 | const std::string utf8_string = "A-Za-z0-9grüße" ; |
468 | const std::string sLatin1 = to<char>(utf8: utf8_string); |
469 | // Sanity check that utf8_string is UTF-8 encoded (using multiple bytes for the special chars) |
470 | // and sLatin1 is not encoded (1 byte per char) |
471 | TEST_GT(utf8_string.length(), sLatin1.length()); |
472 | const std::basic_string<Char> sWide = utf<Char>(utf8_string); |
473 | const std::string encoding = "Latin1" ; |
474 | |
475 | using boost::locale::conv::to_utf; |
476 | using boost::locale::conv::utf_encoder; |
477 | // 3 variants for source: string, C-string, range |
478 | TEST_EQ(to_utf<Char>(sLatin1, encoding), sWide); |
479 | TEST_EQ(to_utf<Char>(sLatin1.c_str(), encoding), sWide); |
480 | TEST_EQ(to_utf<Char>(sLatin1.c_str(), sLatin1.c_str() + sLatin1.size(), encoding), sWide); |
481 | TEST_EQ(utf_encoder<Char>(encoding)(sLatin1), sWide); |
482 | TEST_EQ(utf_encoder<Char>(encoding).convert(sLatin1), sWide); |
483 | TEST_EQ(utf_encoder<Char>(encoding).convert(sLatin1.c_str(), sLatin1.c_str() + sLatin1.size()), sWide); |
484 | // Same but encoding given via locale |
485 | const std::locale l = boost::locale::generator{}("en_US.Latin1" ); |
486 | TEST_EQ(to_utf<Char>(sLatin1, l), sWide); |
487 | TEST_EQ(to_utf<Char>(sLatin1.c_str(), l), sWide); |
488 | TEST_EQ(to_utf<Char>(sLatin1.c_str(), sLatin1.c_str() + sLatin1.size(), l), sWide); |
489 | |
490 | using boost::locale::conv::from_utf; |
491 | using boost::locale::conv::utf_decoder; |
492 | // 3 variants for source: string, C-string, range |
493 | TEST_EQ(from_utf<Char>(sWide, encoding), sLatin1); |
494 | TEST_EQ(from_utf<Char>(sWide.c_str(), encoding), sLatin1); |
495 | TEST_EQ(from_utf<Char>(sWide.c_str(), sWide.c_str() + sWide.size(), encoding), sLatin1); |
496 | TEST_EQ(utf_decoder<Char>(encoding)(sWide), sLatin1); |
497 | TEST_EQ(utf_decoder<Char>(encoding).convert(sWide), sLatin1); |
498 | TEST_EQ(utf_decoder<Char>(encoding).convert(sWide.c_str(), sWide.c_str() + sWide.size()), sLatin1); |
499 | // Same but encoding given via locale |
500 | TEST_EQ(from_utf<Char>(sWide, l), sLatin1); |
501 | TEST_EQ(from_utf<Char>(sWide.c_str(), l), sLatin1); |
502 | TEST_EQ(from_utf<Char>(sWide.c_str(), sWide.c_str() + sWide.size(), l), sLatin1); |
503 | |
504 | // Empty string doesn't error/assert |
505 | TEST_EQ(to_utf<Char>("" , encoding), utf<Char>("" )); |
506 | TEST_EQ(from_utf<Char>(utf<Char>("" ), encoding), std::string()); |
507 | test_to_utf_for_impls("" , utf<Char>("" ), encoding); |
508 | test_from_utf_for_impls(utf<Char>("" ), "" , encoding); |
509 | } |
510 | |
511 | /// Quick check of to_utf/from_utf overloads using the simple Latin1 encoding |
512 | void test_latin1_conversions() |
513 | { |
514 | std::cout << "- Testing Latin1 conversion\n" ; |
515 | std::cout << "-- char\n" ; |
516 | test_latin1_conversions_for<char>(); |
517 | std::cout << "-- wchar_t\n" ; |
518 | test_latin1_conversions_for<wchar_t>(); |
519 | #ifndef BOOST_LOCALE_NO_CXX20_STRING8 |
520 | std::cout << "-- char8_t\n" ; |
521 | test_latin1_conversions_for<char8_t>(); |
522 | #endif |
523 | #ifdef BOOST_LOCALE_ENABLE_CHAR16_T |
524 | std::cout << "-- char16_t\n" ; |
525 | test_latin1_conversions_for<char16_t>(); |
526 | #endif |
527 | #ifdef BOOST_LOCALE_ENABLE_CHAR32_T |
528 | std::cout << "-- char32_t\n" ; |
529 | test_latin1_conversions_for<char32_t>(); |
530 | #endif |
531 | } |
532 | |
533 | void test_between_for_impls(const std::string& source, |
534 | const std::string& target, |
535 | const std::string& to_encoding, |
536 | const std::string& from_encoding, |
537 | const bool expectSuccess = true) |
538 | { |
539 | boost::locale::conv::narrow_converter conv(from_encoding, to_encoding); |
540 | TEST_EQ(conv(source), target); |
541 | for(const auto impl : all_conv_backends) { |
542 | using boost::locale::conv::detail::make_narrow_converter; |
543 | std::cout << "----- " << impl << '\n'; |
544 | using boost::locale::conv::invalid_charset_error; |
545 | try { |
546 | auto convPtr = make_narrow_converter(src_encoding: from_encoding, target_encoding: to_encoding, how: boost::locale::conv::skip, impl); |
547 | TEST_EQ(convPtr->convert(source), target); |
548 | } catch(invalid_charset_error&) { |
549 | continue; // LCOV_EXCL_LINE |
550 | } |
551 | if(!expectSuccess) { |
552 | auto convPtr = make_narrow_converter(src_encoding: from_encoding, target_encoding: to_encoding, how: boost::locale::conv::stop, impl); |
553 | TEST_FAIL_CONVERSION(convPtr->convert(source)); |
554 | } |
555 | } |
556 | if(to_encoding == "UTF-8" && from_encoding == "UTF-8" ) { |
557 | using boost::locale::conv::utf_to_utf; |
558 | TEST_EQ(utf_to_utf<char>(source), target); |
559 | if(!expectSuccess) |
560 | TEST_FAIL_CONVERSION(utf_to_utf<char>(source, boost::locale::conv::stop)); |
561 | } |
562 | } |
563 | |
564 | void test_error_between(const std::string& source, |
565 | const std::string& target, |
566 | const std::string& to_encoding, |
567 | const std::string& from_encoding) |
568 | { |
569 | using boost::locale::conv::between; |
570 | TEST_EQ(between(source, to_encoding, from_encoding), target); |
571 | using boost::locale::conv::stop; |
572 | TEST_FAIL_CONVERSION(between(source, to_encoding, from_encoding, stop)); |
573 | TEST_FAIL_CONVERSION(between(source.c_str(), to_encoding, from_encoding, stop)); |
574 | TEST_FAIL_CONVERSION(between(source.c_str(), source.c_str() + source.size(), to_encoding, from_encoding, stop)); |
575 | test_between_for_impls(source, target, to_encoding, from_encoding, expectSuccess: false); |
576 | } |
577 | |
578 | void test_between() |
579 | { |
580 | using boost::locale::conv::between; |
581 | const std::string utf8_string = "A-Za-z0-9grüße" ; |
582 | const std::string sLatin1 = to<char>(utf8: utf8_string); |
583 | TEST_GT(utf8_string.length(), sLatin1.length()); // Assert UTF encoding -> multi byte |
584 | TEST_EQ(between(sLatin1, "UTF-8" , "Latin1" ), utf8_string); |
585 | TEST_EQ(between(sLatin1.c_str(), "UTF-8" , "Latin1" ), utf8_string); |
586 | TEST_EQ(between(sLatin1.c_str(), sLatin1.c_str() + sLatin1.size(), "UTF-8" , "Latin1" ), utf8_string); |
587 | test_between_for_impls(source: sLatin1, target: utf8_string, to_encoding: "UTF-8" , from_encoding: "Latin1" ); |
588 | TEST_EQ(between(utf8_string, "Latin1" , "UTF-8" ), sLatin1); |
589 | TEST_EQ(between(utf8_string.c_str(), "Latin1" , "UTF-8" ), sLatin1); |
590 | TEST_EQ(between(utf8_string.c_str(), utf8_string.c_str() + utf8_string.size(), "Latin1" , "UTF-8" ), sLatin1); |
591 | test_between_for_impls(source: utf8_string, target: sLatin1, to_encoding: "Latin1" , from_encoding: "UTF-8" ); |
592 | // Same encoding |
593 | TEST_EQ(between(utf8_string, "UTF-8" , "UTF-8" ), utf8_string); |
594 | test_between_for_impls(source: utf8_string, target: utf8_string, to_encoding: "UTF-8" , from_encoding: "UTF-8" ); |
595 | TEST_EQ(between(sLatin1, "Latin1" , "Latin1" ), sLatin1); |
596 | test_between_for_impls(source: sLatin1, target: sLatin1, to_encoding: "Latin1" , from_encoding: "Latin1" ); |
597 | // Wrong encoding throws |
598 | { |
599 | using boost::locale::conv::invalid_charset_error; |
600 | TEST_THROWS(between(sLatin1, "Invalid-Encoding" , "Latin1" ), invalid_charset_error); |
601 | TEST_THROWS(between(sLatin1, "UTF-8" , "Invalid-Encoding" ), invalid_charset_error); |
602 | TEST_THROWS(between(sLatin1, "Invalid-Encoding" , "Invalid-Encoding" ), invalid_charset_error); |
603 | for(const auto impl : all_conv_backends) { |
604 | std::cout << "----- " << impl << '\n'; |
605 | using boost::locale::conv::invalid_charset_error; |
606 | using boost::locale::conv::skip; |
607 | using boost::locale::conv::detail::make_narrow_converter; |
608 | TEST_THROWS(make_narrow_converter("Invalid-Encoding" , "Latin1" , skip, impl), invalid_charset_error); |
609 | TEST_THROWS(make_narrow_converter("UTF-8" , "Invalid-Encoding" , skip, impl), invalid_charset_error); |
610 | TEST_THROWS(make_narrow_converter("Invalid-Encoding" , "Invalid-Encoding" , skip, impl), |
611 | invalid_charset_error); |
612 | } |
613 | } |
614 | // Error handling |
615 | // Unencodable char at start, middle, end |
616 | test_error_between(source: "שלום hello" , target: " hello" , to_encoding: "ISO8859-1" , from_encoding: "UTF-8" ); |
617 | test_error_between(source: "hello שלום world" , target: "hello world" , to_encoding: "ISO8859-1" , from_encoding: "UTF-8" ); |
618 | test_error_between(source: "hello שלום" , target: "hello " , to_encoding: "ISO8859-1" , from_encoding: "UTF-8" ); |
619 | // Undecodable char(s) at start, middle, end |
620 | test_error_between(source: "\xFFxfoo" , target: "xfoo" , to_encoding: "ISO8859-1" , from_encoding: "UTF-8" ); |
621 | test_error_between(source: "\xFF\xFFyfoo" , target: "yfoo" , to_encoding: "ISO8859-1" , from_encoding: "UTF-8" ); |
622 | test_error_between(source: "f\xFFoo2" , target: "foo2" , to_encoding: "ISO8859-1" , from_encoding: "UTF-8" ); |
623 | test_error_between(source: "f\xFF\xFF\xFFoo3" , target: "foo3" , to_encoding: "ISO8859-1" , from_encoding: "UTF-8" ); |
624 | test_error_between(source: "foo4\xFF" , target: "foo4" , to_encoding: "ISO8859-1" , from_encoding: "UTF-8" ); |
625 | test_error_between(source: "foo5\xFF\xFF" , target: "foo5" , to_encoding: "ISO8859-1" , from_encoding: "UTF-8" ); |
626 | // Same but UTF-8 to UTF-8 |
627 | test_error_between(source: "\xFFzfoo" , target: "zfoo" , to_encoding: "UTF-8" , from_encoding: "UTF-8" ); |
628 | test_error_between(source: "f\xFFoo6" , target: "foo6" , to_encoding: "UTF-8" , from_encoding: "UTF-8" ); |
629 | test_error_between(source: "f\xFF\xFF\xFFoo7" , target: "foo7" , to_encoding: "UTF-8" , from_encoding: "UTF-8" ); |
630 | } |
631 | |
632 | void test_utf_name(); |
633 | void test_simple_encodings(); |
634 | void test_win_codepages(); |
635 | |
636 | void test_main(int /*argc*/, char** /*argv*/) |
637 | { |
638 | // Sanity check to<char> |
639 | TEST_EQ(to<char>("grüßen" ), |
640 | "gr\xFC\xDF" |
641 | "en" ); |
642 | TEST_THROWS(to<char>("€" ), std::logic_error); |
643 | // Sanity check internal details |
644 | test_utf_name(); |
645 | test_simple_encodings(); |
646 | test_win_codepages(); |
647 | |
648 | test_latin1_conversions(); |
649 | test_utf_to_utf(); |
650 | |
651 | std::cout << "Testing charset to/from UTF conversion functions\n" ; |
652 | std::cout << " char" << std::endl; |
653 | test_utf_for<char>(); |
654 | std::cout << " wchar_t" << std::endl; |
655 | test_utf_for<wchar_t>(); |
656 | #ifndef BOOST_LOCALE_NO_CXX20_STRING8 |
657 | std::cout << " char8_t" << std::endl; |
658 | test_utf_for<char8_t>(); |
659 | #endif |
660 | #ifdef BOOST_LOCALE_ENABLE_CHAR16_T |
661 | std::cout << " char16_t" << std::endl; |
662 | test_utf_for<char16_t>(); |
663 | #endif |
664 | #ifdef BOOST_LOCALE_ENABLE_CHAR32_T |
665 | std::cout << " char32_t" << std::endl; |
666 | test_utf_for<char32_t>(); |
667 | #endif |
668 | |
669 | test_all_combinations(); |
670 | test_between(); |
671 | } |
672 | |
673 | // Internal tests, keep those out of the above scope |
674 | |
675 | bool isLittleEndian() |
676 | { |
677 | #if defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) |
678 | return __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__; |
679 | #elif defined(__LITTLE_ENDIAN__) |
680 | return true; |
681 | #elif defined(__BIG_ENDIAN__) |
682 | return false; |
683 | #endif |
684 | const int endianMark = 1; |
685 | return reinterpret_cast<const char*>(&endianMark)[0] == 1; |
686 | } |
687 | |
688 | #include "../src/boost/locale/util/encoding.hpp" |
689 | #include "../src/boost/locale/util/win_codepages.hpp" |
690 | |
691 | void test_utf_name() |
692 | { |
693 | TEST_EQ(boost::locale::util::utf_name<char>(), std::string("UTF-8" )); |
694 | #ifdef __cpp_char8_t |
695 | TEST_EQ(boost::locale::util::utf_name<char8_t>(), std::string("UTF-8" )); |
696 | #endif |
697 | TEST_EQ(boost::locale::util::utf_name<char16_t>(), std::string(isLittleEndian() ? "UTF-16LE" : "UTF-16BE" )); |
698 | TEST_EQ(boost::locale::util::utf_name<char32_t>(), std::string(isLittleEndian() ? "UTF-32LE" : "UTF-32BE" )); |
699 | } |
700 | |
701 | void test_simple_encodings() |
702 | { |
703 | using namespace boost::locale::util; |
704 | const auto encodings = get_simple_encodings(); |
705 | for(auto it = encodings.begin(), end = encodings.end(); it != end; ++it) { |
706 | TEST_EQ(normalize_encoding(*it), *it); // Must be normalized |
707 | const auto it2 = std::find(first: it + 1, last: end, val: *it); |
708 | TEST(it2 == end); |
709 | if(it2 != end) |
710 | std::cerr << "Duplicate entry: " << *it << '\n'; // LCOV_EXCL_LINE |
711 | } |
712 | const auto it = std::is_sorted_until(first: encodings.begin(), last: encodings.end()); |
713 | TEST(it == encodings.end()); |
714 | if(it != encodings.end()) |
715 | std::cerr << "First wrongly sorted element: " << *it << '\n'; // LCOV_EXCL_LINE |
716 | } |
717 | |
718 | void test_win_codepages() |
719 | { |
720 | using namespace boost::locale::util; |
721 | |
722 | for(const windows_encoding *it = all_windows_encodings, *end = std::end(arr&: all_windows_encodings); it != end; ++it) { |
723 | TEST_EQ(normalize_encoding(it->name), it->name); // Must be normalized |
724 | auto is_same_win_codepage = [&it](const windows_encoding& rhs) -> bool { |
725 | return it->codepage == rhs.codepage && std::strcmp(s1: it->name, s2: rhs.name) == 0; |
726 | }; |
727 | const auto* it2 = std::find_if(first: it + 1, last: end, pred: is_same_win_codepage); |
728 | TEST(it2 == end); |
729 | if(it2 != end) |
730 | std::cerr << "Duplicate entry: " << it->name << ':' << it->codepage << '\n'; // LCOV_EXCL_LINE |
731 | } |
732 | const auto cmp = [](const windows_encoding& rhs, const windows_encoding& lhs) -> bool { return rhs < lhs.name; }; |
733 | const auto* it = std::is_sorted_until(first: all_windows_encodings, last: std::end(arr&: all_windows_encodings), comp: cmp); |
734 | TEST(it == std::end(all_windows_encodings)); |
735 | if(it != std::end(arr&: all_windows_encodings)) |
736 | std::cerr << "First wrongly sorted element: " << it->name << '\n'; // LCOV_EXCL_LINE |
737 | } |
738 | |
739 | // boostinspect:noascii |
740 | |