1//
2// Copyright (c) 2009-2011 Artyom Beilis (Tonkikh)
3// Copyright (c) 2022-2023 Alexander Grund
4//
5// Distributed under the Boost Software License, Version 1.0.
6// https://www.boost.org/LICENSE_1_0.txt
7
8#include <boost/locale/encoding.hpp>
9#include <boost/locale/generator.hpp>
10#include <algorithm>
11
12#include "boostLocale/test/tools.hpp"
13#include "boostLocale/test/unit_test.hpp"
14
15const bool test_iso_8859_8 =
16#if defined(BOOST_LOCALE_WITH_ICU) || defined(BOOST_LOCALE_WITH_ICONV)
17 true;
18#else
19 hasWinCodepage(28598);
20#endif
21
22#if defined(BOOST_LOCALE_WITH_ICONV)
23// Reproduce issue #206 to detect faulty IConv
24static bool isFaultyIconv()
25{
26 namespace blc = boost::locale::conv;
27 auto from_utf = blc::detail::make_utf_decoder<char>(charset: "ISO-2022-CN", how: blc::skip, impl: blc::detail::conv_backend::IConv);
28 try {
29 from_utf->convert(text: "实");
30 } catch(const std::runtime_error& e) { // LCOV_EXCL_LINE
31 return std::string(e.what()).find(s: "IConv is faulty") != std::string::npos; // LCOV_EXCL_LINE
32 }
33 return false;
34}
35#else
36constexpr bool isFaultyIconv()
37{
38 return false;
39}
40#endif
41
42constexpr boost::locale::conv::detail::conv_backend all_conv_backends[] = {
43#ifdef BOOST_LOCALE_WITH_ICONV
44 boost::locale::conv::detail::conv_backend::IConv,
45#endif
46#ifdef BOOST_LOCALE_WITH_ICU
47 boost::locale::conv::detail::conv_backend::ICU,
48#endif
49#if BOOST_LOCALE_USE_WIN32_API
50 boost::locale::conv::detail::conv_backend::WinAPI,
51#endif
52};
53
54std::ostream& operator<<(std::ostream& s, boost::locale::conv::detail::conv_backend impl)
55{
56 using boost::locale::conv::detail::conv_backend;
57 switch(impl) {
58 case conv_backend::Default: return s << "[Default]"; // LCOV_EXCL_LINE
59 case conv_backend::IConv: return s << "[IConv]";
60 case conv_backend::ICU: return s << "[ICU]";
61 case conv_backend::WinAPI: return s << "[WinAPI]";
62 }
63 return s; // LCOV_EXCL_LINE
64}
65
66#define TEST_FAIL_CONVERSION(X) TEST_THROWS(X, boost::locale::conv::conversion_error)
67
68template<typename Char>
69void test_to_utf_for_impls(const std::string& source,
70 const std::basic_string<Char>& target,
71 const std::string& encoding,
72 const bool expectSuccess = true,
73 const bool test_default = true)
74{
75 if(test_default) {
76 boost::locale::conv::utf_encoder<Char> conv(encoding);
77 TEST_EQ(conv(source), target);
78 }
79 for(const auto impl : all_conv_backends) {
80 std::cout << "----- " << impl << '\n';
81 using boost::locale::conv::invalid_charset_error;
82 try {
83 auto convPtr =
84 boost::locale::conv::detail::make_utf_encoder<Char>(encoding, boost::locale::conv::skip, impl);
85 TEST_EQ(convPtr->convert(source), target);
86 } catch(invalid_charset_error&) {
87 std::cout << "--- Charset not supported\n"; // LCOV_EXCL_LINE
88 continue; // LCOV_EXCL_LINE
89 }
90 if(!expectSuccess) {
91 auto convPtr =
92 boost::locale::conv::detail::make_utf_encoder<Char>(encoding, boost::locale::conv::stop, impl);
93 TEST_FAIL_CONVERSION(convPtr->convert(source));
94 }
95 }
96 if(encoding == "UTF-8") {
97 using boost::locale::conv::utf_to_utf;
98 TEST_EQ(utf_to_utf<Char>(source), target);
99 if(expectSuccess)
100 TEST_EQ(utf_to_utf<char>(source), source);
101 else
102 TEST_FAIL_CONVERSION(utf_to_utf<Char>(source, boost::locale::conv::stop));
103 }
104}
105
106template<typename Char>
107void test_from_utf_for_impls(const std::basic_string<Char>& source,
108 const std::string& target,
109 const std::string& encoding,
110 const bool expectSuccess = true,
111 const bool test_default = true)
112{
113 if(test_default) {
114 boost::locale::conv::utf_decoder<Char> conv(encoding);
115 TEST_EQ(conv(source), target);
116 }
117 for(const auto impl : all_conv_backends) {
118 std::cout << "----- " << impl << '\n';
119 using boost::locale::conv::invalid_charset_error;
120 try {
121 auto convPtr =
122 boost::locale::conv::detail::make_utf_decoder<Char>(encoding, boost::locale::conv::skip, impl);
123 TEST_EQ(convPtr->convert(source), target);
124 } catch(invalid_charset_error&) {
125 std::cout << "--- Charset not supported\n"; // LCOV_EXCL_LINE
126 continue; // LCOV_EXCL_LINE
127 }
128 if(!expectSuccess) {
129 auto convPtr =
130 boost::locale::conv::detail::make_utf_decoder<Char>(encoding, boost::locale::conv::stop, impl);
131 TEST_FAIL_CONVERSION(convPtr->convert(source));
132 }
133 }
134 if(encoding == "UTF-8") {
135 using boost::locale::conv::utf_to_utf;
136 TEST_EQ(utf_to_utf<char>(source), target);
137 if(expectSuccess)
138 TEST_EQ(utf_to_utf<Char>(source), source);
139 else
140 TEST_FAIL_CONVERSION(utf_to_utf<char>(source, boost::locale::conv::stop));
141 }
142}
143
144template<typename Char>
145void test_to_from_utf(const std::string& source,
146 const std::basic_string<Char>& target,
147 const std::string& encoding,
148 const bool test_default = true)
149{
150 std::cout << "-- " << encoding << std::endl;
151
152 if(test_default) {
153 TEST_EQ(boost::locale::conv::to_utf<Char>(source, encoding), target);
154 TEST_EQ(boost::locale::conv::from_utf<Char>(target, encoding), source);
155 }
156 test_to_utf_for_impls(source, target, encoding, true, test_default);
157 test_from_utf_for_impls(target, source, encoding, true, test_default);
158}
159
160template<typename Char>
161void test_error_to_utf(const std::string& source, const std::basic_string<Char>& target, const std::string& encoding)
162{
163 using boost::locale::conv::to_utf;
164 using boost::locale::conv::stop;
165
166 // Default: Replace, no error
167 TEST_EQ(to_utf<Char>(source, encoding), target);
168 // Test all overloads with method=stop -> error
169 // source as string, C-String, range
170 TEST_FAIL_CONVERSION(to_utf<Char>(source, encoding, stop));
171 TEST_FAIL_CONVERSION(to_utf<Char>(source.c_str(), encoding, stop));
172 TEST_FAIL_CONVERSION(to_utf<Char>(source.c_str(), source.c_str() + source.size(), encoding, stop));
173 // Same but encoding via locale
174 const std::locale l = boost::locale::generator{}("en_US." + encoding);
175 TEST_FAIL_CONVERSION(to_utf<Char>(source, l, stop));
176 TEST_FAIL_CONVERSION(to_utf<Char>(source.c_str(), l, stop));
177 TEST_FAIL_CONVERSION(to_utf<Char>(source.c_str(), source.c_str() + source.size(), l, stop));
178 test_to_utf_for_impls(source, target, encoding, false);
179}
180
181template<typename Char>
182void test_error_from_utf(const std::basic_string<Char>& source, const std::string& target, const std::string& encoding)
183{
184 using boost::locale::conv::from_utf;
185 using boost::locale::conv::stop;
186
187 // Default: Replace, no error
188 TEST_EQ(from_utf<Char>(source, encoding), target);
189 // Test all overloads with method=stop -> error
190 // source as string, C-String, range
191 TEST_FAIL_CONVERSION(from_utf<Char>(source, encoding, stop));
192 TEST_FAIL_CONVERSION(from_utf<Char>(source.c_str(), encoding, stop));
193 TEST_FAIL_CONVERSION(from_utf<Char>(source.c_str(), source.c_str() + source.size(), encoding, stop));
194 // Same but encoding via locale
195 const std::locale l = boost::locale::generator{}("en_US." + encoding);
196 TEST_FAIL_CONVERSION(from_utf<Char>(source, l, stop));
197 TEST_FAIL_CONVERSION(from_utf<Char>(source.c_str(), l, stop));
198 TEST_FAIL_CONVERSION(from_utf<Char>(source.c_str(), source.c_str() + source.size(), l, stop));
199 test_from_utf_for_impls(source, target, encoding, false);
200}
201
202template<typename Char>
203std::basic_string<Char> utf(const std::string& s)
204{
205 return to<Char>(s);
206}
207
208template<>
209std::basic_string<char> utf(const std::string& s)
210{
211 return s;
212}
213
214template<typename Char>
215void test_with_0()
216{
217 std::cout << "-- Test string containing NULL chars" << std::endl;
218 const char with_null[] = "foo\0\0 of\0";
219 const std::string s_with_null(with_null, sizeof(with_null) - 1);
220 const std::basic_string<Char> s_with_null2 = ascii_to<Char>(with_null);
221 for(const std::string charset : {"UTF-8", "ISO8859-1"}) {
222 for(const auto impl : all_conv_backends) {
223 std::cout << "--- " << charset << " to UTF with Impl " << impl << std::endl;
224 auto to_utf =
225 boost::locale::conv::detail::make_utf_encoder<Char>(charset, boost::locale::conv::default_method, impl);
226 TEST_EQ(to_utf->convert(s_with_null), s_with_null2);
227 std::cout << "--- " << charset << " from UTF with Impl " << impl << std::endl;
228 auto from_utf =
229 boost::locale::conv::detail::make_utf_decoder<Char>(charset, boost::locale::conv::default_method, impl);
230 TEST_EQ(from_utf->convert(s_with_null2), s_with_null);
231 }
232 }
233 using boost::locale::conv::utf_to_utf;
234 TEST_EQ(utf_to_utf<Char>(s_with_null), s_with_null2);
235 TEST_EQ(utf_to_utf<Char>(s_with_null2), s_with_null2);
236 TEST_EQ(utf_to_utf<char>(s_with_null2), s_with_null);
237 TEST_EQ(utf_to_utf<char>(s_with_null), s_with_null);
238}
239
240template<typename Char, int n = sizeof(Char)>
241struct utfutf;
242
243#ifdef BOOST_MSVC
244# pragma warning(push)
245# pragma warning(disable : 4309) // narrowing static_cast warning
246#endif
247template<typename U8Char>
248struct utfutf<U8Char, 1> {
249 static const U8Char* ok() { return reinterpret_cast<const U8Char*>("grüßen"); }
250 static const U8Char* bad()
251 {
252 return reinterpret_cast<const U8Char*>("gr\xFF"
253 "üßen");
254 // split into 2 to make SunCC happy
255 }
256 static U8Char bad_char() { return static_cast<U8Char>(0xFF); }
257};
258
259template<>
260struct utfutf<wchar_t, 2> {
261 static const wchar_t* ok() { return L"\x67\x72\xfc\xdf\x65\x6e"; }
262 static const wchar_t* bad()
263 {
264 static wchar_t buf[256] = L"\x67\x72\xFF\xfc\xFE\xFD\xdf\x65\x6e";
265 buf[2] = 0xDC01; // second surrogate must not be
266 buf[4] = 0xD801; // First
267 buf[5] = 0xD801; // Must be surrogate trail
268 return buf;
269 }
270 static wchar_t bad_char() { return static_cast<wchar_t>(0xDC01); }
271};
272
273template<>
274struct utfutf<wchar_t, 4> {
275 static const wchar_t* ok() { return L"\x67\x72\xfc\xdf\x65\x6e"; }
276 static const wchar_t* bad()
277 {
278 static wchar_t buf[256] = L"\x67\x72\xFF\xfc\xdf\x65\x6e";
279 buf[2] = static_cast<wchar_t>(0x1000000); // > 10FFFF
280 return buf;
281 }
282 static wchar_t bad_char() { return static_cast<wchar_t>(0x1000000); }
283};
284#ifdef BOOST_MSVC
285# pragma warning(pop)
286#endif
287
288template<typename CharOut, typename CharIn>
289void test_combinations()
290{
291 using boost::locale::conv::utf_to_utf;
292 typedef utfutf<CharOut> out;
293 typedef utfutf<CharIn> in;
294 const CharIn* inOk = in::ok();
295 // Both overloads: C-string and string. Both call the range overload
296 TEST((utf_to_utf<CharOut>(inOk) == out::ok()));
297 TEST((utf_to_utf<CharOut>(std::basic_string<CharIn>(inOk)) == out::ok()));
298 const CharIn* inBad = in::bad();
299 // Again both overloads
300 TEST_FAIL_CONVERSION((utf_to_utf<CharOut>(inBad, boost::locale::conv::stop)));
301 TEST_FAIL_CONVERSION((utf_to_utf<CharOut>(std::basic_string<CharIn>(inBad), boost::locale::conv::stop)));
302 TEST((utf_to_utf<CharOut>(in::bad()) == out::ok()));
303}
304
305void test_all_combinations()
306{
307 std::cout << "Testing utf_to_utf\n";
308 std::cout << " char<-char" << std::endl;
309 test_combinations<char, char>();
310 std::cout << " char<-wchar" << std::endl;
311 test_combinations<char, wchar_t>();
312 std::cout << " wchar<-char" << std::endl;
313 test_combinations<wchar_t, char>();
314 std::cout << " wchar<-wchar" << std::endl;
315 test_combinations<wchar_t, wchar_t>();
316}
317
318template<typename Char>
319void test_utf_for()
320{
321 using boost::locale::conv::invalid_charset_error;
322
323 {
324 using boost::locale::conv::to_utf;
325 using boost::locale::conv::from_utf;
326 TEST_THROWS(to_utf<Char>("Hello", "invalid-charset"), invalid_charset_error);
327 TEST_THROWS(from_utf<Char>(ascii_to<Char>("Hello"), "invalid-charset"), invalid_charset_error);
328 }
329
330 test_to_from_utf<Char>(to<char>(utf8: "grüßen"), utf<Char>("grüßen"), "ISO8859-1");
331 if(test_iso_8859_8)
332 test_to_from_utf<Char>("\xf9\xec\xe5\xed", utf<Char>("שלום"), "ISO8859-8");
333 test_to_from_utf<Char>("grüßen", utf<Char>("grüßen"), "UTF-8");
334 test_to_from_utf<Char>("abc\"\xf0\xa0\x82\x8a\"", utf<Char>("abc\"\xf0\xa0\x82\x8a\""), "UTF-8");
335 // Testing a codepage which may be an issue on Windows, see issue #121
336 try {
337 test_to_from_utf<Char>("\x1b$BE_5(\x1b(B", utf<Char>("冬季"), "iso-2022-jp");
338 } catch(const invalid_charset_error&) { // LCOV_EXCL_LINE
339 std::cout << "--- not supported\n"; // LCOV_EXCL_LINE
340 }
341 if(!isFaultyIconv()) {
342 // Testing a codepage which may crash with IConv on macOS, see issue #196
343 test_to_from_utf<Char>("\xa1\xad\xa1\xad", utf<Char>("……"), "gbk", false);
344 // This might cause a bogus E2BIG on macOS, see issue #206
345 test_to_from_utf<Char>("\x1b\x24\x29\x41\x0e\x4a\x35\xf", utf<Char>("实"), "ISO-2022-CN", false);
346 }
347
348 std::cout << "- Testing correct invalid bytes skipping\n";
349 {
350 std::cout << "-- UTF-8" << std::endl;
351
352 std::cout << "--- At start single" << std::endl;
353 test_error_to_utf<Char>("\xFFgrüßen", utf<Char>("grüßen"), "UTF-8");
354 std::cout << "--- At start multiple" << std::endl;
355 test_error_to_utf<Char>("\xFF\xFFgrüßen", utf<Char>("grüßen"), "UTF-8");
356
357 std::cout << "--- At middle single" << std::endl;
358 test_error_to_utf<Char>("g\xFFrüßen", utf<Char>("grüßen"), "UTF-8");
359 std::cout << "--- At middle multiple" << std::endl;
360 test_error_to_utf<Char>("g\xFF\xFF\xFFrüßen", utf<Char>("grüßen"), "UTF-8");
361
362 std::cout << "--- At end single" << std::endl;
363 test_error_to_utf<Char>("grüßen\xFF", utf<Char>("grüßen"), "UTF-8");
364 std::cout << "--- At end multiple" << std::endl;
365 test_error_to_utf<Char>("grüßen\xFF\xFF", utf<Char>("grüßen"), "UTF-8");
366
367 try {
368 std::cout << "-- ISO-8859-8" << std::endl;
369 test_error_to_utf<Char>("\xFB", utf<Char>(""), "ISO-8859-8");
370 test_error_to_utf<Char>("\xFB-", utf<Char>("-"), "ISO-8859-8");
371 test_error_to_utf<Char>("test \xE0\xE1\xFB", utf<Char>("test \xd7\x90\xd7\x91"), "ISO-8859-8");
372 test_error_to_utf<Char>("test \xE0\xE1\xFB-", utf<Char>("test \xd7\x90\xd7\x91-"), "ISO-8859-8");
373 } catch(const invalid_charset_error&) { // LCOV_EXCL_LINE
374 std::cout << "--- not supported\n"; // LCOV_EXCL_LINE
375 }
376 try {
377 std::cout << "-- cp932" << std::endl;
378 test_error_to_utf<Char>("\x83\xF8", utf<Char>(""), "cp932");
379 test_error_to_utf<Char>("\x83\xF8-", utf<Char>("-"), "cp932");
380 test_error_to_utf<Char>("test\xE0\xA0 \x83\xF8", utf<Char>("test\xe7\x87\xbf "), "cp932");
381 test_error_to_utf<Char>("test\xE0\xA0 \x83\xF8-", utf<Char>("test\xe7\x87\xbf -"), "cp932");
382 } catch(const invalid_charset_error&) { // LCOV_EXCL_LINE
383 std::cout << "--- not supported\n"; // LCOV_EXCL_LINE
384 }
385 std::cout << "-- Error for encoding at start" << std::endl;
386 test_error_from_utf<Char>(utf<Char>("שלום hello"), " hello", "ISO8859-1");
387 std::cout << "-- Error for encoding at middle and end" << std::endl;
388 test_error_from_utf<Char>(utf<Char>("hello שלום world"), "hello world", "ISO8859-1");
389 std::cout << "-- Error for encoding at end" << std::endl;
390 test_error_from_utf<Char>(utf<Char>("hello שלום"), "hello ", "ISO8859-1");
391 std::cout << "-- Error for decoding to UTF-8" << std::endl;
392 test_error_from_utf<Char>(utfutf<Char>::bad(), utfutf<char>::ok(), "UTF-8");
393 std::cout << "-- Error for decoding to Latin1" << std::endl;
394 test_error_from_utf<Char>(utfutf<Char>::bad(), to<char>(utf8: utfutf<char>::ok()), "Latin1");
395
396 const std::basic_string<Char> onlyInvalidUtf(2, utfutf<Char>::bad_char());
397 std::cout << "-- Error decoding string of only invalid chars to UTF-8" << std::endl;
398 test_error_from_utf<Char>(onlyInvalidUtf, "", "UTF-8");
399 std::cout << "-- Error decoding string of only invalid chars to Latin1" << std::endl;
400 test_error_from_utf<Char>(onlyInvalidUtf, "", "Latin1");
401 }
402
403 test_with_0<Char>();
404}
405
406template<typename Char1, typename Char2>
407void test_utf_to_utf_for(const std::string& utf8_string)
408{
409 const auto utf_string1 = utf<Char1>(utf8_string);
410 const auto utf_string2 = utf<Char2>(utf8_string);
411 using boost::locale::conv::utf_to_utf;
412 TEST_EQ(utf_to_utf<Char1>(utf_string2), utf_string1);
413 TEST_EQ(utf_to_utf<Char2>(utf_string1), utf_string2);
414 TEST_EQ(utf_to_utf<Char1>(utf_string1), utf_string1);
415 TEST_EQ(utf_to_utf<Char2>(utf_string2), utf_string2);
416}
417
418template<typename Char>
419void test_utf_to_utf_for()
420{
421 const std::string& utf8_string = "A-Za-z0-9grüße'\xf0\xa0\x82\x8a'\xf4\x8f\xbf\xbf";
422 std::cout << "---- char\n";
423 test_utf_to_utf_for<Char, char>(utf8_string);
424 test_to_utf_for_impls(utf8_string, utf<Char>(utf8_string), "UTF-8");
425 test_from_utf_for_impls(utf<Char>(utf8_string), utf8_string, "UTF-8");
426 std::cout << "---- wchar_t\n";
427 test_utf_to_utf_for<Char, wchar_t>(utf8_string);
428#ifndef BOOST_LOCALE_NO_CXX20_STRING8
429 std::cout << "---- char8_t\n";
430 test_utf_to_utf_for<Char, char8_t>(utf8_string);
431#endif
432#ifdef BOOST_LOCALE_ENABLE_CHAR16_T
433 std::cout << "---- char16_t\n";
434 test_utf_to_utf_for<Char, char16_t>(utf8_string);
435#endif
436#ifdef BOOST_LOCALE_ENABLE_CHAR32_T
437 std::cout << "---- char32_t\n";
438 test_utf_to_utf_for<Char, char32_t>(utf8_string);
439#endif
440}
441
442void test_utf_to_utf()
443{
444 std::cout << "- Testing UTF to UTF conversion\n";
445 std::cout << "-- char\n";
446 test_utf_to_utf_for<char>();
447 std::cout << "-- wchar_t\n";
448 test_utf_to_utf_for<wchar_t>();
449#ifndef BOOST_LOCALE_NO_CXX20_STRING8
450 std::cout << "-- char8_t\n";
451 test_utf_to_utf_for<char8_t>();
452#endif
453#ifdef BOOST_LOCALE_ENABLE_CHAR16_T
454 std::cout << "-- char16_t\n";
455 test_utf_to_utf_for<char16_t>();
456#endif
457#ifdef BOOST_LOCALE_ENABLE_CHAR32_T
458 std::cout << "-- char32_t\n";
459 test_utf_to_utf_for<char32_t>();
460#endif
461}
462
463/// Test all overloads of to_utf/from_utf templated by Char
464template<typename Char>
465void test_latin1_conversions_for()
466{
467 const std::string utf8_string = "A-Za-z0-9grüße";
468 const std::string sLatin1 = to<char>(utf8: utf8_string);
469 // Sanity check that utf8_string is UTF-8 encoded (using multiple bytes for the special chars)
470 // and sLatin1 is not encoded (1 byte per char)
471 TEST_GT(utf8_string.length(), sLatin1.length());
472 const std::basic_string<Char> sWide = utf<Char>(utf8_string);
473 const std::string encoding = "Latin1";
474
475 using boost::locale::conv::to_utf;
476 using boost::locale::conv::utf_encoder;
477 // 3 variants for source: string, C-string, range
478 TEST_EQ(to_utf<Char>(sLatin1, encoding), sWide);
479 TEST_EQ(to_utf<Char>(sLatin1.c_str(), encoding), sWide);
480 TEST_EQ(to_utf<Char>(sLatin1.c_str(), sLatin1.c_str() + sLatin1.size(), encoding), sWide);
481 TEST_EQ(utf_encoder<Char>(encoding)(sLatin1), sWide);
482 TEST_EQ(utf_encoder<Char>(encoding).convert(sLatin1), sWide);
483 TEST_EQ(utf_encoder<Char>(encoding).convert(sLatin1.c_str(), sLatin1.c_str() + sLatin1.size()), sWide);
484 // Same but encoding given via locale
485 const std::locale l = boost::locale::generator{}("en_US.Latin1");
486 TEST_EQ(to_utf<Char>(sLatin1, l), sWide);
487 TEST_EQ(to_utf<Char>(sLatin1.c_str(), l), sWide);
488 TEST_EQ(to_utf<Char>(sLatin1.c_str(), sLatin1.c_str() + sLatin1.size(), l), sWide);
489
490 using boost::locale::conv::from_utf;
491 using boost::locale::conv::utf_decoder;
492 // 3 variants for source: string, C-string, range
493 TEST_EQ(from_utf<Char>(sWide, encoding), sLatin1);
494 TEST_EQ(from_utf<Char>(sWide.c_str(), encoding), sLatin1);
495 TEST_EQ(from_utf<Char>(sWide.c_str(), sWide.c_str() + sWide.size(), encoding), sLatin1);
496 TEST_EQ(utf_decoder<Char>(encoding)(sWide), sLatin1);
497 TEST_EQ(utf_decoder<Char>(encoding).convert(sWide), sLatin1);
498 TEST_EQ(utf_decoder<Char>(encoding).convert(sWide.c_str(), sWide.c_str() + sWide.size()), sLatin1);
499 // Same but encoding given via locale
500 TEST_EQ(from_utf<Char>(sWide, l), sLatin1);
501 TEST_EQ(from_utf<Char>(sWide.c_str(), l), sLatin1);
502 TEST_EQ(from_utf<Char>(sWide.c_str(), sWide.c_str() + sWide.size(), l), sLatin1);
503
504 // Empty string doesn't error/assert
505 TEST_EQ(to_utf<Char>("", encoding), utf<Char>(""));
506 TEST_EQ(from_utf<Char>(utf<Char>(""), encoding), std::string());
507 test_to_utf_for_impls("", utf<Char>(""), encoding);
508 test_from_utf_for_impls(utf<Char>(""), "", encoding);
509}
510
511/// Quick check of to_utf/from_utf overloads using the simple Latin1 encoding
512void test_latin1_conversions()
513{
514 std::cout << "- Testing Latin1 conversion\n";
515 std::cout << "-- char\n";
516 test_latin1_conversions_for<char>();
517 std::cout << "-- wchar_t\n";
518 test_latin1_conversions_for<wchar_t>();
519#ifndef BOOST_LOCALE_NO_CXX20_STRING8
520 std::cout << "-- char8_t\n";
521 test_latin1_conversions_for<char8_t>();
522#endif
523#ifdef BOOST_LOCALE_ENABLE_CHAR16_T
524 std::cout << "-- char16_t\n";
525 test_latin1_conversions_for<char16_t>();
526#endif
527#ifdef BOOST_LOCALE_ENABLE_CHAR32_T
528 std::cout << "-- char32_t\n";
529 test_latin1_conversions_for<char32_t>();
530#endif
531}
532
533void test_between_for_impls(const std::string& source,
534 const std::string& target,
535 const std::string& to_encoding,
536 const std::string& from_encoding,
537 const bool expectSuccess = true)
538{
539 boost::locale::conv::narrow_converter conv(from_encoding, to_encoding);
540 TEST_EQ(conv(source), target);
541 for(const auto impl : all_conv_backends) {
542 using boost::locale::conv::detail::make_narrow_converter;
543 std::cout << "----- " << impl << '\n';
544 using boost::locale::conv::invalid_charset_error;
545 try {
546 auto convPtr = make_narrow_converter(src_encoding: from_encoding, target_encoding: to_encoding, how: boost::locale::conv::skip, impl);
547 TEST_EQ(convPtr->convert(source), target);
548 } catch(invalid_charset_error&) {
549 continue; // LCOV_EXCL_LINE
550 }
551 if(!expectSuccess) {
552 auto convPtr = make_narrow_converter(src_encoding: from_encoding, target_encoding: to_encoding, how: boost::locale::conv::stop, impl);
553 TEST_FAIL_CONVERSION(convPtr->convert(source));
554 }
555 }
556 if(to_encoding == "UTF-8" && from_encoding == "UTF-8") {
557 using boost::locale::conv::utf_to_utf;
558 TEST_EQ(utf_to_utf<char>(source), target);
559 if(!expectSuccess)
560 TEST_FAIL_CONVERSION(utf_to_utf<char>(source, boost::locale::conv::stop));
561 }
562}
563
564void test_error_between(const std::string& source,
565 const std::string& target,
566 const std::string& to_encoding,
567 const std::string& from_encoding)
568{
569 using boost::locale::conv::between;
570 TEST_EQ(between(source, to_encoding, from_encoding), target);
571 using boost::locale::conv::stop;
572 TEST_FAIL_CONVERSION(between(source, to_encoding, from_encoding, stop));
573 TEST_FAIL_CONVERSION(between(source.c_str(), to_encoding, from_encoding, stop));
574 TEST_FAIL_CONVERSION(between(source.c_str(), source.c_str() + source.size(), to_encoding, from_encoding, stop));
575 test_between_for_impls(source, target, to_encoding, from_encoding, expectSuccess: false);
576}
577
578void test_between()
579{
580 using boost::locale::conv::between;
581 const std::string utf8_string = "A-Za-z0-9grüße";
582 const std::string sLatin1 = to<char>(utf8: utf8_string);
583 TEST_GT(utf8_string.length(), sLatin1.length()); // Assert UTF encoding -> multi byte
584 TEST_EQ(between(sLatin1, "UTF-8", "Latin1"), utf8_string);
585 TEST_EQ(between(sLatin1.c_str(), "UTF-8", "Latin1"), utf8_string);
586 TEST_EQ(between(sLatin1.c_str(), sLatin1.c_str() + sLatin1.size(), "UTF-8", "Latin1"), utf8_string);
587 test_between_for_impls(source: sLatin1, target: utf8_string, to_encoding: "UTF-8", from_encoding: "Latin1");
588 TEST_EQ(between(utf8_string, "Latin1", "UTF-8"), sLatin1);
589 TEST_EQ(between(utf8_string.c_str(), "Latin1", "UTF-8"), sLatin1);
590 TEST_EQ(between(utf8_string.c_str(), utf8_string.c_str() + utf8_string.size(), "Latin1", "UTF-8"), sLatin1);
591 test_between_for_impls(source: utf8_string, target: sLatin1, to_encoding: "Latin1", from_encoding: "UTF-8");
592 // Same encoding
593 TEST_EQ(between(utf8_string, "UTF-8", "UTF-8"), utf8_string);
594 test_between_for_impls(source: utf8_string, target: utf8_string, to_encoding: "UTF-8", from_encoding: "UTF-8");
595 TEST_EQ(between(sLatin1, "Latin1", "Latin1"), sLatin1);
596 test_between_for_impls(source: sLatin1, target: sLatin1, to_encoding: "Latin1", from_encoding: "Latin1");
597 // Wrong encoding throws
598 {
599 using boost::locale::conv::invalid_charset_error;
600 TEST_THROWS(between(sLatin1, "Invalid-Encoding", "Latin1"), invalid_charset_error);
601 TEST_THROWS(between(sLatin1, "UTF-8", "Invalid-Encoding"), invalid_charset_error);
602 TEST_THROWS(between(sLatin1, "Invalid-Encoding", "Invalid-Encoding"), invalid_charset_error);
603 for(const auto impl : all_conv_backends) {
604 std::cout << "----- " << impl << '\n';
605 using boost::locale::conv::invalid_charset_error;
606 using boost::locale::conv::skip;
607 using boost::locale::conv::detail::make_narrow_converter;
608 TEST_THROWS(make_narrow_converter("Invalid-Encoding", "Latin1", skip, impl), invalid_charset_error);
609 TEST_THROWS(make_narrow_converter("UTF-8", "Invalid-Encoding", skip, impl), invalid_charset_error);
610 TEST_THROWS(make_narrow_converter("Invalid-Encoding", "Invalid-Encoding", skip, impl),
611 invalid_charset_error);
612 }
613 }
614 // Error handling
615 // Unencodable char at start, middle, end
616 test_error_between(source: "שלום hello", target: " hello", to_encoding: "ISO8859-1", from_encoding: "UTF-8");
617 test_error_between(source: "hello שלום world", target: "hello world", to_encoding: "ISO8859-1", from_encoding: "UTF-8");
618 test_error_between(source: "hello שלום", target: "hello ", to_encoding: "ISO8859-1", from_encoding: "UTF-8");
619 // Undecodable char(s) at start, middle, end
620 test_error_between(source: "\xFFxfoo", target: "xfoo", to_encoding: "ISO8859-1", from_encoding: "UTF-8");
621 test_error_between(source: "\xFF\xFFyfoo", target: "yfoo", to_encoding: "ISO8859-1", from_encoding: "UTF-8");
622 test_error_between(source: "f\xFFoo2", target: "foo2", to_encoding: "ISO8859-1", from_encoding: "UTF-8");
623 test_error_between(source: "f\xFF\xFF\xFFoo3", target: "foo3", to_encoding: "ISO8859-1", from_encoding: "UTF-8");
624 test_error_between(source: "foo4\xFF", target: "foo4", to_encoding: "ISO8859-1", from_encoding: "UTF-8");
625 test_error_between(source: "foo5\xFF\xFF", target: "foo5", to_encoding: "ISO8859-1", from_encoding: "UTF-8");
626 // Same but UTF-8 to UTF-8
627 test_error_between(source: "\xFFzfoo", target: "zfoo", to_encoding: "UTF-8", from_encoding: "UTF-8");
628 test_error_between(source: "f\xFFoo6", target: "foo6", to_encoding: "UTF-8", from_encoding: "UTF-8");
629 test_error_between(source: "f\xFF\xFF\xFFoo7", target: "foo7", to_encoding: "UTF-8", from_encoding: "UTF-8");
630}
631
632void test_utf_name();
633void test_simple_encodings();
634void test_win_codepages();
635
636void test_main(int /*argc*/, char** /*argv*/)
637{
638 // Sanity check to<char>
639 TEST_EQ(to<char>("grüßen"),
640 "gr\xFC\xDF"
641 "en");
642 TEST_THROWS(to<char>("€"), std::logic_error);
643 // Sanity check internal details
644 test_utf_name();
645 test_simple_encodings();
646 test_win_codepages();
647
648 test_latin1_conversions();
649 test_utf_to_utf();
650
651 std::cout << "Testing charset to/from UTF conversion functions\n";
652 std::cout << " char" << std::endl;
653 test_utf_for<char>();
654 std::cout << " wchar_t" << std::endl;
655 test_utf_for<wchar_t>();
656#ifndef BOOST_LOCALE_NO_CXX20_STRING8
657 std::cout << " char8_t" << std::endl;
658 test_utf_for<char8_t>();
659#endif
660#ifdef BOOST_LOCALE_ENABLE_CHAR16_T
661 std::cout << " char16_t" << std::endl;
662 test_utf_for<char16_t>();
663#endif
664#ifdef BOOST_LOCALE_ENABLE_CHAR32_T
665 std::cout << " char32_t" << std::endl;
666 test_utf_for<char32_t>();
667#endif
668
669 test_all_combinations();
670 test_between();
671}
672
673// Internal tests, keep those out of the above scope
674
675bool isLittleEndian()
676{
677#if defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__)
678 return __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__;
679#elif defined(__LITTLE_ENDIAN__)
680 return true;
681#elif defined(__BIG_ENDIAN__)
682 return false;
683#endif
684 const int endianMark = 1;
685 return reinterpret_cast<const char*>(&endianMark)[0] == 1;
686}
687
688#include "../src/boost/locale/util/encoding.hpp"
689#include "../src/boost/locale/util/win_codepages.hpp"
690
691void test_utf_name()
692{
693 TEST_EQ(boost::locale::util::utf_name<char>(), std::string("UTF-8"));
694#ifdef __cpp_char8_t
695 TEST_EQ(boost::locale::util::utf_name<char8_t>(), std::string("UTF-8"));
696#endif
697 TEST_EQ(boost::locale::util::utf_name<char16_t>(), std::string(isLittleEndian() ? "UTF-16LE" : "UTF-16BE"));
698 TEST_EQ(boost::locale::util::utf_name<char32_t>(), std::string(isLittleEndian() ? "UTF-32LE" : "UTF-32BE"));
699}
700
701void test_simple_encodings()
702{
703 using namespace boost::locale::util;
704 const auto encodings = get_simple_encodings();
705 for(auto it = encodings.begin(), end = encodings.end(); it != end; ++it) {
706 TEST_EQ(normalize_encoding(*it), *it); // Must be normalized
707 const auto it2 = std::find(first: it + 1, last: end, val: *it);
708 TEST(it2 == end);
709 if(it2 != end)
710 std::cerr << "Duplicate entry: " << *it << '\n'; // LCOV_EXCL_LINE
711 }
712 const auto it = std::is_sorted_until(first: encodings.begin(), last: encodings.end());
713 TEST(it == encodings.end());
714 if(it != encodings.end())
715 std::cerr << "First wrongly sorted element: " << *it << '\n'; // LCOV_EXCL_LINE
716}
717
718void test_win_codepages()
719{
720 using namespace boost::locale::util;
721
722 for(const windows_encoding *it = all_windows_encodings, *end = std::end(arr&: all_windows_encodings); it != end; ++it) {
723 TEST_EQ(normalize_encoding(it->name), it->name); // Must be normalized
724 auto is_same_win_codepage = [&it](const windows_encoding& rhs) -> bool {
725 return it->codepage == rhs.codepage && std::strcmp(s1: it->name, s2: rhs.name) == 0;
726 };
727 const auto* it2 = std::find_if(first: it + 1, last: end, pred: is_same_win_codepage);
728 TEST(it2 == end);
729 if(it2 != end)
730 std::cerr << "Duplicate entry: " << it->name << ':' << it->codepage << '\n'; // LCOV_EXCL_LINE
731 }
732 const auto cmp = [](const windows_encoding& rhs, const windows_encoding& lhs) -> bool { return rhs < lhs.name; };
733 const auto* it = std::is_sorted_until(first: all_windows_encodings, last: std::end(arr&: all_windows_encodings), comp: cmp);
734 TEST(it == std::end(all_windows_encodings));
735 if(it != std::end(arr&: all_windows_encodings))
736 std::cerr << "First wrongly sorted element: " << it->name << '\n'; // LCOV_EXCL_LINE
737}
738
739// boostinspect:noascii
740

source code of boost/libs/locale/test/test_encoding.cpp