test_encoding.cpp source code [boost/libs/locale/test/test_encoding.cpp]

1	//
2	// Copyright (c) 2009-2011 Artyom Beilis (Tonkikh)
3	// Copyright (c) 2022-2023 Alexander Grund
4	//
5	// Distributed under the Boost Software License, Version 1.0.
6	// https://www.boost.org/LICENSE_1_0.txt
7
8	#include <boost/locale/encoding.hpp>
9	#include <boost/locale/generator.hpp>
10	#include <algorithm>
11
12	#include "boostLocale/test/tools.hpp"
13	#include "boostLocale/test/unit_test.hpp"
14
15	const bool test_iso_8859_8 =
16	#if defined(BOOST_LOCALE_WITH_ICU) \|\| defined(BOOST_LOCALE_WITH_ICONV)
17	true;
18	#else
19	hasWinCodepage(`28598`);
20	#endif
21
22	#if defined(BOOST_LOCALE_WITH_ICONV)
23	// Reproduce issue #206 to detect faulty IConv
24	static bool isFaultyIconv()
25	{
26	namespace blc = boost::locale::conv;
27	auto from_utf = blc::detail::make_utf_decoder<char>(charset: "ISO-2022-CN", how: blc::skip, impl: blc::detail::conv_backend::IConv);
28	try {
29	from_utf ->convert(text: "实");
30	} catch(const std::runtime_error& e) { // LCOV_EXCL_LINE
31	return std::string (e.what()).find(s: "IConv is faulty") != std::string::npos; // LCOV_EXCL_LINE
32	}
33	return false;
34	}
35	#else
36	constexpr bool isFaultyIconv()
37	{
38	return false;
39	}
40	#endif
41
42	constexpr boost::locale::conv::detail::conv_backend all_conv_backends[] = {
43	#ifdef BOOST_LOCALE_WITH_ICONV
44	boost::locale::conv::detail::conv_backend::IConv,
45	#endif
46	#ifdef BOOST_LOCALE_WITH_ICU
47	boost::locale::conv::detail::conv_backend::ICU,
48	#endif
49	#if BOOST_LOCALE_USE_WIN32_API
50	boost::locale::conv::detail::conv_backend::WinAPI,
51	#endif
52	};
53
54	std::ostream& operator<<(std::ostream& s, boost::locale::conv::detail::conv_backend impl)
55	{
56	using boost::locale::conv::detail::conv_backend;
57	switch(impl) {
58	case conv_backend::Default: return s << "[Default]"; // LCOV_EXCL_LINE
59	case conv_backend::IConv: return s << "[IConv]";
60	case conv_backend::ICU: return s << "[ICU]";
61	case conv_backend::WinAPI: return s << "[WinAPI]";
62	}
63	return s; // LCOV_EXCL_LINE
64	}
65
66	#define TEST_FAIL_CONVERSION(X) TEST_THROWS(X, boost::locale::conv::conversion_error)
67
68	template<typename Char>
69	void test_to_utf_for_impls(const std::string& source,
70	const std::basic_string<Char>& target,
71	const std::string& encoding,
72	const bool expectSuccess = true,
73	const bool test_default = true)
74	{
75	if(test_default) {
76	boost::locale::conv::utf_encoder<Char> conv(encoding);
77	TEST_EQ(conv(source), target);
78	}
79	for(const auto impl : all_conv_backends) {
80	std::cout << "----- " << impl << `'\n'`;
81	using boost::locale::conv::invalid_charset_error;
82	try {
83	auto convPtr =
84	boost::locale::conv::detail::make_utf_encoder<Char>(encoding, boost::locale::conv::skip, impl);
85	TEST_EQ(convPtr->convert(source), target);
86	} catch(invalid_charset_error&) {
87	std::cout << "--- Charset not supported\n"; // LCOV_EXCL_LINE
88	continue; // LCOV_EXCL_LINE
89	}
90	if(!expectSuccess) {
91	auto convPtr =
92	boost::locale::conv::detail::make_utf_encoder<Char>(encoding, boost::locale::conv::stop, impl);
93	TEST_FAIL_CONVERSION(convPtr->convert(source));
94	}
95	}
96	if(encoding == "UTF-8") {
97	using boost::locale::conv::utf_to_utf;
98	TEST_EQ(utf_to_utf<Char>(source), target);
99	if(expectSuccess)
100	TEST_EQ(utf_to_utf<char>(source), source);
101	else
102	TEST_FAIL_CONVERSION(utf_to_utf<Char>(source, boost::locale::conv::stop));
103	}
104	}
105
106	template<typename Char>
107	void test_from_utf_for_impls(const std::basic_string<Char>& source,
108	const std::string& target,
109	const std::string& encoding,
110	const bool expectSuccess = true,
111	const bool test_default = true)
112	{
113	if(test_default) {
114	boost::locale::conv::utf_decoder<Char> conv(encoding);
115	TEST_EQ(conv(source), target);
116	}
117	for(const auto impl : all_conv_backends) {
118	std::cout << "----- " << impl << `'\n'`;
119	using boost::locale::conv::invalid_charset_error;
120	try {
121	auto convPtr =
122	boost::locale::conv::detail::make_utf_decoder<Char>(encoding, boost::locale::conv::skip, impl);
123	TEST_EQ(convPtr->convert(source), target);
124	} catch(invalid_charset_error&) {
125	std::cout << "--- Charset not supported\n"; // LCOV_EXCL_LINE
126	continue; // LCOV_EXCL_LINE
127	}
128	if(!expectSuccess) {
129	auto convPtr =
130	boost::locale::conv::detail::make_utf_decoder<Char>(encoding, boost::locale::conv::stop, impl);
131	TEST_FAIL_CONVERSION(convPtr->convert(source));
132	}
133	}
134	if(encoding == "UTF-8") {
135	using boost::locale::conv::utf_to_utf;
136	TEST_EQ(utf_to_utf<char>(source), target);
137	if(expectSuccess)
138	TEST_EQ(utf_to_utf<Char>(source), source);
139	else
140	TEST_FAIL_CONVERSION(utf_to_utf<char>(source, boost::locale::conv::stop));
141	}
142	}
143
144	template<typename Char>
145	void test_to_from_utf(const std::string& source,
146	const std::basic_string<Char>& target,
147	const std::string& encoding,
148	const bool test_default = true)
149	{
150	std::cout << "-- " << encoding << std::endl;
151
152	if(test_default) {
153	TEST_EQ(boost::locale::conv::to_utf<Char>(source, encoding), target);
154	TEST_EQ(boost::locale::conv::from_utf<Char>(target, encoding), source);
155	}
156	test_to_utf_for_impls(source, target, encoding, true, test_default);
157	test_from_utf_for_impls(target, source, encoding, true, test_default);
158	}
159
160	template<typename Char>
161	void test_error_to_utf(const std::string& source, const std::basic_string<Char>& target, const std::string& encoding)
162	{
163	using boost::locale::conv::to_utf;
164	using boost::locale::conv::stop;
165
166	// Default: Replace, no error
167	TEST_EQ(to_utf<Char>(source, encoding), target);
168	// Test all overloads with method=stop -> error
169	// source as string, C-String, range
170	TEST_FAIL_CONVERSION(to_utf<Char>(source, encoding, stop));
171	TEST_FAIL_CONVERSION(to_utf<Char>(source.c_str(), encoding, stop));
172	TEST_FAIL_CONVERSION(to_utf<Char>(source.c_str(), source.c_str() + source.size(), encoding, stop));
173	// Same but encoding via locale
174	const std::locale l = boost::locale::generator {}("en_US." + encoding);
175	TEST_FAIL_CONVERSION(to_utf<Char>(source, l, stop));
176	TEST_FAIL_CONVERSION(to_utf<Char>(source.c_str(), l, stop));
177	TEST_FAIL_CONVERSION(to_utf<Char>(source.c_str(), source.c_str() + source.size(), l, stop));
178	test_to_utf_for_impls(source, target, encoding, false);
179	}
180
181	template<typename Char>
182	void test_error_from_utf(const std::basic_string<Char>& source, const std::string& target, const std::string& encoding)
183	{
184	using boost::locale::conv::from_utf;
185	using boost::locale::conv::stop;
186
187	// Default: Replace, no error
188	TEST_EQ(from_utf<Char>(source, encoding), target);
189	// Test all overloads with method=stop -> error
190	// source as string, C-String, range
191	TEST_FAIL_CONVERSION(from_utf<Char>(source, encoding, stop));
192	TEST_FAIL_CONVERSION(from_utf<Char>(source.c_str(), encoding, stop));
193	TEST_FAIL_CONVERSION(from_utf<Char>(source.c_str(), source.c_str() + source.size(), encoding, stop));
194	// Same but encoding via locale
195	const std::locale l = boost::locale::generator {}("en_US." + encoding);
196	TEST_FAIL_CONVERSION(from_utf<Char>(source, l, stop));
197	TEST_FAIL_CONVERSION(from_utf<Char>(source.c_str(), l, stop));
198	TEST_FAIL_CONVERSION(from_utf<Char>(source.c_str(), source.c_str() + source.size(), l, stop));
199	test_from_utf_for_impls(source, target, encoding, false);
200	}
201
202	template<typename Char>
203	std::basic_string<Char> utf(const std::string& s)
204	{
205	return to<Char>(s);
206	}
207
208	template<>
209	std::basic_string<char> utf(const std::string& s)
210	{
211	return s;
212	}
213
214	template<typename Char>
215	void test_with_0()
216	{
217	std::cout << "-- Test string containing NULL chars" << std::endl;
218	const char with_null[] = "foo\0\0 of\0";
219	const std::string s_with_null(with_null, sizeof(with_null) - `1`);
220	const std::basic_string<Char> s_with_null2 = ascii_to<Char>(with_null);
221	for(const std::string charset : {"UTF-8", "ISO8859-1"}) {
222	for(const auto impl : all_conv_backends) {
223	std::cout << "--- " << charset << " to UTF with Impl " << impl << std::endl;
224	auto to_utf =
225	boost::locale::conv::detail::make_utf_encoder<Char>(charset, boost::locale::conv::default_method, impl);
226	TEST_EQ(to_utf->convert(s_with_null), s_with_null2);
227	std::cout << "--- " << charset << " from UTF with Impl " << impl << std::endl;
228	auto from_utf =
229	boost::locale::conv::detail::make_utf_decoder<Char>(charset, boost::locale::conv::default_method, impl);
230	TEST_EQ(from_utf->convert(s_with_null2), s_with_null);
231	}
232	}
233	using boost::locale::conv::utf_to_utf;
234	TEST_EQ(utf_to_utf<Char>(s_with_null), s_with_null2);
235	TEST_EQ(utf_to_utf<Char>(s_with_null2), s_with_null2);
236	TEST_EQ(utf_to_utf<char>(s_with_null2), s_with_null);
237	TEST_EQ(utf_to_utf<char>(s_with_null), s_with_null);
238	}
239
240	template<typename Char, int n = sizeof(Char)>
241	struct utfutf;
242
243	#ifdef BOOST_MSVC
244	# pragma warning(push)
245	# pragma warning(disable : 4309) // narrowing static_cast warning
246	#endif
247	template<typename U8Char>
248	struct utfutf<U8Char, `1`> {
249	static const U8Char* ok() { return reinterpret_cast<const U8Char*>("grüßen"); }
250	static const U8Char* bad()
251	{
252	return reinterpret_cast<const U8Char*>("gr\xFF"
253	"üßen");
254	// split into 2 to make SunCC happy
255	}
256	static U8Char bad_char() { return static_cast<U8Char>(`0xFF`); }
257	};
258
259	template<>
260	struct utfutf<wchar_t, `2`> {
261	static const wchar_t* ok() { return L"\x67\x72\xfc\xdf\x65\x6e"; }
262	static const wchar_t* bad()
263	{
264	static wchar_t buf[`256`] = L"\x67\x72\xFF\xfc\xFE\xFD\xdf\x65\x6e";
265	buf[`2`] = `0xDC01`; // second surrogate must not be
266	buf[`4`] = `0xD801`; // First
267	buf[`5`] = `0xD801`; // Must be surrogate trail
268	return buf;
269	}
270	static wchar_t bad_char() { return static_cast<wchar_t>(`0xDC01`); }
271	};
272
273	template<>
274	struct utfutf<wchar_t, `4`> {
275	static const wchar_t* ok() { return L"\x67\x72\xfc\xdf\x65\x6e"; }
276	static const wchar_t* bad()
277	{
278	static wchar_t buf[`256`] = L"\x67\x72\xFF\xfc\xdf\x65\x6e";
279	buf[`2`] = static_cast<wchar_t>(`0x1000000`); // > 10FFFF
280	return buf;
281	}
282	static wchar_t bad_char() { return static_cast<wchar_t>(`0x1000000`); }
283	};
284	#ifdef BOOST_MSVC
285	# pragma warning(pop)
286	#endif
287
288	template<typename CharOut, typename CharIn>
289	void test_combinations()
290	{
291	using boost::locale::conv::utf_to_utf;
292	typedef utfutf<CharOut> out;
293	typedef utfutf<CharIn> in;
294	const CharIn* inOk = in::ok();
295	// Both overloads: C-string and string. Both call the range overload
296	TEST((utf_to_utf<CharOut>(inOk) == out::ok()));
297	TEST((utf_to_utf<CharOut>(std::basic_string<CharIn>(inOk)) == out::ok()));
298	const CharIn* inBad = in::bad();
299	// Again both overloads
300	TEST_FAIL_CONVERSION((utf_to_utf<CharOut>(inBad, boost::locale::conv::stop)));
301	TEST_FAIL_CONVERSION((utf_to_utf<CharOut>(std::basic_string<CharIn>(inBad), boost::locale::conv::stop)));
302	TEST((utf_to_utf<CharOut>(in::bad()) == out::ok()));
303	}
304
305	void test_all_combinations()
306	{
307	std::cout << "Testing utf_to_utf\n";
308	std::cout << " char<-char" << std::endl;
309	test_combinations<char, char>();
310	std::cout << " char<-wchar" << std::endl;
311	test_combinations<char, wchar_t>();
312	std::cout << " wchar<-char" << std::endl;
313	test_combinations<wchar_t, char>();
314	std::cout << " wchar<-wchar" << std::endl;
315	test_combinations<wchar_t, wchar_t>();
316	}
317
318	template<typename Char>
319	void test_utf_for()
320	{
321	using boost::locale::conv::invalid_charset_error;
322
323	{
324	using boost::locale::conv::to_utf;
325	using boost::locale::conv::from_utf;
326	TEST_THROWS(to_utf<Char>("Hello", "invalid-charset"), invalid_charset_error);
327	TEST_THROWS(from_utf<Char>(ascii_to<Char>("Hello"), "invalid-charset"), invalid_charset_error);
328	}
329
330	test_to_from_utf<Char>(to<char>(utf8: "grüßen"), utf<Char>("grüßen"), "ISO8859-1");
331	if(test_iso_8859_8)
332	test_to_from_utf<Char>("\xf9\xec\xe5\xed", utf<Char>("שלום"), "ISO8859-8");
333	test_to_from_utf<Char>("grüßen", utf<Char>("grüßen"), "UTF-8");
334	test_to_from_utf<Char>("abc\"\xf0\xa0\x82\x8a\"", utf<Char>("abc\"\xf0\xa0\x82\x8a\""), "UTF-8");
335	// Testing a codepage which may be an issue on Windows, see issue #121
336	try {
337	test_to_from_utf<Char>("\x1b$BE_5(\x1b(B", utf<Char>("冬季"), "iso-2022-jp");
338	} catch(const invalid_charset_error&) { // LCOV_EXCL_LINE
339	std::cout << "--- not supported\n"; // LCOV_EXCL_LINE
340	}
341	if(!isFaultyIconv()) {
342	// Testing a codepage which may crash with IConv on macOS, see issue #196
343	test_to_from_utf<Char>("\xa1\xad\xa1\xad", utf<Char>("……"), "gbk", false);
344	// This might cause a bogus E2BIG on macOS, see issue #206
345	test_to_from_utf<Char>("\x1b\x24\x29\x41\x0e\x4a\x35\xf", utf<Char>("实"), "ISO-2022-CN", false);
346	}
347
348	std::cout << "- Testing correct invalid bytes skipping\n";
349	{
350	std::cout << "-- UTF-8" << std::endl;
351
352	std::cout << "--- At start single" << std::endl;
353	test_error_to_utf<Char>("\xFFgrüßen", utf<Char>("grüßen"), "UTF-8");
354	std::cout << "--- At start multiple" << std::endl;
355	test_error_to_utf<Char>("\xFF\xFFgrüßen", utf<Char>("grüßen"), "UTF-8");
356
357	std::cout << "--- At middle single" << std::endl;
358	test_error_to_utf<Char>("g\xFFrüßen", utf<Char>("grüßen"), "UTF-8");
359	std::cout << "--- At middle multiple" << std::endl;
360	test_error_to_utf<Char>("g\xFF\xFF\xFFrüßen", utf<Char>("grüßen"), "UTF-8");
361
362	std::cout << "--- At end single" << std::endl;
363	test_error_to_utf<Char>("grüßen\xFF", utf<Char>("grüßen"), "UTF-8");
364	std::cout << "--- At end multiple" << std::endl;
365	test_error_to_utf<Char>("grüßen\xFF\xFF", utf<Char>("grüßen"), "UTF-8");
366
367	try {
368	std::cout << "-- ISO-8859-8" << std::endl;
369	test_error_to_utf<Char>("\xFB", utf<Char>(""), "ISO-8859-8");
370	test_error_to_utf<Char>("\xFB-", utf<Char>("-"), "ISO-8859-8");
371	test_error_to_utf<Char>("test \xE0\xE1\xFB", utf<Char>("test \xd7\x90\xd7\x91"), "ISO-8859-8");
372	test_error_to_utf<Char>("test \xE0\xE1\xFB-", utf<Char>("test \xd7\x90\xd7\x91-"), "ISO-8859-8");
373	} catch(const invalid_charset_error&) { // LCOV_EXCL_LINE
374	std::cout << "--- not supported\n"; // LCOV_EXCL_LINE
375	}
376	try {
377	std::cout << "-- cp932" << std::endl;
378	test_error_to_utf<Char>("\x83\xF8", utf<Char>(""), "cp932");
379	test_error_to_utf<Char>("\x83\xF8-", utf<Char>("-"), "cp932");
380	test_error_to_utf<Char>("test\xE0\xA0 \x83\xF8", utf<Char>("test\xe7\x87\xbf "), "cp932");
381	test_error_to_utf<Char>("test\xE0\xA0 \x83\xF8-", utf<Char>("test\xe7\x87\xbf -"), "cp932");
382	} catch(const invalid_charset_error&) { // LCOV_EXCL_LINE
383	std::cout << "--- not supported\n"; // LCOV_EXCL_LINE
384	}
385	std::cout << "-- Error for encoding at start" << std::endl;
386	test_error_from_utf<Char>(utf<Char>("שלום hello"), " hello", "ISO8859-1");
387	std::cout << "-- Error for encoding at middle and end" << std::endl;
388	test_error_from_utf<Char>(utf<Char>("hello שלום world"), "hello world", "ISO8859-1");
389	std::cout << "-- Error for encoding at end" << std::endl;
390	test_error_from_utf<Char>(utf<Char>("hello שלום"), "hello ", "ISO8859-1");
391	std::cout << "-- Error for decoding to UTF-8" << std::endl;
392	test_error_from_utf<Char>(utfutf<Char>::bad(), utfutf<char>::ok(), "UTF-8");
393	std::cout << "-- Error for decoding to Latin1" << std::endl;
394	test_error_from_utf<Char>(utfutf<Char>::bad(), to<char>(utf8: utfutf<char>::ok()), "Latin1");
395
396	const std::basic_string<Char> onlyInvalidUtf(`2`, utfutf<Char>::bad_char());
397	std::cout << "-- Error decoding string of only invalid chars to UTF-8" << std::endl;
398	test_error_from_utf<Char>(onlyInvalidUtf, "", "UTF-8");
399	std::cout << "-- Error decoding string of only invalid chars to Latin1" << std::endl;
400	test_error_from_utf<Char>(onlyInvalidUtf, "", "Latin1");
401	}
402
403	test_with_0<Char>();
404	}
405
406	template<typename Char1, typename Char2>
407	void test_utf_to_utf_for(const std::string& utf8_string)
408	{
409	const auto utf_string1 = utf<Char1>(utf8_string);
410	const auto utf_string2 = utf<Char2>(utf8_string);
411	using boost::locale::conv::utf_to_utf;
412	TEST_EQ(utf_to_utf<Char1>(utf_string2), utf_string1);
413	TEST_EQ(utf_to_utf<Char2>(utf_string1), utf_string2);
414	TEST_EQ(utf_to_utf<Char1>(utf_string1), utf_string1);
415	TEST_EQ(utf_to_utf<Char2>(utf_string2), utf_string2);
416	}
417
418	template<typename Char>
419	void test_utf_to_utf_for()
420	{
421	const std::string& utf8_string = "A-Za-z0-9grüße'\xf0\xa0\x82\x8a'\xf4\x8f\xbf\xbf";
422	std::cout << "---- char\n";
423	test_utf_to_utf_for<Char, char>(utf8_string);
424	test_to_utf_for_impls(utf8_string, utf<Char>(utf8_string), "UTF-8");
425	test_from_utf_for_impls(utf<Char>(utf8_string), utf8_string, "UTF-8");
426	std::cout << "---- wchar_t\n";
427	test_utf_to_utf_for<Char, wchar_t>(utf8_string);
428	#ifndef BOOST_LOCALE_NO_CXX20_STRING8
429	std::cout << "---- char8_t\n";
430	test_utf_to_utf_for<Char, char8_t>(utf8_string);
431	#endif
432	#ifdef BOOST_LOCALE_ENABLE_CHAR16_T
433	std::cout << "---- char16_t\n";
434	test_utf_to_utf_for<Char, char16_t>(utf8_string);
435	#endif
436	#ifdef BOOST_LOCALE_ENABLE_CHAR32_T
437	std::cout << "---- char32_t\n";
438	test_utf_to_utf_for<Char, char32_t>(utf8_string);
439	#endif
440	}
441
442	void test_utf_to_utf()
443	{
444	std::cout << "- Testing UTF to UTF conversion\n";
445	std::cout << "-- char\n";
446	test_utf_to_utf_for<char>();
447	std::cout << "-- wchar_t\n";
448	test_utf_to_utf_for<wchar_t>();
449	#ifndef BOOST_LOCALE_NO_CXX20_STRING8
450	std::cout << "-- char8_t\n";
451	test_utf_to_utf_for<char8_t>();
452	#endif
453	#ifdef BOOST_LOCALE_ENABLE_CHAR16_T
454	std::cout << "-- char16_t\n";
455	test_utf_to_utf_for<char16_t>();
456	#endif
457	#ifdef BOOST_LOCALE_ENABLE_CHAR32_T
458	std::cout << "-- char32_t\n";
459	test_utf_to_utf_for<char32_t>();
460	#endif
461	}
462
463	/// Test all overloads of to_utf/from_utf templated by Char
464	template<typename Char>
465	void test_latin1_conversions_for()
466	{
467	const std::string utf8_string = "A-Za-z0-9grüße";
468	const std::string sLatin1 = to<char>(utf8: utf8_string);
469	// Sanity check that utf8_string is UTF-8 encoded (using multiple bytes for the special chars)
470	// and sLatin1 is not encoded (1 byte per char)
471	TEST_GT(utf8_string.length(), sLatin1.length());
472	const std::basic_string<Char> sWide = utf<Char>(utf8_string);
473	const std::string encoding = "Latin1";
474
475	using boost::locale::conv::to_utf;
476	using boost::locale::conv::utf_encoder;
477	// 3 variants for source: string, C-string, range
478	TEST_EQ(to_utf<Char>(sLatin1, encoding), sWide);
479	TEST_EQ(to_utf<Char>(sLatin1.c_str(), encoding), sWide);
480	TEST_EQ(to_utf<Char>(sLatin1.c_str(), sLatin1.c_str() + sLatin1.size(), encoding), sWide);
481	TEST_EQ(utf_encoder<Char>(encoding)(sLatin1), sWide);
482	TEST_EQ(utf_encoder<Char>(encoding).convert(sLatin1), sWide);
483	TEST_EQ(utf_encoder<Char>(encoding).convert(sLatin1.c_str(), sLatin1.c_str() + sLatin1.size()), sWide);
484	// Same but encoding given via locale
485	const std::locale l = boost::locale::generator {}("en_US.Latin1");
486	TEST_EQ(to_utf<Char>(sLatin1, l), sWide);
487	TEST_EQ(to_utf<Char>(sLatin1.c_str(), l), sWide);
488	TEST_EQ(to_utf<Char>(sLatin1.c_str(), sLatin1.c_str() + sLatin1.size(), l), sWide);
489
490	using boost::locale::conv::from_utf;
491	using boost::locale::conv::utf_decoder;
492	// 3 variants for source: string, C-string, range
493	TEST_EQ(from_utf<Char>(sWide, encoding), sLatin1);
494	TEST_EQ(from_utf<Char>(sWide.c_str(), encoding), sLatin1);
495	TEST_EQ(from_utf<Char>(sWide.c_str(), sWide.c_str() + sWide.size(), encoding), sLatin1);
496	TEST_EQ(utf_decoder<Char>(encoding)(sWide), sLatin1);
497	TEST_EQ(utf_decoder<Char>(encoding).convert(sWide), sLatin1);
498	TEST_EQ(utf_decoder<Char>(encoding).convert(sWide.c_str(), sWide.c_str() + sWide.size()), sLatin1);
499	// Same but encoding given via locale
500	TEST_EQ(from_utf<Char>(sWide, l), sLatin1);
501	TEST_EQ(from_utf<Char>(sWide.c_str(), l), sLatin1);
502	TEST_EQ(from_utf<Char>(sWide.c_str(), sWide.c_str() + sWide.size(), l), sLatin1);
503
504	// Empty string doesn't error/assert
505	TEST_EQ(to_utf<Char>("", encoding), utf<Char>(""));
506	TEST_EQ(from_utf<Char>(utf<Char>(""), encoding), std::string ());
507	test_to_utf_for_impls("", utf<Char>(""), encoding);
508	test_from_utf_for_impls(utf<Char>(""), "", encoding);
509	}
510
511	/// Quick check of to_utf/from_utf overloads using the simple Latin1 encoding
512	void test_latin1_conversions()
513	{
514	std::cout << "- Testing Latin1 conversion\n";
515	std::cout << "-- char\n";
516	test_latin1_conversions_for<char>();
517	std::cout << "-- wchar_t\n";
518	test_latin1_conversions_for<wchar_t>();
519	#ifndef BOOST_LOCALE_NO_CXX20_STRING8
520	std::cout << "-- char8_t\n";
521	test_latin1_conversions_for<char8_t>();
522	#endif
523	#ifdef BOOST_LOCALE_ENABLE_CHAR16_T
524	std::cout << "-- char16_t\n";
525	test_latin1_conversions_for<char16_t>();
526	#endif
527	#ifdef BOOST_LOCALE_ENABLE_CHAR32_T
528	std::cout << "-- char32_t\n";
529	test_latin1_conversions_for<char32_t>();
530	#endif
531	}
532
533	void test_between_for_impls(const std::string& source,
534	const std::string& target,
535	const std::string& to_encoding,
536	const std::string& from_encoding,
537	const bool expectSuccess = true)
538	{
539	boost::locale::conv::narrow_converter conv(from_encoding, to_encoding);
540	TEST_EQ(conv(source), target);
541	for(const auto impl : all_conv_backends) {
542	using boost::locale::conv::detail::make_narrow_converter;
543	std::cout << "----- " << impl << `'\n'`;
544	using boost::locale::conv::invalid_charset_error;
545	try {
546	auto convPtr = make_narrow_converter(src_encoding: from_encoding, target_encoding: to_encoding, how: boost::locale::conv::skip, impl);
547	TEST_EQ(convPtr ->convert(source), target);
548	} catch(invalid_charset_error&) {
549	continue; // LCOV_EXCL_LINE
550	}
551	if(!expectSuccess) {
552	auto convPtr = make_narrow_converter(src_encoding: from_encoding, target_encoding: to_encoding, how: boost::locale::conv::stop, impl);
553	TEST_FAIL_CONVERSION(convPtr ->convert(source));
554	}
555	}
556	if(to_encoding == "UTF-8" && from_encoding == "UTF-8") {
557	using boost::locale::conv::utf_to_utf;
558	TEST_EQ(utf_to_utf<char>(source), target);
559	if(!expectSuccess)
560	TEST_FAIL_CONVERSION(utf_to_utf<char>(source, boost::locale::conv::stop));
561	}
562	}
563
564	void test_error_between(const std::string& source,
565	const std::string& target,
566	const std::string& to_encoding,
567	const std::string& from_encoding)
568	{
569	using boost::locale::conv::between;
570	TEST_EQ(between(source, to_encoding, from_encoding), target);
571	using boost::locale::conv::stop;
572	TEST_FAIL_CONVERSION(between(source, to_encoding, from_encoding, stop));
573	TEST_FAIL_CONVERSION(between(source.c_str(), to_encoding, from_encoding, stop));
574	TEST_FAIL_CONVERSION(between(source.c_str(), source.c_str() + source.size(), to_encoding, from_encoding, stop));
575	test_between_for_impls(source, target, to_encoding, from_encoding, expectSuccess: false);
576	}
577
578	void test_between()
579	{
580	using boost::locale::conv::between;
581	const std::string utf8_string = "A-Za-z0-9grüße";
582	const std::string sLatin1 = to<char>(utf8: utf8_string);
583	TEST_GT(utf8_string.length(), sLatin1.length()); // Assert UTF encoding -> multi byte
584	TEST_EQ(between(sLatin1, "UTF-8", "Latin1"), utf8_string);
585	TEST_EQ(between(sLatin1.c_str(), "UTF-8", "Latin1"), utf8_string);
586	TEST_EQ(between(sLatin1.c_str(), sLatin1.c_str() + sLatin1.size(), "UTF-8", "Latin1"), utf8_string);
587	test_between_for_impls(source: sLatin1, target: utf8_string, to_encoding: "UTF-8", from_encoding: "Latin1");
588	TEST_EQ(between(utf8_string, "Latin1", "UTF-8"), sLatin1);
589	TEST_EQ(between(utf8_string.c_str(), "Latin1", "UTF-8"), sLatin1);
590	TEST_EQ(between(utf8_string.c_str(), utf8_string.c_str() + utf8_string.size(), "Latin1", "UTF-8"), sLatin1);
591	test_between_for_impls(source: utf8_string, target: sLatin1, to_encoding: "Latin1", from_encoding: "UTF-8");
592	// Same encoding
593	TEST_EQ(between(utf8_string, "UTF-8", "UTF-8"), utf8_string);
594	test_between_for_impls(source: utf8_string, target: utf8_string, to_encoding: "UTF-8", from_encoding: "UTF-8");
595	TEST_EQ(between(sLatin1, "Latin1", "Latin1"), sLatin1);
596	test_between_for_impls(source: sLatin1, target: sLatin1, to_encoding: "Latin1", from_encoding: "Latin1");
597	// Wrong encoding throws
598	{
599	using boost::locale::conv::invalid_charset_error;
600	TEST_THROWS(between(sLatin1, "Invalid-Encoding", "Latin1"), invalid_charset_error);
601	TEST_THROWS(between(sLatin1, "UTF-8", "Invalid-Encoding"), invalid_charset_error);
602	TEST_THROWS(between(sLatin1, "Invalid-Encoding", "Invalid-Encoding"), invalid_charset_error);
603	for(const auto impl : all_conv_backends) {
604	std::cout << "----- " << impl << `'\n'`;
605	using boost::locale::conv::invalid_charset_error;
606	using boost::locale::conv::skip;
607	using boost::locale::conv::detail::make_narrow_converter;
608	TEST_THROWS(make_narrow_converter("Invalid-Encoding", "Latin1", skip, impl), invalid_charset_error);
609	TEST_THROWS(make_narrow_converter("UTF-8", "Invalid-Encoding", skip, impl), invalid_charset_error);
610	TEST_THROWS(make_narrow_converter("Invalid-Encoding", "Invalid-Encoding", skip, impl),
611	invalid_charset_error);
612	}
613	}
614	// Error handling
615	// Unencodable char at start, middle, end
616	test_error_between(source: "שלום hello", target: " hello", to_encoding: "ISO8859-1", from_encoding: "UTF-8");
617	test_error_between(source: "hello שלום world", target: "hello world", to_encoding: "ISO8859-1", from_encoding: "UTF-8");
618	test_error_between(source: "hello שלום", target: "hello ", to_encoding: "ISO8859-1", from_encoding: "UTF-8");
619	// Undecodable char(s) at start, middle, end
620	test_error_between(source: "\xFFxfoo", target: "xfoo", to_encoding: "ISO8859-1", from_encoding: "UTF-8");
621	test_error_between(source: "\xFF\xFFyfoo", target: "yfoo", to_encoding: "ISO8859-1", from_encoding: "UTF-8");
622	test_error_between(source: "f\xFFoo2", target: "foo2", to_encoding: "ISO8859-1", from_encoding: "UTF-8");
623	test_error_between(source: "f\xFF\xFF\xFFoo3", target: "foo3", to_encoding: "ISO8859-1", from_encoding: "UTF-8");
624	test_error_between(source: "foo4\xFF", target: "foo4", to_encoding: "ISO8859-1", from_encoding: "UTF-8");
625	test_error_between(source: "foo5\xFF\xFF", target: "foo5", to_encoding: "ISO8859-1", from_encoding: "UTF-8");
626	// Same but UTF-8 to UTF-8
627	test_error_between(source: "\xFFzfoo", target: "zfoo", to_encoding: "UTF-8", from_encoding: "UTF-8");
628	test_error_between(source: "f\xFFoo6", target: "foo6", to_encoding: "UTF-8", from_encoding: "UTF-8");
629	test_error_between(source: "f\xFF\xFF\xFFoo7", target: "foo7", to_encoding: "UTF-8", from_encoding: "UTF-8");
630	}
631
632	void test_utf_name();
633	void test_simple_encodings();
634	void test_win_codepages();
635
636	void test_main(int /argc/, char** /argv/)
637	{
638	// Sanity check to<char>
639	TEST_EQ(to<char>("grüßen"),
640	"gr\xFC\xDF"
641	"en");
642	TEST_THROWS(to<char>("€"), std::logic_error);
643	// Sanity check internal details
644	test_utf_name();
645	test_simple_encodings();
646	test_win_codepages();
647
648	test_latin1_conversions();
649	test_utf_to_utf();
650
651	std::cout << "Testing charset to/from UTF conversion functions\n";
652	std::cout << " char" << std::endl;
653	test_utf_for<char>();
654	std::cout << " wchar_t" << std::endl;
655	test_utf_for<wchar_t>();
656	#ifndef BOOST_LOCALE_NO_CXX20_STRING8
657	std::cout << " char8_t" << std::endl;
658	test_utf_for<char8_t>();
659	#endif
660	#ifdef BOOST_LOCALE_ENABLE_CHAR16_T
661	std::cout << " char16_t" << std::endl;
662	test_utf_for<char16_t>();
663	#endif
664	#ifdef BOOST_LOCALE_ENABLE_CHAR32_T
665	std::cout << " char32_t" << std::endl;
666	test_utf_for<char32_t>();
667	#endif
668
669	test_all_combinations();
670	test_between();
671	}
672
673	// Internal tests, keep those out of the above scope
674
675	bool isLittleEndian()
676	{
677	#if defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__)
678	return __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__;
679	#elif defined(__LITTLE_ENDIAN__)
680	return true;
681	#elif defined(__BIG_ENDIAN__)
682	return false;
683	#endif
684	const int endianMark = `1`;
685	return reinterpret_cast<const char*>(&endianMark)[`0`] == `1`;
686	}
687
688	#include "../src/boost/locale/util/encoding.hpp"
689	#include "../src/boost/locale/util/win_codepages.hpp"
690
691	void test_utf_name()
692	{
693	TEST_EQ(boost::locale::util::utf_name<char>(), std::string ("UTF-8"));
694	#ifdef __cpp_char8_t
695	TEST_EQ(boost::locale::util::utf_name<char8_t>(), std::string("UTF-8"));
696	#endif
697	TEST_EQ(boost::locale::util::utf_name<char16_t>(), std::string (isLittleEndian() ? "UTF-16LE" : "UTF-16BE"));
698	TEST_EQ(boost::locale::util::utf_name<char32_t>(), std::string (isLittleEndian() ? "UTF-32LE" : "UTF-32BE"));
699	}
700
701	void test_simple_encodings()
702	{
703	using namespace boost::locale::util;
704	const auto encodings = get_simple_encodings();
705	for(auto it = encodings.begin(), end = encodings.end(); it != end; ++it) {
706	TEST_EQ(normalize_encoding(it), it); // Must be normalized
707	const auto it2 = std::find(first: it + `1`, last: end, val: *it);
708	TEST(it2 == end);
709	if(it2 != end)
710	std::cerr << "Duplicate entry: " << it << `'\n'`; // LCOV_EXCL_LINE*
711	}
712	const auto it = std::is_sorted_until(first: encodings.begin(), last: encodings.end());
713	TEST(it == encodings.end());
714	if(it != encodings.end())
715	std::cerr << "First wrongly sorted element: " << it << `'\n'`; // LCOV_EXCL_LINE*
716	}
717
718	void test_win_codepages()
719	{
720	using namespace boost::locale::util;
721
722	for(const windows_encoding it = all_windows_encodings, end = std::end(arr&: all_windows_encodings); it != end; ++it) {
723	TEST_EQ(normalize_encoding(it->name), it->name); // Must be normalized
724	auto is_same_win_codepage = [&it](const windows_encoding& rhs) -> bool {
725	return it->codepage == rhs.codepage && std::strcmp(s1: it->name, s2: rhs.name) == `0`;
726	};
727	const auto* it2 = std::find_if(first: it + `1`, last: end, pred: is_same_win_codepage);
728	TEST(it2 == end);
729	if(it2 != end)
730	std::cerr << "Duplicate entry: " << it->name << `':'` << it->codepage << `'\n'`; // LCOV_EXCL_LINE
731	}
732	const auto cmp = [](const windows_encoding& rhs, const windows_encoding& lhs) -> bool { return rhs < lhs.name; };
733	const auto* it = std::is_sorted_until(first: all_windows_encodings, last: std::end(arr&: all_windows_encodings), comp: cmp);
734	TEST(it == std::end(all_windows_encodings));
735	if(it != std::end(arr&: all_windows_encodings))
736	std::cerr << "First wrongly sorted element: " << it->name << `'\n'`; // LCOV_EXCL_LINE
737	}
738
739	// boostinspect:noascii
740

source code of boost/libs/locale/test/test_encoding.cpp