1/*
2 * Copyright Andrey Semashev 2007 - 2021.
3 * Distributed under the Boost Software License, Version 1.0.
4 * (See accompanying file LICENSE_1_0.txt or copy at
5 * http://www.boost.org/LICENSE_1_0.txt)
6 */
7/*!
8 * \file dump_avx2.cpp
9 * \author Andrey Semashev
10 * \date 05.05.2013
11 *
12 * \brief This header is the Boost.Log library implementation, see the library documentation
13 * at http://www.boost.org/doc/libs/release/libs/log/doc/html/index.html.
14 */
15
16// NOTE: You should generally avoid including headers as much as possible here, because this file
17// is compiled with special compiler options, and any included header may result in generation of
18// unintended code with these options and violation of ODR.
19#include <boost/log/detail/config.hpp>
20#include <ostream>
21#include <immintrin.h>
22#include <boost/cstdint.hpp>
23#include <boost/log/detail/header.hpp>
24
25#if defined(__x86_64) || defined(__x86_64__) || \
26 defined(__amd64__) || defined(__amd64) || \
27 defined(_M_X64)
28#define BOOST_LOG_AUX_X86_64
29#endif
30
31namespace boost {
32
33BOOST_LOG_OPEN_NAMESPACE
34
35namespace aux {
36
37extern const char g_hex_char_table[2][16];
38
39template< typename CharT >
40extern void dump_data_generic(const void* data, std::size_t size, std::basic_ostream< CharT >& strm);
41
42BOOST_LOG_ANONYMOUS_NAMESPACE {
43
44enum
45{
46 packs_per_stride = 32,
47 stride = packs_per_stride * 32
48};
49
50template< typename CharT >
51BOOST_FORCEINLINE void store_characters(__m128i mm_chars, CharT* buf)
52{
53 switch (sizeof(CharT))
54 {
55 case 1:
56 _mm_store_si128(p: reinterpret_cast< __m128i* >(buf), b: mm_chars);
57 break;
58
59 case 2:
60 _mm256_store_si256(p: reinterpret_cast< __m256i* >(buf), a: _mm256_cvtepu8_epi16(V: mm_chars));
61 break;
62
63 case 4:
64 {
65 __m128i mm = _mm_unpackhi_epi64(a: mm_chars, b: mm_chars);
66 _mm256_store_si256(p: reinterpret_cast< __m256i* >(buf), a: _mm256_cvtepu8_epi32(V: mm_chars));
67 _mm256_store_si256(p: reinterpret_cast< __m256i* >(buf) + 1, a: _mm256_cvtepu8_epi32(V: mm));
68 }
69 break;
70 }
71}
72
73template< typename CharT >
74BOOST_FORCEINLINE void store_characters_x3(__m256i mm_chars1, __m256i mm_chars2, __m256i mm_chars3, CharT* buf)
75{
76 store_characters(_mm256_castsi256_si128(a: mm_chars1), buf);
77 store_characters(_mm256_castsi256_si128(a: mm_chars2), buf + 16);
78 store_characters(_mm256_castsi256_si128(a: mm_chars3), buf + 32);
79 store_characters(_mm256_extracti128_si256(mm_chars1, 1), buf + 48);
80 store_characters(_mm256_extracti128_si256(mm_chars2, 1), buf + 64);
81 store_characters(_mm256_extracti128_si256(mm_chars3, 1), buf + 80);
82}
83
84union ymm_constant
85{
86 uint8_t as_bytes[32];
87 __m256i as_mm;
88
89 BOOST_FORCEINLINE operator __m256i () const { return as_mm; }
90 BOOST_FORCEINLINE operator __m128i () const { return _mm256_castsi256_si128(a: as_mm); }
91};
92
93static const ymm_constant mm_shuffle_pattern1 = {.as_bytes: { 0x80, 0, 1, 0x80, 2, 3, 0x80, 4, 5, 0x80, 6, 7, 0x80, 8, 9, 0x80, 0x80, 0, 1, 0x80, 2, 3, 0x80, 4, 5, 0x80, 6, 7, 0x80, 8, 9, 0x80 }};
94static const ymm_constant mm_shuffle_pattern2 = {.as_bytes: { 0, 1, 0x80, 2, 3, 0x80, 4, 5, 0x80, 6, 7, 0x80, 8, 9, 0x80, 10, 0, 1, 0x80, 2, 3, 0x80, 4, 5, 0x80, 6, 7, 0x80, 8, 9, 0x80, 10 }};
95static const ymm_constant mm_shuffle_pattern3 = {.as_bytes: { 5, 0x80, 6, 7, 0x80, 8, 9, 0x80, 10, 11, 0x80, 12, 13, 0x80, 14, 15, 5, 0x80, 6, 7, 0x80, 8, 9, 0x80, 10, 11, 0x80, 12, 13, 0x80, 14, 15 }};
96static const ymm_constant mm_shuffle_pattern13 = {.as_bytes: { 0x80, 0, 1, 0x80, 2, 3, 0x80, 4, 5, 0x80, 6, 7, 0x80, 8, 9, 0x80, 5, 0x80, 6, 7, 0x80, 8, 9, 0x80, 10, 11, 0x80, 12, 13, 0x80, 14, 15 }};
97
98#if defined(BOOST_LOG_AUX_X86_64)
99
100// x86-64 architecture has more registers which we can utilize to pass constants
101#define BOOST_LOG_AUX_MM256_CONSTANT_ARGS_DECL __m256i mm_15, __m256i mm_char_space,
102#define BOOST_LOG_AUX_MM256_CONSTANT_ARGS mm_15, mm_char_space,
103#define BOOST_LOG_AUX_MM256_CONSTANTS \
104 const __m256i mm_15 = _mm256_set1_epi32(0x0F0F0F0F);\
105 const __m256i mm_char_space = _mm256_set1_epi32(0x20202020);
106#define BOOST_LOG_AUX_MM128_CONSTANT_ARGS_DECL __m128i mm_15, __m128i mm_char_space,
107#define BOOST_LOG_AUX_MM128_CONSTANT_ARGS mm_15, mm_char_space,
108#define BOOST_LOG_AUX_MM128_CONSTANTS \
109 const __m128i mm_15 = _mm_set1_epi32(0x0F0F0F0F);\
110 const __m128i mm_char_space = _mm_set1_epi32(0x20202020);
111
112#else
113
114// MSVC in 32-bit mode is not able to pass all constants to dump_pack, and is also not able to align them on the stack, so we have to fetch them from global constants
115static const ymm_constant mm_15 = {{ 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F }};
116static const ymm_constant mm_char_space = {{ 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20 }};
117#define BOOST_LOG_AUX_MM256_CONSTANT_ARGS_DECL
118#define BOOST_LOG_AUX_MM256_CONSTANT_ARGS
119#define BOOST_LOG_AUX_MM256_CONSTANTS
120#define BOOST_LOG_AUX_MM128_CONSTANT_ARGS_DECL
121#define BOOST_LOG_AUX_MM128_CONSTANT_ARGS
122#define BOOST_LOG_AUX_MM128_CONSTANTS
123
124#endif
125
126/*!
127 * \brief Dumps a pack of input data into a string of 8 bit ASCII characters.
128 *
129 * The composed string is placed as follows (in Intel notation): mm_output1[127:0], mm_output2[127:0], mm_output3[127:0], mm_output1[255:128], mm_output2[255:128], mm_output3[255:128].
130 */
131static BOOST_FORCEINLINE void dump_pack
132(
133 BOOST_LOG_AUX_MM256_CONSTANT_ARGS_DECL
134 __m256i mm_char_table, __m256i mm_input,
135 __m256i& mm_output1, __m256i& mm_output2, __m256i& mm_output3
136)
137{
138 // Split half-bytes
139 __m256i mm_input_hi = _mm256_and_si256(a: _mm256_srli_epi16(a: mm_input, count: 4), b: mm_15);
140 __m256i mm_input_lo = _mm256_and_si256(a: mm_input, b: mm_15);
141
142 // Stringize each of the halves
143 mm_input_hi = _mm256_shuffle_epi8(a: mm_char_table, b: mm_input_hi);
144 mm_input_lo = _mm256_shuffle_epi8(a: mm_char_table, b: mm_input_lo);
145
146 // Join them back together
147 __m256i mm_1 = _mm256_unpacklo_epi8(a: mm_input_hi, b: mm_input_lo);
148 __m256i mm_2 = _mm256_unpackhi_epi8(a: mm_input_hi, b: mm_input_lo);
149
150 // Insert spaces between stringized bytes:
151 // |0123456789abcdef|0123456789abcdef|
152 // | 01 23 45 67 89 |ab cd ef 01 23 4|5 67 89 ab cd ef|
153 __m256i mm_out1 = _mm256_shuffle_epi8(a: mm_1, b: mm_shuffle_pattern1.as_mm);
154 __m256i mm_out3 = _mm256_shuffle_epi8(a: mm_2, b: mm_shuffle_pattern3.as_mm);
155 __m256i mm_out2 = _mm256_shuffle_epi8(_mm256_alignr_epi8(mm_2, mm_1, 10), b: mm_shuffle_pattern2.as_mm);
156
157 mm_output1 = _mm256_max_epu8(a: mm_out1, b: mm_char_space);
158 mm_output2 = _mm256_max_epu8(a: mm_out2, b: mm_char_space);
159 mm_output3 = _mm256_max_epu8(a: mm_out3, b: mm_char_space);
160}
161
162//! Dumps a pack of input data into a string of 8 bit ASCII characters
163static BOOST_FORCEINLINE void dump_pack
164(
165 BOOST_LOG_AUX_MM128_CONSTANT_ARGS_DECL
166 __m128i mm_char_table, __m128i mm_input,
167 __m128i& mm_output1, __m128i& mm_output2, __m128i& mm_output3
168)
169{
170 // Split half-bytes
171 __m128i mm_input_hi = _mm_and_si128(a: _mm_srli_epi16(a: mm_input, count: 4), b: mm_15);
172 __m128i mm_input_lo = _mm_and_si128(a: mm_input, b: mm_15);
173
174 // Stringize each of the halves
175 mm_input_hi = _mm_shuffle_epi8(a: mm_char_table, b: mm_input_hi);
176 mm_input_lo = _mm_shuffle_epi8(a: mm_char_table, b: mm_input_lo);
177
178 // Join them back together
179 __m128i mm_1 = _mm_unpacklo_epi8(a: mm_input_hi, b: mm_input_lo);
180 __m128i mm_2 = _mm_unpackhi_epi8(a: mm_input_hi, b: mm_input_lo);
181
182 // Insert spaces between stringized bytes:
183 // |0123456789abcdef|0123456789abcdef|
184 // | 01 23 45 67 89 |ab cd ef 01 23 4|5 67 89 ab cd ef|
185 mm_output1 = _mm_shuffle_epi8(a: mm_1, b: mm_shuffle_pattern1);
186 mm_output2 = _mm_shuffle_epi8(_mm_alignr_epi8(mm_2, mm_1, 10), b: mm_shuffle_pattern2);
187 mm_output3 = _mm_shuffle_epi8(a: mm_2, b: mm_shuffle_pattern3);
188
189 mm_output1 = _mm_max_epu8(a: mm_output1, b: mm_char_space);
190 mm_output2 = _mm_max_epu8(a: mm_output2, b: mm_char_space);
191 mm_output3 = _mm_max_epu8(a: mm_output3, b: mm_char_space);
192}
193
194template< typename CharT >
195BOOST_FORCEINLINE void dump_data_avx2(const void* data, std::size_t size, std::basic_ostream< CharT >& strm)
196{
197 typedef CharT char_type;
198
199 char_type buf_storage[stride * 3u + 32u];
200 // Align the temporary buffer at 32 bytes
201 char_type* const buf = reinterpret_cast< char_type* >((uint8_t*)buf_storage + (32u - (((uintptr_t)(char_type*)buf_storage) & 31u)));
202 char_type* buf_begin = buf + 1u; // skip the first space of the first chunk
203 char_type* buf_end = buf + stride * 3u;
204
205 const char* const char_table = g_hex_char_table[(strm.flags() & std::ios_base::uppercase) != 0];
206#if defined(__GNUC__) && ((defined(BOOST_GCC) && BOOST_GCC < 40900) || (defined(BOOST_CLANG) && BOOST_CLANG_VERSION < 40000))
207 // gcc 4.7 is missing _mm256_broadcastsi128_si256 declaration in immintrin.h.
208 // gcc 4.8 generates vmovdqu+vinserti128 instead of a single vbroadcasti128.
209 // clang up until 4.0 generates vmovdqu+vinserti128 or worse.
210 __m256i mm_char_table;
211 __asm__("vbroadcasti128 %1, %0" : "=x" (mm_char_table) : "m" (*reinterpret_cast< const __m128i* >(char_table)));
212#else
213 const __m256i mm_char_table = _mm256_broadcastsi128_si256(X: _mm_loadu_si128(p: reinterpret_cast< const __m128i* >(char_table)));
214#endif
215
216 // First, check the input alignment. Also, if we can dump the whole data in one go, do it right away. It turns out to be faster than splitting
217 // the work between prealign and tail part. It is also a fairly common case since on most platforms memory is not aligned to 32 bytes (i.e. prealign is often needed).
218 const uint8_t* p = static_cast< const uint8_t* >(data);
219 const std::size_t prealign_size = size == 32u ? static_cast< std::size_t >(32u) : static_cast< std::size_t >((32u - ((uintptr_t)p & 31u)) & 31u);
220 if (prealign_size)
221 {
222 __m256i mm_input = _mm256_loadu_si256(p: reinterpret_cast< const __m256i* >(p));
223 BOOST_LOG_AUX_MM256_CONSTANTS
224
225 __m256i mm_output1, mm_output2, mm_output3;
226 dump_pack(BOOST_LOG_AUX_MM256_CONSTANT_ARGS mm_char_table, mm_input, mm_output1, mm_output2, mm_output3);
227
228 store_characters_x3(mm_output1, mm_output2, mm_output3, buf);
229
230 _mm256_zeroall(); // need to zero all ymm registers to avoid register spills/restores the compler generates around the function call
231 strm.write(buf_begin, prealign_size * 3u - 1u);
232
233 buf_begin = buf;
234 size -= prealign_size;
235 p += prealign_size;
236 }
237
238 const std::size_t stride_count = size / stride;
239 std::size_t tail_size = size % stride;
240 for (std::size_t i = 0; i < stride_count; ++i)
241 {
242 char_type* b = buf;
243 BOOST_LOG_AUX_MM256_CONSTANTS
244
245 for (unsigned int j = 0; j < packs_per_stride; ++j, b += 3u * 32u, p += 32u)
246 {
247 __m256i mm_input = _mm256_load_si256(p: reinterpret_cast< const __m256i* >(p));
248 __m256i mm_output1, mm_output2, mm_output3;
249 dump_pack(BOOST_LOG_AUX_MM256_CONSTANT_ARGS mm_char_table, mm_input, mm_output1, mm_output2, mm_output3);
250
251 store_characters_x3(mm_output1, mm_output2, mm_output3, b);
252 }
253
254 _mm256_zeroall(); // need to zero all ymm registers to avoid register spills/restores the compler generates around the function call
255 strm.write(buf_begin, buf_end - buf_begin);
256 buf_begin = buf;
257 }
258
259 if (BOOST_UNLIKELY(tail_size > 0))
260 {
261 char_type* b = buf;
262 while (tail_size >= 16u)
263 {
264 __m128i mm_input = _mm_load_si128(p: reinterpret_cast< const __m128i* >(p));
265 BOOST_LOG_AUX_MM128_CONSTANTS
266
267 __m128i mm_output1, mm_output2, mm_output3;
268 dump_pack(BOOST_LOG_AUX_MM128_CONSTANT_ARGS mm_char_table: _mm256_castsi256_si128(a: mm_char_table), mm_input, mm_output1, mm_output2, mm_output3);
269
270 store_characters(mm_output1, b);
271 store_characters(mm_output2, b + 16u);
272 store_characters(mm_output3, b + 32u);
273
274 b += 3u * 16u;
275 p += 16u;
276 tail_size -= 16u;
277 }
278
279 _mm256_zeroall(); // need to zero all ymm registers to avoid register spills/restores the compler generates around the function call
280 for (unsigned int i = 0; i < tail_size; ++i, ++p, b += 3u)
281 {
282 uint32_t n = *p;
283 b[0] = static_cast< char_type >(' ');
284 b[1] = static_cast< char_type >(char_table[n >> 4]);
285 b[2] = static_cast< char_type >(char_table[n & 0x0F]);
286 }
287
288 strm.write(buf_begin, b - buf_begin);
289 }
290}
291
292#undef BOOST_LOG_AUX_MM256_CONSTANT_ARGS_DECL
293#undef BOOST_LOG_AUX_MM256_CONSTANT_ARGS
294#undef BOOST_LOG_AUX_MM256_CONSTANTS
295#undef BOOST_LOG_AUX_MM128_CONSTANT_ARGS_DECL
296#undef BOOST_LOG_AUX_MM128_CONSTANT_ARGS
297
298} // namespace
299
300void dump_data_char_avx2(const void* data, std::size_t size, std::basic_ostream< char >& strm)
301{
302 if (size >= 32)
303 {
304 dump_data_avx2(data, size, strm);
305 }
306 else
307 {
308 dump_data_generic(data, size, strm);
309 }
310}
311
312void dump_data_wchar_avx2(const void* data, std::size_t size, std::basic_ostream< wchar_t >& strm)
313{
314 if (size >= 32)
315 {
316 dump_data_avx2(data, size, strm);
317 }
318 else
319 {
320 dump_data_generic(data, size, strm);
321 }
322}
323
324#if !defined(BOOST_NO_CXX11_CHAR16_T)
325void dump_data_char16_avx2(const void* data, std::size_t size, std::basic_ostream< char16_t >& strm)
326{
327 if (size >= 32)
328 {
329 dump_data_avx2(data, size, strm);
330 }
331 else
332 {
333 dump_data_generic(data, size, strm);
334 }
335}
336#endif
337
338#if !defined(BOOST_NO_CXX11_CHAR32_T)
339void dump_data_char32_avx2(const void* data, std::size_t size, std::basic_ostream< char32_t >& strm)
340{
341 if (size >= 32)
342 {
343 dump_data_avx2(data, size, strm);
344 }
345 else
346 {
347 dump_data_generic(data, size, strm);
348 }
349}
350#endif
351
352} // namespace aux
353
354BOOST_LOG_CLOSE_NAMESPACE // namespace log
355
356} // namespace boost
357
358#include <boost/log/detail/footer.hpp>
359

source code of boost/libs/log/src/dump_avx2.cpp