1 | /* |
2 | * Copyright Andrey Semashev 2007 - 2021. |
3 | * Distributed under the Boost Software License, Version 1.0. |
4 | * (See accompanying file LICENSE_1_0.txt or copy at |
5 | * http://www.boost.org/LICENSE_1_0.txt) |
6 | */ |
7 | /*! |
8 | * \file dump_avx2.cpp |
9 | * \author Andrey Semashev |
10 | * \date 05.05.2013 |
11 | * |
12 | * \brief This header is the Boost.Log library implementation, see the library documentation |
13 | * at http://www.boost.org/doc/libs/release/libs/log/doc/html/index.html. |
14 | */ |
15 | |
16 | // NOTE: You should generally avoid including headers as much as possible here, because this file |
17 | // is compiled with special compiler options, and any included header may result in generation of |
18 | // unintended code with these options and violation of ODR. |
19 | #include <boost/log/detail/config.hpp> |
20 | #include <ostream> |
21 | #include <immintrin.h> |
22 | #include <boost/cstdint.hpp> |
23 | #include <boost/log/detail/header.hpp> |
24 | |
25 | #if defined(__x86_64) || defined(__x86_64__) || \ |
26 | defined(__amd64__) || defined(__amd64) || \ |
27 | defined(_M_X64) |
28 | #define BOOST_LOG_AUX_X86_64 |
29 | #endif |
30 | |
31 | namespace boost { |
32 | |
33 | BOOST_LOG_OPEN_NAMESPACE |
34 | |
35 | namespace aux { |
36 | |
37 | extern const char g_hex_char_table[2][16]; |
38 | |
39 | template< typename CharT > |
40 | extern void dump_data_generic(const void* data, std::size_t size, std::basic_ostream< CharT >& strm); |
41 | |
42 | BOOST_LOG_ANONYMOUS_NAMESPACE { |
43 | |
44 | enum |
45 | { |
46 | packs_per_stride = 32, |
47 | stride = packs_per_stride * 32 |
48 | }; |
49 | |
50 | template< typename CharT > |
51 | BOOST_FORCEINLINE void store_characters(__m128i mm_chars, CharT* buf) |
52 | { |
53 | switch (sizeof(CharT)) |
54 | { |
55 | case 1: |
56 | _mm_store_si128(p: reinterpret_cast< __m128i* >(buf), b: mm_chars); |
57 | break; |
58 | |
59 | case 2: |
60 | _mm256_store_si256(p: reinterpret_cast< __m256i* >(buf), a: _mm256_cvtepu8_epi16(V: mm_chars)); |
61 | break; |
62 | |
63 | case 4: |
64 | { |
65 | __m128i mm = _mm_unpackhi_epi64(a: mm_chars, b: mm_chars); |
66 | _mm256_store_si256(p: reinterpret_cast< __m256i* >(buf), a: _mm256_cvtepu8_epi32(V: mm_chars)); |
67 | _mm256_store_si256(p: reinterpret_cast< __m256i* >(buf) + 1, a: _mm256_cvtepu8_epi32(V: mm)); |
68 | } |
69 | break; |
70 | } |
71 | } |
72 | |
73 | template< typename CharT > |
74 | BOOST_FORCEINLINE void store_characters_x3(__m256i mm_chars1, __m256i mm_chars2, __m256i mm_chars3, CharT* buf) |
75 | { |
76 | store_characters(_mm256_castsi256_si128(a: mm_chars1), buf); |
77 | store_characters(_mm256_castsi256_si128(a: mm_chars2), buf + 16); |
78 | store_characters(_mm256_castsi256_si128(a: mm_chars3), buf + 32); |
79 | store_characters(_mm256_extracti128_si256(mm_chars1, 1), buf + 48); |
80 | store_characters(_mm256_extracti128_si256(mm_chars2, 1), buf + 64); |
81 | store_characters(_mm256_extracti128_si256(mm_chars3, 1), buf + 80); |
82 | } |
83 | |
84 | union ymm_constant |
85 | { |
86 | uint8_t as_bytes[32]; |
87 | __m256i as_mm; |
88 | |
89 | BOOST_FORCEINLINE operator __m256i () const { return as_mm; } |
90 | BOOST_FORCEINLINE operator __m128i () const { return _mm256_castsi256_si128(a: as_mm); } |
91 | }; |
92 | |
93 | static const ymm_constant mm_shuffle_pattern1 = {.as_bytes: { 0x80, 0, 1, 0x80, 2, 3, 0x80, 4, 5, 0x80, 6, 7, 0x80, 8, 9, 0x80, 0x80, 0, 1, 0x80, 2, 3, 0x80, 4, 5, 0x80, 6, 7, 0x80, 8, 9, 0x80 }}; |
94 | static const ymm_constant mm_shuffle_pattern2 = {.as_bytes: { 0, 1, 0x80, 2, 3, 0x80, 4, 5, 0x80, 6, 7, 0x80, 8, 9, 0x80, 10, 0, 1, 0x80, 2, 3, 0x80, 4, 5, 0x80, 6, 7, 0x80, 8, 9, 0x80, 10 }}; |
95 | static const ymm_constant mm_shuffle_pattern3 = {.as_bytes: { 5, 0x80, 6, 7, 0x80, 8, 9, 0x80, 10, 11, 0x80, 12, 13, 0x80, 14, 15, 5, 0x80, 6, 7, 0x80, 8, 9, 0x80, 10, 11, 0x80, 12, 13, 0x80, 14, 15 }}; |
96 | static const ymm_constant mm_shuffle_pattern13 = {.as_bytes: { 0x80, 0, 1, 0x80, 2, 3, 0x80, 4, 5, 0x80, 6, 7, 0x80, 8, 9, 0x80, 5, 0x80, 6, 7, 0x80, 8, 9, 0x80, 10, 11, 0x80, 12, 13, 0x80, 14, 15 }}; |
97 | |
98 | #if defined(BOOST_LOG_AUX_X86_64) |
99 | |
100 | // x86-64 architecture has more registers which we can utilize to pass constants |
101 | #define BOOST_LOG_AUX_MM256_CONSTANT_ARGS_DECL __m256i mm_15, __m256i mm_char_space, |
102 | #define BOOST_LOG_AUX_MM256_CONSTANT_ARGS mm_15, mm_char_space, |
103 | #define BOOST_LOG_AUX_MM256_CONSTANTS \ |
104 | const __m256i mm_15 = _mm256_set1_epi32(0x0F0F0F0F);\ |
105 | const __m256i mm_char_space = _mm256_set1_epi32(0x20202020); |
106 | #define BOOST_LOG_AUX_MM128_CONSTANT_ARGS_DECL __m128i mm_15, __m128i mm_char_space, |
107 | #define BOOST_LOG_AUX_MM128_CONSTANT_ARGS mm_15, mm_char_space, |
108 | #define BOOST_LOG_AUX_MM128_CONSTANTS \ |
109 | const __m128i mm_15 = _mm_set1_epi32(0x0F0F0F0F);\ |
110 | const __m128i mm_char_space = _mm_set1_epi32(0x20202020); |
111 | |
112 | #else |
113 | |
114 | // MSVC in 32-bit mode is not able to pass all constants to dump_pack, and is also not able to align them on the stack, so we have to fetch them from global constants |
115 | static const ymm_constant mm_15 = {{ 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F }}; |
116 | static const ymm_constant mm_char_space = {{ 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20 }}; |
117 | #define BOOST_LOG_AUX_MM256_CONSTANT_ARGS_DECL |
118 | #define BOOST_LOG_AUX_MM256_CONSTANT_ARGS |
119 | #define BOOST_LOG_AUX_MM256_CONSTANTS |
120 | #define BOOST_LOG_AUX_MM128_CONSTANT_ARGS_DECL |
121 | #define BOOST_LOG_AUX_MM128_CONSTANT_ARGS |
122 | #define BOOST_LOG_AUX_MM128_CONSTANTS |
123 | |
124 | #endif |
125 | |
126 | /*! |
127 | * \brief Dumps a pack of input data into a string of 8 bit ASCII characters. |
128 | * |
129 | * The composed string is placed as follows (in Intel notation): mm_output1[127:0], mm_output2[127:0], mm_output3[127:0], mm_output1[255:128], mm_output2[255:128], mm_output3[255:128]. |
130 | */ |
131 | static BOOST_FORCEINLINE void dump_pack |
132 | ( |
133 | BOOST_LOG_AUX_MM256_CONSTANT_ARGS_DECL |
134 | __m256i mm_char_table, __m256i mm_input, |
135 | __m256i& mm_output1, __m256i& mm_output2, __m256i& mm_output3 |
136 | ) |
137 | { |
138 | // Split half-bytes |
139 | __m256i mm_input_hi = _mm256_and_si256(a: _mm256_srli_epi16(a: mm_input, count: 4), b: mm_15); |
140 | __m256i mm_input_lo = _mm256_and_si256(a: mm_input, b: mm_15); |
141 | |
142 | // Stringize each of the halves |
143 | mm_input_hi = _mm256_shuffle_epi8(a: mm_char_table, b: mm_input_hi); |
144 | mm_input_lo = _mm256_shuffle_epi8(a: mm_char_table, b: mm_input_lo); |
145 | |
146 | // Join them back together |
147 | __m256i mm_1 = _mm256_unpacklo_epi8(a: mm_input_hi, b: mm_input_lo); |
148 | __m256i mm_2 = _mm256_unpackhi_epi8(a: mm_input_hi, b: mm_input_lo); |
149 | |
150 | // Insert spaces between stringized bytes: |
151 | // |0123456789abcdef|0123456789abcdef| |
152 | // | 01 23 45 67 89 |ab cd ef 01 23 4|5 67 89 ab cd ef| |
153 | __m256i mm_out1 = _mm256_shuffle_epi8(a: mm_1, b: mm_shuffle_pattern1.as_mm); |
154 | __m256i mm_out3 = _mm256_shuffle_epi8(a: mm_2, b: mm_shuffle_pattern3.as_mm); |
155 | __m256i mm_out2 = _mm256_shuffle_epi8(_mm256_alignr_epi8(mm_2, mm_1, 10), b: mm_shuffle_pattern2.as_mm); |
156 | |
157 | mm_output1 = _mm256_max_epu8(a: mm_out1, b: mm_char_space); |
158 | mm_output2 = _mm256_max_epu8(a: mm_out2, b: mm_char_space); |
159 | mm_output3 = _mm256_max_epu8(a: mm_out3, b: mm_char_space); |
160 | } |
161 | |
162 | //! Dumps a pack of input data into a string of 8 bit ASCII characters |
163 | static BOOST_FORCEINLINE void dump_pack |
164 | ( |
165 | BOOST_LOG_AUX_MM128_CONSTANT_ARGS_DECL |
166 | __m128i mm_char_table, __m128i mm_input, |
167 | __m128i& mm_output1, __m128i& mm_output2, __m128i& mm_output3 |
168 | ) |
169 | { |
170 | // Split half-bytes |
171 | __m128i mm_input_hi = _mm_and_si128(a: _mm_srli_epi16(a: mm_input, count: 4), b: mm_15); |
172 | __m128i mm_input_lo = _mm_and_si128(a: mm_input, b: mm_15); |
173 | |
174 | // Stringize each of the halves |
175 | mm_input_hi = _mm_shuffle_epi8(a: mm_char_table, b: mm_input_hi); |
176 | mm_input_lo = _mm_shuffle_epi8(a: mm_char_table, b: mm_input_lo); |
177 | |
178 | // Join them back together |
179 | __m128i mm_1 = _mm_unpacklo_epi8(a: mm_input_hi, b: mm_input_lo); |
180 | __m128i mm_2 = _mm_unpackhi_epi8(a: mm_input_hi, b: mm_input_lo); |
181 | |
182 | // Insert spaces between stringized bytes: |
183 | // |0123456789abcdef|0123456789abcdef| |
184 | // | 01 23 45 67 89 |ab cd ef 01 23 4|5 67 89 ab cd ef| |
185 | mm_output1 = _mm_shuffle_epi8(a: mm_1, b: mm_shuffle_pattern1); |
186 | mm_output2 = _mm_shuffle_epi8(_mm_alignr_epi8(mm_2, mm_1, 10), b: mm_shuffle_pattern2); |
187 | mm_output3 = _mm_shuffle_epi8(a: mm_2, b: mm_shuffle_pattern3); |
188 | |
189 | mm_output1 = _mm_max_epu8(a: mm_output1, b: mm_char_space); |
190 | mm_output2 = _mm_max_epu8(a: mm_output2, b: mm_char_space); |
191 | mm_output3 = _mm_max_epu8(a: mm_output3, b: mm_char_space); |
192 | } |
193 | |
194 | template< typename CharT > |
195 | BOOST_FORCEINLINE void dump_data_avx2(const void* data, std::size_t size, std::basic_ostream< CharT >& strm) |
196 | { |
197 | typedef CharT char_type; |
198 | |
199 | char_type buf_storage[stride * 3u + 32u]; |
200 | // Align the temporary buffer at 32 bytes |
201 | char_type* const buf = reinterpret_cast< char_type* >((uint8_t*)buf_storage + (32u - (((uintptr_t)(char_type*)buf_storage) & 31u))); |
202 | char_type* buf_begin = buf + 1u; // skip the first space of the first chunk |
203 | char_type* buf_end = buf + stride * 3u; |
204 | |
205 | const char* const char_table = g_hex_char_table[(strm.flags() & std::ios_base::uppercase) != 0]; |
206 | #if defined(__GNUC__) && ((defined(BOOST_GCC) && BOOST_GCC < 40900) || (defined(BOOST_CLANG) && BOOST_CLANG_VERSION < 40000)) |
207 | // gcc 4.7 is missing _mm256_broadcastsi128_si256 declaration in immintrin.h. |
208 | // gcc 4.8 generates vmovdqu+vinserti128 instead of a single vbroadcasti128. |
209 | // clang up until 4.0 generates vmovdqu+vinserti128 or worse. |
210 | __m256i mm_char_table; |
211 | __asm__("vbroadcasti128 %1, %0" : "=x" (mm_char_table) : "m" (*reinterpret_cast< const __m128i* >(char_table))); |
212 | #else |
213 | const __m256i mm_char_table = _mm256_broadcastsi128_si256(X: _mm_loadu_si128(p: reinterpret_cast< const __m128i* >(char_table))); |
214 | #endif |
215 | |
216 | // First, check the input alignment. Also, if we can dump the whole data in one go, do it right away. It turns out to be faster than splitting |
217 | // the work between prealign and tail part. It is also a fairly common case since on most platforms memory is not aligned to 32 bytes (i.e. prealign is often needed). |
218 | const uint8_t* p = static_cast< const uint8_t* >(data); |
219 | const std::size_t prealign_size = size == 32u ? static_cast< std::size_t >(32u) : static_cast< std::size_t >((32u - ((uintptr_t)p & 31u)) & 31u); |
220 | if (prealign_size) |
221 | { |
222 | __m256i mm_input = _mm256_loadu_si256(p: reinterpret_cast< const __m256i* >(p)); |
223 | BOOST_LOG_AUX_MM256_CONSTANTS |
224 | |
225 | __m256i mm_output1, mm_output2, mm_output3; |
226 | dump_pack(BOOST_LOG_AUX_MM256_CONSTANT_ARGS mm_char_table, mm_input, mm_output1, mm_output2, mm_output3); |
227 | |
228 | store_characters_x3(mm_output1, mm_output2, mm_output3, buf); |
229 | |
230 | _mm256_zeroall(); // need to zero all ymm registers to avoid register spills/restores the compler generates around the function call |
231 | strm.write(buf_begin, prealign_size * 3u - 1u); |
232 | |
233 | buf_begin = buf; |
234 | size -= prealign_size; |
235 | p += prealign_size; |
236 | } |
237 | |
238 | const std::size_t stride_count = size / stride; |
239 | std::size_t tail_size = size % stride; |
240 | for (std::size_t i = 0; i < stride_count; ++i) |
241 | { |
242 | char_type* b = buf; |
243 | BOOST_LOG_AUX_MM256_CONSTANTS |
244 | |
245 | for (unsigned int j = 0; j < packs_per_stride; ++j, b += 3u * 32u, p += 32u) |
246 | { |
247 | __m256i mm_input = _mm256_load_si256(p: reinterpret_cast< const __m256i* >(p)); |
248 | __m256i mm_output1, mm_output2, mm_output3; |
249 | dump_pack(BOOST_LOG_AUX_MM256_CONSTANT_ARGS mm_char_table, mm_input, mm_output1, mm_output2, mm_output3); |
250 | |
251 | store_characters_x3(mm_output1, mm_output2, mm_output3, b); |
252 | } |
253 | |
254 | _mm256_zeroall(); // need to zero all ymm registers to avoid register spills/restores the compler generates around the function call |
255 | strm.write(buf_begin, buf_end - buf_begin); |
256 | buf_begin = buf; |
257 | } |
258 | |
259 | if (BOOST_UNLIKELY(tail_size > 0)) |
260 | { |
261 | char_type* b = buf; |
262 | while (tail_size >= 16u) |
263 | { |
264 | __m128i mm_input = _mm_load_si128(p: reinterpret_cast< const __m128i* >(p)); |
265 | BOOST_LOG_AUX_MM128_CONSTANTS |
266 | |
267 | __m128i mm_output1, mm_output2, mm_output3; |
268 | dump_pack(BOOST_LOG_AUX_MM128_CONSTANT_ARGS mm_char_table: _mm256_castsi256_si128(a: mm_char_table), mm_input, mm_output1, mm_output2, mm_output3); |
269 | |
270 | store_characters(mm_output1, b); |
271 | store_characters(mm_output2, b + 16u); |
272 | store_characters(mm_output3, b + 32u); |
273 | |
274 | b += 3u * 16u; |
275 | p += 16u; |
276 | tail_size -= 16u; |
277 | } |
278 | |
279 | _mm256_zeroall(); // need to zero all ymm registers to avoid register spills/restores the compler generates around the function call |
280 | for (unsigned int i = 0; i < tail_size; ++i, ++p, b += 3u) |
281 | { |
282 | uint32_t n = *p; |
283 | b[0] = static_cast< char_type >(' '); |
284 | b[1] = static_cast< char_type >(char_table[n >> 4]); |
285 | b[2] = static_cast< char_type >(char_table[n & 0x0F]); |
286 | } |
287 | |
288 | strm.write(buf_begin, b - buf_begin); |
289 | } |
290 | } |
291 | |
292 | #undef BOOST_LOG_AUX_MM256_CONSTANT_ARGS_DECL |
293 | #undef BOOST_LOG_AUX_MM256_CONSTANT_ARGS |
294 | #undef BOOST_LOG_AUX_MM256_CONSTANTS |
295 | #undef BOOST_LOG_AUX_MM128_CONSTANT_ARGS_DECL |
296 | #undef BOOST_LOG_AUX_MM128_CONSTANT_ARGS |
297 | |
298 | } // namespace |
299 | |
300 | void dump_data_char_avx2(const void* data, std::size_t size, std::basic_ostream< char >& strm) |
301 | { |
302 | if (size >= 32) |
303 | { |
304 | dump_data_avx2(data, size, strm); |
305 | } |
306 | else |
307 | { |
308 | dump_data_generic(data, size, strm); |
309 | } |
310 | } |
311 | |
312 | void dump_data_wchar_avx2(const void* data, std::size_t size, std::basic_ostream< wchar_t >& strm) |
313 | { |
314 | if (size >= 32) |
315 | { |
316 | dump_data_avx2(data, size, strm); |
317 | } |
318 | else |
319 | { |
320 | dump_data_generic(data, size, strm); |
321 | } |
322 | } |
323 | |
324 | #if !defined(BOOST_NO_CXX11_CHAR16_T) |
325 | void dump_data_char16_avx2(const void* data, std::size_t size, std::basic_ostream< char16_t >& strm) |
326 | { |
327 | if (size >= 32) |
328 | { |
329 | dump_data_avx2(data, size, strm); |
330 | } |
331 | else |
332 | { |
333 | dump_data_generic(data, size, strm); |
334 | } |
335 | } |
336 | #endif |
337 | |
338 | #if !defined(BOOST_NO_CXX11_CHAR32_T) |
339 | void dump_data_char32_avx2(const void* data, std::size_t size, std::basic_ostream< char32_t >& strm) |
340 | { |
341 | if (size >= 32) |
342 | { |
343 | dump_data_avx2(data, size, strm); |
344 | } |
345 | else |
346 | { |
347 | dump_data_generic(data, size, strm); |
348 | } |
349 | } |
350 | #endif |
351 | |
352 | } // namespace aux |
353 | |
354 | BOOST_LOG_CLOSE_NAMESPACE // namespace log |
355 | |
356 | } // namespace boost |
357 | |
358 | #include <boost/log/detail/footer.hpp> |
359 | |