1 | /* |
2 | * Copyright Andrey Semashev 2013. |
3 | * Distributed under the Boost Software License, Version 1.0. |
4 | * (See accompanying file LICENSE_1_0.txt or copy at |
5 | * http://www.boost.org/LICENSE_1_0.txt) |
6 | */ |
7 | /*! |
8 | * \file uuid/detail/uuid_x86.hpp |
9 | * |
10 | * \brief This header contains optimized SSE implementation of \c boost::uuid operations. |
11 | */ |
12 | |
13 | #ifndef BOOST_UUID_DETAIL_UUID_X86_HPP_INCLUDED_ |
14 | #define BOOST_UUID_DETAIL_UUID_X86_HPP_INCLUDED_ |
15 | |
16 | // MSVC does not always have immintrin.h (at least, not up to MSVC 10), so include the appropriate header for each instruction set |
17 | #if defined(BOOST_UUID_USE_SSE41) |
18 | #include <smmintrin.h> |
19 | #elif defined(BOOST_UUID_USE_SSE3) |
20 | #include <pmmintrin.h> |
21 | #else |
22 | #include <emmintrin.h> |
23 | #endif |
24 | |
25 | #if defined(BOOST_MSVC) && defined(_M_X64) && !defined(BOOST_UUID_USE_SSE3) && (BOOST_MSVC < 1900 /* Fixed in Visual Studio 2015 */ ) |
26 | // At least MSVC 9 (VS2008) and 12 (VS2013) have an optimizer bug that sometimes results in incorrect SIMD code |
27 | // generated in Release x64 mode. In particular, it affects operator==, where the compiler sometimes generates |
28 | // pcmpeqd with a memory opereand instead of movdqu followed by pcmpeqd. The problem is that uuid can be |
29 | // not aligned to 16 bytes and pcmpeqd causes alignment violation in this case. We cannot be sure that other |
30 | // MSVC versions are not affected so we apply the workaround for all versions, except VS2015 on up where |
31 | // the bug has been fixed. |
32 | // |
33 | // https://svn.boost.org/trac/boost/ticket/8509#comment:3 |
34 | // https://connect.microsoft.com/VisualStudio/feedbackdetail/view/981648#tabs |
35 | #define BOOST_UUID_DETAIL_MSVC_BUG981648 |
36 | #if BOOST_MSVC >= 1600 |
37 | extern "C" void _ReadWriteBarrier(void); |
38 | #pragma intrinsic(_ReadWriteBarrier) |
39 | #endif |
40 | #endif |
41 | |
42 | namespace boost { |
43 | namespace uuids { |
44 | namespace detail { |
45 | |
46 | BOOST_FORCEINLINE __m128i load_unaligned_si128(const uint8_t* p) BOOST_NOEXCEPT |
47 | { |
48 | #if defined(BOOST_UUID_USE_SSE3) |
49 | return _mm_lddqu_si128(reinterpret_cast< const __m128i* >(p)); |
50 | #elif !defined(BOOST_UUID_DETAIL_MSVC_BUG981648) |
51 | return _mm_loadu_si128(reinterpret_cast< const __m128i* >(p)); |
52 | #elif BOOST_MSVC >= 1600 |
53 | __m128i mm = _mm_loadu_si128(reinterpret_cast< const __m128i* >(p)); |
54 | // Make sure this load doesn't get merged with the subsequent instructions |
55 | _ReadWriteBarrier(); |
56 | return mm; |
57 | #else |
58 | // VS2008 x64 doesn't respect _ReadWriteBarrier above, so we have to generate this crippled code to load unaligned data |
59 | return _mm_unpacklo_epi64(_mm_loadl_epi64(reinterpret_cast< const __m128i* >(p)), _mm_loadl_epi64(reinterpret_cast< const __m128i* >(p + 8))); |
60 | #endif |
61 | } |
62 | |
63 | } // namespace detail |
64 | |
65 | inline bool uuid::is_nil() const BOOST_NOEXCEPT |
66 | { |
67 | __m128i mm = uuids::detail::load_unaligned_si128(data); |
68 | #if defined(BOOST_UUID_USE_SSE41) |
69 | return _mm_test_all_zeros(mm, mm) != 0; |
70 | #else |
71 | mm = _mm_cmpeq_epi8(mm, _mm_setzero_si128()); |
72 | return _mm_movemask_epi8(mm) == 0xFFFF; |
73 | #endif |
74 | } |
75 | |
76 | inline void uuid::swap(uuid& rhs) BOOST_NOEXCEPT |
77 | { |
78 | __m128i mm_this = uuids::detail::load_unaligned_si128(data); |
79 | __m128i mm_rhs = uuids::detail::load_unaligned_si128(rhs.data); |
80 | _mm_storeu_si128(reinterpret_cast< __m128i* >(rhs.data), mm_this); |
81 | _mm_storeu_si128(reinterpret_cast< __m128i* >(data), mm_rhs); |
82 | } |
83 | |
84 | inline bool operator== (uuid const& lhs, uuid const& rhs) BOOST_NOEXCEPT |
85 | { |
86 | __m128i mm_left = uuids::detail::load_unaligned_si128(lhs.data); |
87 | __m128i mm_right = uuids::detail::load_unaligned_si128(rhs.data); |
88 | |
89 | __m128i mm_cmp = _mm_cmpeq_epi32(mm_left, mm_right); |
90 | |
91 | #if defined(BOOST_UUID_USE_SSE41) |
92 | return _mm_test_all_ones(mm_cmp) != 0; |
93 | #else |
94 | return _mm_movemask_epi8(mm_cmp) == 0xFFFF; |
95 | #endif |
96 | } |
97 | |
98 | inline bool operator< (uuid const& lhs, uuid const& rhs) BOOST_NOEXCEPT |
99 | { |
100 | __m128i mm_left = uuids::detail::load_unaligned_si128(lhs.data); |
101 | __m128i mm_right = uuids::detail::load_unaligned_si128(rhs.data); |
102 | |
103 | // To emulate lexicographical_compare behavior we have to perform two comparisons - the forward and reverse one. |
104 | // Then we know which bytes are equivalent and which ones are different, and for those different the comparison results |
105 | // will be opposite. Then we'll be able to find the first differing comparison result (for both forward and reverse ways), |
106 | // and depending on which way it is for, this will be the result of the operation. There are a few notes to consider: |
107 | // |
108 | // 1. Due to little endian byte order the first bytes go into the lower part of the xmm registers, |
109 | // so the comparison results in the least significant bits will actually be the most signigicant for the final operation result. |
110 | // This means we have to determine which of the comparison results have the least significant bit on, and this is achieved with |
111 | // the "(x - 1) ^ x" trick. |
112 | // 2. Because there is only signed comparison in SSE/AVX, we have to invert byte comparison results whenever signs of the corresponding |
113 | // bytes are different. I.e. in signed comparison it's -1 < 1, but in unsigned it is the opposite (255 > 1). To do that we XOR left and right, |
114 | // making the most significant bit of each byte 1 if the signs are different, and later apply this mask with another XOR to the comparison results. |
115 | // 3. pcmpgtw compares for "greater" relation, so we swap the arguments to get what we need. |
116 | |
117 | const __m128i mm_signs_mask = _mm_xor_si128(mm_left, mm_right); |
118 | |
119 | __m128i mm_cmp = _mm_cmpgt_epi8(mm_right, mm_left), mm_rcmp = _mm_cmpgt_epi8(mm_left, mm_right); |
120 | |
121 | mm_cmp = _mm_xor_si128(mm_signs_mask, mm_cmp); |
122 | mm_rcmp = _mm_xor_si128(mm_signs_mask, mm_rcmp); |
123 | |
124 | uint32_t cmp = static_cast< uint32_t >(_mm_movemask_epi8(mm_cmp)), rcmp = static_cast< uint32_t >(_mm_movemask_epi8(mm_rcmp)); |
125 | |
126 | cmp = (cmp - 1u) ^ cmp; |
127 | rcmp = (rcmp - 1u) ^ rcmp; |
128 | |
129 | return cmp < rcmp; |
130 | } |
131 | |
132 | } // namespace uuids |
133 | } // namespace boost |
134 | |
135 | #endif // BOOST_UUID_DETAIL_UUID_X86_HPP_INCLUDED_ |
136 | |