Warning: This file is not a C or C++ file. It does not have highlighting.
1 | /*===---- bmiintrin.h - Implementation of BMI2 intrinsics on PowerPC -------=== |
---|---|
2 | * |
3 | * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | * See https://llvm.org/LICENSE.txt for license information. |
5 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | * |
7 | *===-----------------------------------------------------------------------=== |
8 | */ |
9 | |
10 | #if !defined X86GPRINTRIN_H_ |
11 | #error "Never use <bmi2intrin.h> directly; include <x86gprintrin.h> instead." |
12 | #endif |
13 | |
14 | #ifndef BMI2INTRIN_H_ |
15 | #define BMI2INTRIN_H_ |
16 | |
17 | extern __inline unsigned int |
18 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
19 | _bzhi_u32(unsigned int __X, unsigned int __Y) { |
20 | return ((__X << (32 - __Y)) >> (32 - __Y)); |
21 | } |
22 | |
23 | extern __inline unsigned int |
24 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
25 | _mulx_u32(unsigned int __X, unsigned int __Y, unsigned int *__P) { |
26 | unsigned long long __res = (unsigned long long)__X * __Y; |
27 | *__P = (unsigned int)(__res >> 32); |
28 | return (unsigned int)__res; |
29 | } |
30 | |
31 | #ifdef __PPC64__ |
32 | extern __inline unsigned long long |
33 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
34 | _bzhi_u64(unsigned long long __X, unsigned long long __Y) { |
35 | return ((__X << (64 - __Y)) >> (64 - __Y)); |
36 | } |
37 | |
38 | /* __int128 requires base 64-bit. */ |
39 | extern __inline unsigned long long |
40 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
41 | _mulx_u64(unsigned long long __X, unsigned long long __Y, |
42 | unsigned long long *__P) { |
43 | unsigned __int128 __res = (unsigned __int128)__X * __Y; |
44 | *__P = (unsigned long long)(__res >> 64); |
45 | return (unsigned long long)__res; |
46 | } |
47 | |
48 | #ifdef _ARCH_PWR7 |
49 | /* popcount and bpermd require power7 minimum. */ |
50 | extern __inline unsigned long long |
51 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
52 | _pdep_u64(unsigned long long __X, unsigned long long __M) { |
53 | unsigned long __result = 0x0UL; |
54 | const unsigned long __mask = 0x8000000000000000UL; |
55 | unsigned long __m = __M; |
56 | unsigned long __c, __t; |
57 | unsigned long __p; |
58 | |
59 | /* The pop-count of the mask gives the number of the bits from |
60 | source to process. This is also needed to shift bits from the |
61 | source into the correct position for the result. */ |
62 | __p = 64 - __builtin_popcountl(__M); |
63 | |
64 | /* The loop is for the number of '1' bits in the mask and clearing |
65 | each mask bit as it is processed. */ |
66 | while (__m != 0) { |
67 | __c = __builtin_clzl(__m); |
68 | __t = __X << (__p - __c); |
69 | __m ^= (__mask >> __c); |
70 | __result |= (__t & (__mask >> __c)); |
71 | __p++; |
72 | } |
73 | return __result; |
74 | } |
75 | |
76 | extern __inline unsigned long long |
77 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
78 | _pext_u64(unsigned long long __X, unsigned long long __M) { |
79 | unsigned long __p = 0x4040404040404040UL; // initial bit permute control |
80 | const unsigned long __mask = 0x8000000000000000UL; |
81 | unsigned long __m = __M; |
82 | unsigned long __c; |
83 | unsigned long __result; |
84 | |
85 | /* if the mask is constant and selects 8 bits or less we can use |
86 | the Power8 Bit permute instruction. */ |
87 | if (__builtin_constant_p(__M) && (__builtin_popcountl(__M) <= 8)) { |
88 | /* Also if the pext mask is constant, then the popcount is |
89 | constant, we can evaluate the following loop at compile |
90 | time and use a constant bit permute vector. */ |
91 | long __i; |
92 | for (__i = 0; __i < __builtin_popcountl(__M); __i++) { |
93 | __c = __builtin_clzl(__m); |
94 | __p = (__p << 8) | __c; |
95 | __m ^= (__mask >> __c); |
96 | } |
97 | __result = __builtin_bpermd(__p, __X); |
98 | } else { |
99 | __p = 64 - __builtin_popcountl(__M); |
100 | __result = 0; |
101 | /* We could a use a for loop here, but that combined with |
102 | -funroll-loops can expand to a lot of code. The while |
103 | loop avoids unrolling and the compiler commons the xor |
104 | from clearing the mask bit with the (m != 0) test. The |
105 | result is a more compact loop setup and body. */ |
106 | while (__m != 0) { |
107 | unsigned long __t; |
108 | __c = __builtin_clzl(__m); |
109 | __t = (__X & (__mask >> __c)) >> (__p - __c); |
110 | __m ^= (__mask >> __c); |
111 | __result |= (__t); |
112 | __p++; |
113 | } |
114 | } |
115 | return __result; |
116 | } |
117 | |
118 | /* these 32-bit implementations depend on 64-bit pdep/pext |
119 | which depend on _ARCH_PWR7. */ |
120 | extern __inline unsigned int |
121 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
122 | _pdep_u32(unsigned int __X, unsigned int __Y) { |
123 | return _pdep_u64(__X, __Y); |
124 | } |
125 | |
126 | extern __inline unsigned int |
127 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
128 | _pext_u32(unsigned int __X, unsigned int __Y) { |
129 | return _pext_u64(__X, __Y); |
130 | } |
131 | #endif /* _ARCH_PWR7 */ |
132 | #endif /* __PPC64__ */ |
133 | |
134 | #endif /* BMI2INTRIN_H_ */ |
135 |
Warning: This file is not a C or C++ file. It does not have highlighting.