Warning: This file is not a C or C++ file. It does not have highlighting.
1 | /*===---- smmintrin.h - Implementation of SSE4 intrinsics on PowerPC -------=== |
---|---|
2 | * |
3 | * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | * See https://llvm.org/LICENSE.txt for license information. |
5 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | * |
7 | *===-----------------------------------------------------------------------=== |
8 | */ |
9 | |
10 | /* Implemented from the specification included in the Intel C++ Compiler |
11 | User Guide and Reference, version 9.0. |
12 | |
13 | NOTE: This is NOT a complete implementation of the SSE4 intrinsics! */ |
14 | |
15 | #ifndef NO_WARN_X86_INTRINSICS |
16 | /* This header is distributed to simplify porting x86_64 code that |
17 | makes explicit use of Intel intrinsics to powerpc64/powerpc64le. |
18 | |
19 | It is the user's responsibility to determine if the results are |
20 | acceptable and make additional changes as necessary. |
21 | |
22 | Note that much code that uses Intel intrinsics can be rewritten in |
23 | standard C or GNU C extensions, which are more portable and better |
24 | optimized across multiple targets. */ |
25 | #error \ |
26 | "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error." |
27 | #endif |
28 | |
29 | #ifndef SMMINTRIN_H_ |
30 | #define SMMINTRIN_H_ |
31 | |
32 | #if defined(__powerpc64__) && \ |
33 | (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) |
34 | |
35 | #include <altivec.h> |
36 | #include <tmmintrin.h> |
37 | |
38 | /* Rounding mode macros. */ |
39 | #define _MM_FROUND_TO_NEAREST_INT 0x00 |
40 | #define _MM_FROUND_TO_ZERO 0x01 |
41 | #define _MM_FROUND_TO_POS_INF 0x02 |
42 | #define _MM_FROUND_TO_NEG_INF 0x03 |
43 | #define _MM_FROUND_CUR_DIRECTION 0x04 |
44 | |
45 | #define _MM_FROUND_NINT (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC) |
46 | #define _MM_FROUND_FLOOR (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC) |
47 | #define _MM_FROUND_CEIL (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC) |
48 | #define _MM_FROUND_TRUNC (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC) |
49 | #define _MM_FROUND_RINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC) |
50 | #define _MM_FROUND_NEARBYINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC) |
51 | |
52 | #define _MM_FROUND_RAISE_EXC 0x00 |
53 | #define _MM_FROUND_NO_EXC 0x08 |
54 | |
55 | extern __inline __m128d |
56 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
57 | _mm_round_pd(__m128d __A, int __rounding) { |
58 | __v2df __r; |
59 | union { |
60 | double __fr; |
61 | long long __fpscr; |
62 | } __enables_save, __fpscr_save; |
63 | |
64 | if (__rounding & _MM_FROUND_NO_EXC) { |
65 | /* Save enabled exceptions, disable all exceptions, |
66 | and preserve the rounding mode. */ |
67 | #ifdef _ARCH_PWR9 |
68 | __asm__("mffsce %0" : "=f"(__fpscr_save.__fr)); |
69 | __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8; |
70 | #else |
71 | __fpscr_save.__fr = __builtin_ppc_mffs(); |
72 | __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8; |
73 | __fpscr_save.__fpscr &= ~0xf8; |
74 | __builtin_ppc_mtfsf(0b00000011, __fpscr_save.__fr); |
75 | #endif |
76 | /* Insert an artificial "read/write" reference to the variable |
77 | read below, to ensure the compiler does not schedule |
78 | a read/use of the variable before the FPSCR is modified, above. |
79 | This can be removed if and when GCC PR102783 is fixed. |
80 | */ |
81 | __asm__("" : "+wa"(__A)); |
82 | } |
83 | |
84 | switch (__rounding) { |
85 | case _MM_FROUND_TO_NEAREST_INT: |
86 | #ifdef _ARCH_PWR9 |
87 | __fpscr_save.__fr = __builtin_ppc_mffsl(); |
88 | #else |
89 | __fpscr_save.__fr = __builtin_ppc_mffs(); |
90 | __fpscr_save.__fpscr &= 0x70007f0ffL; |
91 | #endif |
92 | __attribute__((fallthrough)); |
93 | case _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC: |
94 | __builtin_ppc_set_fpscr_rn(0b00); |
95 | /* Insert an artificial "read/write" reference to the variable |
96 | read below, to ensure the compiler does not schedule |
97 | a read/use of the variable before the FPSCR is modified, above. |
98 | This can be removed if and when GCC PR102783 is fixed. |
99 | */ |
100 | __asm__("" : "+wa"(__A)); |
101 | |
102 | __r = vec_rint((__v2df)__A); |
103 | |
104 | /* Insert an artificial "read" reference to the variable written |
105 | above, to ensure the compiler does not schedule the computation |
106 | of the value after the manipulation of the FPSCR, below. |
107 | This can be removed if and when GCC PR102783 is fixed. |
108 | */ |
109 | __asm__("" : : "wa"(__r)); |
110 | __builtin_ppc_set_fpscr_rn(__fpscr_save.__fpscr); |
111 | break; |
112 | case _MM_FROUND_TO_NEG_INF: |
113 | case _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC: |
114 | __r = vec_floor((__v2df)__A); |
115 | break; |
116 | case _MM_FROUND_TO_POS_INF: |
117 | case _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC: |
118 | __r = vec_ceil((__v2df)__A); |
119 | break; |
120 | case _MM_FROUND_TO_ZERO: |
121 | case _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC: |
122 | __r = vec_trunc((__v2df)__A); |
123 | break; |
124 | case _MM_FROUND_CUR_DIRECTION: |
125 | __r = vec_rint((__v2df)__A); |
126 | break; |
127 | } |
128 | if (__rounding & _MM_FROUND_NO_EXC) { |
129 | /* Insert an artificial "read" reference to the variable written |
130 | above, to ensure the compiler does not schedule the computation |
131 | of the value after the manipulation of the FPSCR, below. |
132 | This can be removed if and when GCC PR102783 is fixed. |
133 | */ |
134 | __asm__("" : : "wa"(__r)); |
135 | /* Restore enabled exceptions. */ |
136 | #ifdef _ARCH_PWR9 |
137 | __fpscr_save.__fr = __builtin_ppc_mffsl(); |
138 | #else |
139 | __fpscr_save.__fr = __builtin_ppc_mffs(); |
140 | __fpscr_save.__fpscr &= 0x70007f0ffL; |
141 | #endif |
142 | __fpscr_save.__fpscr |= __enables_save.__fpscr; |
143 | __builtin_ppc_mtfsf(0b00000011, __fpscr_save.__fr); |
144 | } |
145 | return (__m128d)__r; |
146 | } |
147 | |
148 | extern __inline __m128d |
149 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
150 | _mm_round_sd(__m128d __A, __m128d __B, int __rounding) { |
151 | __B = _mm_round_pd(__B, __rounding); |
152 | __v2df __r = {((__v2df)__B)[0], ((__v2df)__A)[1]}; |
153 | return (__m128d)__r; |
154 | } |
155 | |
156 | extern __inline __m128 |
157 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
158 | _mm_round_ps(__m128 __A, int __rounding) { |
159 | __v4sf __r; |
160 | union { |
161 | double __fr; |
162 | long long __fpscr; |
163 | } __enables_save, __fpscr_save; |
164 | |
165 | if (__rounding & _MM_FROUND_NO_EXC) { |
166 | /* Save enabled exceptions, disable all exceptions, |
167 | and preserve the rounding mode. */ |
168 | #ifdef _ARCH_PWR9 |
169 | __asm__("mffsce %0" : "=f"(__fpscr_save.__fr)); |
170 | __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8; |
171 | #else |
172 | __fpscr_save.__fr = __builtin_ppc_mffs(); |
173 | __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8; |
174 | __fpscr_save.__fpscr &= ~0xf8; |
175 | __builtin_ppc_mtfsf(0b00000011, __fpscr_save.__fr); |
176 | #endif |
177 | /* Insert an artificial "read/write" reference to the variable |
178 | read below, to ensure the compiler does not schedule |
179 | a read/use of the variable before the FPSCR is modified, above. |
180 | This can be removed if and when GCC PR102783 is fixed. |
181 | */ |
182 | __asm__("" : "+wa"(__A)); |
183 | } |
184 | |
185 | switch (__rounding) { |
186 | case _MM_FROUND_TO_NEAREST_INT: |
187 | #ifdef _ARCH_PWR9 |
188 | __fpscr_save.__fr = __builtin_ppc_mffsl(); |
189 | #else |
190 | __fpscr_save.__fr = __builtin_ppc_mffs(); |
191 | __fpscr_save.__fpscr &= 0x70007f0ffL; |
192 | #endif |
193 | __attribute__((fallthrough)); |
194 | case _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC: |
195 | __builtin_ppc_set_fpscr_rn(0b00); |
196 | /* Insert an artificial "read/write" reference to the variable |
197 | read below, to ensure the compiler does not schedule |
198 | a read/use of the variable before the FPSCR is modified, above. |
199 | This can be removed if and when GCC PR102783 is fixed. |
200 | */ |
201 | __asm__("" : "+wa"(__A)); |
202 | |
203 | __r = vec_rint((__v4sf)__A); |
204 | |
205 | /* Insert an artificial "read" reference to the variable written |
206 | above, to ensure the compiler does not schedule the computation |
207 | of the value after the manipulation of the FPSCR, below. |
208 | This can be removed if and when GCC PR102783 is fixed. |
209 | */ |
210 | __asm__("" : : "wa"(__r)); |
211 | __builtin_ppc_set_fpscr_rn(__fpscr_save.__fpscr); |
212 | break; |
213 | case _MM_FROUND_TO_NEG_INF: |
214 | case _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC: |
215 | __r = vec_floor((__v4sf)__A); |
216 | break; |
217 | case _MM_FROUND_TO_POS_INF: |
218 | case _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC: |
219 | __r = vec_ceil((__v4sf)__A); |
220 | break; |
221 | case _MM_FROUND_TO_ZERO: |
222 | case _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC: |
223 | __r = vec_trunc((__v4sf)__A); |
224 | break; |
225 | case _MM_FROUND_CUR_DIRECTION: |
226 | __r = vec_rint((__v4sf)__A); |
227 | break; |
228 | } |
229 | if (__rounding & _MM_FROUND_NO_EXC) { |
230 | /* Insert an artificial "read" reference to the variable written |
231 | above, to ensure the compiler does not schedule the computation |
232 | of the value after the manipulation of the FPSCR, below. |
233 | This can be removed if and when GCC PR102783 is fixed. |
234 | */ |
235 | __asm__("" : : "wa"(__r)); |
236 | /* Restore enabled exceptions. */ |
237 | #ifdef _ARCH_PWR9 |
238 | __fpscr_save.__fr = __builtin_ppc_mffsl(); |
239 | #else |
240 | __fpscr_save.__fr = __builtin_ppc_mffs(); |
241 | __fpscr_save.__fpscr &= 0x70007f0ffL; |
242 | #endif |
243 | __fpscr_save.__fpscr |= __enables_save.__fpscr; |
244 | __builtin_ppc_mtfsf(0b00000011, __fpscr_save.__fr); |
245 | } |
246 | return (__m128)__r; |
247 | } |
248 | |
249 | extern __inline __m128 |
250 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
251 | _mm_round_ss(__m128 __A, __m128 __B, int __rounding) { |
252 | __B = _mm_round_ps(__B, __rounding); |
253 | __v4sf __r = (__v4sf)__A; |
254 | __r[0] = ((__v4sf)__B)[0]; |
255 | return (__m128)__r; |
256 | } |
257 | |
258 | #define _mm_ceil_pd(V) _mm_round_pd((V), _MM_FROUND_CEIL) |
259 | #define _mm_ceil_sd(D, V) _mm_round_sd((D), (V), _MM_FROUND_CEIL) |
260 | |
261 | #define _mm_floor_pd(V) _mm_round_pd((V), _MM_FROUND_FLOOR) |
262 | #define _mm_floor_sd(D, V) _mm_round_sd((D), (V), _MM_FROUND_FLOOR) |
263 | |
264 | #define _mm_ceil_ps(V) _mm_round_ps((V), _MM_FROUND_CEIL) |
265 | #define _mm_ceil_ss(D, V) _mm_round_ss((D), (V), _MM_FROUND_CEIL) |
266 | |
267 | #define _mm_floor_ps(V) _mm_round_ps((V), _MM_FROUND_FLOOR) |
268 | #define _mm_floor_ss(D, V) _mm_round_ss((D), (V), _MM_FROUND_FLOOR) |
269 | |
270 | extern __inline __m128i |
271 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
272 | _mm_insert_epi8(__m128i const __A, int const __D, int const __N) { |
273 | __v16qi __result = (__v16qi)__A; |
274 | |
275 | __result[__N & 0xf] = __D; |
276 | |
277 | return (__m128i)__result; |
278 | } |
279 | |
280 | extern __inline __m128i |
281 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
282 | _mm_insert_epi32(__m128i const __A, int const __D, int const __N) { |
283 | __v4si __result = (__v4si)__A; |
284 | |
285 | __result[__N & 3] = __D; |
286 | |
287 | return (__m128i)__result; |
288 | } |
289 | |
290 | extern __inline __m128i |
291 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
292 | _mm_insert_epi64(__m128i const __A, long long const __D, int const __N) { |
293 | __v2di __result = (__v2di)__A; |
294 | |
295 | __result[__N & 1] = __D; |
296 | |
297 | return (__m128i)__result; |
298 | } |
299 | |
300 | extern __inline int |
301 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
302 | _mm_extract_epi8(__m128i __X, const int __N) { |
303 | return (unsigned char)((__v16qi)__X)[__N & 15]; |
304 | } |
305 | |
306 | extern __inline int |
307 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
308 | _mm_extract_epi32(__m128i __X, const int __N) { |
309 | return ((__v4si)__X)[__N & 3]; |
310 | } |
311 | |
312 | extern __inline int |
313 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
314 | _mm_extract_epi64(__m128i __X, const int __N) { |
315 | return ((__v2di)__X)[__N & 1]; |
316 | } |
317 | |
318 | extern __inline int |
319 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
320 | _mm_extract_ps(__m128 __X, const int __N) { |
321 | return ((__v4si)__X)[__N & 3]; |
322 | } |
323 | |
324 | #ifdef _ARCH_PWR8 |
325 | extern __inline __m128i |
326 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
327 | _mm_blend_epi16(__m128i __A, __m128i __B, const int __imm8) { |
328 | __v16qu __charmask = vec_splats((unsigned char)__imm8); |
329 | __charmask = vec_gb(__charmask); |
330 | __v8hu __shortmask = (__v8hu)vec_unpackh((__v16qi)__charmask); |
331 | #ifdef __BIG_ENDIAN__ |
332 | __shortmask = vec_reve(__shortmask); |
333 | #endif |
334 | return (__m128i)vec_sel((__v8hu)__A, (__v8hu)__B, __shortmask); |
335 | } |
336 | #endif |
337 | |
338 | extern __inline __m128i |
339 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
340 | _mm_blendv_epi8(__m128i __A, __m128i __B, __m128i __mask) { |
341 | #ifdef _ARCH_PWR10 |
342 | return (__m128i)vec_blendv((__v16qi)__A, (__v16qi)__B, (__v16qu)__mask); |
343 | #else |
344 | const __v16qu __seven = vec_splats((unsigned char)0x07); |
345 | __v16qu __lmask = vec_sra((__v16qu)__mask, __seven); |
346 | return (__m128i)vec_sel((__v16qi)__A, (__v16qi)__B, __lmask); |
347 | #endif |
348 | } |
349 | |
350 | extern __inline __m128 |
351 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
352 | _mm_blend_ps(__m128 __A, __m128 __B, const int __imm8) { |
353 | __v16qu __pcv[] = { |
354 | {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, |
355 | {16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, |
356 | {0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15}, |
357 | {16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15}, |
358 | {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 12, 13, 14, 15}, |
359 | {16, 17, 18, 19, 4, 5, 6, 7, 24, 25, 26, 27, 12, 13, 14, 15}, |
360 | {0, 1, 2, 3, 20, 21, 22, 23, 24, 25, 26, 27, 12, 13, 14, 15}, |
361 | {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 12, 13, 14, 15}, |
362 | {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 28, 29, 30, 31}, |
363 | {16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 28, 29, 30, 31}, |
364 | {0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31}, |
365 | {16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31}, |
366 | {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31}, |
367 | {16, 17, 18, 19, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31}, |
368 | {0, 1, 2, 3, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}, |
369 | {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}, |
370 | }; |
371 | __v16qu __r = vec_perm((__v16qu)__A, (__v16qu)__B, __pcv[__imm8]); |
372 | return (__m128)__r; |
373 | } |
374 | |
375 | extern __inline __m128 |
376 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
377 | _mm_blendv_ps(__m128 __A, __m128 __B, __m128 __mask) { |
378 | #ifdef _ARCH_PWR10 |
379 | return (__m128)vec_blendv((__v4sf)__A, (__v4sf)__B, (__v4su)__mask); |
380 | #else |
381 | const __v4si __zero = {0}; |
382 | const __vector __bool int __boolmask = vec_cmplt((__v4si)__mask, __zero); |
383 | return (__m128)vec_sel((__v4su)__A, (__v4su)__B, (__v4su)__boolmask); |
384 | #endif |
385 | } |
386 | |
387 | extern __inline __m128d |
388 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
389 | _mm_blend_pd(__m128d __A, __m128d __B, const int __imm8) { |
390 | __v16qu __pcv[] = { |
391 | {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, |
392 | {16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15}, |
393 | {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31}, |
394 | {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}}; |
395 | __v16qu __r = vec_perm((__v16qu)__A, (__v16qu)__B, __pcv[__imm8]); |
396 | return (__m128d)__r; |
397 | } |
398 | |
399 | #ifdef _ARCH_PWR8 |
400 | extern __inline __m128d |
401 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
402 | _mm_blendv_pd(__m128d __A, __m128d __B, __m128d __mask) { |
403 | #ifdef _ARCH_PWR10 |
404 | return (__m128d)vec_blendv((__v2df)__A, (__v2df)__B, (__v2du)__mask); |
405 | #else |
406 | const __v2di __zero = {0}; |
407 | const __vector __bool long long __boolmask = |
408 | vec_cmplt((__v2di)__mask, __zero); |
409 | return (__m128d)vec_sel((__v2du)__A, (__v2du)__B, (__v2du)__boolmask); |
410 | #endif |
411 | } |
412 | #endif |
413 | |
414 | extern __inline int |
415 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
416 | _mm_testz_si128(__m128i __A, __m128i __B) { |
417 | /* Note: This implementation does NOT set "zero" or "carry" flags. */ |
418 | const __v16qu __zero = {0}; |
419 | return vec_all_eq(vec_and((__v16qu)__A, (__v16qu)__B), __zero); |
420 | } |
421 | |
422 | extern __inline int |
423 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
424 | _mm_testc_si128(__m128i __A, __m128i __B) { |
425 | /* Note: This implementation does NOT set "zero" or "carry" flags. */ |
426 | const __v16qu __zero = {0}; |
427 | const __v16qu __notA = vec_nor((__v16qu)__A, (__v16qu)__A); |
428 | return vec_all_eq(vec_and((__v16qu)__notA, (__v16qu)__B), __zero); |
429 | } |
430 | |
431 | extern __inline int |
432 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
433 | _mm_testnzc_si128(__m128i __A, __m128i __B) { |
434 | /* Note: This implementation does NOT set "zero" or "carry" flags. */ |
435 | return _mm_testz_si128(__A, __B) == 0 && _mm_testc_si128(__A, __B) == 0; |
436 | } |
437 | |
438 | #define _mm_test_all_zeros(M, V) _mm_testz_si128((M), (V)) |
439 | |
440 | #define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_cmpeq_epi32((V), (V))) |
441 | |
442 | #define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V)) |
443 | |
444 | #ifdef _ARCH_PWR8 |
445 | extern __inline __m128i |
446 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
447 | _mm_cmpeq_epi64(__m128i __X, __m128i __Y) { |
448 | return (__m128i)vec_cmpeq((__v2di)__X, (__v2di)__Y); |
449 | } |
450 | #endif |
451 | |
452 | extern __inline __m128i |
453 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
454 | _mm_min_epi8(__m128i __X, __m128i __Y) { |
455 | return (__m128i)vec_min((__v16qi)__X, (__v16qi)__Y); |
456 | } |
457 | |
458 | extern __inline __m128i |
459 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
460 | _mm_min_epu16(__m128i __X, __m128i __Y) { |
461 | return (__m128i)vec_min((__v8hu)__X, (__v8hu)__Y); |
462 | } |
463 | |
464 | extern __inline __m128i |
465 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
466 | _mm_min_epi32(__m128i __X, __m128i __Y) { |
467 | return (__m128i)vec_min((__v4si)__X, (__v4si)__Y); |
468 | } |
469 | |
470 | extern __inline __m128i |
471 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
472 | _mm_min_epu32(__m128i __X, __m128i __Y) { |
473 | return (__m128i)vec_min((__v4su)__X, (__v4su)__Y); |
474 | } |
475 | |
476 | extern __inline __m128i |
477 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
478 | _mm_max_epi8(__m128i __X, __m128i __Y) { |
479 | return (__m128i)vec_max((__v16qi)__X, (__v16qi)__Y); |
480 | } |
481 | |
482 | extern __inline __m128i |
483 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
484 | _mm_max_epu16(__m128i __X, __m128i __Y) { |
485 | return (__m128i)vec_max((__v8hu)__X, (__v8hu)__Y); |
486 | } |
487 | |
488 | extern __inline __m128i |
489 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
490 | _mm_max_epi32(__m128i __X, __m128i __Y) { |
491 | return (__m128i)vec_max((__v4si)__X, (__v4si)__Y); |
492 | } |
493 | |
494 | extern __inline __m128i |
495 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
496 | _mm_max_epu32(__m128i __X, __m128i __Y) { |
497 | return (__m128i)vec_max((__v4su)__X, (__v4su)__Y); |
498 | } |
499 | |
500 | extern __inline __m128i |
501 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
502 | _mm_mullo_epi32(__m128i __X, __m128i __Y) { |
503 | return (__m128i)vec_mul((__v4su)__X, (__v4su)__Y); |
504 | } |
505 | |
506 | #ifdef _ARCH_PWR8 |
507 | extern __inline __m128i |
508 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
509 | _mm_mul_epi32(__m128i __X, __m128i __Y) { |
510 | return (__m128i)vec_mule((__v4si)__X, (__v4si)__Y); |
511 | } |
512 | #endif |
513 | |
514 | extern __inline __m128i |
515 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
516 | _mm_cvtepi8_epi16(__m128i __A) { |
517 | return (__m128i)vec_unpackh((__v16qi)__A); |
518 | } |
519 | |
520 | extern __inline __m128i |
521 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
522 | _mm_cvtepi8_epi32(__m128i __A) { |
523 | __A = (__m128i)vec_unpackh((__v16qi)__A); |
524 | return (__m128i)vec_unpackh((__v8hi)__A); |
525 | } |
526 | |
527 | #ifdef _ARCH_PWR8 |
528 | extern __inline __m128i |
529 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
530 | _mm_cvtepi8_epi64(__m128i __A) { |
531 | __A = (__m128i)vec_unpackh((__v16qi)__A); |
532 | __A = (__m128i)vec_unpackh((__v8hi)__A); |
533 | return (__m128i)vec_unpackh((__v4si)__A); |
534 | } |
535 | #endif |
536 | |
537 | extern __inline __m128i |
538 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
539 | _mm_cvtepi16_epi32(__m128i __A) { |
540 | return (__m128i)vec_unpackh((__v8hi)__A); |
541 | } |
542 | |
543 | #ifdef _ARCH_PWR8 |
544 | extern __inline __m128i |
545 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
546 | _mm_cvtepi16_epi64(__m128i __A) { |
547 | __A = (__m128i)vec_unpackh((__v8hi)__A); |
548 | return (__m128i)vec_unpackh((__v4si)__A); |
549 | } |
550 | #endif |
551 | |
552 | #ifdef _ARCH_PWR8 |
553 | extern __inline __m128i |
554 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
555 | _mm_cvtepi32_epi64(__m128i __A) { |
556 | return (__m128i)vec_unpackh((__v4si)__A); |
557 | } |
558 | #endif |
559 | |
560 | extern __inline __m128i |
561 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
562 | _mm_cvtepu8_epi16(__m128i __A) { |
563 | const __v16qu __zero = {0}; |
564 | #ifdef __LITTLE_ENDIAN__ |
565 | __A = (__m128i)vec_mergeh((__v16qu)__A, __zero); |
566 | #else /* __BIG_ENDIAN__. */ |
567 | __A = (__m128i)vec_mergeh(__zero, (__v16qu)__A); |
568 | #endif /* __BIG_ENDIAN__. */ |
569 | return __A; |
570 | } |
571 | |
572 | extern __inline __m128i |
573 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
574 | _mm_cvtepu8_epi32(__m128i __A) { |
575 | const __v16qu __zero = {0}; |
576 | #ifdef __LITTLE_ENDIAN__ |
577 | __A = (__m128i)vec_mergeh((__v16qu)__A, __zero); |
578 | __A = (__m128i)vec_mergeh((__v8hu)__A, (__v8hu)__zero); |
579 | #else /* __BIG_ENDIAN__. */ |
580 | __A = (__m128i)vec_mergeh(__zero, (__v16qu)__A); |
581 | __A = (__m128i)vec_mergeh((__v8hu)__zero, (__v8hu)__A); |
582 | #endif /* __BIG_ENDIAN__. */ |
583 | return __A; |
584 | } |
585 | |
586 | extern __inline __m128i |
587 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
588 | _mm_cvtepu8_epi64(__m128i __A) { |
589 | const __v16qu __zero = {0}; |
590 | #ifdef __LITTLE_ENDIAN__ |
591 | __A = (__m128i)vec_mergeh((__v16qu)__A, __zero); |
592 | __A = (__m128i)vec_mergeh((__v8hu)__A, (__v8hu)__zero); |
593 | __A = (__m128i)vec_mergeh((__v4su)__A, (__v4su)__zero); |
594 | #else /* __BIG_ENDIAN__. */ |
595 | __A = (__m128i)vec_mergeh(__zero, (__v16qu)__A); |
596 | __A = (__m128i)vec_mergeh((__v8hu)__zero, (__v8hu)__A); |
597 | __A = (__m128i)vec_mergeh((__v4su)__zero, (__v4su)__A); |
598 | #endif /* __BIG_ENDIAN__. */ |
599 | return __A; |
600 | } |
601 | |
602 | extern __inline __m128i |
603 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
604 | _mm_cvtepu16_epi32(__m128i __A) { |
605 | const __v8hu __zero = {0}; |
606 | #ifdef __LITTLE_ENDIAN__ |
607 | __A = (__m128i)vec_mergeh((__v8hu)__A, __zero); |
608 | #else /* __BIG_ENDIAN__. */ |
609 | __A = (__m128i)vec_mergeh(__zero, (__v8hu)__A); |
610 | #endif /* __BIG_ENDIAN__. */ |
611 | return __A; |
612 | } |
613 | |
614 | extern __inline __m128i |
615 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
616 | _mm_cvtepu16_epi64(__m128i __A) { |
617 | const __v8hu __zero = {0}; |
618 | #ifdef __LITTLE_ENDIAN__ |
619 | __A = (__m128i)vec_mergeh((__v8hu)__A, __zero); |
620 | __A = (__m128i)vec_mergeh((__v4su)__A, (__v4su)__zero); |
621 | #else /* __BIG_ENDIAN__. */ |
622 | __A = (__m128i)vec_mergeh(__zero, (__v8hu)__A); |
623 | __A = (__m128i)vec_mergeh((__v4su)__zero, (__v4su)__A); |
624 | #endif /* __BIG_ENDIAN__. */ |
625 | return __A; |
626 | } |
627 | |
628 | extern __inline __m128i |
629 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
630 | _mm_cvtepu32_epi64(__m128i __A) { |
631 | const __v4su __zero = {0}; |
632 | #ifdef __LITTLE_ENDIAN__ |
633 | __A = (__m128i)vec_mergeh((__v4su)__A, __zero); |
634 | #else /* __BIG_ENDIAN__. */ |
635 | __A = (__m128i)vec_mergeh(__zero, (__v4su)__A); |
636 | #endif /* __BIG_ENDIAN__. */ |
637 | return __A; |
638 | } |
639 | |
640 | /* Return horizontal packed word minimum and its index in bits [15:0] |
641 | and bits [18:16] respectively. */ |
642 | extern __inline __m128i |
643 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
644 | _mm_minpos_epu16(__m128i __A) { |
645 | union __u { |
646 | __m128i __m; |
647 | __v8hu __uh; |
648 | }; |
649 | union __u __u = {.__m = __A}, __r = {.__m = {0}}; |
650 | unsigned short __ridx = 0; |
651 | unsigned short __rmin = __u.__uh[__ridx]; |
652 | unsigned long __i; |
653 | for (__i = 1; __i < 8; __i++) { |
654 | if (__u.__uh[__i] < __rmin) { |
655 | __rmin = __u.__uh[__i]; |
656 | __ridx = __i; |
657 | } |
658 | } |
659 | __r.__uh[0] = __rmin; |
660 | __r.__uh[1] = __ridx; |
661 | return __r.__m; |
662 | } |
663 | |
664 | extern __inline __m128i |
665 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
666 | _mm_packus_epi32(__m128i __X, __m128i __Y) { |
667 | return (__m128i)vec_packsu((__v4si)__X, (__v4si)__Y); |
668 | } |
669 | |
670 | #ifdef _ARCH_PWR8 |
671 | extern __inline __m128i |
672 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
673 | _mm_cmpgt_epi64(__m128i __X, __m128i __Y) { |
674 | return (__m128i)vec_cmpgt((__v2di)__X, (__v2di)__Y); |
675 | } |
676 | #endif |
677 | |
678 | #else |
679 | #include_next <smmintrin.h> |
680 | #endif /* defined(__powerpc64__) && \ |
681 | * (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */ |
682 | |
683 | #endif /* SMMINTRIN_H_ */ |
684 |
Warning: This file is not a C or C++ file. It does not have highlighting.