Warning: This file is not a C or C++ file. It does not have highlighting.
1 | /*===---- xmmintrin.h - Implementation of SSE intrinsics on PowerPC --------=== |
---|---|
2 | * |
3 | * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | * See https://llvm.org/LICENSE.txt for license information. |
5 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | * |
7 | *===-----------------------------------------------------------------------=== |
8 | */ |
9 | |
10 | /* Implemented from the specification included in the Intel C++ Compiler |
11 | User Guide and Reference, version 9.0. */ |
12 | |
13 | #ifndef NO_WARN_X86_INTRINSICS |
14 | /* This header file is to help porting code using Intel intrinsics |
15 | explicitly from x86_64 to powerpc64/powerpc64le. |
16 | |
17 | Since X86 SSE intrinsics mainly handles __m128 type, PowerPC |
18 | VMX/VSX ISA is a good match for vector float SIMD operations. |
19 | However scalar float operations in vector (XMM) registers require |
20 | the POWER8 VSX ISA (2.07) level. There are differences for data |
21 | format and placement of float scalars in the vector register, which |
22 | require extra steps to match SSE scalar float semantics on POWER. |
23 | |
24 | It should be noted that there's much difference between X86_64's |
25 | MXSCR and PowerISA's FPSCR/VSCR registers. It's recommended to use |
26 | portable <fenv.h> instead of access MXSCR directly. |
27 | |
28 | Most SSE scalar float intrinsic operations can be performed more |
29 | efficiently as C language float scalar operations or optimized to |
30 | use vector SIMD operations. We recommend this for new applications. */ |
31 | #error \ |
32 | "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error." |
33 | #endif |
34 | |
35 | #ifndef XMMINTRIN_H_ |
36 | #define XMMINTRIN_H_ |
37 | |
38 | #if defined(__powerpc64__) && \ |
39 | (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) |
40 | |
41 | /* Define four value permute mask */ |
42 | #define _MM_SHUFFLE(w, x, y, z) (((w) << 6) | ((x) << 4) | ((y) << 2) | (z)) |
43 | |
44 | #include <altivec.h> |
45 | |
46 | /* Avoid collisions between altivec.h and strict adherence to C++ and |
47 | C11 standards. This should eventually be done inside altivec.h itself, |
48 | but only after testing a full distro build. */ |
49 | #if defined(__STRICT_ANSI__) && \ |
50 | (defined(__cplusplus) || \ |
51 | (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L)) |
52 | #undef vector |
53 | #undef pixel |
54 | #undef bool |
55 | #endif |
56 | |
57 | /* We need type definitions from the MMX header file. */ |
58 | #include <mmintrin.h> |
59 | |
60 | /* Get _mm_malloc () and _mm_free (). */ |
61 | #if __STDC_HOSTED__ |
62 | #include <mm_malloc.h> |
63 | #endif |
64 | |
65 | /* The Intel API is flexible enough that we must allow aliasing with other |
66 | vector types, and their scalar components. */ |
67 | typedef vector float __m128 __attribute__((__may_alias__)); |
68 | |
69 | /* Unaligned version of the same type. */ |
70 | typedef vector float __m128_u __attribute__((__may_alias__, __aligned__(1))); |
71 | |
72 | /* Internal data types for implementing the intrinsics. */ |
73 | typedef vector float __v4sf; |
74 | |
75 | /* Create an undefined vector. */ |
76 | extern __inline __m128 |
77 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
78 | _mm_undefined_ps(void) { |
79 | __m128 __Y = __Y; |
80 | return __Y; |
81 | } |
82 | |
83 | /* Create a vector of zeros. */ |
84 | extern __inline __m128 |
85 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
86 | _mm_setzero_ps(void) { |
87 | return __extension__(__m128){0.0f, 0.0f, 0.0f, 0.0f}; |
88 | } |
89 | |
90 | /* Load four SPFP values from P. The address must be 16-byte aligned. */ |
91 | extern __inline __m128 |
92 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
93 | _mm_load_ps(float const *__P) { |
94 | return ((__m128)vec_ld(0, (__v4sf *)__P)); |
95 | } |
96 | |
97 | /* Load four SPFP values from P. The address need not be 16-byte aligned. */ |
98 | extern __inline __m128 |
99 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
100 | _mm_loadu_ps(float const *__P) { |
101 | return (vec_vsx_ld(0, __P)); |
102 | } |
103 | |
104 | /* Load four SPFP values in reverse order. The address must be aligned. */ |
105 | extern __inline __m128 |
106 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
107 | _mm_loadr_ps(float const *__P) { |
108 | __v4sf __tmp; |
109 | __m128 __result; |
110 | static const __vector unsigned char __permute_vector = { |
111 | 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, |
112 | 0x14, 0x15, 0x16, 0x17, 0x10, 0x11, 0x12, 0x13}; |
113 | |
114 | __tmp = vec_ld(0, (__v4sf *)__P); |
115 | __result = (__m128)vec_perm(__tmp, __tmp, __permute_vector); |
116 | return __result; |
117 | } |
118 | |
119 | /* Create a vector with all four elements equal to F. */ |
120 | extern __inline __m128 |
121 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
122 | _mm_set1_ps(float __F) { |
123 | return __extension__(__m128)(__v4sf){__F, __F, __F, __F}; |
124 | } |
125 | |
126 | extern __inline __m128 |
127 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
128 | _mm_set_ps1(float __F) { |
129 | return _mm_set1_ps(__F); |
130 | } |
131 | |
132 | /* Create the vector [Z Y X W]. */ |
133 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, |
134 | __artificial__)) |
135 | _mm_set_ps(const float __Z, const float __Y, const float __X, const float __W) { |
136 | return __extension__(__m128)(__v4sf){__W, __X, __Y, __Z}; |
137 | } |
138 | |
139 | /* Create the vector [W X Y Z]. */ |
140 | extern __inline __m128 |
141 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
142 | _mm_setr_ps(float __Z, float __Y, float __X, float __W) { |
143 | return __extension__(__m128)(__v4sf){__Z, __Y, __X, __W}; |
144 | } |
145 | |
146 | /* Store four SPFP values. The address must be 16-byte aligned. */ |
147 | extern __inline void |
148 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
149 | _mm_store_ps(float *__P, __m128 __A) { |
150 | vec_st((__v4sf)__A, 0, (__v4sf *)__P); |
151 | } |
152 | |
153 | /* Store four SPFP values. The address need not be 16-byte aligned. */ |
154 | extern __inline void |
155 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
156 | _mm_storeu_ps(float *__P, __m128 __A) { |
157 | *(__m128_u *)__P = __A; |
158 | } |
159 | |
160 | /* Store four SPFP values in reverse order. The address must be aligned. */ |
161 | extern __inline void |
162 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
163 | _mm_storer_ps(float *__P, __m128 __A) { |
164 | __v4sf __tmp; |
165 | static const __vector unsigned char __permute_vector = { |
166 | 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, |
167 | 0x14, 0x15, 0x16, 0x17, 0x10, 0x11, 0x12, 0x13}; |
168 | |
169 | __tmp = (__m128)vec_perm(__A, __A, __permute_vector); |
170 | |
171 | _mm_store_ps(__P, __tmp); |
172 | } |
173 | |
174 | /* Store the lower SPFP value across four words. */ |
175 | extern __inline void |
176 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
177 | _mm_store1_ps(float *__P, __m128 __A) { |
178 | __v4sf __va = vec_splat((__v4sf)__A, 0); |
179 | _mm_store_ps(__P, __va); |
180 | } |
181 | |
182 | extern __inline void |
183 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
184 | _mm_store_ps1(float *__P, __m128 __A) { |
185 | _mm_store1_ps(__P, __A); |
186 | } |
187 | |
188 | /* Create a vector with element 0 as F and the rest zero. */ |
189 | extern __inline __m128 |
190 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
191 | _mm_set_ss(float __F) { |
192 | return __extension__(__m128)(__v4sf){__F, 0.0f, 0.0f, 0.0f}; |
193 | } |
194 | |
195 | /* Sets the low SPFP value of A from the low value of B. */ |
196 | extern __inline __m128 |
197 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
198 | _mm_move_ss(__m128 __A, __m128 __B) { |
199 | static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; |
200 | |
201 | return (vec_sel((__v4sf)__A, (__v4sf)__B, __mask)); |
202 | } |
203 | |
204 | /* Create a vector with element 0 as *P and the rest zero. */ |
205 | extern __inline __m128 |
206 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
207 | _mm_load_ss(float const *__P) { |
208 | return _mm_set_ss(*__P); |
209 | } |
210 | |
211 | /* Stores the lower SPFP value. */ |
212 | extern __inline void |
213 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
214 | _mm_store_ss(float *__P, __m128 __A) { |
215 | *__P = ((__v4sf)__A)[0]; |
216 | } |
217 | |
218 | /* Perform the respective operation on the lower SPFP (single-precision |
219 | floating-point) values of A and B; the upper three SPFP values are |
220 | passed through from A. */ |
221 | |
222 | extern __inline __m128 |
223 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
224 | _mm_add_ss(__m128 __A, __m128 __B) { |
225 | #ifdef _ARCH_PWR7 |
226 | __m128 __a, __b, __c; |
227 | static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; |
228 | /* PowerISA VSX does not allow partial (for just lower double) |
229 | results. So to insure we don't generate spurious exceptions |
230 | (from the upper double values) we splat the lower double |
231 | before we to the operation. */ |
232 | __a = vec_splat(__A, 0); |
233 | __b = vec_splat(__B, 0); |
234 | __c = __a + __b; |
235 | /* Then we merge the lower float result with the original upper |
236 | float elements from __A. */ |
237 | return (vec_sel(__A, __c, __mask)); |
238 | #else |
239 | __A[0] = __A[0] + __B[0]; |
240 | return (__A); |
241 | #endif |
242 | } |
243 | |
244 | extern __inline __m128 |
245 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
246 | _mm_sub_ss(__m128 __A, __m128 __B) { |
247 | #ifdef _ARCH_PWR7 |
248 | __m128 __a, __b, __c; |
249 | static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; |
250 | /* PowerISA VSX does not allow partial (for just lower double) |
251 | results. So to insure we don't generate spurious exceptions |
252 | (from the upper double values) we splat the lower double |
253 | before we to the operation. */ |
254 | __a = vec_splat(__A, 0); |
255 | __b = vec_splat(__B, 0); |
256 | __c = __a - __b; |
257 | /* Then we merge the lower float result with the original upper |
258 | float elements from __A. */ |
259 | return (vec_sel(__A, __c, __mask)); |
260 | #else |
261 | __A[0] = __A[0] - __B[0]; |
262 | return (__A); |
263 | #endif |
264 | } |
265 | |
266 | extern __inline __m128 |
267 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
268 | _mm_mul_ss(__m128 __A, __m128 __B) { |
269 | #ifdef _ARCH_PWR7 |
270 | __m128 __a, __b, __c; |
271 | static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; |
272 | /* PowerISA VSX does not allow partial (for just lower double) |
273 | results. So to insure we don't generate spurious exceptions |
274 | (from the upper double values) we splat the lower double |
275 | before we to the operation. */ |
276 | __a = vec_splat(__A, 0); |
277 | __b = vec_splat(__B, 0); |
278 | __c = __a * __b; |
279 | /* Then we merge the lower float result with the original upper |
280 | float elements from __A. */ |
281 | return (vec_sel(__A, __c, __mask)); |
282 | #else |
283 | __A[0] = __A[0] * __B[0]; |
284 | return (__A); |
285 | #endif |
286 | } |
287 | |
288 | extern __inline __m128 |
289 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
290 | _mm_div_ss(__m128 __A, __m128 __B) { |
291 | #ifdef _ARCH_PWR7 |
292 | __m128 __a, __b, __c; |
293 | static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; |
294 | /* PowerISA VSX does not allow partial (for just lower double) |
295 | results. So to insure we don't generate spurious exceptions |
296 | (from the upper double values) we splat the lower double |
297 | before we to the operation. */ |
298 | __a = vec_splat(__A, 0); |
299 | __b = vec_splat(__B, 0); |
300 | __c = __a / __b; |
301 | /* Then we merge the lower float result with the original upper |
302 | float elements from __A. */ |
303 | return (vec_sel(__A, __c, __mask)); |
304 | #else |
305 | __A[0] = __A[0] / __B[0]; |
306 | return (__A); |
307 | #endif |
308 | } |
309 | |
310 | extern __inline __m128 |
311 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
312 | _mm_sqrt_ss(__m128 __A) { |
313 | __m128 __a, __c; |
314 | static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; |
315 | /* PowerISA VSX does not allow partial (for just lower double) |
316 | * results. So to insure we don't generate spurious exceptions |
317 | * (from the upper double values) we splat the lower double |
318 | * before we to the operation. */ |
319 | __a = vec_splat(__A, 0); |
320 | __c = vec_sqrt(__a); |
321 | /* Then we merge the lower float result with the original upper |
322 | * float elements from __A. */ |
323 | return (vec_sel(__A, __c, __mask)); |
324 | } |
325 | |
326 | /* Perform the respective operation on the four SPFP values in A and B. */ |
327 | extern __inline __m128 |
328 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
329 | _mm_add_ps(__m128 __A, __m128 __B) { |
330 | return (__m128)((__v4sf)__A + (__v4sf)__B); |
331 | } |
332 | |
333 | extern __inline __m128 |
334 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
335 | _mm_sub_ps(__m128 __A, __m128 __B) { |
336 | return (__m128)((__v4sf)__A - (__v4sf)__B); |
337 | } |
338 | |
339 | extern __inline __m128 |
340 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
341 | _mm_mul_ps(__m128 __A, __m128 __B) { |
342 | return (__m128)((__v4sf)__A * (__v4sf)__B); |
343 | } |
344 | |
345 | extern __inline __m128 |
346 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
347 | _mm_div_ps(__m128 __A, __m128 __B) { |
348 | return (__m128)((__v4sf)__A / (__v4sf)__B); |
349 | } |
350 | |
351 | extern __inline __m128 |
352 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
353 | _mm_sqrt_ps(__m128 __A) { |
354 | return (vec_sqrt((__v4sf)__A)); |
355 | } |
356 | |
357 | extern __inline __m128 |
358 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
359 | _mm_rcp_ps(__m128 __A) { |
360 | return (vec_re((__v4sf)__A)); |
361 | } |
362 | |
363 | extern __inline __m128 |
364 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
365 | _mm_rsqrt_ps(__m128 __A) { |
366 | return (vec_rsqrte(__A)); |
367 | } |
368 | |
369 | extern __inline __m128 |
370 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
371 | _mm_rcp_ss(__m128 __A) { |
372 | __m128 __a, __c; |
373 | static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; |
374 | /* PowerISA VSX does not allow partial (for just lower double) |
375 | * results. So to insure we don't generate spurious exceptions |
376 | * (from the upper double values) we splat the lower double |
377 | * before we to the operation. */ |
378 | __a = vec_splat(__A, 0); |
379 | __c = _mm_rcp_ps(__a); |
380 | /* Then we merge the lower float result with the original upper |
381 | * float elements from __A. */ |
382 | return (vec_sel(__A, __c, __mask)); |
383 | } |
384 | |
385 | extern __inline __m128 |
386 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
387 | _mm_rsqrt_ss(__m128 __A) { |
388 | __m128 __a, __c; |
389 | static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; |
390 | /* PowerISA VSX does not allow partial (for just lower double) |
391 | * results. So to insure we don't generate spurious exceptions |
392 | * (from the upper double values) we splat the lower double |
393 | * before we to the operation. */ |
394 | __a = vec_splat(__A, 0); |
395 | __c = vec_rsqrte(__a); |
396 | /* Then we merge the lower float result with the original upper |
397 | * float elements from __A. */ |
398 | return (vec_sel(__A, __c, __mask)); |
399 | } |
400 | |
401 | extern __inline __m128 |
402 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
403 | _mm_min_ss(__m128 __A, __m128 __B) { |
404 | __v4sf __a, __b, __c; |
405 | static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; |
406 | /* PowerISA VSX does not allow partial (for just lower float) |
407 | * results. So to insure we don't generate spurious exceptions |
408 | * (from the upper float values) we splat the lower float |
409 | * before we to the operation. */ |
410 | __a = vec_splat((__v4sf)__A, 0); |
411 | __b = vec_splat((__v4sf)__B, 0); |
412 | __c = vec_min(__a, __b); |
413 | /* Then we merge the lower float result with the original upper |
414 | * float elements from __A. */ |
415 | return (vec_sel((__v4sf)__A, __c, __mask)); |
416 | } |
417 | |
418 | extern __inline __m128 |
419 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
420 | _mm_max_ss(__m128 __A, __m128 __B) { |
421 | __v4sf __a, __b, __c; |
422 | static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; |
423 | /* PowerISA VSX does not allow partial (for just lower float) |
424 | * results. So to insure we don't generate spurious exceptions |
425 | * (from the upper float values) we splat the lower float |
426 | * before we to the operation. */ |
427 | __a = vec_splat(__A, 0); |
428 | __b = vec_splat(__B, 0); |
429 | __c = vec_max(__a, __b); |
430 | /* Then we merge the lower float result with the original upper |
431 | * float elements from __A. */ |
432 | return (vec_sel((__v4sf)__A, __c, __mask)); |
433 | } |
434 | |
435 | extern __inline __m128 |
436 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
437 | _mm_min_ps(__m128 __A, __m128 __B) { |
438 | __vector __bool int __m = vec_cmpgt((__v4sf)__B, (__v4sf)__A); |
439 | return vec_sel(__B, __A, __m); |
440 | } |
441 | |
442 | extern __inline __m128 |
443 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
444 | _mm_max_ps(__m128 __A, __m128 __B) { |
445 | __vector __bool int __m = vec_cmpgt((__v4sf)__A, (__v4sf)__B); |
446 | return vec_sel(__B, __A, __m); |
447 | } |
448 | |
449 | /* Perform logical bit-wise operations on 128-bit values. */ |
450 | extern __inline __m128 |
451 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
452 | _mm_and_ps(__m128 __A, __m128 __B) { |
453 | return ((__m128)vec_and((__v4sf)__A, (__v4sf)__B)); |
454 | // return __builtin_ia32_andps (__A, __B); |
455 | } |
456 | |
457 | extern __inline __m128 |
458 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
459 | _mm_andnot_ps(__m128 __A, __m128 __B) { |
460 | return ((__m128)vec_andc((__v4sf)__B, (__v4sf)__A)); |
461 | } |
462 | |
463 | extern __inline __m128 |
464 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
465 | _mm_or_ps(__m128 __A, __m128 __B) { |
466 | return ((__m128)vec_or((__v4sf)__A, (__v4sf)__B)); |
467 | } |
468 | |
469 | extern __inline __m128 |
470 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
471 | _mm_xor_ps(__m128 __A, __m128 __B) { |
472 | return ((__m128)vec_xor((__v4sf)__A, (__v4sf)__B)); |
473 | } |
474 | |
475 | /* Perform a comparison on the four SPFP values of A and B. For each |
476 | element, if the comparison is true, place a mask of all ones in the |
477 | result, otherwise a mask of zeros. */ |
478 | extern __inline __m128 |
479 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
480 | _mm_cmpeq_ps(__m128 __A, __m128 __B) { |
481 | return ((__m128)vec_cmpeq((__v4sf)__A, (__v4sf)__B)); |
482 | } |
483 | |
484 | extern __inline __m128 |
485 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
486 | _mm_cmplt_ps(__m128 __A, __m128 __B) { |
487 | return ((__m128)vec_cmplt((__v4sf)__A, (__v4sf)__B)); |
488 | } |
489 | |
490 | extern __inline __m128 |
491 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
492 | _mm_cmple_ps(__m128 __A, __m128 __B) { |
493 | return ((__m128)vec_cmple((__v4sf)__A, (__v4sf)__B)); |
494 | } |
495 | |
496 | extern __inline __m128 |
497 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
498 | _mm_cmpgt_ps(__m128 __A, __m128 __B) { |
499 | return ((__m128)vec_cmpgt((__v4sf)__A, (__v4sf)__B)); |
500 | } |
501 | |
502 | extern __inline __m128 |
503 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
504 | _mm_cmpge_ps(__m128 __A, __m128 __B) { |
505 | return ((__m128)vec_cmpge((__v4sf)__A, (__v4sf)__B)); |
506 | } |
507 | |
508 | extern __inline __m128 |
509 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
510 | _mm_cmpneq_ps(__m128 __A, __m128 __B) { |
511 | __v4sf __temp = (__v4sf)vec_cmpeq((__v4sf)__A, (__v4sf)__B); |
512 | return ((__m128)vec_nor(__temp, __temp)); |
513 | } |
514 | |
515 | extern __inline __m128 |
516 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
517 | _mm_cmpnlt_ps(__m128 __A, __m128 __B) { |
518 | return ((__m128)vec_cmpge((__v4sf)__A, (__v4sf)__B)); |
519 | } |
520 | |
521 | extern __inline __m128 |
522 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
523 | _mm_cmpnle_ps(__m128 __A, __m128 __B) { |
524 | return ((__m128)vec_cmpgt((__v4sf)__A, (__v4sf)__B)); |
525 | } |
526 | |
527 | extern __inline __m128 |
528 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
529 | _mm_cmpngt_ps(__m128 __A, __m128 __B) { |
530 | return ((__m128)vec_cmple((__v4sf)__A, (__v4sf)__B)); |
531 | } |
532 | |
533 | extern __inline __m128 |
534 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
535 | _mm_cmpnge_ps(__m128 __A, __m128 __B) { |
536 | return ((__m128)vec_cmplt((__v4sf)__A, (__v4sf)__B)); |
537 | } |
538 | |
539 | extern __inline __m128 |
540 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
541 | _mm_cmpord_ps(__m128 __A, __m128 __B) { |
542 | __vector unsigned int __a, __b; |
543 | __vector unsigned int __c, __d; |
544 | static const __vector unsigned int __float_exp_mask = { |
545 | 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000}; |
546 | |
547 | __a = (__vector unsigned int)vec_abs((__v4sf)__A); |
548 | __b = (__vector unsigned int)vec_abs((__v4sf)__B); |
549 | __c = (__vector unsigned int)vec_cmpgt(__float_exp_mask, __a); |
550 | __d = (__vector unsigned int)vec_cmpgt(__float_exp_mask, __b); |
551 | return ((__m128)vec_and(__c, __d)); |
552 | } |
553 | |
554 | extern __inline __m128 |
555 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
556 | _mm_cmpunord_ps(__m128 __A, __m128 __B) { |
557 | __vector unsigned int __a, __b; |
558 | __vector unsigned int __c, __d; |
559 | static const __vector unsigned int __float_exp_mask = { |
560 | 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000}; |
561 | |
562 | __a = (__vector unsigned int)vec_abs((__v4sf)__A); |
563 | __b = (__vector unsigned int)vec_abs((__v4sf)__B); |
564 | __c = (__vector unsigned int)vec_cmpgt(__a, __float_exp_mask); |
565 | __d = (__vector unsigned int)vec_cmpgt(__b, __float_exp_mask); |
566 | return ((__m128)vec_or(__c, __d)); |
567 | } |
568 | |
569 | /* Perform a comparison on the lower SPFP values of A and B. If the |
570 | comparison is true, place a mask of all ones in the result, otherwise a |
571 | mask of zeros. The upper three SPFP values are passed through from A. */ |
572 | extern __inline __m128 |
573 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
574 | _mm_cmpeq_ss(__m128 __A, __m128 __B) { |
575 | static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; |
576 | __v4sf __a, __b, __c; |
577 | /* PowerISA VMX does not allow partial (for just element 0) |
578 | * results. So to insure we don't generate spurious exceptions |
579 | * (from the upper elements) we splat the lower float |
580 | * before we to the operation. */ |
581 | __a = vec_splat((__v4sf)__A, 0); |
582 | __b = vec_splat((__v4sf)__B, 0); |
583 | __c = (__v4sf)vec_cmpeq(__a, __b); |
584 | /* Then we merge the lower float result with the original upper |
585 | * float elements from __A. */ |
586 | return ((__m128)vec_sel((__v4sf)__A, __c, __mask)); |
587 | } |
588 | |
589 | extern __inline __m128 |
590 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
591 | _mm_cmplt_ss(__m128 __A, __m128 __B) { |
592 | static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; |
593 | __v4sf __a, __b, __c; |
594 | /* PowerISA VMX does not allow partial (for just element 0) |
595 | * results. So to insure we don't generate spurious exceptions |
596 | * (from the upper elements) we splat the lower float |
597 | * before we to the operation. */ |
598 | __a = vec_splat((__v4sf)__A, 0); |
599 | __b = vec_splat((__v4sf)__B, 0); |
600 | __c = (__v4sf)vec_cmplt(__a, __b); |
601 | /* Then we merge the lower float result with the original upper |
602 | * float elements from __A. */ |
603 | return ((__m128)vec_sel((__v4sf)__A, __c, __mask)); |
604 | } |
605 | |
606 | extern __inline __m128 |
607 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
608 | _mm_cmple_ss(__m128 __A, __m128 __B) { |
609 | static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; |
610 | __v4sf __a, __b, __c; |
611 | /* PowerISA VMX does not allow partial (for just element 0) |
612 | * results. So to insure we don't generate spurious exceptions |
613 | * (from the upper elements) we splat the lower float |
614 | * before we to the operation. */ |
615 | __a = vec_splat((__v4sf)__A, 0); |
616 | __b = vec_splat((__v4sf)__B, 0); |
617 | __c = (__v4sf)vec_cmple(__a, __b); |
618 | /* Then we merge the lower float result with the original upper |
619 | * float elements from __A. */ |
620 | return ((__m128)vec_sel((__v4sf)__A, __c, __mask)); |
621 | } |
622 | |
623 | extern __inline __m128 |
624 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
625 | _mm_cmpgt_ss(__m128 __A, __m128 __B) { |
626 | static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; |
627 | __v4sf __a, __b, __c; |
628 | /* PowerISA VMX does not allow partial (for just element 0) |
629 | * results. So to insure we don't generate spurious exceptions |
630 | * (from the upper elements) we splat the lower float |
631 | * before we to the operation. */ |
632 | __a = vec_splat((__v4sf)__A, 0); |
633 | __b = vec_splat((__v4sf)__B, 0); |
634 | __c = (__v4sf)vec_cmpgt(__a, __b); |
635 | /* Then we merge the lower float result with the original upper |
636 | * float elements from __A. */ |
637 | return ((__m128)vec_sel((__v4sf)__A, __c, __mask)); |
638 | } |
639 | |
640 | extern __inline __m128 |
641 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
642 | _mm_cmpge_ss(__m128 __A, __m128 __B) { |
643 | static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; |
644 | __v4sf __a, __b, __c; |
645 | /* PowerISA VMX does not allow partial (for just element 0) |
646 | * results. So to insure we don't generate spurious exceptions |
647 | * (from the upper elements) we splat the lower float |
648 | * before we to the operation. */ |
649 | __a = vec_splat((__v4sf)__A, 0); |
650 | __b = vec_splat((__v4sf)__B, 0); |
651 | __c = (__v4sf)vec_cmpge(__a, __b); |
652 | /* Then we merge the lower float result with the original upper |
653 | * float elements from __A. */ |
654 | return ((__m128)vec_sel((__v4sf)__A, __c, __mask)); |
655 | } |
656 | |
657 | extern __inline __m128 |
658 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
659 | _mm_cmpneq_ss(__m128 __A, __m128 __B) { |
660 | static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; |
661 | __v4sf __a, __b, __c; |
662 | /* PowerISA VMX does not allow partial (for just element 0) |
663 | * results. So to insure we don't generate spurious exceptions |
664 | * (from the upper elements) we splat the lower float |
665 | * before we to the operation. */ |
666 | __a = vec_splat((__v4sf)__A, 0); |
667 | __b = vec_splat((__v4sf)__B, 0); |
668 | __c = (__v4sf)vec_cmpeq(__a, __b); |
669 | __c = vec_nor(__c, __c); |
670 | /* Then we merge the lower float result with the original upper |
671 | * float elements from __A. */ |
672 | return ((__m128)vec_sel((__v4sf)__A, __c, __mask)); |
673 | } |
674 | |
675 | extern __inline __m128 |
676 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
677 | _mm_cmpnlt_ss(__m128 __A, __m128 __B) { |
678 | static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; |
679 | __v4sf __a, __b, __c; |
680 | /* PowerISA VMX does not allow partial (for just element 0) |
681 | * results. So to insure we don't generate spurious exceptions |
682 | * (from the upper elements) we splat the lower float |
683 | * before we to the operation. */ |
684 | __a = vec_splat((__v4sf)__A, 0); |
685 | __b = vec_splat((__v4sf)__B, 0); |
686 | __c = (__v4sf)vec_cmpge(__a, __b); |
687 | /* Then we merge the lower float result with the original upper |
688 | * float elements from __A. */ |
689 | return ((__m128)vec_sel((__v4sf)__A, __c, __mask)); |
690 | } |
691 | |
692 | extern __inline __m128 |
693 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
694 | _mm_cmpnle_ss(__m128 __A, __m128 __B) { |
695 | static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; |
696 | __v4sf __a, __b, __c; |
697 | /* PowerISA VMX does not allow partial (for just element 0) |
698 | * results. So to insure we don't generate spurious exceptions |
699 | * (from the upper elements) we splat the lower float |
700 | * before we to the operation. */ |
701 | __a = vec_splat((__v4sf)__A, 0); |
702 | __b = vec_splat((__v4sf)__B, 0); |
703 | __c = (__v4sf)vec_cmpgt(__a, __b); |
704 | /* Then we merge the lower float result with the original upper |
705 | * float elements from __A. */ |
706 | return ((__m128)vec_sel((__v4sf)__A, __c, __mask)); |
707 | } |
708 | |
709 | extern __inline __m128 |
710 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
711 | _mm_cmpngt_ss(__m128 __A, __m128 __B) { |
712 | static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; |
713 | __v4sf __a, __b, __c; |
714 | /* PowerISA VMX does not allow partial (for just element 0) |
715 | * results. So to insure we don't generate spurious exceptions |
716 | * (from the upper elements) we splat the lower float |
717 | * before we to the operation. */ |
718 | __a = vec_splat((__v4sf)__A, 0); |
719 | __b = vec_splat((__v4sf)__B, 0); |
720 | __c = (__v4sf)vec_cmple(__a, __b); |
721 | /* Then we merge the lower float result with the original upper |
722 | * float elements from __A. */ |
723 | return ((__m128)vec_sel((__v4sf)__A, __c, __mask)); |
724 | } |
725 | |
726 | extern __inline __m128 |
727 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
728 | _mm_cmpnge_ss(__m128 __A, __m128 __B) { |
729 | static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; |
730 | __v4sf __a, __b, __c; |
731 | /* PowerISA VMX does not allow partial (for just element 0) |
732 | * results. So to insure we don't generate spurious exceptions |
733 | * (from the upper elements) we splat the lower float |
734 | * before we do the operation. */ |
735 | __a = vec_splat((__v4sf)__A, 0); |
736 | __b = vec_splat((__v4sf)__B, 0); |
737 | __c = (__v4sf)vec_cmplt(__a, __b); |
738 | /* Then we merge the lower float result with the original upper |
739 | * float elements from __A. */ |
740 | return ((__m128)vec_sel((__v4sf)__A, __c, __mask)); |
741 | } |
742 | |
743 | extern __inline __m128 |
744 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
745 | _mm_cmpord_ss(__m128 __A, __m128 __B) { |
746 | __vector unsigned int __a, __b; |
747 | __vector unsigned int __c, __d; |
748 | static const __vector unsigned int __float_exp_mask = { |
749 | 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000}; |
750 | static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; |
751 | |
752 | __a = (__vector unsigned int)vec_abs((__v4sf)__A); |
753 | __b = (__vector unsigned int)vec_abs((__v4sf)__B); |
754 | __c = (__vector unsigned int)vec_cmpgt(__float_exp_mask, __a); |
755 | __d = (__vector unsigned int)vec_cmpgt(__float_exp_mask, __b); |
756 | __c = vec_and(__c, __d); |
757 | /* Then we merge the lower float result with the original upper |
758 | * float elements from __A. */ |
759 | return ((__m128)vec_sel((__v4sf)__A, (__v4sf)__c, __mask)); |
760 | } |
761 | |
762 | extern __inline __m128 |
763 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
764 | _mm_cmpunord_ss(__m128 __A, __m128 __B) { |
765 | __vector unsigned int __a, __b; |
766 | __vector unsigned int __c, __d; |
767 | static const __vector unsigned int __float_exp_mask = { |
768 | 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000}; |
769 | static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; |
770 | |
771 | __a = (__vector unsigned int)vec_abs((__v4sf)__A); |
772 | __b = (__vector unsigned int)vec_abs((__v4sf)__B); |
773 | __c = (__vector unsigned int)vec_cmpgt(__a, __float_exp_mask); |
774 | __d = (__vector unsigned int)vec_cmpgt(__b, __float_exp_mask); |
775 | __c = vec_or(__c, __d); |
776 | /* Then we merge the lower float result with the original upper |
777 | * float elements from __A. */ |
778 | return ((__m128)vec_sel((__v4sf)__A, (__v4sf)__c, __mask)); |
779 | } |
780 | |
781 | /* Compare the lower SPFP values of A and B and return 1 if true |
782 | and 0 if false. */ |
783 | extern __inline int |
784 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
785 | _mm_comieq_ss(__m128 __A, __m128 __B) { |
786 | return (__A[0] == __B[0]); |
787 | } |
788 | |
789 | extern __inline int |
790 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
791 | _mm_comilt_ss(__m128 __A, __m128 __B) { |
792 | return (__A[0] < __B[0]); |
793 | } |
794 | |
795 | extern __inline int |
796 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
797 | _mm_comile_ss(__m128 __A, __m128 __B) { |
798 | return (__A[0] <= __B[0]); |
799 | } |
800 | |
801 | extern __inline int |
802 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
803 | _mm_comigt_ss(__m128 __A, __m128 __B) { |
804 | return (__A[0] > __B[0]); |
805 | } |
806 | |
807 | extern __inline int |
808 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
809 | _mm_comige_ss(__m128 __A, __m128 __B) { |
810 | return (__A[0] >= __B[0]); |
811 | } |
812 | |
813 | extern __inline int |
814 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
815 | _mm_comineq_ss(__m128 __A, __m128 __B) { |
816 | return (__A[0] != __B[0]); |
817 | } |
818 | |
819 | /* FIXME |
820 | * The __mm_ucomi??_ss implementations below are exactly the same as |
821 | * __mm_comi??_ss because GCC for PowerPC only generates unordered |
822 | * compares (scalar and vector). |
823 | * Technically __mm_comieq_ss et al should be using the ordered |
824 | * compare and signal for QNaNs. |
825 | * The __mm_ucomieq_sd et all should be OK, as is. |
826 | */ |
827 | extern __inline int |
828 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
829 | _mm_ucomieq_ss(__m128 __A, __m128 __B) { |
830 | return (__A[0] == __B[0]); |
831 | } |
832 | |
833 | extern __inline int |
834 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
835 | _mm_ucomilt_ss(__m128 __A, __m128 __B) { |
836 | return (__A[0] < __B[0]); |
837 | } |
838 | |
839 | extern __inline int |
840 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
841 | _mm_ucomile_ss(__m128 __A, __m128 __B) { |
842 | return (__A[0] <= __B[0]); |
843 | } |
844 | |
845 | extern __inline int |
846 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
847 | _mm_ucomigt_ss(__m128 __A, __m128 __B) { |
848 | return (__A[0] > __B[0]); |
849 | } |
850 | |
851 | extern __inline int |
852 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
853 | _mm_ucomige_ss(__m128 __A, __m128 __B) { |
854 | return (__A[0] >= __B[0]); |
855 | } |
856 | |
857 | extern __inline int |
858 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
859 | _mm_ucomineq_ss(__m128 __A, __m128 __B) { |
860 | return (__A[0] != __B[0]); |
861 | } |
862 | |
863 | extern __inline float |
864 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
865 | _mm_cvtss_f32(__m128 __A) { |
866 | return ((__v4sf)__A)[0]; |
867 | } |
868 | |
869 | /* Convert the lower SPFP value to a 32-bit integer according to the current |
870 | rounding mode. */ |
871 | extern __inline int |
872 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
873 | _mm_cvtss_si32(__m128 __A) { |
874 | int __res; |
875 | #ifdef _ARCH_PWR8 |
876 | double __dtmp; |
877 | __asm__( |
878 | #ifdef __LITTLE_ENDIAN__ |
879 | "xxsldwi %x0,%x0,%x0,3;\n" |
880 | #endif |
881 | "xscvspdp %x2,%x0;\n" |
882 | "fctiw %2,%2;\n" |
883 | "mfvsrd %1,%x2;\n" |
884 | : "+wa"(__A), "=r"(__res), "=f"(__dtmp) |
885 | :); |
886 | #else |
887 | __res = __builtin_rint(__A[0]); |
888 | #endif |
889 | return __res; |
890 | } |
891 | |
892 | extern __inline int |
893 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
894 | _mm_cvt_ss2si(__m128 __A) { |
895 | return _mm_cvtss_si32(__A); |
896 | } |
897 | |
898 | /* Convert the lower SPFP value to a 32-bit integer according to the |
899 | current rounding mode. */ |
900 | |
901 | /* Intel intrinsic. */ |
902 | extern __inline long long |
903 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
904 | _mm_cvtss_si64(__m128 __A) { |
905 | long long __res; |
906 | #if defined(_ARCH_PWR8) && defined(__powerpc64__) |
907 | double __dtmp; |
908 | __asm__( |
909 | #ifdef __LITTLE_ENDIAN__ |
910 | "xxsldwi %x0,%x0,%x0,3;\n" |
911 | #endif |
912 | "xscvspdp %x2,%x0;\n" |
913 | "fctid %2,%2;\n" |
914 | "mfvsrd %1,%x2;\n" |
915 | : "+wa"(__A), "=r"(__res), "=f"(__dtmp) |
916 | :); |
917 | #else |
918 | __res = __builtin_llrint(__A[0]); |
919 | #endif |
920 | return __res; |
921 | } |
922 | |
923 | /* Microsoft intrinsic. */ |
924 | extern __inline long long |
925 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
926 | _mm_cvtss_si64x(__m128 __A) { |
927 | return _mm_cvtss_si64((__v4sf)__A); |
928 | } |
929 | |
930 | /* Constants for use with _mm_prefetch. */ |
931 | enum _mm_hint { |
932 | /* _MM_HINT_ET is _MM_HINT_T with set 3rd bit. */ |
933 | _MM_HINT_ET0 = 7, |
934 | _MM_HINT_ET1 = 6, |
935 | _MM_HINT_T0 = 3, |
936 | _MM_HINT_T1 = 2, |
937 | _MM_HINT_T2 = 1, |
938 | _MM_HINT_NTA = 0 |
939 | }; |
940 | |
941 | /* Loads one cache line from address P to a location "closer" to the |
942 | processor. The selector I specifies the type of prefetch operation. */ |
943 | extern __inline void |
944 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
945 | _mm_prefetch(const void *__P, enum _mm_hint __I) { |
946 | /* Current PowerPC will ignores the hint parameters. */ |
947 | __builtin_prefetch(__P); |
948 | } |
949 | |
950 | /* Convert the two lower SPFP values to 32-bit integers according to the |
951 | current rounding mode. Return the integers in packed form. */ |
952 | extern __inline __m64 |
953 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
954 | _mm_cvtps_pi32(__m128 __A) { |
955 | /* Splat two lower SPFP values to both halves. */ |
956 | __v4sf __temp, __rounded; |
957 | __vector unsigned long long __result; |
958 | |
959 | /* Splat two lower SPFP values to both halves. */ |
960 | __temp = (__v4sf)vec_splat((__vector long long)__A, 0); |
961 | __rounded = vec_rint(__temp); |
962 | __result = (__vector unsigned long long)vec_cts(__rounded, 0); |
963 | |
964 | return (__m64)((__vector long long)__result)[0]; |
965 | } |
966 | |
967 | extern __inline __m64 |
968 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
969 | _mm_cvt_ps2pi(__m128 __A) { |
970 | return _mm_cvtps_pi32(__A); |
971 | } |
972 | |
973 | /* Truncate the lower SPFP value to a 32-bit integer. */ |
974 | extern __inline int |
975 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
976 | _mm_cvttss_si32(__m128 __A) { |
977 | /* Extract the lower float element. */ |
978 | float __temp = __A[0]; |
979 | /* truncate to 32-bit integer and return. */ |
980 | return __temp; |
981 | } |
982 | |
983 | extern __inline int |
984 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
985 | _mm_cvtt_ss2si(__m128 __A) { |
986 | return _mm_cvttss_si32(__A); |
987 | } |
988 | |
989 | /* Intel intrinsic. */ |
990 | extern __inline long long |
991 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
992 | _mm_cvttss_si64(__m128 __A) { |
993 | /* Extract the lower float element. */ |
994 | float __temp = __A[0]; |
995 | /* truncate to 32-bit integer and return. */ |
996 | return __temp; |
997 | } |
998 | |
999 | /* Microsoft intrinsic. */ |
1000 | extern __inline long long |
1001 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1002 | _mm_cvttss_si64x(__m128 __A) { |
1003 | /* Extract the lower float element. */ |
1004 | float __temp = __A[0]; |
1005 | /* truncate to 32-bit integer and return. */ |
1006 | return __temp; |
1007 | } |
1008 | |
1009 | /* Truncate the two lower SPFP values to 32-bit integers. Return the |
1010 | integers in packed form. */ |
1011 | extern __inline __m64 |
1012 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1013 | _mm_cvttps_pi32(__m128 __A) { |
1014 | __v4sf __temp; |
1015 | __vector unsigned long long __result; |
1016 | |
1017 | /* Splat two lower SPFP values to both halves. */ |
1018 | __temp = (__v4sf)vec_splat((__vector long long)__A, 0); |
1019 | __result = (__vector unsigned long long)vec_cts(__temp, 0); |
1020 | |
1021 | return (__m64)((__vector long long)__result)[0]; |
1022 | } |
1023 | |
1024 | extern __inline __m64 |
1025 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1026 | _mm_cvtt_ps2pi(__m128 __A) { |
1027 | return _mm_cvttps_pi32(__A); |
1028 | } |
1029 | |
1030 | /* Convert B to a SPFP value and insert it as element zero in A. */ |
1031 | extern __inline __m128 |
1032 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1033 | _mm_cvtsi32_ss(__m128 __A, int __B) { |
1034 | float __temp = __B; |
1035 | __A[0] = __temp; |
1036 | |
1037 | return __A; |
1038 | } |
1039 | |
1040 | extern __inline __m128 |
1041 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1042 | _mm_cvt_si2ss(__m128 __A, int __B) { |
1043 | return _mm_cvtsi32_ss(__A, __B); |
1044 | } |
1045 | |
1046 | /* Convert B to a SPFP value and insert it as element zero in A. */ |
1047 | /* Intel intrinsic. */ |
1048 | extern __inline __m128 |
1049 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1050 | _mm_cvtsi64_ss(__m128 __A, long long __B) { |
1051 | float __temp = __B; |
1052 | __A[0] = __temp; |
1053 | |
1054 | return __A; |
1055 | } |
1056 | |
1057 | /* Microsoft intrinsic. */ |
1058 | extern __inline __m128 |
1059 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1060 | _mm_cvtsi64x_ss(__m128 __A, long long __B) { |
1061 | return _mm_cvtsi64_ss(__A, __B); |
1062 | } |
1063 | |
1064 | /* Convert the two 32-bit values in B to SPFP form and insert them |
1065 | as the two lower elements in A. */ |
1066 | extern __inline __m128 |
1067 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1068 | _mm_cvtpi32_ps(__m128 __A, __m64 __B) { |
1069 | __vector signed int __vm1; |
1070 | __vector float __vf1; |
1071 | |
1072 | __vm1 = (__vector signed int)(__vector unsigned long long){__B, __B}; |
1073 | __vf1 = (__vector float)vec_ctf(__vm1, 0); |
1074 | |
1075 | return ((__m128)(__vector unsigned long long){ |
1076 | ((__vector unsigned long long)__vf1)[0], |
1077 | ((__vector unsigned long long)__A)[1]}); |
1078 | } |
1079 | |
1080 | extern __inline __m128 |
1081 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1082 | _mm_cvt_pi2ps(__m128 __A, __m64 __B) { |
1083 | return _mm_cvtpi32_ps(__A, __B); |
1084 | } |
1085 | |
1086 | /* Convert the four signed 16-bit values in A to SPFP form. */ |
1087 | extern __inline __m128 |
1088 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1089 | _mm_cvtpi16_ps(__m64 __A) { |
1090 | __vector signed short __vs8; |
1091 | __vector signed int __vi4; |
1092 | __vector float __vf1; |
1093 | |
1094 | __vs8 = (__vector signed short)(__vector unsigned long long){__A, __A}; |
1095 | __vi4 = vec_vupklsh(__vs8); |
1096 | __vf1 = (__vector float)vec_ctf(__vi4, 0); |
1097 | |
1098 | return (__m128)__vf1; |
1099 | } |
1100 | |
1101 | /* Convert the four unsigned 16-bit values in A to SPFP form. */ |
1102 | extern __inline __m128 |
1103 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1104 | _mm_cvtpu16_ps(__m64 __A) { |
1105 | const __vector unsigned short __zero = {0, 0, 0, 0, 0, 0, 0, 0}; |
1106 | __vector unsigned short __vs8; |
1107 | __vector unsigned int __vi4; |
1108 | __vector float __vf1; |
1109 | |
1110 | __vs8 = (__vector unsigned short)(__vector unsigned long long){__A, __A}; |
1111 | __vi4 = (__vector unsigned int)vec_mergel |
1112 | #ifdef __LITTLE_ENDIAN__ |
1113 | (__vs8, __zero); |
1114 | #else |
1115 | (__zero, __vs8); |
1116 | #endif |
1117 | __vf1 = (__vector float)vec_ctf(__vi4, 0); |
1118 | |
1119 | return (__m128)__vf1; |
1120 | } |
1121 | |
1122 | /* Convert the low four signed 8-bit values in A to SPFP form. */ |
1123 | extern __inline __m128 |
1124 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1125 | _mm_cvtpi8_ps(__m64 __A) { |
1126 | __vector signed char __vc16; |
1127 | __vector signed short __vs8; |
1128 | __vector signed int __vi4; |
1129 | __vector float __vf1; |
1130 | |
1131 | __vc16 = (__vector signed char)(__vector unsigned long long){__A, __A}; |
1132 | __vs8 = vec_vupkhsb(__vc16); |
1133 | __vi4 = vec_vupkhsh(__vs8); |
1134 | __vf1 = (__vector float)vec_ctf(__vi4, 0); |
1135 | |
1136 | return (__m128)__vf1; |
1137 | } |
1138 | |
1139 | /* Convert the low four unsigned 8-bit values in A to SPFP form. */ |
1140 | extern __inline __m128 |
1141 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1142 | |
1143 | _mm_cvtpu8_ps(__m64 __A) { |
1144 | const __vector unsigned char __zero = {0, 0, 0, 0, 0, 0, 0, 0}; |
1145 | __vector unsigned char __vc16; |
1146 | __vector unsigned short __vs8; |
1147 | __vector unsigned int __vi4; |
1148 | __vector float __vf1; |
1149 | |
1150 | __vc16 = (__vector unsigned char)(__vector unsigned long long){__A, __A}; |
1151 | #ifdef __LITTLE_ENDIAN__ |
1152 | __vs8 = (__vector unsigned short)vec_mergel(__vc16, __zero); |
1153 | __vi4 = |
1154 | (__vector unsigned int)vec_mergeh(__vs8, (__vector unsigned short)__zero); |
1155 | #else |
1156 | __vs8 = (__vector unsigned short)vec_mergel(__zero, __vc16); |
1157 | __vi4 = |
1158 | (__vector unsigned int)vec_mergeh((__vector unsigned short)__zero, __vs8); |
1159 | #endif |
1160 | __vf1 = (__vector float)vec_ctf(__vi4, 0); |
1161 | |
1162 | return (__m128)__vf1; |
1163 | } |
1164 | |
1165 | /* Convert the four signed 32-bit values in A and B to SPFP form. */ |
1166 | extern __inline __m128 |
1167 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1168 | _mm_cvtpi32x2_ps(__m64 __A, __m64 __B) { |
1169 | __vector signed int __vi4; |
1170 | __vector float __vf4; |
1171 | |
1172 | __vi4 = (__vector signed int)(__vector unsigned long long){__A, __B}; |
1173 | __vf4 = (__vector float)vec_ctf(__vi4, 0); |
1174 | return (__m128)__vf4; |
1175 | } |
1176 | |
1177 | /* Convert the four SPFP values in A to four signed 16-bit integers. */ |
1178 | extern __inline __m64 |
1179 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1180 | _mm_cvtps_pi16(__m128 __A) { |
1181 | __v4sf __rounded; |
1182 | __vector signed int __temp; |
1183 | __vector unsigned long long __result; |
1184 | |
1185 | __rounded = vec_rint(__A); |
1186 | __temp = vec_cts(__rounded, 0); |
1187 | __result = (__vector unsigned long long)vec_pack(__temp, __temp); |
1188 | |
1189 | return (__m64)((__vector long long)__result)[0]; |
1190 | } |
1191 | |
1192 | /* Convert the four SPFP values in A to four signed 8-bit integers. */ |
1193 | extern __inline __m64 |
1194 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1195 | _mm_cvtps_pi8(__m128 __A) { |
1196 | __v4sf __rounded; |
1197 | __vector signed int __tmp_i; |
1198 | static const __vector signed int __zero = {0, 0, 0, 0}; |
1199 | __vector signed short __tmp_s; |
1200 | __vector signed char __res_v; |
1201 | |
1202 | __rounded = vec_rint(__A); |
1203 | __tmp_i = vec_cts(__rounded, 0); |
1204 | __tmp_s = vec_pack(__tmp_i, __zero); |
1205 | __res_v = vec_pack(__tmp_s, __tmp_s); |
1206 | return (__m64)((__vector long long)__res_v)[0]; |
1207 | } |
1208 | |
1209 | /* Selects four specific SPFP values from A and B based on MASK. */ |
1210 | extern __inline __m128 |
1211 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1212 | |
1213 | _mm_shuffle_ps(__m128 __A, __m128 __B, int const __mask) { |
1214 | unsigned long __element_selector_10 = __mask & 0x03; |
1215 | unsigned long __element_selector_32 = (__mask >> 2) & 0x03; |
1216 | unsigned long __element_selector_54 = (__mask >> 4) & 0x03; |
1217 | unsigned long __element_selector_76 = (__mask >> 6) & 0x03; |
1218 | static const unsigned int __permute_selectors[4] = { |
1219 | #ifdef __LITTLE_ENDIAN__ |
1220 | 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C |
1221 | #else |
1222 | 0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F |
1223 | #endif |
1224 | }; |
1225 | __vector unsigned int __t; |
1226 | |
1227 | __t[0] = __permute_selectors[__element_selector_10]; |
1228 | __t[1] = __permute_selectors[__element_selector_32]; |
1229 | __t[2] = __permute_selectors[__element_selector_54] + 0x10101010; |
1230 | __t[3] = __permute_selectors[__element_selector_76] + 0x10101010; |
1231 | return vec_perm((__v4sf)__A, (__v4sf)__B, (__vector unsigned char)__t); |
1232 | } |
1233 | |
1234 | /* Selects and interleaves the upper two SPFP values from A and B. */ |
1235 | extern __inline __m128 |
1236 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1237 | _mm_unpackhi_ps(__m128 __A, __m128 __B) { |
1238 | return (__m128)vec_vmrglw((__v4sf)__A, (__v4sf)__B); |
1239 | } |
1240 | |
1241 | /* Selects and interleaves the lower two SPFP values from A and B. */ |
1242 | extern __inline __m128 |
1243 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1244 | _mm_unpacklo_ps(__m128 __A, __m128 __B) { |
1245 | return (__m128)vec_vmrghw((__v4sf)__A, (__v4sf)__B); |
1246 | } |
1247 | |
1248 | /* Sets the upper two SPFP values with 64-bits of data loaded from P; |
1249 | the lower two values are passed through from A. */ |
1250 | extern __inline __m128 |
1251 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1252 | _mm_loadh_pi(__m128 __A, __m64 const *__P) { |
1253 | __vector unsigned long long __a = (__vector unsigned long long)__A; |
1254 | __vector unsigned long long __p = vec_splats(*__P); |
1255 | __a[1] = __p[1]; |
1256 | |
1257 | return (__m128)__a; |
1258 | } |
1259 | |
1260 | /* Stores the upper two SPFP values of A into P. */ |
1261 | extern __inline void |
1262 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1263 | _mm_storeh_pi(__m64 *__P, __m128 __A) { |
1264 | __vector unsigned long long __a = (__vector unsigned long long)__A; |
1265 | |
1266 | *__P = __a[1]; |
1267 | } |
1268 | |
1269 | /* Moves the upper two values of B into the lower two values of A. */ |
1270 | extern __inline __m128 |
1271 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1272 | _mm_movehl_ps(__m128 __A, __m128 __B) { |
1273 | return (__m128)vec_mergel((__vector unsigned long long)__B, |
1274 | (__vector unsigned long long)__A); |
1275 | } |
1276 | |
1277 | /* Moves the lower two values of B into the upper two values of A. */ |
1278 | extern __inline __m128 |
1279 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1280 | _mm_movelh_ps(__m128 __A, __m128 __B) { |
1281 | return (__m128)vec_mergeh((__vector unsigned long long)__A, |
1282 | (__vector unsigned long long)__B); |
1283 | } |
1284 | |
1285 | /* Sets the lower two SPFP values with 64-bits of data loaded from P; |
1286 | the upper two values are passed through from A. */ |
1287 | extern __inline __m128 |
1288 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1289 | _mm_loadl_pi(__m128 __A, __m64 const *__P) { |
1290 | __vector unsigned long long __a = (__vector unsigned long long)__A; |
1291 | __vector unsigned long long __p = vec_splats(*__P); |
1292 | __a[0] = __p[0]; |
1293 | |
1294 | return (__m128)__a; |
1295 | } |
1296 | |
1297 | /* Stores the lower two SPFP values of A into P. */ |
1298 | extern __inline void |
1299 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1300 | _mm_storel_pi(__m64 *__P, __m128 __A) { |
1301 | __vector unsigned long long __a = (__vector unsigned long long)__A; |
1302 | |
1303 | *__P = __a[0]; |
1304 | } |
1305 | |
1306 | #ifdef _ARCH_PWR8 |
1307 | /* Intrinsic functions that require PowerISA 2.07 minimum. */ |
1308 | |
1309 | /* Creates a 4-bit mask from the most significant bits of the SPFP values. */ |
1310 | extern __inline int |
1311 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1312 | _mm_movemask_ps(__m128 __A) { |
1313 | #ifdef _ARCH_PWR10 |
1314 | return vec_extractm((__vector unsigned int)__A); |
1315 | #else |
1316 | __vector unsigned long long __result; |
1317 | static const __vector unsigned int __perm_mask = { |
1318 | #ifdef __LITTLE_ENDIAN__ |
1319 | 0x00204060, 0x80808080, 0x80808080, 0x80808080 |
1320 | #else |
1321 | 0x80808080, 0x80808080, 0x80808080, 0x00204060 |
1322 | #endif |
1323 | }; |
1324 | |
1325 | __result = ((__vector unsigned long long)vec_vbpermq( |
1326 | (__vector unsigned char)__A, (__vector unsigned char)__perm_mask)); |
1327 | |
1328 | #ifdef __LITTLE_ENDIAN__ |
1329 | return __result[1]; |
1330 | #else |
1331 | return __result[0]; |
1332 | #endif |
1333 | #endif /* !_ARCH_PWR10 */ |
1334 | } |
1335 | #endif /* _ARCH_PWR8 */ |
1336 | |
1337 | /* Create a vector with all four elements equal to *P. */ |
1338 | extern __inline __m128 |
1339 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1340 | _mm_load1_ps(float const *__P) { |
1341 | return _mm_set1_ps(*__P); |
1342 | } |
1343 | |
1344 | extern __inline __m128 |
1345 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1346 | _mm_load_ps1(float const *__P) { |
1347 | return _mm_load1_ps(__P); |
1348 | } |
1349 | |
1350 | /* Extracts one of the four words of A. The selector N must be immediate. */ |
1351 | extern __inline int |
1352 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1353 | _mm_extract_pi16(__m64 const __A, int const __N) { |
1354 | unsigned int __shiftr = __N & 3; |
1355 | #ifdef __BIG_ENDIAN__ |
1356 | __shiftr = 3 - __shiftr; |
1357 | #endif |
1358 | |
1359 | return ((__A >> (__shiftr * 16)) & 0xffff); |
1360 | } |
1361 | |
1362 | extern __inline int |
1363 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1364 | _m_pextrw(__m64 const __A, int const __N) { |
1365 | return _mm_extract_pi16(__A, __N); |
1366 | } |
1367 | |
1368 | /* Inserts word D into one of four words of A. The selector N must be |
1369 | immediate. */ |
1370 | extern __inline __m64 |
1371 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1372 | _mm_insert_pi16(__m64 const __A, int const __D, int const __N) { |
1373 | const int __shiftl = (__N & 3) * 16; |
1374 | const __m64 __shiftD = (const __m64)__D << __shiftl; |
1375 | const __m64 __mask = 0xffffUL << __shiftl; |
1376 | __m64 __result = (__A & (~__mask)) | (__shiftD & __mask); |
1377 | |
1378 | return __result; |
1379 | } |
1380 | |
1381 | extern __inline __m64 |
1382 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1383 | _m_pinsrw(__m64 const __A, int const __D, int const __N) { |
1384 | return _mm_insert_pi16(__A, __D, __N); |
1385 | } |
1386 | |
1387 | /* Compute the element-wise maximum of signed 16-bit values. */ |
1388 | extern __inline __m64 |
1389 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1390 | |
1391 | _mm_max_pi16(__m64 __A, __m64 __B) { |
1392 | #if _ARCH_PWR8 |
1393 | __vector signed short __a, __b, __r; |
1394 | __vector __bool short __c; |
1395 | |
1396 | __a = (__vector signed short)vec_splats(__A); |
1397 | __b = (__vector signed short)vec_splats(__B); |
1398 | __c = (__vector __bool short)vec_cmpgt(__a, __b); |
1399 | __r = vec_sel(__b, __a, __c); |
1400 | return (__m64)((__vector long long)__r)[0]; |
1401 | #else |
1402 | __m64_union __m1, __m2, __res; |
1403 | |
1404 | __m1.as_m64 = __A; |
1405 | __m2.as_m64 = __B; |
1406 | |
1407 | __res.as_short[0] = (__m1.as_short[0] > __m2.as_short[0]) ? __m1.as_short[0] |
1408 | : __m2.as_short[0]; |
1409 | __res.as_short[1] = (__m1.as_short[1] > __m2.as_short[1]) ? __m1.as_short[1] |
1410 | : __m2.as_short[1]; |
1411 | __res.as_short[2] = (__m1.as_short[2] > __m2.as_short[2]) ? __m1.as_short[2] |
1412 | : __m2.as_short[2]; |
1413 | __res.as_short[3] = (__m1.as_short[3] > __m2.as_short[3]) ? __m1.as_short[3] |
1414 | : __m2.as_short[3]; |
1415 | |
1416 | return (__m64)__res.as_m64; |
1417 | #endif |
1418 | } |
1419 | |
1420 | extern __inline __m64 |
1421 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1422 | _m_pmaxsw(__m64 __A, __m64 __B) { |
1423 | return _mm_max_pi16(__A, __B); |
1424 | } |
1425 | |
1426 | /* Compute the element-wise maximum of unsigned 8-bit values. */ |
1427 | extern __inline __m64 |
1428 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1429 | _mm_max_pu8(__m64 __A, __m64 __B) { |
1430 | #if _ARCH_PWR8 |
1431 | __vector unsigned char __a, __b, __r; |
1432 | __vector __bool char __c; |
1433 | |
1434 | __a = (__vector unsigned char)vec_splats(__A); |
1435 | __b = (__vector unsigned char)vec_splats(__B); |
1436 | __c = (__vector __bool char)vec_cmpgt(__a, __b); |
1437 | __r = vec_sel(__b, __a, __c); |
1438 | return (__m64)((__vector long long)__r)[0]; |
1439 | #else |
1440 | __m64_union __m1, __m2, __res; |
1441 | long __i; |
1442 | |
1443 | __m1.as_m64 = __A; |
1444 | __m2.as_m64 = __B; |
1445 | |
1446 | for (__i = 0; __i < 8; __i++) |
1447 | __res.as_char[__i] = |
1448 | ((unsigned char)__m1.as_char[__i] > (unsigned char)__m2.as_char[__i]) |
1449 | ? __m1.as_char[__i] |
1450 | : __m2.as_char[__i]; |
1451 | |
1452 | return (__m64)__res.as_m64; |
1453 | #endif |
1454 | } |
1455 | |
1456 | extern __inline __m64 |
1457 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1458 | _m_pmaxub(__m64 __A, __m64 __B) { |
1459 | return _mm_max_pu8(__A, __B); |
1460 | } |
1461 | |
1462 | /* Compute the element-wise minimum of signed 16-bit values. */ |
1463 | extern __inline __m64 |
1464 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1465 | _mm_min_pi16(__m64 __A, __m64 __B) { |
1466 | #if _ARCH_PWR8 |
1467 | __vector signed short __a, __b, __r; |
1468 | __vector __bool short __c; |
1469 | |
1470 | __a = (__vector signed short)vec_splats(__A); |
1471 | __b = (__vector signed short)vec_splats(__B); |
1472 | __c = (__vector __bool short)vec_cmplt(__a, __b); |
1473 | __r = vec_sel(__b, __a, __c); |
1474 | return (__m64)((__vector long long)__r)[0]; |
1475 | #else |
1476 | __m64_union __m1, __m2, __res; |
1477 | |
1478 | __m1.as_m64 = __A; |
1479 | __m2.as_m64 = __B; |
1480 | |
1481 | __res.as_short[0] = (__m1.as_short[0] < __m2.as_short[0]) ? __m1.as_short[0] |
1482 | : __m2.as_short[0]; |
1483 | __res.as_short[1] = (__m1.as_short[1] < __m2.as_short[1]) ? __m1.as_short[1] |
1484 | : __m2.as_short[1]; |
1485 | __res.as_short[2] = (__m1.as_short[2] < __m2.as_short[2]) ? __m1.as_short[2] |
1486 | : __m2.as_short[2]; |
1487 | __res.as_short[3] = (__m1.as_short[3] < __m2.as_short[3]) ? __m1.as_short[3] |
1488 | : __m2.as_short[3]; |
1489 | |
1490 | return (__m64)__res.as_m64; |
1491 | #endif |
1492 | } |
1493 | |
1494 | extern __inline __m64 |
1495 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1496 | _m_pminsw(__m64 __A, __m64 __B) { |
1497 | return _mm_min_pi16(__A, __B); |
1498 | } |
1499 | |
1500 | /* Compute the element-wise minimum of unsigned 8-bit values. */ |
1501 | extern __inline __m64 |
1502 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1503 | _mm_min_pu8(__m64 __A, __m64 __B) { |
1504 | #if _ARCH_PWR8 |
1505 | __vector unsigned char __a, __b, __r; |
1506 | __vector __bool char __c; |
1507 | |
1508 | __a = (__vector unsigned char)vec_splats(__A); |
1509 | __b = (__vector unsigned char)vec_splats(__B); |
1510 | __c = (__vector __bool char)vec_cmplt(__a, __b); |
1511 | __r = vec_sel(__b, __a, __c); |
1512 | return (__m64)((__vector long long)__r)[0]; |
1513 | #else |
1514 | __m64_union __m1, __m2, __res; |
1515 | long __i; |
1516 | |
1517 | __m1.as_m64 = __A; |
1518 | __m2.as_m64 = __B; |
1519 | |
1520 | for (__i = 0; __i < 8; __i++) |
1521 | __res.as_char[__i] = |
1522 | ((unsigned char)__m1.as_char[__i] < (unsigned char)__m2.as_char[__i]) |
1523 | ? __m1.as_char[__i] |
1524 | : __m2.as_char[__i]; |
1525 | |
1526 | return (__m64)__res.as_m64; |
1527 | #endif |
1528 | } |
1529 | |
1530 | extern __inline __m64 |
1531 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1532 | _m_pminub(__m64 __A, __m64 __B) { |
1533 | return _mm_min_pu8(__A, __B); |
1534 | } |
1535 | |
1536 | /* Create an 8-bit mask of the signs of 8-bit values. */ |
1537 | extern __inline int |
1538 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1539 | _mm_movemask_pi8(__m64 __A) { |
1540 | #ifdef __powerpc64__ |
1541 | unsigned long long __p = |
1542 | #ifdef __LITTLE_ENDIAN__ |
1543 | 0x0008101820283038UL; // permute control for sign bits |
1544 | #else |
1545 | 0x3830282018100800UL; // permute control for sign bits |
1546 | #endif |
1547 | return __builtin_bpermd(__p, __A); |
1548 | #else |
1549 | #ifdef __LITTLE_ENDIAN__ |
1550 | unsigned int __mask = 0x20283038UL; |
1551 | unsigned int __r1 = __builtin_bpermd(__mask, __A) & 0xf; |
1552 | unsigned int __r2 = __builtin_bpermd(__mask, __A >> 32) & 0xf; |
1553 | #else |
1554 | unsigned int __mask = 0x38302820UL; |
1555 | unsigned int __r1 = __builtin_bpermd(__mask, __A >> 32) & 0xf; |
1556 | unsigned int __r2 = __builtin_bpermd(__mask, __A) & 0xf; |
1557 | #endif |
1558 | return (__r2 << 4) | __r1; |
1559 | #endif |
1560 | } |
1561 | |
1562 | extern __inline int |
1563 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1564 | _m_pmovmskb(__m64 __A) { |
1565 | return _mm_movemask_pi8(__A); |
1566 | } |
1567 | |
1568 | /* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values |
1569 | in B and produce the high 16 bits of the 32-bit results. */ |
1570 | extern __inline __m64 |
1571 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1572 | _mm_mulhi_pu16(__m64 __A, __m64 __B) { |
1573 | __vector unsigned short __a, __b; |
1574 | __vector unsigned short __c; |
1575 | __vector unsigned int __w0, __w1; |
1576 | __vector unsigned char __xform1 = { |
1577 | #ifdef __LITTLE_ENDIAN__ |
1578 | 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A, |
1579 | 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F |
1580 | #else |
1581 | 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x00, |
1582 | 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15 |
1583 | #endif |
1584 | }; |
1585 | |
1586 | __a = (__vector unsigned short)vec_splats(__A); |
1587 | __b = (__vector unsigned short)vec_splats(__B); |
1588 | |
1589 | __w0 = vec_vmuleuh(__a, __b); |
1590 | __w1 = vec_vmulouh(__a, __b); |
1591 | __c = (__vector unsigned short)vec_perm(__w0, __w1, __xform1); |
1592 | |
1593 | return (__m64)((__vector long long)__c)[0]; |
1594 | } |
1595 | |
1596 | extern __inline __m64 |
1597 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1598 | _m_pmulhuw(__m64 __A, __m64 __B) { |
1599 | return _mm_mulhi_pu16(__A, __B); |
1600 | } |
1601 | |
1602 | /* Return a combination of the four 16-bit values in A. The selector |
1603 | must be an immediate. */ |
1604 | extern __inline __m64 |
1605 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1606 | _mm_shuffle_pi16(__m64 __A, int const __N) { |
1607 | unsigned long __element_selector_10 = __N & 0x03; |
1608 | unsigned long __element_selector_32 = (__N >> 2) & 0x03; |
1609 | unsigned long __element_selector_54 = (__N >> 4) & 0x03; |
1610 | unsigned long __element_selector_76 = (__N >> 6) & 0x03; |
1611 | static const unsigned short __permute_selectors[4] = { |
1612 | #ifdef __LITTLE_ENDIAN__ |
1613 | 0x0908, 0x0B0A, 0x0D0C, 0x0F0E |
1614 | #else |
1615 | 0x0607, 0x0405, 0x0203, 0x0001 |
1616 | #endif |
1617 | }; |
1618 | __m64_union __t; |
1619 | __vector unsigned long long __a, __p, __r; |
1620 | |
1621 | #ifdef __LITTLE_ENDIAN__ |
1622 | __t.as_short[0] = __permute_selectors[__element_selector_10]; |
1623 | __t.as_short[1] = __permute_selectors[__element_selector_32]; |
1624 | __t.as_short[2] = __permute_selectors[__element_selector_54]; |
1625 | __t.as_short[3] = __permute_selectors[__element_selector_76]; |
1626 | #else |
1627 | __t.as_short[3] = __permute_selectors[__element_selector_10]; |
1628 | __t.as_short[2] = __permute_selectors[__element_selector_32]; |
1629 | __t.as_short[1] = __permute_selectors[__element_selector_54]; |
1630 | __t.as_short[0] = __permute_selectors[__element_selector_76]; |
1631 | #endif |
1632 | __p = vec_splats(__t.as_m64); |
1633 | __a = vec_splats(__A); |
1634 | __r = vec_perm(__a, __a, (__vector unsigned char)__p); |
1635 | return (__m64)((__vector long long)__r)[0]; |
1636 | } |
1637 | |
1638 | extern __inline __m64 |
1639 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1640 | _m_pshufw(__m64 __A, int const __N) { |
1641 | return _mm_shuffle_pi16(__A, __N); |
1642 | } |
1643 | |
1644 | /* Conditionally store byte elements of A into P. The high bit of each |
1645 | byte in the selector N determines whether the corresponding byte from |
1646 | A is stored. */ |
1647 | extern __inline void |
1648 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1649 | _mm_maskmove_si64(__m64 __A, __m64 __N, char *__P) { |
1650 | __m64 __hibit = 0x8080808080808080UL; |
1651 | __m64 __mask, __tmp; |
1652 | __m64 *__p = (__m64 *)__P; |
1653 | |
1654 | __tmp = *__p; |
1655 | __mask = _mm_cmpeq_pi8((__N & __hibit), __hibit); |
1656 | __tmp = (__tmp & (~__mask)) | (__A & __mask); |
1657 | *__p = __tmp; |
1658 | } |
1659 | |
1660 | extern __inline void |
1661 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1662 | _m_maskmovq(__m64 __A, __m64 __N, char *__P) { |
1663 | _mm_maskmove_si64(__A, __N, __P); |
1664 | } |
1665 | |
1666 | /* Compute the rounded averages of the unsigned 8-bit values in A and B. */ |
1667 | extern __inline __m64 |
1668 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1669 | _mm_avg_pu8(__m64 __A, __m64 __B) { |
1670 | __vector unsigned char __a, __b, __c; |
1671 | |
1672 | __a = (__vector unsigned char)vec_splats(__A); |
1673 | __b = (__vector unsigned char)vec_splats(__B); |
1674 | __c = vec_avg(__a, __b); |
1675 | return (__m64)((__vector long long)__c)[0]; |
1676 | } |
1677 | |
1678 | extern __inline __m64 |
1679 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1680 | _m_pavgb(__m64 __A, __m64 __B) { |
1681 | return _mm_avg_pu8(__A, __B); |
1682 | } |
1683 | |
1684 | /* Compute the rounded averages of the unsigned 16-bit values in A and B. */ |
1685 | extern __inline __m64 |
1686 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1687 | _mm_avg_pu16(__m64 __A, __m64 __B) { |
1688 | __vector unsigned short __a, __b, __c; |
1689 | |
1690 | __a = (__vector unsigned short)vec_splats(__A); |
1691 | __b = (__vector unsigned short)vec_splats(__B); |
1692 | __c = vec_avg(__a, __b); |
1693 | return (__m64)((__vector long long)__c)[0]; |
1694 | } |
1695 | |
1696 | extern __inline __m64 |
1697 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1698 | _m_pavgw(__m64 __A, __m64 __B) { |
1699 | return _mm_avg_pu16(__A, __B); |
1700 | } |
1701 | |
1702 | /* Compute the sum of the absolute differences of the unsigned 8-bit |
1703 | values in A and B. Return the value in the lower 16-bit word; the |
1704 | upper words are cleared. */ |
1705 | extern __inline __m64 |
1706 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1707 | _mm_sad_pu8(__m64 __A, __m64 __B) { |
1708 | __vector unsigned char __a, __b; |
1709 | __vector unsigned char __vmin, __vmax, __vabsdiff; |
1710 | __vector signed int __vsum; |
1711 | const __vector unsigned int __zero = {0, 0, 0, 0}; |
1712 | __m64_union __result = {0}; |
1713 | |
1714 | __a = (__vector unsigned char)(__vector unsigned long long){0UL, __A}; |
1715 | __b = (__vector unsigned char)(__vector unsigned long long){0UL, __B}; |
1716 | __vmin = vec_min(__a, __b); |
1717 | __vmax = vec_max(__a, __b); |
1718 | __vabsdiff = vec_sub(__vmax, __vmin); |
1719 | /* Sum four groups of bytes into integers. */ |
1720 | __vsum = (__vector signed int)vec_sum4s(__vabsdiff, __zero); |
1721 | /* Sum across four integers with integer result. */ |
1722 | __vsum = vec_sums(__vsum, (__vector signed int)__zero); |
1723 | /* The sum is in the right most 32-bits of the vector result. |
1724 | Transfer to a GPR and truncate to 16 bits. */ |
1725 | __result.as_short[0] = __vsum[3]; |
1726 | return __result.as_m64; |
1727 | } |
1728 | |
1729 | extern __inline __m64 |
1730 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1731 | _m_psadbw(__m64 __A, __m64 __B) { |
1732 | return _mm_sad_pu8(__A, __B); |
1733 | } |
1734 | |
1735 | /* Stores the data in A to the address P without polluting the caches. */ |
1736 | extern __inline void |
1737 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1738 | _mm_stream_pi(__m64 *__P, __m64 __A) { |
1739 | /* Use the data cache block touch for store transient. */ |
1740 | __asm__(" dcbtstt 0,%0" : : "b"(__P) : "memory"); |
1741 | *__P = __A; |
1742 | } |
1743 | |
1744 | /* Likewise. The address must be 16-byte aligned. */ |
1745 | extern __inline void |
1746 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1747 | _mm_stream_ps(float *__P, __m128 __A) { |
1748 | /* Use the data cache block touch for store transient. */ |
1749 | __asm__(" dcbtstt 0,%0" : : "b"(__P) : "memory"); |
1750 | _mm_store_ps(__P, __A); |
1751 | } |
1752 | |
1753 | /* Guarantees that every preceding store is globally visible before |
1754 | any subsequent store. */ |
1755 | extern __inline void |
1756 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1757 | _mm_sfence(void) { |
1758 | /* Generate a light weight sync. */ |
1759 | __atomic_thread_fence(__ATOMIC_RELEASE); |
1760 | } |
1761 | |
1762 | /* The execution of the next instruction is delayed by an implementation |
1763 | specific amount of time. The instruction does not modify the |
1764 | architectural state. This is after the pop_options pragma because |
1765 | it does not require SSE support in the processor--the encoding is a |
1766 | nop on processors that do not support it. */ |
1767 | extern __inline void |
1768 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1769 | _mm_pause(void) { |
1770 | /* There is no exact match with this construct, but the following is |
1771 | close to the desired effect. */ |
1772 | #if _ARCH_PWR8 |
1773 | /* On power8 and later processors we can depend on Program Priority |
1774 | (PRI) and associated "very low" PPI setting. Since we don't know |
1775 | what PPI this thread is running at we: 1) save the current PRI |
1776 | from the PPR SPR into a local GRP, 2) set the PRI to "very low* |
1777 | via the special or 31,31,31 encoding. 3) issue an "isync" to |
1778 | insure the PRI change takes effect before we execute any more |
1779 | instructions. |
1780 | Now we can execute a lwsync (release barrier) while we execute |
1781 | this thread at "very low" PRI. Finally we restore the original |
1782 | PRI and continue execution. */ |
1783 | unsigned long __PPR; |
1784 | |
1785 | __asm__ volatile(" mfppr %0;" |
1786 | " or 31,31,31;" |
1787 | " isync;" |
1788 | " lwsync;" |
1789 | " isync;" |
1790 | " mtppr %0;" |
1791 | : "=r"(__PPR) |
1792 | : |
1793 | : "memory"); |
1794 | #else |
1795 | /* For older processor where we may not even have Program Priority |
1796 | controls we can only depend on Heavy Weight Sync. */ |
1797 | __atomic_thread_fence(__ATOMIC_SEQ_CST); |
1798 | #endif |
1799 | } |
1800 | |
1801 | /* Transpose the 4x4 matrix composed of row[0-3]. */ |
1802 | #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ |
1803 | do { \ |
1804 | __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3); \ |
1805 | __v4sf __t0 = vec_vmrghw(__r0, __r1); \ |
1806 | __v4sf __t1 = vec_vmrghw(__r2, __r3); \ |
1807 | __v4sf __t2 = vec_vmrglw(__r0, __r1); \ |
1808 | __v4sf __t3 = vec_vmrglw(__r2, __r3); \ |
1809 | (row0) = (__v4sf)vec_mergeh((__vector long long)__t0, \ |
1810 | (__vector long long)__t1); \ |
1811 | (row1) = (__v4sf)vec_mergel((__vector long long)__t0, \ |
1812 | (__vector long long)__t1); \ |
1813 | (row2) = (__v4sf)vec_mergeh((__vector long long)__t2, \ |
1814 | (__vector long long)__t3); \ |
1815 | (row3) = (__v4sf)vec_mergel((__vector long long)__t2, \ |
1816 | (__vector long long)__t3); \ |
1817 | } while (0) |
1818 | |
1819 | /* For backward source compatibility. */ |
1820 | //# include <emmintrin.h> |
1821 | |
1822 | #else |
1823 | #include_next <xmmintrin.h> |
1824 | #endif /* defined(__powerpc64__) && \ |
1825 | * (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */ |
1826 | |
1827 | #endif /* XMMINTRIN_H_ */ |
1828 |
Warning: This file is not a C or C++ file. It does not have highlighting.