Warning: This file is not a C or C++ file. It does not have highlighting.
1 | /*===---- mmintrin.h - Implementation of MMX intrinsics on PowerPC ---------=== |
---|---|
2 | * |
3 | * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | * See https://llvm.org/LICENSE.txt for license information. |
5 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | * |
7 | *===-----------------------------------------------------------------------=== |
8 | */ |
9 | |
10 | /* Implemented from the specification included in the Intel C++ Compiler |
11 | User Guide and Reference, version 9.0. */ |
12 | |
13 | #ifndef NO_WARN_X86_INTRINSICS |
14 | /* This header file is to help porting code using Intel intrinsics |
15 | explicitly from x86_64 to powerpc64/powerpc64le. |
16 | |
17 | Since PowerPC target doesn't support native 64-bit vector type, we |
18 | typedef __m64 to 64-bit unsigned long long in MMX intrinsics, which |
19 | works well for _si64 and some _pi32 operations. |
20 | |
21 | For _pi16 and _pi8 operations, it's better to transfer __m64 into |
22 | 128-bit PowerPC vector first. Power8 introduced direct register |
23 | move instructions which helps for more efficient implementation. |
24 | |
25 | It's user's responsibility to determine if the results of such port |
26 | are acceptable or further changes are needed. Please note that much |
27 | code using Intel intrinsics CAN BE REWRITTEN in more portable and |
28 | efficient standard C or GNU C extensions with 64-bit scalar |
29 | operations, or 128-bit SSE/Altivec operations, which are more |
30 | recommended. */ |
31 | #error \ |
32 | "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error." |
33 | #endif |
34 | |
35 | #ifndef _MMINTRIN_H_INCLUDED |
36 | #define _MMINTRIN_H_INCLUDED |
37 | |
38 | #if defined(__powerpc64__) && \ |
39 | (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) |
40 | |
41 | #include <altivec.h> |
42 | /* The Intel API is flexible enough that we must allow aliasing with other |
43 | vector types, and their scalar components. */ |
44 | typedef __attribute__((__aligned__(8))) unsigned long long __m64; |
45 | |
46 | typedef __attribute__((__aligned__(8))) union { |
47 | __m64 as_m64; |
48 | char as_char[8]; |
49 | signed char as_signed_char[8]; |
50 | short as_short[4]; |
51 | int as_int[2]; |
52 | long long as_long_long; |
53 | float as_float[2]; |
54 | double as_double; |
55 | } __m64_union; |
56 | |
57 | /* Empty the multimedia state. */ |
58 | extern __inline void |
59 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
60 | _mm_empty(void) { |
61 | /* nothing to do on PowerPC. */ |
62 | } |
63 | |
64 | extern __inline void |
65 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
66 | _m_empty(void) { |
67 | /* nothing to do on PowerPC. */ |
68 | } |
69 | |
70 | /* Convert I to a __m64 object. The integer is zero-extended to 64-bits. */ |
71 | extern __inline __m64 |
72 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
73 | _mm_cvtsi32_si64(int __i) { |
74 | return (__m64)(unsigned int)__i; |
75 | } |
76 | |
77 | extern __inline __m64 |
78 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
79 | _m_from_int(int __i) { |
80 | return _mm_cvtsi32_si64(__i); |
81 | } |
82 | |
83 | /* Convert the lower 32 bits of the __m64 object into an integer. */ |
84 | extern __inline int |
85 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
86 | _mm_cvtsi64_si32(__m64 __i) { |
87 | return ((int)__i); |
88 | } |
89 | |
90 | extern __inline int |
91 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
92 | _m_to_int(__m64 __i) { |
93 | return _mm_cvtsi64_si32(__i); |
94 | } |
95 | |
96 | /* Convert I to a __m64 object. */ |
97 | |
98 | /* Intel intrinsic. */ |
99 | extern __inline __m64 |
100 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
101 | _m_from_int64(long long __i) { |
102 | return (__m64)__i; |
103 | } |
104 | |
105 | extern __inline __m64 |
106 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
107 | _mm_cvtsi64_m64(long long __i) { |
108 | return (__m64)__i; |
109 | } |
110 | |
111 | /* Microsoft intrinsic. */ |
112 | extern __inline __m64 |
113 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
114 | _mm_cvtsi64x_si64(long long __i) { |
115 | return (__m64)__i; |
116 | } |
117 | |
118 | extern __inline __m64 |
119 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
120 | _mm_set_pi64x(long long __i) { |
121 | return (__m64)__i; |
122 | } |
123 | |
124 | /* Convert the __m64 object to a 64bit integer. */ |
125 | |
126 | /* Intel intrinsic. */ |
127 | extern __inline long long |
128 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
129 | _m_to_int64(__m64 __i) { |
130 | return (long long)__i; |
131 | } |
132 | |
133 | extern __inline long long |
134 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
135 | _mm_cvtm64_si64(__m64 __i) { |
136 | return (long long)__i; |
137 | } |
138 | |
139 | /* Microsoft intrinsic. */ |
140 | extern __inline long long |
141 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
142 | _mm_cvtsi64_si64x(__m64 __i) { |
143 | return (long long)__i; |
144 | } |
145 | |
146 | #ifdef _ARCH_PWR8 |
147 | /* Pack the four 16-bit values from M1 into the lower four 8-bit values of |
148 | the result, and the four 16-bit values from M2 into the upper four 8-bit |
149 | values of the result, all with signed saturation. */ |
150 | extern __inline __m64 |
151 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
152 | _mm_packs_pi16(__m64 __m1, __m64 __m2) { |
153 | __vector signed short __vm1; |
154 | __vector signed char __vresult; |
155 | |
156 | __vm1 = (__vector signed short)(__vector unsigned long long) |
157 | #ifdef __LITTLE_ENDIAN__ |
158 | {__m1, __m2}; |
159 | #else |
160 | {__m2, __m1}; |
161 | #endif |
162 | __vresult = vec_packs(__vm1, __vm1); |
163 | return (__m64)((__vector long long)__vresult)[0]; |
164 | } |
165 | |
166 | extern __inline __m64 |
167 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
168 | _m_packsswb(__m64 __m1, __m64 __m2) { |
169 | return _mm_packs_pi16(__m1, __m2); |
170 | } |
171 | |
172 | /* Pack the two 32-bit values from M1 in to the lower two 16-bit values of |
173 | the result, and the two 32-bit values from M2 into the upper two 16-bit |
174 | values of the result, all with signed saturation. */ |
175 | extern __inline __m64 |
176 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
177 | _mm_packs_pi32(__m64 __m1, __m64 __m2) { |
178 | __vector signed int __vm1; |
179 | __vector signed short __vresult; |
180 | |
181 | __vm1 = (__vector signed int)(__vector unsigned long long) |
182 | #ifdef __LITTLE_ENDIAN__ |
183 | {__m1, __m2}; |
184 | #else |
185 | {__m2, __m1}; |
186 | #endif |
187 | __vresult = vec_packs(__vm1, __vm1); |
188 | return (__m64)((__vector long long)__vresult)[0]; |
189 | } |
190 | |
191 | extern __inline __m64 |
192 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
193 | _m_packssdw(__m64 __m1, __m64 __m2) { |
194 | return _mm_packs_pi32(__m1, __m2); |
195 | } |
196 | |
197 | /* Pack the four 16-bit values from M1 into the lower four 8-bit values of |
198 | the result, and the four 16-bit values from M2 into the upper four 8-bit |
199 | values of the result, all with unsigned saturation. */ |
200 | extern __inline __m64 |
201 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
202 | _mm_packs_pu16(__m64 __m1, __m64 __m2) { |
203 | __vector unsigned char __r; |
204 | __vector signed short __vm1 = (__vector signed short)(__vector long long) |
205 | #ifdef __LITTLE_ENDIAN__ |
206 | {__m1, __m2}; |
207 | #else |
208 | {__m2, __m1}; |
209 | #endif |
210 | const __vector signed short __zero = {0}; |
211 | __vector __bool short __select = vec_cmplt(__vm1, __zero); |
212 | __r = |
213 | vec_packs((__vector unsigned short)__vm1, (__vector unsigned short)__vm1); |
214 | __vector __bool char __packsel = vec_pack(__select, __select); |
215 | __r = vec_sel(__r, (const __vector unsigned char)__zero, __packsel); |
216 | return (__m64)((__vector long long)__r)[0]; |
217 | } |
218 | |
219 | extern __inline __m64 |
220 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
221 | _m_packuswb(__m64 __m1, __m64 __m2) { |
222 | return _mm_packs_pu16(__m1, __m2); |
223 | } |
224 | #endif /* end ARCH_PWR8 */ |
225 | |
226 | /* Interleave the four 8-bit values from the high half of M1 with the four |
227 | 8-bit values from the high half of M2. */ |
228 | extern __inline __m64 |
229 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
230 | _mm_unpackhi_pi8(__m64 __m1, __m64 __m2) { |
231 | #if _ARCH_PWR8 |
232 | __vector unsigned char __a, __b, __c; |
233 | |
234 | __a = (__vector unsigned char)vec_splats(__m1); |
235 | __b = (__vector unsigned char)vec_splats(__m2); |
236 | __c = vec_mergel(__a, __b); |
237 | return (__m64)((__vector long long)__c)[1]; |
238 | #else |
239 | __m64_union __mu1, __mu2, __res; |
240 | |
241 | __mu1.as_m64 = __m1; |
242 | __mu2.as_m64 = __m2; |
243 | |
244 | __res.as_char[0] = __mu1.as_char[4]; |
245 | __res.as_char[1] = __mu2.as_char[4]; |
246 | __res.as_char[2] = __mu1.as_char[5]; |
247 | __res.as_char[3] = __mu2.as_char[5]; |
248 | __res.as_char[4] = __mu1.as_char[6]; |
249 | __res.as_char[5] = __mu2.as_char[6]; |
250 | __res.as_char[6] = __mu1.as_char[7]; |
251 | __res.as_char[7] = __mu2.as_char[7]; |
252 | |
253 | return (__m64)__res.as_m64; |
254 | #endif |
255 | } |
256 | |
257 | extern __inline __m64 |
258 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
259 | _m_punpckhbw(__m64 __m1, __m64 __m2) { |
260 | return _mm_unpackhi_pi8(__m1, __m2); |
261 | } |
262 | |
263 | /* Interleave the two 16-bit values from the high half of M1 with the two |
264 | 16-bit values from the high half of M2. */ |
265 | extern __inline __m64 |
266 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
267 | _mm_unpackhi_pi16(__m64 __m1, __m64 __m2) { |
268 | __m64_union __mu1, __mu2, __res; |
269 | |
270 | __mu1.as_m64 = __m1; |
271 | __mu2.as_m64 = __m2; |
272 | |
273 | __res.as_short[0] = __mu1.as_short[2]; |
274 | __res.as_short[1] = __mu2.as_short[2]; |
275 | __res.as_short[2] = __mu1.as_short[3]; |
276 | __res.as_short[3] = __mu2.as_short[3]; |
277 | |
278 | return (__m64)__res.as_m64; |
279 | } |
280 | |
281 | extern __inline __m64 |
282 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
283 | _m_punpckhwd(__m64 __m1, __m64 __m2) { |
284 | return _mm_unpackhi_pi16(__m1, __m2); |
285 | } |
286 | /* Interleave the 32-bit value from the high half of M1 with the 32-bit |
287 | value from the high half of M2. */ |
288 | extern __inline __m64 |
289 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
290 | _mm_unpackhi_pi32(__m64 __m1, __m64 __m2) { |
291 | __m64_union __mu1, __mu2, __res; |
292 | |
293 | __mu1.as_m64 = __m1; |
294 | __mu2.as_m64 = __m2; |
295 | |
296 | __res.as_int[0] = __mu1.as_int[1]; |
297 | __res.as_int[1] = __mu2.as_int[1]; |
298 | |
299 | return (__m64)__res.as_m64; |
300 | } |
301 | |
302 | extern __inline __m64 |
303 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
304 | _m_punpckhdq(__m64 __m1, __m64 __m2) { |
305 | return _mm_unpackhi_pi32(__m1, __m2); |
306 | } |
307 | /* Interleave the four 8-bit values from the low half of M1 with the four |
308 | 8-bit values from the low half of M2. */ |
309 | extern __inline __m64 |
310 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
311 | _mm_unpacklo_pi8(__m64 __m1, __m64 __m2) { |
312 | #if _ARCH_PWR8 |
313 | __vector unsigned char __a, __b, __c; |
314 | |
315 | __a = (__vector unsigned char)vec_splats(__m1); |
316 | __b = (__vector unsigned char)vec_splats(__m2); |
317 | __c = vec_mergel(__a, __b); |
318 | return (__m64)((__vector long long)__c)[0]; |
319 | #else |
320 | __m64_union __mu1, __mu2, __res; |
321 | |
322 | __mu1.as_m64 = __m1; |
323 | __mu2.as_m64 = __m2; |
324 | |
325 | __res.as_char[0] = __mu1.as_char[0]; |
326 | __res.as_char[1] = __mu2.as_char[0]; |
327 | __res.as_char[2] = __mu1.as_char[1]; |
328 | __res.as_char[3] = __mu2.as_char[1]; |
329 | __res.as_char[4] = __mu1.as_char[2]; |
330 | __res.as_char[5] = __mu2.as_char[2]; |
331 | __res.as_char[6] = __mu1.as_char[3]; |
332 | __res.as_char[7] = __mu2.as_char[3]; |
333 | |
334 | return (__m64)__res.as_m64; |
335 | #endif |
336 | } |
337 | |
338 | extern __inline __m64 |
339 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
340 | _m_punpcklbw(__m64 __m1, __m64 __m2) { |
341 | return _mm_unpacklo_pi8(__m1, __m2); |
342 | } |
343 | /* Interleave the two 16-bit values from the low half of M1 with the two |
344 | 16-bit values from the low half of M2. */ |
345 | extern __inline __m64 |
346 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
347 | _mm_unpacklo_pi16(__m64 __m1, __m64 __m2) { |
348 | __m64_union __mu1, __mu2, __res; |
349 | |
350 | __mu1.as_m64 = __m1; |
351 | __mu2.as_m64 = __m2; |
352 | |
353 | __res.as_short[0] = __mu1.as_short[0]; |
354 | __res.as_short[1] = __mu2.as_short[0]; |
355 | __res.as_short[2] = __mu1.as_short[1]; |
356 | __res.as_short[3] = __mu2.as_short[1]; |
357 | |
358 | return (__m64)__res.as_m64; |
359 | } |
360 | |
361 | extern __inline __m64 |
362 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
363 | _m_punpcklwd(__m64 __m1, __m64 __m2) { |
364 | return _mm_unpacklo_pi16(__m1, __m2); |
365 | } |
366 | |
367 | /* Interleave the 32-bit value from the low half of M1 with the 32-bit |
368 | value from the low half of M2. */ |
369 | extern __inline __m64 |
370 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
371 | _mm_unpacklo_pi32(__m64 __m1, __m64 __m2) { |
372 | __m64_union __mu1, __mu2, __res; |
373 | |
374 | __mu1.as_m64 = __m1; |
375 | __mu2.as_m64 = __m2; |
376 | |
377 | __res.as_int[0] = __mu1.as_int[0]; |
378 | __res.as_int[1] = __mu2.as_int[0]; |
379 | |
380 | return (__m64)__res.as_m64; |
381 | } |
382 | |
383 | extern __inline __m64 |
384 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
385 | _m_punpckldq(__m64 __m1, __m64 __m2) { |
386 | return _mm_unpacklo_pi32(__m1, __m2); |
387 | } |
388 | |
389 | /* Add the 8-bit values in M1 to the 8-bit values in M2. */ |
390 | extern __inline __m64 |
391 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
392 | _mm_add_pi8(__m64 __m1, __m64 __m2) { |
393 | #if _ARCH_PWR8 |
394 | __vector signed char __a, __b, __c; |
395 | |
396 | __a = (__vector signed char)vec_splats(__m1); |
397 | __b = (__vector signed char)vec_splats(__m2); |
398 | __c = vec_add(__a, __b); |
399 | return (__m64)((__vector long long)__c)[0]; |
400 | #else |
401 | __m64_union __mu1, __mu2, __res; |
402 | |
403 | __mu1.as_m64 = __m1; |
404 | __mu2.as_m64 = __m2; |
405 | |
406 | __res.as_char[0] = __mu1.as_char[0] + __mu2.as_char[0]; |
407 | __res.as_char[1] = __mu1.as_char[1] + __mu2.as_char[1]; |
408 | __res.as_char[2] = __mu1.as_char[2] + __mu2.as_char[2]; |
409 | __res.as_char[3] = __mu1.as_char[3] + __mu2.as_char[3]; |
410 | __res.as_char[4] = __mu1.as_char[4] + __mu2.as_char[4]; |
411 | __res.as_char[5] = __mu1.as_char[5] + __mu2.as_char[5]; |
412 | __res.as_char[6] = __mu1.as_char[6] + __mu2.as_char[6]; |
413 | __res.as_char[7] = __mu1.as_char[7] + __mu2.as_char[7]; |
414 | |
415 | return (__m64)__res.as_m64; |
416 | #endif |
417 | } |
418 | |
419 | extern __inline __m64 |
420 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
421 | _m_paddb(__m64 __m1, __m64 __m2) { |
422 | return _mm_add_pi8(__m1, __m2); |
423 | } |
424 | |
425 | /* Add the 16-bit values in M1 to the 16-bit values in M2. */ |
426 | extern __inline __m64 |
427 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
428 | _mm_add_pi16(__m64 __m1, __m64 __m2) { |
429 | #if _ARCH_PWR8 |
430 | __vector signed short __a, __b, __c; |
431 | |
432 | __a = (__vector signed short)vec_splats(__m1); |
433 | __b = (__vector signed short)vec_splats(__m2); |
434 | __c = vec_add(__a, __b); |
435 | return (__m64)((__vector long long)__c)[0]; |
436 | #else |
437 | __m64_union __mu1, __mu2, __res; |
438 | |
439 | __mu1.as_m64 = __m1; |
440 | __mu2.as_m64 = __m2; |
441 | |
442 | __res.as_short[0] = __mu1.as_short[0] + __mu2.as_short[0]; |
443 | __res.as_short[1] = __mu1.as_short[1] + __mu2.as_short[1]; |
444 | __res.as_short[2] = __mu1.as_short[2] + __mu2.as_short[2]; |
445 | __res.as_short[3] = __mu1.as_short[3] + __mu2.as_short[3]; |
446 | |
447 | return (__m64)__res.as_m64; |
448 | #endif |
449 | } |
450 | |
451 | extern __inline __m64 |
452 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
453 | _m_paddw(__m64 __m1, __m64 __m2) { |
454 | return _mm_add_pi16(__m1, __m2); |
455 | } |
456 | |
457 | /* Add the 32-bit values in M1 to the 32-bit values in M2. */ |
458 | extern __inline __m64 |
459 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
460 | _mm_add_pi32(__m64 __m1, __m64 __m2) { |
461 | #if _ARCH_PWR9 |
462 | __vector signed int __a, __b, __c; |
463 | |
464 | __a = (__vector signed int)vec_splats(__m1); |
465 | __b = (__vector signed int)vec_splats(__m2); |
466 | __c = vec_add(__a, __b); |
467 | return (__m64)((__vector long long)__c)[0]; |
468 | #else |
469 | __m64_union __mu1, __mu2, __res; |
470 | |
471 | __mu1.as_m64 = __m1; |
472 | __mu2.as_m64 = __m2; |
473 | |
474 | __res.as_int[0] = __mu1.as_int[0] + __mu2.as_int[0]; |
475 | __res.as_int[1] = __mu1.as_int[1] + __mu2.as_int[1]; |
476 | |
477 | return (__m64)__res.as_m64; |
478 | #endif |
479 | } |
480 | |
481 | extern __inline __m64 |
482 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
483 | _m_paddd(__m64 __m1, __m64 __m2) { |
484 | return _mm_add_pi32(__m1, __m2); |
485 | } |
486 | |
487 | /* Subtract the 8-bit values in M2 from the 8-bit values in M1. */ |
488 | extern __inline __m64 |
489 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
490 | _mm_sub_pi8(__m64 __m1, __m64 __m2) { |
491 | #if _ARCH_PWR8 |
492 | __vector signed char __a, __b, __c; |
493 | |
494 | __a = (__vector signed char)vec_splats(__m1); |
495 | __b = (__vector signed char)vec_splats(__m2); |
496 | __c = vec_sub(__a, __b); |
497 | return (__m64)((__vector long long)__c)[0]; |
498 | #else |
499 | __m64_union __mu1, __mu2, __res; |
500 | |
501 | __mu1.as_m64 = __m1; |
502 | __mu2.as_m64 = __m2; |
503 | |
504 | __res.as_char[0] = __mu1.as_char[0] - __mu2.as_char[0]; |
505 | __res.as_char[1] = __mu1.as_char[1] - __mu2.as_char[1]; |
506 | __res.as_char[2] = __mu1.as_char[2] - __mu2.as_char[2]; |
507 | __res.as_char[3] = __mu1.as_char[3] - __mu2.as_char[3]; |
508 | __res.as_char[4] = __mu1.as_char[4] - __mu2.as_char[4]; |
509 | __res.as_char[5] = __mu1.as_char[5] - __mu2.as_char[5]; |
510 | __res.as_char[6] = __mu1.as_char[6] - __mu2.as_char[6]; |
511 | __res.as_char[7] = __mu1.as_char[7] - __mu2.as_char[7]; |
512 | |
513 | return (__m64)__res.as_m64; |
514 | #endif |
515 | } |
516 | |
517 | extern __inline __m64 |
518 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
519 | _m_psubb(__m64 __m1, __m64 __m2) { |
520 | return _mm_sub_pi8(__m1, __m2); |
521 | } |
522 | |
523 | /* Subtract the 16-bit values in M2 from the 16-bit values in M1. */ |
524 | extern __inline __m64 |
525 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
526 | _mm_sub_pi16(__m64 __m1, __m64 __m2) { |
527 | #if _ARCH_PWR8 |
528 | __vector signed short __a, __b, __c; |
529 | |
530 | __a = (__vector signed short)vec_splats(__m1); |
531 | __b = (__vector signed short)vec_splats(__m2); |
532 | __c = vec_sub(__a, __b); |
533 | return (__m64)((__vector long long)__c)[0]; |
534 | #else |
535 | __m64_union __mu1, __mu2, __res; |
536 | |
537 | __mu1.as_m64 = __m1; |
538 | __mu2.as_m64 = __m2; |
539 | |
540 | __res.as_short[0] = __mu1.as_short[0] - __mu2.as_short[0]; |
541 | __res.as_short[1] = __mu1.as_short[1] - __mu2.as_short[1]; |
542 | __res.as_short[2] = __mu1.as_short[2] - __mu2.as_short[2]; |
543 | __res.as_short[3] = __mu1.as_short[3] - __mu2.as_short[3]; |
544 | |
545 | return (__m64)__res.as_m64; |
546 | #endif |
547 | } |
548 | |
549 | extern __inline __m64 |
550 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
551 | _m_psubw(__m64 __m1, __m64 __m2) { |
552 | return _mm_sub_pi16(__m1, __m2); |
553 | } |
554 | |
555 | /* Subtract the 32-bit values in M2 from the 32-bit values in M1. */ |
556 | extern __inline __m64 |
557 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
558 | _mm_sub_pi32(__m64 __m1, __m64 __m2) { |
559 | #if _ARCH_PWR9 |
560 | __vector signed int __a, __b, __c; |
561 | |
562 | __a = (__vector signed int)vec_splats(__m1); |
563 | __b = (__vector signed int)vec_splats(__m2); |
564 | __c = vec_sub(__a, __b); |
565 | return (__m64)((__vector long long)__c)[0]; |
566 | #else |
567 | __m64_union __mu1, __mu2, __res; |
568 | |
569 | __mu1.as_m64 = __m1; |
570 | __mu2.as_m64 = __m2; |
571 | |
572 | __res.as_int[0] = __mu1.as_int[0] - __mu2.as_int[0]; |
573 | __res.as_int[1] = __mu1.as_int[1] - __mu2.as_int[1]; |
574 | |
575 | return (__m64)__res.as_m64; |
576 | #endif |
577 | } |
578 | |
579 | extern __inline __m64 |
580 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
581 | _m_psubd(__m64 __m1, __m64 __m2) { |
582 | return _mm_sub_pi32(__m1, __m2); |
583 | } |
584 | |
585 | extern __inline __m64 |
586 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
587 | _mm_add_si64(__m64 __m1, __m64 __m2) { |
588 | return (__m1 + __m2); |
589 | } |
590 | |
591 | extern __inline __m64 |
592 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
593 | _mm_sub_si64(__m64 __m1, __m64 __m2) { |
594 | return (__m1 - __m2); |
595 | } |
596 | |
597 | /* Shift the 64-bit value in M left by COUNT. */ |
598 | extern __inline __m64 |
599 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
600 | _mm_sll_si64(__m64 __m, __m64 __count) { |
601 | return (__m << __count); |
602 | } |
603 | |
604 | extern __inline __m64 |
605 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
606 | _m_psllq(__m64 __m, __m64 __count) { |
607 | return _mm_sll_si64(__m, __count); |
608 | } |
609 | |
610 | extern __inline __m64 |
611 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
612 | _mm_slli_si64(__m64 __m, const int __count) { |
613 | return (__m << __count); |
614 | } |
615 | |
616 | extern __inline __m64 |
617 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
618 | _m_psllqi(__m64 __m, const int __count) { |
619 | return _mm_slli_si64(__m, __count); |
620 | } |
621 | |
622 | /* Shift the 64-bit value in M left by COUNT; shift in zeros. */ |
623 | extern __inline __m64 |
624 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
625 | _mm_srl_si64(__m64 __m, __m64 __count) { |
626 | return (__m >> __count); |
627 | } |
628 | |
629 | extern __inline __m64 |
630 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
631 | _m_psrlq(__m64 __m, __m64 __count) { |
632 | return _mm_srl_si64(__m, __count); |
633 | } |
634 | |
635 | extern __inline __m64 |
636 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
637 | _mm_srli_si64(__m64 __m, const int __count) { |
638 | return (__m >> __count); |
639 | } |
640 | |
641 | extern __inline __m64 |
642 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
643 | _m_psrlqi(__m64 __m, const int __count) { |
644 | return _mm_srli_si64(__m, __count); |
645 | } |
646 | |
647 | /* Bit-wise AND the 64-bit values in M1 and M2. */ |
648 | extern __inline __m64 |
649 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
650 | _mm_and_si64(__m64 __m1, __m64 __m2) { |
651 | return (__m1 & __m2); |
652 | } |
653 | |
654 | extern __inline __m64 |
655 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
656 | _m_pand(__m64 __m1, __m64 __m2) { |
657 | return _mm_and_si64(__m1, __m2); |
658 | } |
659 | |
660 | /* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the |
661 | 64-bit value in M2. */ |
662 | extern __inline __m64 |
663 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
664 | _mm_andnot_si64(__m64 __m1, __m64 __m2) { |
665 | return (~__m1 & __m2); |
666 | } |
667 | |
668 | extern __inline __m64 |
669 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
670 | _m_pandn(__m64 __m1, __m64 __m2) { |
671 | return _mm_andnot_si64(__m1, __m2); |
672 | } |
673 | |
674 | /* Bit-wise inclusive OR the 64-bit values in M1 and M2. */ |
675 | extern __inline __m64 |
676 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
677 | _mm_or_si64(__m64 __m1, __m64 __m2) { |
678 | return (__m1 | __m2); |
679 | } |
680 | |
681 | extern __inline __m64 |
682 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
683 | _m_por(__m64 __m1, __m64 __m2) { |
684 | return _mm_or_si64(__m1, __m2); |
685 | } |
686 | |
687 | /* Bit-wise exclusive OR the 64-bit values in M1 and M2. */ |
688 | extern __inline __m64 |
689 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
690 | _mm_xor_si64(__m64 __m1, __m64 __m2) { |
691 | return (__m1 ^ __m2); |
692 | } |
693 | |
694 | extern __inline __m64 |
695 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
696 | _m_pxor(__m64 __m1, __m64 __m2) { |
697 | return _mm_xor_si64(__m1, __m2); |
698 | } |
699 | |
700 | /* Creates a 64-bit zero. */ |
701 | extern __inline __m64 |
702 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
703 | _mm_setzero_si64(void) { |
704 | return (__m64)0; |
705 | } |
706 | |
707 | /* Compare eight 8-bit values. The result of the comparison is 0xFF if the |
708 | test is true and zero if false. */ |
709 | extern __inline __m64 |
710 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
711 | _mm_cmpeq_pi8(__m64 __m1, __m64 __m2) { |
712 | #if defined(_ARCH_PWR6) && defined(__powerpc64__) |
713 | __m64 __res; |
714 | __asm__("cmpb %0,%1,%2;\n" : "=r"(__res) : "r"(__m1), "r"(__m2) :); |
715 | return (__res); |
716 | #else |
717 | __m64_union __mu1, __mu2, __res; |
718 | |
719 | __mu1.as_m64 = __m1; |
720 | __mu2.as_m64 = __m2; |
721 | |
722 | __res.as_char[0] = (__mu1.as_char[0] == __mu2.as_char[0]) ? -1 : 0; |
723 | __res.as_char[1] = (__mu1.as_char[1] == __mu2.as_char[1]) ? -1 : 0; |
724 | __res.as_char[2] = (__mu1.as_char[2] == __mu2.as_char[2]) ? -1 : 0; |
725 | __res.as_char[3] = (__mu1.as_char[3] == __mu2.as_char[3]) ? -1 : 0; |
726 | __res.as_char[4] = (__mu1.as_char[4] == __mu2.as_char[4]) ? -1 : 0; |
727 | __res.as_char[5] = (__mu1.as_char[5] == __mu2.as_char[5]) ? -1 : 0; |
728 | __res.as_char[6] = (__mu1.as_char[6] == __mu2.as_char[6]) ? -1 : 0; |
729 | __res.as_char[7] = (__mu1.as_char[7] == __mu2.as_char[7]) ? -1 : 0; |
730 | |
731 | return (__m64)__res.as_m64; |
732 | #endif |
733 | } |
734 | |
735 | extern __inline __m64 |
736 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
737 | _m_pcmpeqb(__m64 __m1, __m64 __m2) { |
738 | return _mm_cmpeq_pi8(__m1, __m2); |
739 | } |
740 | |
741 | extern __inline __m64 |
742 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
743 | _mm_cmpgt_pi8(__m64 __m1, __m64 __m2) { |
744 | #if _ARCH_PWR8 |
745 | __vector signed char __a, __b, __c; |
746 | |
747 | __a = (__vector signed char)vec_splats(__m1); |
748 | __b = (__vector signed char)vec_splats(__m2); |
749 | __c = (__vector signed char)vec_cmpgt(__a, __b); |
750 | return (__m64)((__vector long long)__c)[0]; |
751 | #else |
752 | __m64_union __mu1, __mu2, __res; |
753 | |
754 | __mu1.as_m64 = __m1; |
755 | __mu2.as_m64 = __m2; |
756 | |
757 | __res.as_char[0] = (__mu1.as_char[0] > __mu2.as_char[0]) ? -1 : 0; |
758 | __res.as_char[1] = (__mu1.as_char[1] > __mu2.as_char[1]) ? -1 : 0; |
759 | __res.as_char[2] = (__mu1.as_char[2] > __mu2.as_char[2]) ? -1 : 0; |
760 | __res.as_char[3] = (__mu1.as_char[3] > __mu2.as_char[3]) ? -1 : 0; |
761 | __res.as_char[4] = (__mu1.as_char[4] > __mu2.as_char[4]) ? -1 : 0; |
762 | __res.as_char[5] = (__mu1.as_char[5] > __mu2.as_char[5]) ? -1 : 0; |
763 | __res.as_char[6] = (__mu1.as_char[6] > __mu2.as_char[6]) ? -1 : 0; |
764 | __res.as_char[7] = (__mu1.as_char[7] > __mu2.as_char[7]) ? -1 : 0; |
765 | |
766 | return (__m64)__res.as_m64; |
767 | #endif |
768 | } |
769 | |
770 | extern __inline __m64 |
771 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
772 | _m_pcmpgtb(__m64 __m1, __m64 __m2) { |
773 | return _mm_cmpgt_pi8(__m1, __m2); |
774 | } |
775 | |
776 | /* Compare four 16-bit values. The result of the comparison is 0xFFFF if |
777 | the test is true and zero if false. */ |
778 | extern __inline __m64 |
779 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
780 | _mm_cmpeq_pi16(__m64 __m1, __m64 __m2) { |
781 | #if _ARCH_PWR8 |
782 | __vector signed short __a, __b, __c; |
783 | |
784 | __a = (__vector signed short)vec_splats(__m1); |
785 | __b = (__vector signed short)vec_splats(__m2); |
786 | __c = (__vector signed short)vec_cmpeq(__a, __b); |
787 | return (__m64)((__vector long long)__c)[0]; |
788 | #else |
789 | __m64_union __mu1, __mu2, __res; |
790 | |
791 | __mu1.as_m64 = __m1; |
792 | __mu2.as_m64 = __m2; |
793 | |
794 | __res.as_short[0] = (__mu1.as_short[0] == __mu2.as_short[0]) ? -1 : 0; |
795 | __res.as_short[1] = (__mu1.as_short[1] == __mu2.as_short[1]) ? -1 : 0; |
796 | __res.as_short[2] = (__mu1.as_short[2] == __mu2.as_short[2]) ? -1 : 0; |
797 | __res.as_short[3] = (__mu1.as_short[3] == __mu2.as_short[3]) ? -1 : 0; |
798 | |
799 | return (__m64)__res.as_m64; |
800 | #endif |
801 | } |
802 | |
803 | extern __inline __m64 |
804 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
805 | _m_pcmpeqw(__m64 __m1, __m64 __m2) { |
806 | return _mm_cmpeq_pi16(__m1, __m2); |
807 | } |
808 | |
809 | extern __inline __m64 |
810 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
811 | _mm_cmpgt_pi16(__m64 __m1, __m64 __m2) { |
812 | #if _ARCH_PWR8 |
813 | __vector signed short __a, __b, __c; |
814 | |
815 | __a = (__vector signed short)vec_splats(__m1); |
816 | __b = (__vector signed short)vec_splats(__m2); |
817 | __c = (__vector signed short)vec_cmpgt(__a, __b); |
818 | return (__m64)((__vector long long)__c)[0]; |
819 | #else |
820 | __m64_union __mu1, __mu2, __res; |
821 | |
822 | __mu1.as_m64 = __m1; |
823 | __mu2.as_m64 = __m2; |
824 | |
825 | __res.as_short[0] = (__mu1.as_short[0] > __mu2.as_short[0]) ? -1 : 0; |
826 | __res.as_short[1] = (__mu1.as_short[1] > __mu2.as_short[1]) ? -1 : 0; |
827 | __res.as_short[2] = (__mu1.as_short[2] > __mu2.as_short[2]) ? -1 : 0; |
828 | __res.as_short[3] = (__mu1.as_short[3] > __mu2.as_short[3]) ? -1 : 0; |
829 | |
830 | return (__m64)__res.as_m64; |
831 | #endif |
832 | } |
833 | |
834 | extern __inline __m64 |
835 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
836 | _m_pcmpgtw(__m64 __m1, __m64 __m2) { |
837 | return _mm_cmpgt_pi16(__m1, __m2); |
838 | } |
839 | |
840 | /* Compare two 32-bit values. The result of the comparison is 0xFFFFFFFF if |
841 | the test is true and zero if false. */ |
842 | extern __inline __m64 |
843 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
844 | _mm_cmpeq_pi32(__m64 __m1, __m64 __m2) { |
845 | #if _ARCH_PWR9 |
846 | __vector signed int __a, __b, __c; |
847 | |
848 | __a = (__vector signed int)vec_splats(__m1); |
849 | __b = (__vector signed int)vec_splats(__m2); |
850 | __c = (__vector signed int)vec_cmpeq(__a, __b); |
851 | return (__m64)((__vector long long)__c)[0]; |
852 | #else |
853 | __m64_union __mu1, __mu2, __res; |
854 | |
855 | __mu1.as_m64 = __m1; |
856 | __mu2.as_m64 = __m2; |
857 | |
858 | __res.as_int[0] = (__mu1.as_int[0] == __mu2.as_int[0]) ? -1 : 0; |
859 | __res.as_int[1] = (__mu1.as_int[1] == __mu2.as_int[1]) ? -1 : 0; |
860 | |
861 | return (__m64)__res.as_m64; |
862 | #endif |
863 | } |
864 | |
865 | extern __inline __m64 |
866 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
867 | _m_pcmpeqd(__m64 __m1, __m64 __m2) { |
868 | return _mm_cmpeq_pi32(__m1, __m2); |
869 | } |
870 | |
871 | extern __inline __m64 |
872 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
873 | _mm_cmpgt_pi32(__m64 __m1, __m64 __m2) { |
874 | #if _ARCH_PWR9 |
875 | __vector signed int __a, __b, __c; |
876 | |
877 | __a = (__vector signed int)vec_splats(__m1); |
878 | __b = (__vector signed int)vec_splats(__m2); |
879 | __c = (__vector signed int)vec_cmpgt(__a, __b); |
880 | return (__m64)((__vector long long)__c)[0]; |
881 | #else |
882 | __m64_union __mu1, __mu2, __res; |
883 | |
884 | __mu1.as_m64 = __m1; |
885 | __mu2.as_m64 = __m2; |
886 | |
887 | __res.as_int[0] = (__mu1.as_int[0] > __mu2.as_int[0]) ? -1 : 0; |
888 | __res.as_int[1] = (__mu1.as_int[1] > __mu2.as_int[1]) ? -1 : 0; |
889 | |
890 | return (__m64)__res.as_m64; |
891 | #endif |
892 | } |
893 | |
894 | extern __inline __m64 |
895 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
896 | _m_pcmpgtd(__m64 __m1, __m64 __m2) { |
897 | return _mm_cmpgt_pi32(__m1, __m2); |
898 | } |
899 | |
900 | #if _ARCH_PWR8 |
901 | /* Add the 8-bit values in M1 to the 8-bit values in M2 using signed |
902 | saturated arithmetic. */ |
903 | extern __inline __m64 |
904 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
905 | _mm_adds_pi8(__m64 __m1, __m64 __m2) { |
906 | __vector signed char __a, __b, __c; |
907 | |
908 | __a = (__vector signed char)vec_splats(__m1); |
909 | __b = (__vector signed char)vec_splats(__m2); |
910 | __c = vec_adds(__a, __b); |
911 | return (__m64)((__vector long long)__c)[0]; |
912 | } |
913 | |
914 | extern __inline __m64 |
915 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
916 | _m_paddsb(__m64 __m1, __m64 __m2) { |
917 | return _mm_adds_pi8(__m1, __m2); |
918 | } |
919 | /* Add the 16-bit values in M1 to the 16-bit values in M2 using signed |
920 | saturated arithmetic. */ |
921 | extern __inline __m64 |
922 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
923 | _mm_adds_pi16(__m64 __m1, __m64 __m2) { |
924 | __vector signed short __a, __b, __c; |
925 | |
926 | __a = (__vector signed short)vec_splats(__m1); |
927 | __b = (__vector signed short)vec_splats(__m2); |
928 | __c = vec_adds(__a, __b); |
929 | return (__m64)((__vector long long)__c)[0]; |
930 | } |
931 | |
932 | extern __inline __m64 |
933 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
934 | _m_paddsw(__m64 __m1, __m64 __m2) { |
935 | return _mm_adds_pi16(__m1, __m2); |
936 | } |
937 | /* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned |
938 | saturated arithmetic. */ |
939 | extern __inline __m64 |
940 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
941 | _mm_adds_pu8(__m64 __m1, __m64 __m2) { |
942 | __vector unsigned char __a, __b, __c; |
943 | |
944 | __a = (__vector unsigned char)vec_splats(__m1); |
945 | __b = (__vector unsigned char)vec_splats(__m2); |
946 | __c = vec_adds(__a, __b); |
947 | return (__m64)((__vector long long)__c)[0]; |
948 | } |
949 | |
950 | extern __inline __m64 |
951 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
952 | _m_paddusb(__m64 __m1, __m64 __m2) { |
953 | return _mm_adds_pu8(__m1, __m2); |
954 | } |
955 | |
956 | /* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned |
957 | saturated arithmetic. */ |
958 | extern __inline __m64 |
959 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
960 | _mm_adds_pu16(__m64 __m1, __m64 __m2) { |
961 | __vector unsigned short __a, __b, __c; |
962 | |
963 | __a = (__vector unsigned short)vec_splats(__m1); |
964 | __b = (__vector unsigned short)vec_splats(__m2); |
965 | __c = vec_adds(__a, __b); |
966 | return (__m64)((__vector long long)__c)[0]; |
967 | } |
968 | |
969 | extern __inline __m64 |
970 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
971 | _m_paddusw(__m64 __m1, __m64 __m2) { |
972 | return _mm_adds_pu16(__m1, __m2); |
973 | } |
974 | |
975 | /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed |
976 | saturating arithmetic. */ |
977 | extern __inline __m64 |
978 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
979 | _mm_subs_pi8(__m64 __m1, __m64 __m2) { |
980 | __vector signed char __a, __b, __c; |
981 | |
982 | __a = (__vector signed char)vec_splats(__m1); |
983 | __b = (__vector signed char)vec_splats(__m2); |
984 | __c = vec_subs(__a, __b); |
985 | return (__m64)((__vector long long)__c)[0]; |
986 | } |
987 | |
988 | extern __inline __m64 |
989 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
990 | _m_psubsb(__m64 __m1, __m64 __m2) { |
991 | return _mm_subs_pi8(__m1, __m2); |
992 | } |
993 | |
994 | /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using |
995 | signed saturating arithmetic. */ |
996 | extern __inline __m64 |
997 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
998 | _mm_subs_pi16(__m64 __m1, __m64 __m2) { |
999 | __vector signed short __a, __b, __c; |
1000 | |
1001 | __a = (__vector signed short)vec_splats(__m1); |
1002 | __b = (__vector signed short)vec_splats(__m2); |
1003 | __c = vec_subs(__a, __b); |
1004 | return (__m64)((__vector long long)__c)[0]; |
1005 | } |
1006 | |
1007 | extern __inline __m64 |
1008 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1009 | _m_psubsw(__m64 __m1, __m64 __m2) { |
1010 | return _mm_subs_pi16(__m1, __m2); |
1011 | } |
1012 | |
1013 | /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using |
1014 | unsigned saturating arithmetic. */ |
1015 | extern __inline __m64 |
1016 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1017 | _mm_subs_pu8(__m64 __m1, __m64 __m2) { |
1018 | __vector unsigned char __a, __b, __c; |
1019 | |
1020 | __a = (__vector unsigned char)vec_splats(__m1); |
1021 | __b = (__vector unsigned char)vec_splats(__m2); |
1022 | __c = vec_subs(__a, __b); |
1023 | return (__m64)((__vector long long)__c)[0]; |
1024 | } |
1025 | |
1026 | extern __inline __m64 |
1027 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1028 | _m_psubusb(__m64 __m1, __m64 __m2) { |
1029 | return _mm_subs_pu8(__m1, __m2); |
1030 | } |
1031 | |
1032 | /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using |
1033 | unsigned saturating arithmetic. */ |
1034 | extern __inline __m64 |
1035 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1036 | _mm_subs_pu16(__m64 __m1, __m64 __m2) { |
1037 | __vector unsigned short __a, __b, __c; |
1038 | |
1039 | __a = (__vector unsigned short)vec_splats(__m1); |
1040 | __b = (__vector unsigned short)vec_splats(__m2); |
1041 | __c = vec_subs(__a, __b); |
1042 | return (__m64)((__vector long long)__c)[0]; |
1043 | } |
1044 | |
1045 | extern __inline __m64 |
1046 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1047 | _m_psubusw(__m64 __m1, __m64 __m2) { |
1048 | return _mm_subs_pu16(__m1, __m2); |
1049 | } |
1050 | |
1051 | /* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing |
1052 | four 32-bit intermediate results, which are then summed by pairs to |
1053 | produce two 32-bit results. */ |
1054 | extern __inline __m64 |
1055 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1056 | _mm_madd_pi16(__m64 __m1, __m64 __m2) { |
1057 | __vector signed short __a, __b; |
1058 | __vector signed int __c; |
1059 | __vector signed int __zero = {0, 0, 0, 0}; |
1060 | |
1061 | __a = (__vector signed short)vec_splats(__m1); |
1062 | __b = (__vector signed short)vec_splats(__m2); |
1063 | __c = vec_vmsumshm(__a, __b, __zero); |
1064 | return (__m64)((__vector long long)__c)[0]; |
1065 | } |
1066 | |
1067 | extern __inline __m64 |
1068 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1069 | _m_pmaddwd(__m64 __m1, __m64 __m2) { |
1070 | return _mm_madd_pi16(__m1, __m2); |
1071 | } |
1072 | /* Multiply four signed 16-bit values in M1 by four signed 16-bit values in |
1073 | M2 and produce the high 16 bits of the 32-bit results. */ |
1074 | extern __inline __m64 |
1075 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1076 | _mm_mulhi_pi16(__m64 __m1, __m64 __m2) { |
1077 | __vector signed short __a, __b; |
1078 | __vector signed short __c; |
1079 | __vector signed int __w0, __w1; |
1080 | __vector unsigned char __xform1 = { |
1081 | #ifdef __LITTLE_ENDIAN__ |
1082 | 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A, |
1083 | 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F |
1084 | #else |
1085 | 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x00, |
1086 | 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15 |
1087 | #endif |
1088 | }; |
1089 | |
1090 | __a = (__vector signed short)vec_splats(__m1); |
1091 | __b = (__vector signed short)vec_splats(__m2); |
1092 | |
1093 | __w0 = vec_vmulesh(__a, __b); |
1094 | __w1 = vec_vmulosh(__a, __b); |
1095 | __c = (__vector signed short)vec_perm(__w0, __w1, __xform1); |
1096 | |
1097 | return (__m64)((__vector long long)__c)[0]; |
1098 | } |
1099 | |
1100 | extern __inline __m64 |
1101 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1102 | _m_pmulhw(__m64 __m1, __m64 __m2) { |
1103 | return _mm_mulhi_pi16(__m1, __m2); |
1104 | } |
1105 | |
1106 | /* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce |
1107 | the low 16 bits of the results. */ |
1108 | extern __inline __m64 |
1109 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1110 | _mm_mullo_pi16(__m64 __m1, __m64 __m2) { |
1111 | __vector signed short __a, __b, __c; |
1112 | |
1113 | __a = (__vector signed short)vec_splats(__m1); |
1114 | __b = (__vector signed short)vec_splats(__m2); |
1115 | __c = __a * __b; |
1116 | return (__m64)((__vector long long)__c)[0]; |
1117 | } |
1118 | |
1119 | extern __inline __m64 |
1120 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1121 | _m_pmullw(__m64 __m1, __m64 __m2) { |
1122 | return _mm_mullo_pi16(__m1, __m2); |
1123 | } |
1124 | |
1125 | /* Shift four 16-bit values in M left by COUNT. */ |
1126 | extern __inline __m64 |
1127 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1128 | _mm_sll_pi16(__m64 __m, __m64 __count) { |
1129 | __vector signed short __r; |
1130 | __vector unsigned short __c; |
1131 | |
1132 | if (__count <= 15) { |
1133 | __r = (__vector signed short)vec_splats(__m); |
1134 | __c = (__vector unsigned short)vec_splats((unsigned short)__count); |
1135 | __r = vec_sl(__r, (__vector unsigned short)__c); |
1136 | return (__m64)((__vector long long)__r)[0]; |
1137 | } else |
1138 | return (0); |
1139 | } |
1140 | |
1141 | extern __inline __m64 |
1142 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1143 | _m_psllw(__m64 __m, __m64 __count) { |
1144 | return _mm_sll_pi16(__m, __count); |
1145 | } |
1146 | |
1147 | extern __inline __m64 |
1148 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1149 | _mm_slli_pi16(__m64 __m, int __count) { |
1150 | /* Promote int to long then invoke mm_sll_pi16. */ |
1151 | return _mm_sll_pi16(__m, __count); |
1152 | } |
1153 | |
1154 | extern __inline __m64 |
1155 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1156 | _m_psllwi(__m64 __m, int __count) { |
1157 | return _mm_slli_pi16(__m, __count); |
1158 | } |
1159 | |
1160 | /* Shift two 32-bit values in M left by COUNT. */ |
1161 | extern __inline __m64 |
1162 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1163 | _mm_sll_pi32(__m64 __m, __m64 __count) { |
1164 | __m64_union __res; |
1165 | |
1166 | __res.as_m64 = __m; |
1167 | |
1168 | __res.as_int[0] = __res.as_int[0] << __count; |
1169 | __res.as_int[1] = __res.as_int[1] << __count; |
1170 | return (__res.as_m64); |
1171 | } |
1172 | |
1173 | extern __inline __m64 |
1174 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1175 | _m_pslld(__m64 __m, __m64 __count) { |
1176 | return _mm_sll_pi32(__m, __count); |
1177 | } |
1178 | |
1179 | extern __inline __m64 |
1180 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1181 | _mm_slli_pi32(__m64 __m, int __count) { |
1182 | /* Promote int to long then invoke mm_sll_pi32. */ |
1183 | return _mm_sll_pi32(__m, __count); |
1184 | } |
1185 | |
1186 | extern __inline __m64 |
1187 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1188 | _m_pslldi(__m64 __m, int __count) { |
1189 | return _mm_slli_pi32(__m, __count); |
1190 | } |
1191 | |
1192 | /* Shift four 16-bit values in M right by COUNT; shift in the sign bit. */ |
1193 | extern __inline __m64 |
1194 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1195 | _mm_sra_pi16(__m64 __m, __m64 __count) { |
1196 | __vector signed short __r; |
1197 | __vector unsigned short __c; |
1198 | |
1199 | if (__count <= 15) { |
1200 | __r = (__vector signed short)vec_splats(__m); |
1201 | __c = (__vector unsigned short)vec_splats((unsigned short)__count); |
1202 | __r = vec_sra(__r, (__vector unsigned short)__c); |
1203 | return (__m64)((__vector long long)__r)[0]; |
1204 | } else |
1205 | return (0); |
1206 | } |
1207 | |
1208 | extern __inline __m64 |
1209 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1210 | _m_psraw(__m64 __m, __m64 __count) { |
1211 | return _mm_sra_pi16(__m, __count); |
1212 | } |
1213 | |
1214 | extern __inline __m64 |
1215 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1216 | _mm_srai_pi16(__m64 __m, int __count) { |
1217 | /* Promote int to long then invoke mm_sra_pi32. */ |
1218 | return _mm_sra_pi16(__m, __count); |
1219 | } |
1220 | |
1221 | extern __inline __m64 |
1222 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1223 | _m_psrawi(__m64 __m, int __count) { |
1224 | return _mm_srai_pi16(__m, __count); |
1225 | } |
1226 | |
1227 | /* Shift two 32-bit values in M right by COUNT; shift in the sign bit. */ |
1228 | extern __inline __m64 |
1229 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1230 | _mm_sra_pi32(__m64 __m, __m64 __count) { |
1231 | __m64_union __res; |
1232 | |
1233 | __res.as_m64 = __m; |
1234 | |
1235 | __res.as_int[0] = __res.as_int[0] >> __count; |
1236 | __res.as_int[1] = __res.as_int[1] >> __count; |
1237 | return (__res.as_m64); |
1238 | } |
1239 | |
1240 | extern __inline __m64 |
1241 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1242 | _m_psrad(__m64 __m, __m64 __count) { |
1243 | return _mm_sra_pi32(__m, __count); |
1244 | } |
1245 | |
1246 | extern __inline __m64 |
1247 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1248 | _mm_srai_pi32(__m64 __m, int __count) { |
1249 | /* Promote int to long then invoke mm_sra_pi32. */ |
1250 | return _mm_sra_pi32(__m, __count); |
1251 | } |
1252 | |
1253 | extern __inline __m64 |
1254 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1255 | _m_psradi(__m64 __m, int __count) { |
1256 | return _mm_srai_pi32(__m, __count); |
1257 | } |
1258 | |
1259 | /* Shift four 16-bit values in M right by COUNT; shift in zeros. */ |
1260 | extern __inline __m64 |
1261 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1262 | _mm_srl_pi16(__m64 __m, __m64 __count) { |
1263 | __vector unsigned short __r; |
1264 | __vector unsigned short __c; |
1265 | |
1266 | if (__count <= 15) { |
1267 | __r = (__vector unsigned short)vec_splats(__m); |
1268 | __c = (__vector unsigned short)vec_splats((unsigned short)__count); |
1269 | __r = vec_sr(__r, (__vector unsigned short)__c); |
1270 | return (__m64)((__vector long long)__r)[0]; |
1271 | } else |
1272 | return (0); |
1273 | } |
1274 | |
1275 | extern __inline __m64 |
1276 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1277 | _m_psrlw(__m64 __m, __m64 __count) { |
1278 | return _mm_srl_pi16(__m, __count); |
1279 | } |
1280 | |
1281 | extern __inline __m64 |
1282 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1283 | _mm_srli_pi16(__m64 __m, int __count) { |
1284 | /* Promote int to long then invoke mm_sra_pi32. */ |
1285 | return _mm_srl_pi16(__m, __count); |
1286 | } |
1287 | |
1288 | extern __inline __m64 |
1289 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1290 | _m_psrlwi(__m64 __m, int __count) { |
1291 | return _mm_srli_pi16(__m, __count); |
1292 | } |
1293 | |
1294 | /* Shift two 32-bit values in M right by COUNT; shift in zeros. */ |
1295 | extern __inline __m64 |
1296 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1297 | _mm_srl_pi32(__m64 __m, __m64 __count) { |
1298 | __m64_union __res; |
1299 | |
1300 | __res.as_m64 = __m; |
1301 | |
1302 | __res.as_int[0] = (unsigned int)__res.as_int[0] >> __count; |
1303 | __res.as_int[1] = (unsigned int)__res.as_int[1] >> __count; |
1304 | return (__res.as_m64); |
1305 | } |
1306 | |
1307 | extern __inline __m64 |
1308 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1309 | _m_psrld(__m64 __m, __m64 __count) { |
1310 | return _mm_srl_pi32(__m, __count); |
1311 | } |
1312 | |
1313 | extern __inline __m64 |
1314 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1315 | _mm_srli_pi32(__m64 __m, int __count) { |
1316 | /* Promote int to long then invoke mm_srl_pi32. */ |
1317 | return _mm_srl_pi32(__m, __count); |
1318 | } |
1319 | |
1320 | extern __inline __m64 |
1321 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1322 | _m_psrldi(__m64 __m, int __count) { |
1323 | return _mm_srli_pi32(__m, __count); |
1324 | } |
1325 | #endif /* _ARCH_PWR8 */ |
1326 | |
1327 | /* Creates a vector of two 32-bit values; I0 is least significant. */ |
1328 | extern __inline __m64 |
1329 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1330 | _mm_set_pi32(int __i1, int __i0) { |
1331 | __m64_union __res; |
1332 | |
1333 | __res.as_int[0] = __i0; |
1334 | __res.as_int[1] = __i1; |
1335 | return (__res.as_m64); |
1336 | } |
1337 | |
1338 | /* Creates a vector of four 16-bit values; W0 is least significant. */ |
1339 | extern __inline __m64 |
1340 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1341 | _mm_set_pi16(short __w3, short __w2, short __w1, short __w0) { |
1342 | __m64_union __res; |
1343 | |
1344 | __res.as_short[0] = __w0; |
1345 | __res.as_short[1] = __w1; |
1346 | __res.as_short[2] = __w2; |
1347 | __res.as_short[3] = __w3; |
1348 | return (__res.as_m64); |
1349 | } |
1350 | |
1351 | /* Creates a vector of eight 8-bit values; B0 is least significant. */ |
1352 | extern __inline __m64 |
1353 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1354 | _mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, |
1355 | char __b2, char __b1, char __b0) { |
1356 | __m64_union __res; |
1357 | |
1358 | __res.as_char[0] = __b0; |
1359 | __res.as_char[1] = __b1; |
1360 | __res.as_char[2] = __b2; |
1361 | __res.as_char[3] = __b3; |
1362 | __res.as_char[4] = __b4; |
1363 | __res.as_char[5] = __b5; |
1364 | __res.as_char[6] = __b6; |
1365 | __res.as_char[7] = __b7; |
1366 | return (__res.as_m64); |
1367 | } |
1368 | |
1369 | /* Similar, but with the arguments in reverse order. */ |
1370 | extern __inline __m64 |
1371 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1372 | _mm_setr_pi32(int __i0, int __i1) { |
1373 | __m64_union __res; |
1374 | |
1375 | __res.as_int[0] = __i0; |
1376 | __res.as_int[1] = __i1; |
1377 | return (__res.as_m64); |
1378 | } |
1379 | |
1380 | extern __inline __m64 |
1381 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1382 | _mm_setr_pi16(short __w0, short __w1, short __w2, short __w3) { |
1383 | return _mm_set_pi16(__w3, __w2, __w1, __w0); |
1384 | } |
1385 | |
1386 | extern __inline __m64 |
1387 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1388 | _mm_setr_pi8(char __b0, char __b1, char __b2, char __b3, char __b4, |
1389 | char __b5, char __b6, char __b7) { |
1390 | return _mm_set_pi8(__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0); |
1391 | } |
1392 | |
1393 | /* Creates a vector of two 32-bit values, both elements containing I. */ |
1394 | extern __inline __m64 |
1395 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1396 | _mm_set1_pi32(int __i) { |
1397 | __m64_union __res; |
1398 | |
1399 | __res.as_int[0] = __i; |
1400 | __res.as_int[1] = __i; |
1401 | return (__res.as_m64); |
1402 | } |
1403 | |
1404 | /* Creates a vector of four 16-bit values, all elements containing W. */ |
1405 | extern __inline __m64 |
1406 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1407 | _mm_set1_pi16(short __w) { |
1408 | #if _ARCH_PWR9 |
1409 | __vector signed short w; |
1410 | |
1411 | w = (__vector signed short)vec_splats(__w); |
1412 | return (__m64)((__vector long long)w)[0]; |
1413 | #else |
1414 | __m64_union __res; |
1415 | |
1416 | __res.as_short[0] = __w; |
1417 | __res.as_short[1] = __w; |
1418 | __res.as_short[2] = __w; |
1419 | __res.as_short[3] = __w; |
1420 | return (__res.as_m64); |
1421 | #endif |
1422 | } |
1423 | |
1424 | /* Creates a vector of eight 8-bit values, all elements containing B. */ |
1425 | extern __inline __m64 |
1426 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
1427 | _mm_set1_pi8(signed char __b) { |
1428 | #if _ARCH_PWR8 |
1429 | __vector signed char __res; |
1430 | |
1431 | __res = (__vector signed char)vec_splats(__b); |
1432 | return (__m64)((__vector long long)__res)[0]; |
1433 | #else |
1434 | __m64_union __res; |
1435 | |
1436 | __res.as_char[0] = __b; |
1437 | __res.as_char[1] = __b; |
1438 | __res.as_char[2] = __b; |
1439 | __res.as_char[3] = __b; |
1440 | __res.as_char[4] = __b; |
1441 | __res.as_char[5] = __b; |
1442 | __res.as_char[6] = __b; |
1443 | __res.as_char[7] = __b; |
1444 | return (__res.as_m64); |
1445 | #endif |
1446 | } |
1447 | |
1448 | #else |
1449 | #include_next <mmintrin.h> |
1450 | #endif /* defined(__powerpc64__) && \ |
1451 | * (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */ |
1452 | |
1453 | #endif /* _MMINTRIN_H_INCLUDED */ |
1454 |
Warning: This file is not a C or C++ file. It does not have highlighting.