mmintrin.h source code [clang/lib/Headers/ppc_wrappers/mmintrin.h]

Warning: This file is not a C or C++ file. It does not have highlighting.

1	/*===---- mmintrin.h - Implementation of MMX intrinsics on PowerPC ---------===
2	*
3	* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	* See https://llvm.org/LICENSE.txt for license information.
5	* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	*
7	*===-----------------------------------------------------------------------===
8	*/
9
10	/* Implemented from the specification included in the Intel C++ Compiler
11	User Guide and Reference, version 9.0. */
12
13	#ifndef NO_WARN_X86_INTRINSICS
14	/* This header file is to help porting code using Intel intrinsics
15	explicitly from x86_64 to powerpc64/powerpc64le.
16
17	Since PowerPC target doesn't support native 64-bit vector type, we
18	typedef __m64 to 64-bit unsigned long long in MMX intrinsics, which
19	works well for _si64 and some _pi32 operations.
20
21	For _pi16 and _pi8 operations, it's better to transfer __m64 into
22	128-bit PowerPC vector first. Power8 introduced direct register
23	move instructions which helps for more efficient implementation.
24
25	It's user's responsibility to determine if the results of such port
26	are acceptable or further changes are needed. Please note that much
27	code using Intel intrinsics CAN BE REWRITTEN in more portable and
28	efficient standard C or GNU C extensions with 64-bit scalar
29	operations, or 128-bit SSE/Altivec operations, which are more
30	recommended. */
31	#error \
32	"Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
33	#endif
34
35	#ifndef _MMINTRIN_H_INCLUDED
36	#define _MMINTRIN_H_INCLUDED
37
38	#if defined(__powerpc64__) && \
39	(defined(__linux__) \|\| defined(__FreeBSD__) \|\| defined(_AIX))
40
41	#include <altivec.h>
42	/* The Intel API is flexible enough that we must allow aliasing with other
43	vector types, and their scalar components. */
44	typedef __attribute__((__aligned__(8))) unsigned long long __m64;
45
46	typedef __attribute__((__aligned__(8))) union {
47	__m64 as_m64;
48	char as_char[8];
49	signed char as_signed_char[8];
50	short as_short[4];
51	int as_int[2];
52	long long as_long_long;
53	float as_float[2];
54	double as_double;
55	} __m64_union;
56
57	/* Empty the multimedia state. */
58	extern __inline void
59	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
60	_mm_empty(void) {
61	/* nothing to do on PowerPC. */
62	}
63
64	extern __inline void
65	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
66	_m_empty(void) {
67	/* nothing to do on PowerPC. */
68	}
69
70	/* Convert I to a __m64 object. The integer is zero-extended to 64-bits. */
71	extern __inline __m64
72	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
73	_mm_cvtsi32_si64(int __i) {
74	return (__m64)(unsigned int)__i;
75	}
76
77	extern __inline __m64
78	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
79	_m_from_int(int __i) {
80	return _mm_cvtsi32_si64(__i);
81	}
82
83	/* Convert the lower 32 bits of the __m64 object into an integer. */
84	extern __inline int
85	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
86	_mm_cvtsi64_si32(__m64 __i) {
87	return ((int)__i);
88	}
89
90	extern __inline int
91	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
92	_m_to_int(__m64 __i) {
93	return _mm_cvtsi64_si32(__i);
94	}
95
96	/* Convert I to a __m64 object. */
97
98	/* Intel intrinsic. */
99	extern __inline __m64
100	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
101	_m_from_int64(long long __i) {
102	return (__m64)__i;
103	}
104
105	extern __inline __m64
106	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
107	_mm_cvtsi64_m64(long long __i) {
108	return (__m64)__i;
109	}
110
111	/* Microsoft intrinsic. */
112	extern __inline __m64
113	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
114	_mm_cvtsi64x_si64(long long __i) {
115	return (__m64)__i;
116	}
117
118	extern __inline __m64
119	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
120	_mm_set_pi64x(long long __i) {
121	return (__m64)__i;
122	}
123
124	/* Convert the __m64 object to a 64bit integer. */
125
126	/* Intel intrinsic. */
127	extern __inline long long
128	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
129	_m_to_int64(__m64 __i) {
130	return (long long)__i;
131	}
132
133	extern __inline long long
134	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
135	_mm_cvtm64_si64(__m64 __i) {
136	return (long long)__i;
137	}
138
139	/* Microsoft intrinsic. */
140	extern __inline long long
141	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
142	_mm_cvtsi64_si64x(__m64 __i) {
143	return (long long)__i;
144	}
145
146	#ifdef _ARCH_PWR8
147	/* Pack the four 16-bit values from M1 into the lower four 8-bit values of
148	the result, and the four 16-bit values from M2 into the upper four 8-bit
149	values of the result, all with signed saturation. */
150	extern __inline __m64
151	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
152	_mm_packs_pi16(__m64 __m1, __m64 __m2) {
153	__vector signed short __vm1;
154	__vector signed char __vresult;
155
156	__vm1 = (__vector signed short)(__vector unsigned long long)
157	#ifdef __LITTLE_ENDIAN__
158	{__m1, __m2};
159	#else
160	{__m2, __m1};
161	#endif
162	__vresult = vec_packs(__vm1, __vm1);
163	return (__m64)((__vector long long)__vresult)[0];
164	}
165
166	extern __inline __m64
167	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
168	_m_packsswb(__m64 __m1, __m64 __m2) {
169	return _mm_packs_pi16(__m1, __m2);
170	}
171
172	/* Pack the two 32-bit values from M1 in to the lower two 16-bit values of
173	the result, and the two 32-bit values from M2 into the upper two 16-bit
174	values of the result, all with signed saturation. */
175	extern __inline __m64
176	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
177	_mm_packs_pi32(__m64 __m1, __m64 __m2) {
178	__vector signed int __vm1;
179	__vector signed short __vresult;
180
181	__vm1 = (__vector signed int)(__vector unsigned long long)
182	#ifdef __LITTLE_ENDIAN__
183	{__m1, __m2};
184	#else
185	{__m2, __m1};
186	#endif
187	__vresult = vec_packs(__vm1, __vm1);
188	return (__m64)((__vector long long)__vresult)[0];
189	}
190
191	extern __inline __m64
192	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
193	_m_packssdw(__m64 __m1, __m64 __m2) {
194	return _mm_packs_pi32(__m1, __m2);
195	}
196
197	/* Pack the four 16-bit values from M1 into the lower four 8-bit values of
198	the result, and the four 16-bit values from M2 into the upper four 8-bit
199	values of the result, all with unsigned saturation. */
200	extern __inline __m64
201	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
202	_mm_packs_pu16(__m64 __m1, __m64 __m2) {
203	__vector unsigned char __r;
204	__vector signed short __vm1 = (__vector signed short)(__vector long long)
205	#ifdef __LITTLE_ENDIAN__
206	{__m1, __m2};
207	#else
208	{__m2, __m1};
209	#endif
210	const __vector signed short __zero = {0};
211	__vector __bool short __select = vec_cmplt(__vm1, __zero);
212	__r =
213	vec_packs((__vector unsigned short)__vm1, (__vector unsigned short)__vm1);
214	__vector __bool char __packsel = vec_pack(__select, __select);
215	__r = vec_sel(__r, (const __vector unsigned char)__zero, __packsel);
216	return (__m64)((__vector long long)__r)[0];
217	}
218
219	extern __inline __m64
220	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
221	_m_packuswb(__m64 __m1, __m64 __m2) {
222	return _mm_packs_pu16(__m1, __m2);
223	}
224	#endif /* end ARCH_PWR8 */
225
226	/* Interleave the four 8-bit values from the high half of M1 with the four
227	8-bit values from the high half of M2. */
228	extern __inline __m64
229	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
230	_mm_unpackhi_pi8(__m64 __m1, __m64 __m2) {
231	#if _ARCH_PWR8
232	__vector unsigned char __a, __b, __c;
233
234	__a = (__vector unsigned char)vec_splats(__m1);
235	__b = (__vector unsigned char)vec_splats(__m2);
236	__c = vec_mergel(__a, __b);
237	return (__m64)((__vector long long)__c)[1];
238	#else
239	__m64_union __mu1, __mu2, __res;
240
241	__mu1.as_m64 = __m1;
242	__mu2.as_m64 = __m2;
243
244	__res.as_char[0] = __mu1.as_char[4];
245	__res.as_char[1] = __mu2.as_char[4];
246	__res.as_char[2] = __mu1.as_char[5];
247	__res.as_char[3] = __mu2.as_char[5];
248	__res.as_char[4] = __mu1.as_char[6];
249	__res.as_char[5] = __mu2.as_char[6];
250	__res.as_char[6] = __mu1.as_char[7];
251	__res.as_char[7] = __mu2.as_char[7];
252
253	return (__m64)__res.as_m64;
254	#endif
255	}
256
257	extern __inline __m64
258	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
259	_m_punpckhbw(__m64 __m1, __m64 __m2) {
260	return _mm_unpackhi_pi8(__m1, __m2);
261	}
262
263	/* Interleave the two 16-bit values from the high half of M1 with the two
264	16-bit values from the high half of M2. */
265	extern __inline __m64
266	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
267	_mm_unpackhi_pi16(__m64 __m1, __m64 __m2) {
268	__m64_union __mu1, __mu2, __res;
269
270	__mu1.as_m64 = __m1;
271	__mu2.as_m64 = __m2;
272
273	__res.as_short[0] = __mu1.as_short[2];
274	__res.as_short[1] = __mu2.as_short[2];
275	__res.as_short[2] = __mu1.as_short[3];
276	__res.as_short[3] = __mu2.as_short[3];
277
278	return (__m64)__res.as_m64;
279	}
280
281	extern __inline __m64
282	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
283	_m_punpckhwd(__m64 __m1, __m64 __m2) {
284	return _mm_unpackhi_pi16(__m1, __m2);
285	}
286	/* Interleave the 32-bit value from the high half of M1 with the 32-bit
287	value from the high half of M2. */
288	extern __inline __m64
289	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
290	_mm_unpackhi_pi32(__m64 __m1, __m64 __m2) {
291	__m64_union __mu1, __mu2, __res;
292
293	__mu1.as_m64 = __m1;
294	__mu2.as_m64 = __m2;
295
296	__res.as_int[0] = __mu1.as_int[1];
297	__res.as_int[1] = __mu2.as_int[1];
298
299	return (__m64)__res.as_m64;
300	}
301
302	extern __inline __m64
303	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
304	_m_punpckhdq(__m64 __m1, __m64 __m2) {
305	return _mm_unpackhi_pi32(__m1, __m2);
306	}
307	/* Interleave the four 8-bit values from the low half of M1 with the four
308	8-bit values from the low half of M2. */
309	extern __inline __m64
310	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
311	_mm_unpacklo_pi8(__m64 __m1, __m64 __m2) {
312	#if _ARCH_PWR8
313	__vector unsigned char __a, __b, __c;
314
315	__a = (__vector unsigned char)vec_splats(__m1);
316	__b = (__vector unsigned char)vec_splats(__m2);
317	__c = vec_mergel(__a, __b);
318	return (__m64)((__vector long long)__c)[0];
319	#else
320	__m64_union __mu1, __mu2, __res;
321
322	__mu1.as_m64 = __m1;
323	__mu2.as_m64 = __m2;
324
325	__res.as_char[0] = __mu1.as_char[0];
326	__res.as_char[1] = __mu2.as_char[0];
327	__res.as_char[2] = __mu1.as_char[1];
328	__res.as_char[3] = __mu2.as_char[1];
329	__res.as_char[4] = __mu1.as_char[2];
330	__res.as_char[5] = __mu2.as_char[2];
331	__res.as_char[6] = __mu1.as_char[3];
332	__res.as_char[7] = __mu2.as_char[3];
333
334	return (__m64)__res.as_m64;
335	#endif
336	}
337
338	extern __inline __m64
339	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
340	_m_punpcklbw(__m64 __m1, __m64 __m2) {
341	return _mm_unpacklo_pi8(__m1, __m2);
342	}
343	/* Interleave the two 16-bit values from the low half of M1 with the two
344	16-bit values from the low half of M2. */
345	extern __inline __m64
346	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
347	_mm_unpacklo_pi16(__m64 __m1, __m64 __m2) {
348	__m64_union __mu1, __mu2, __res;
349
350	__mu1.as_m64 = __m1;
351	__mu2.as_m64 = __m2;
352
353	__res.as_short[0] = __mu1.as_short[0];
354	__res.as_short[1] = __mu2.as_short[0];
355	__res.as_short[2] = __mu1.as_short[1];
356	__res.as_short[3] = __mu2.as_short[1];
357
358	return (__m64)__res.as_m64;
359	}
360
361	extern __inline __m64
362	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
363	_m_punpcklwd(__m64 __m1, __m64 __m2) {
364	return _mm_unpacklo_pi16(__m1, __m2);
365	}
366
367	/* Interleave the 32-bit value from the low half of M1 with the 32-bit
368	value from the low half of M2. */
369	extern __inline __m64
370	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
371	_mm_unpacklo_pi32(__m64 __m1, __m64 __m2) {
372	__m64_union __mu1, __mu2, __res;
373
374	__mu1.as_m64 = __m1;
375	__mu2.as_m64 = __m2;
376
377	__res.as_int[0] = __mu1.as_int[0];
378	__res.as_int[1] = __mu2.as_int[0];
379
380	return (__m64)__res.as_m64;
381	}
382
383	extern __inline __m64
384	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
385	_m_punpckldq(__m64 __m1, __m64 __m2) {
386	return _mm_unpacklo_pi32(__m1, __m2);
387	}
388
389	/* Add the 8-bit values in M1 to the 8-bit values in M2. */
390	extern __inline __m64
391	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
392	_mm_add_pi8(__m64 __m1, __m64 __m2) {
393	#if _ARCH_PWR8
394	__vector signed char __a, __b, __c;
395
396	__a = (__vector signed char)vec_splats(__m1);
397	__b = (__vector signed char)vec_splats(__m2);
398	__c = vec_add(__a, __b);
399	return (__m64)((__vector long long)__c)[0];
400	#else
401	__m64_union __mu1, __mu2, __res;
402
403	__mu1.as_m64 = __m1;
404	__mu2.as_m64 = __m2;
405
406	__res.as_char[0] = __mu1.as_char[0] + __mu2.as_char[0];
407	__res.as_char[1] = __mu1.as_char[1] + __mu2.as_char[1];
408	__res.as_char[2] = __mu1.as_char[2] + __mu2.as_char[2];
409	__res.as_char[3] = __mu1.as_char[3] + __mu2.as_char[3];
410	__res.as_char[4] = __mu1.as_char[4] + __mu2.as_char[4];
411	__res.as_char[5] = __mu1.as_char[5] + __mu2.as_char[5];
412	__res.as_char[6] = __mu1.as_char[6] + __mu2.as_char[6];
413	__res.as_char[7] = __mu1.as_char[7] + __mu2.as_char[7];
414
415	return (__m64)__res.as_m64;
416	#endif
417	}
418
419	extern __inline __m64
420	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
421	_m_paddb(__m64 __m1, __m64 __m2) {
422	return _mm_add_pi8(__m1, __m2);
423	}
424
425	/* Add the 16-bit values in M1 to the 16-bit values in M2. */
426	extern __inline __m64
427	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
428	_mm_add_pi16(__m64 __m1, __m64 __m2) {
429	#if _ARCH_PWR8
430	__vector signed short __a, __b, __c;
431
432	__a = (__vector signed short)vec_splats(__m1);
433	__b = (__vector signed short)vec_splats(__m2);
434	__c = vec_add(__a, __b);
435	return (__m64)((__vector long long)__c)[0];
436	#else
437	__m64_union __mu1, __mu2, __res;
438
439	__mu1.as_m64 = __m1;
440	__mu2.as_m64 = __m2;
441
442	__res.as_short[0] = __mu1.as_short[0] + __mu2.as_short[0];
443	__res.as_short[1] = __mu1.as_short[1] + __mu2.as_short[1];
444	__res.as_short[2] = __mu1.as_short[2] + __mu2.as_short[2];
445	__res.as_short[3] = __mu1.as_short[3] + __mu2.as_short[3];
446
447	return (__m64)__res.as_m64;
448	#endif
449	}
450
451	extern __inline __m64
452	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
453	_m_paddw(__m64 __m1, __m64 __m2) {
454	return _mm_add_pi16(__m1, __m2);
455	}
456
457	/* Add the 32-bit values in M1 to the 32-bit values in M2. */
458	extern __inline __m64
459	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
460	_mm_add_pi32(__m64 __m1, __m64 __m2) {
461	#if _ARCH_PWR9
462	__vector signed int __a, __b, __c;
463
464	__a = (__vector signed int)vec_splats(__m1);
465	__b = (__vector signed int)vec_splats(__m2);
466	__c = vec_add(__a, __b);
467	return (__m64)((__vector long long)__c)[0];
468	#else
469	__m64_union __mu1, __mu2, __res;
470
471	__mu1.as_m64 = __m1;
472	__mu2.as_m64 = __m2;
473
474	__res.as_int[0] = __mu1.as_int[0] + __mu2.as_int[0];
475	__res.as_int[1] = __mu1.as_int[1] + __mu2.as_int[1];
476
477	return (__m64)__res.as_m64;
478	#endif
479	}
480
481	extern __inline __m64
482	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
483	_m_paddd(__m64 __m1, __m64 __m2) {
484	return _mm_add_pi32(__m1, __m2);
485	}
486
487	/* Subtract the 8-bit values in M2 from the 8-bit values in M1. */
488	extern __inline __m64
489	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
490	_mm_sub_pi8(__m64 __m1, __m64 __m2) {
491	#if _ARCH_PWR8
492	__vector signed char __a, __b, __c;
493
494	__a = (__vector signed char)vec_splats(__m1);
495	__b = (__vector signed char)vec_splats(__m2);
496	__c = vec_sub(__a, __b);
497	return (__m64)((__vector long long)__c)[0];
498	#else
499	__m64_union __mu1, __mu2, __res;
500
501	__mu1.as_m64 = __m1;
502	__mu2.as_m64 = __m2;
503
504	__res.as_char[0] = __mu1.as_char[0] - __mu2.as_char[0];
505	__res.as_char[1] = __mu1.as_char[1] - __mu2.as_char[1];
506	__res.as_char[2] = __mu1.as_char[2] - __mu2.as_char[2];
507	__res.as_char[3] = __mu1.as_char[3] - __mu2.as_char[3];
508	__res.as_char[4] = __mu1.as_char[4] - __mu2.as_char[4];
509	__res.as_char[5] = __mu1.as_char[5] - __mu2.as_char[5];
510	__res.as_char[6] = __mu1.as_char[6] - __mu2.as_char[6];
511	__res.as_char[7] = __mu1.as_char[7] - __mu2.as_char[7];
512
513	return (__m64)__res.as_m64;
514	#endif
515	}
516
517	extern __inline __m64
518	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
519	_m_psubb(__m64 __m1, __m64 __m2) {
520	return _mm_sub_pi8(__m1, __m2);
521	}
522
523	/* Subtract the 16-bit values in M2 from the 16-bit values in M1. */
524	extern __inline __m64
525	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
526	_mm_sub_pi16(__m64 __m1, __m64 __m2) {
527	#if _ARCH_PWR8
528	__vector signed short __a, __b, __c;
529
530	__a = (__vector signed short)vec_splats(__m1);
531	__b = (__vector signed short)vec_splats(__m2);
532	__c = vec_sub(__a, __b);
533	return (__m64)((__vector long long)__c)[0];
534	#else
535	__m64_union __mu1, __mu2, __res;
536
537	__mu1.as_m64 = __m1;
538	__mu2.as_m64 = __m2;
539
540	__res.as_short[0] = __mu1.as_short[0] - __mu2.as_short[0];
541	__res.as_short[1] = __mu1.as_short[1] - __mu2.as_short[1];
542	__res.as_short[2] = __mu1.as_short[2] - __mu2.as_short[2];
543	__res.as_short[3] = __mu1.as_short[3] - __mu2.as_short[3];
544
545	return (__m64)__res.as_m64;
546	#endif
547	}
548
549	extern __inline __m64
550	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
551	_m_psubw(__m64 __m1, __m64 __m2) {
552	return _mm_sub_pi16(__m1, __m2);
553	}
554
555	/* Subtract the 32-bit values in M2 from the 32-bit values in M1. */
556	extern __inline __m64
557	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
558	_mm_sub_pi32(__m64 __m1, __m64 __m2) {
559	#if _ARCH_PWR9
560	__vector signed int __a, __b, __c;
561
562	__a = (__vector signed int)vec_splats(__m1);
563	__b = (__vector signed int)vec_splats(__m2);
564	__c = vec_sub(__a, __b);
565	return (__m64)((__vector long long)__c)[0];
566	#else
567	__m64_union __mu1, __mu2, __res;
568
569	__mu1.as_m64 = __m1;
570	__mu2.as_m64 = __m2;
571
572	__res.as_int[0] = __mu1.as_int[0] - __mu2.as_int[0];
573	__res.as_int[1] = __mu1.as_int[1] - __mu2.as_int[1];
574
575	return (__m64)__res.as_m64;
576	#endif
577	}
578
579	extern __inline __m64
580	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
581	_m_psubd(__m64 __m1, __m64 __m2) {
582	return _mm_sub_pi32(__m1, __m2);
583	}
584
585	extern __inline __m64
586	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
587	_mm_add_si64(__m64 __m1, __m64 __m2) {
588	return (__m1 + __m2);
589	}
590
591	extern __inline __m64
592	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
593	_mm_sub_si64(__m64 __m1, __m64 __m2) {
594	return (__m1 - __m2);
595	}
596
597	/* Shift the 64-bit value in M left by COUNT. */
598	extern __inline __m64
599	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
600	_mm_sll_si64(__m64 __m, __m64 __count) {
601	return (__m << __count);
602	}
603
604	extern __inline __m64
605	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
606	_m_psllq(__m64 __m, __m64 __count) {
607	return _mm_sll_si64(__m, __count);
608	}
609
610	extern __inline __m64
611	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
612	_mm_slli_si64(__m64 __m, const int __count) {
613	return (__m << __count);
614	}
615
616	extern __inline __m64
617	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
618	_m_psllqi(__m64 __m, const int __count) {
619	return _mm_slli_si64(__m, __count);
620	}
621
622	/* Shift the 64-bit value in M left by COUNT; shift in zeros. */
623	extern __inline __m64
624	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
625	_mm_srl_si64(__m64 __m, __m64 __count) {
626	return (__m >> __count);
627	}
628
629	extern __inline __m64
630	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
631	_m_psrlq(__m64 __m, __m64 __count) {
632	return _mm_srl_si64(__m, __count);
633	}
634
635	extern __inline __m64
636	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
637	_mm_srli_si64(__m64 __m, const int __count) {
638	return (__m >> __count);
639	}
640
641	extern __inline __m64
642	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
643	_m_psrlqi(__m64 __m, const int __count) {
644	return _mm_srli_si64(__m, __count);
645	}
646
647	/* Bit-wise AND the 64-bit values in M1 and M2. */
648	extern __inline __m64
649	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
650	_mm_and_si64(__m64 __m1, __m64 __m2) {
651	return (__m1 & __m2);
652	}
653
654	extern __inline __m64
655	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
656	_m_pand(__m64 __m1, __m64 __m2) {
657	return _mm_and_si64(__m1, __m2);
658	}
659
660	/* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the
661	64-bit value in M2. */
662	extern __inline __m64
663	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
664	_mm_andnot_si64(__m64 __m1, __m64 __m2) {
665	return (~__m1 & __m2);
666	}
667
668	extern __inline __m64
669	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
670	_m_pandn(__m64 __m1, __m64 __m2) {
671	return _mm_andnot_si64(__m1, __m2);
672	}
673
674	/* Bit-wise inclusive OR the 64-bit values in M1 and M2. */
675	extern __inline __m64
676	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
677	_mm_or_si64(__m64 __m1, __m64 __m2) {
678	return (__m1 \| __m2);
679	}
680
681	extern __inline __m64
682	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
683	_m_por(__m64 __m1, __m64 __m2) {
684	return _mm_or_si64(__m1, __m2);
685	}
686
687	/* Bit-wise exclusive OR the 64-bit values in M1 and M2. */
688	extern __inline __m64
689	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
690	_mm_xor_si64(__m64 __m1, __m64 __m2) {
691	return (__m1 ^ __m2);
692	}
693
694	extern __inline __m64
695	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
696	_m_pxor(__m64 __m1, __m64 __m2) {
697	return _mm_xor_si64(__m1, __m2);
698	}
699
700	/* Creates a 64-bit zero. */
701	extern __inline __m64
702	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
703	_mm_setzero_si64(void) {
704	return (__m64)0;
705	}
706
707	/* Compare eight 8-bit values. The result of the comparison is 0xFF if the
708	test is true and zero if false. */
709	extern __inline __m64
710	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
711	_mm_cmpeq_pi8(__m64 __m1, __m64 __m2) {
712	#if defined(_ARCH_PWR6) && defined(__powerpc64__)
713	__m64 __res;
714	__asm__("cmpb %0,%1,%2;\n" : "=r"(__res) : "r"(__m1), "r"(__m2) :);
715	return (__res);
716	#else
717	__m64_union __mu1, __mu2, __res;
718
719	__mu1.as_m64 = __m1;
720	__mu2.as_m64 = __m2;
721
722	__res.as_char[0] = (__mu1.as_char[0] == __mu2.as_char[0]) ? -1 : 0;
723	__res.as_char[1] = (__mu1.as_char[1] == __mu2.as_char[1]) ? -1 : 0;
724	__res.as_char[2] = (__mu1.as_char[2] == __mu2.as_char[2]) ? -1 : 0;
725	__res.as_char[3] = (__mu1.as_char[3] == __mu2.as_char[3]) ? -1 : 0;
726	__res.as_char[4] = (__mu1.as_char[4] == __mu2.as_char[4]) ? -1 : 0;
727	__res.as_char[5] = (__mu1.as_char[5] == __mu2.as_char[5]) ? -1 : 0;
728	__res.as_char[6] = (__mu1.as_char[6] == __mu2.as_char[6]) ? -1 : 0;
729	__res.as_char[7] = (__mu1.as_char[7] == __mu2.as_char[7]) ? -1 : 0;
730
731	return (__m64)__res.as_m64;
732	#endif
733	}
734
735	extern __inline __m64
736	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
737	_m_pcmpeqb(__m64 __m1, __m64 __m2) {
738	return _mm_cmpeq_pi8(__m1, __m2);
739	}
740
741	extern __inline __m64
742	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
743	_mm_cmpgt_pi8(__m64 __m1, __m64 __m2) {
744	#if _ARCH_PWR8
745	__vector signed char __a, __b, __c;
746
747	__a = (__vector signed char)vec_splats(__m1);
748	__b = (__vector signed char)vec_splats(__m2);
749	__c = (__vector signed char)vec_cmpgt(__a, __b);
750	return (__m64)((__vector long long)__c)[0];
751	#else
752	__m64_union __mu1, __mu2, __res;
753
754	__mu1.as_m64 = __m1;
755	__mu2.as_m64 = __m2;
756
757	__res.as_char[0] = (__mu1.as_char[0] > __mu2.as_char[0]) ? -1 : 0;
758	__res.as_char[1] = (__mu1.as_char[1] > __mu2.as_char[1]) ? -1 : 0;
759	__res.as_char[2] = (__mu1.as_char[2] > __mu2.as_char[2]) ? -1 : 0;
760	__res.as_char[3] = (__mu1.as_char[3] > __mu2.as_char[3]) ? -1 : 0;
761	__res.as_char[4] = (__mu1.as_char[4] > __mu2.as_char[4]) ? -1 : 0;
762	__res.as_char[5] = (__mu1.as_char[5] > __mu2.as_char[5]) ? -1 : 0;
763	__res.as_char[6] = (__mu1.as_char[6] > __mu2.as_char[6]) ? -1 : 0;
764	__res.as_char[7] = (__mu1.as_char[7] > __mu2.as_char[7]) ? -1 : 0;
765
766	return (__m64)__res.as_m64;
767	#endif
768	}
769
770	extern __inline __m64
771	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
772	_m_pcmpgtb(__m64 __m1, __m64 __m2) {
773	return _mm_cmpgt_pi8(__m1, __m2);
774	}
775
776	/* Compare four 16-bit values. The result of the comparison is 0xFFFF if
777	the test is true and zero if false. */
778	extern __inline __m64
779	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
780	_mm_cmpeq_pi16(__m64 __m1, __m64 __m2) {
781	#if _ARCH_PWR8
782	__vector signed short __a, __b, __c;
783
784	__a = (__vector signed short)vec_splats(__m1);
785	__b = (__vector signed short)vec_splats(__m2);
786	__c = (__vector signed short)vec_cmpeq(__a, __b);
787	return (__m64)((__vector long long)__c)[0];
788	#else
789	__m64_union __mu1, __mu2, __res;
790
791	__mu1.as_m64 = __m1;
792	__mu2.as_m64 = __m2;
793
794	__res.as_short[0] = (__mu1.as_short[0] == __mu2.as_short[0]) ? -1 : 0;
795	__res.as_short[1] = (__mu1.as_short[1] == __mu2.as_short[1]) ? -1 : 0;
796	__res.as_short[2] = (__mu1.as_short[2] == __mu2.as_short[2]) ? -1 : 0;
797	__res.as_short[3] = (__mu1.as_short[3] == __mu2.as_short[3]) ? -1 : 0;
798
799	return (__m64)__res.as_m64;
800	#endif
801	}
802
803	extern __inline __m64
804	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
805	_m_pcmpeqw(__m64 __m1, __m64 __m2) {
806	return _mm_cmpeq_pi16(__m1, __m2);
807	}
808
809	extern __inline __m64
810	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
811	_mm_cmpgt_pi16(__m64 __m1, __m64 __m2) {
812	#if _ARCH_PWR8
813	__vector signed short __a, __b, __c;
814
815	__a = (__vector signed short)vec_splats(__m1);
816	__b = (__vector signed short)vec_splats(__m2);
817	__c = (__vector signed short)vec_cmpgt(__a, __b);
818	return (__m64)((__vector long long)__c)[0];
819	#else
820	__m64_union __mu1, __mu2, __res;
821
822	__mu1.as_m64 = __m1;
823	__mu2.as_m64 = __m2;
824
825	__res.as_short[0] = (__mu1.as_short[0] > __mu2.as_short[0]) ? -1 : 0;
826	__res.as_short[1] = (__mu1.as_short[1] > __mu2.as_short[1]) ? -1 : 0;
827	__res.as_short[2] = (__mu1.as_short[2] > __mu2.as_short[2]) ? -1 : 0;
828	__res.as_short[3] = (__mu1.as_short[3] > __mu2.as_short[3]) ? -1 : 0;
829
830	return (__m64)__res.as_m64;
831	#endif
832	}
833
834	extern __inline __m64
835	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
836	_m_pcmpgtw(__m64 __m1, __m64 __m2) {
837	return _mm_cmpgt_pi16(__m1, __m2);
838	}
839
840	/* Compare two 32-bit values. The result of the comparison is 0xFFFFFFFF if
841	the test is true and zero if false. */
842	extern __inline __m64
843	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
844	_mm_cmpeq_pi32(__m64 __m1, __m64 __m2) {
845	#if _ARCH_PWR9
846	__vector signed int __a, __b, __c;
847
848	__a = (__vector signed int)vec_splats(__m1);
849	__b = (__vector signed int)vec_splats(__m2);
850	__c = (__vector signed int)vec_cmpeq(__a, __b);
851	return (__m64)((__vector long long)__c)[0];
852	#else
853	__m64_union __mu1, __mu2, __res;
854
855	__mu1.as_m64 = __m1;
856	__mu2.as_m64 = __m2;
857
858	__res.as_int[0] = (__mu1.as_int[0] == __mu2.as_int[0]) ? -1 : 0;
859	__res.as_int[1] = (__mu1.as_int[1] == __mu2.as_int[1]) ? -1 : 0;
860
861	return (__m64)__res.as_m64;
862	#endif
863	}
864
865	extern __inline __m64
866	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
867	_m_pcmpeqd(__m64 __m1, __m64 __m2) {
868	return _mm_cmpeq_pi32(__m1, __m2);
869	}
870
871	extern __inline __m64
872	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
873	_mm_cmpgt_pi32(__m64 __m1, __m64 __m2) {
874	#if _ARCH_PWR9
875	__vector signed int __a, __b, __c;
876
877	__a = (__vector signed int)vec_splats(__m1);
878	__b = (__vector signed int)vec_splats(__m2);
879	__c = (__vector signed int)vec_cmpgt(__a, __b);
880	return (__m64)((__vector long long)__c)[0];
881	#else
882	__m64_union __mu1, __mu2, __res;
883
884	__mu1.as_m64 = __m1;
885	__mu2.as_m64 = __m2;
886
887	__res.as_int[0] = (__mu1.as_int[0] > __mu2.as_int[0]) ? -1 : 0;
888	__res.as_int[1] = (__mu1.as_int[1] > __mu2.as_int[1]) ? -1 : 0;
889
890	return (__m64)__res.as_m64;
891	#endif
892	}
893
894	extern __inline __m64
895	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
896	_m_pcmpgtd(__m64 __m1, __m64 __m2) {
897	return _mm_cmpgt_pi32(__m1, __m2);
898	}
899
900	#if _ARCH_PWR8
901	/* Add the 8-bit values in M1 to the 8-bit values in M2 using signed
902	saturated arithmetic. */
903	extern __inline __m64
904	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
905	_mm_adds_pi8(__m64 __m1, __m64 __m2) {
906	__vector signed char __a, __b, __c;
907
908	__a = (__vector signed char)vec_splats(__m1);
909	__b = (__vector signed char)vec_splats(__m2);
910	__c = vec_adds(__a, __b);
911	return (__m64)((__vector long long)__c)[0];
912	}
913
914	extern __inline __m64
915	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
916	_m_paddsb(__m64 __m1, __m64 __m2) {
917	return _mm_adds_pi8(__m1, __m2);
918	}
919	/* Add the 16-bit values in M1 to the 16-bit values in M2 using signed
920	saturated arithmetic. */
921	extern __inline __m64
922	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
923	_mm_adds_pi16(__m64 __m1, __m64 __m2) {
924	__vector signed short __a, __b, __c;
925
926	__a = (__vector signed short)vec_splats(__m1);
927	__b = (__vector signed short)vec_splats(__m2);
928	__c = vec_adds(__a, __b);
929	return (__m64)((__vector long long)__c)[0];
930	}
931
932	extern __inline __m64
933	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
934	_m_paddsw(__m64 __m1, __m64 __m2) {
935	return _mm_adds_pi16(__m1, __m2);
936	}
937	/* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned
938	saturated arithmetic. */
939	extern __inline __m64
940	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
941	_mm_adds_pu8(__m64 __m1, __m64 __m2) {
942	__vector unsigned char __a, __b, __c;
943
944	__a = (__vector unsigned char)vec_splats(__m1);
945	__b = (__vector unsigned char)vec_splats(__m2);
946	__c = vec_adds(__a, __b);
947	return (__m64)((__vector long long)__c)[0];
948	}
949
950	extern __inline __m64
951	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
952	_m_paddusb(__m64 __m1, __m64 __m2) {
953	return _mm_adds_pu8(__m1, __m2);
954	}
955
956	/* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned
957	saturated arithmetic. */
958	extern __inline __m64
959	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
960	_mm_adds_pu16(__m64 __m1, __m64 __m2) {
961	__vector unsigned short __a, __b, __c;
962
963	__a = (__vector unsigned short)vec_splats(__m1);
964	__b = (__vector unsigned short)vec_splats(__m2);
965	__c = vec_adds(__a, __b);
966	return (__m64)((__vector long long)__c)[0];
967	}
968
969	extern __inline __m64
970	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
971	_m_paddusw(__m64 __m1, __m64 __m2) {
972	return _mm_adds_pu16(__m1, __m2);
973	}
974
975	/* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed
976	saturating arithmetic. */
977	extern __inline __m64
978	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
979	_mm_subs_pi8(__m64 __m1, __m64 __m2) {
980	__vector signed char __a, __b, __c;
981
982	__a = (__vector signed char)vec_splats(__m1);
983	__b = (__vector signed char)vec_splats(__m2);
984	__c = vec_subs(__a, __b);
985	return (__m64)((__vector long long)__c)[0];
986	}
987
988	extern __inline __m64
989	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
990	_m_psubsb(__m64 __m1, __m64 __m2) {
991	return _mm_subs_pi8(__m1, __m2);
992	}
993
994	/* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
995	signed saturating arithmetic. */
996	extern __inline __m64
997	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
998	_mm_subs_pi16(__m64 __m1, __m64 __m2) {
999	__vector signed short __a, __b, __c;
1000
1001	__a = (__vector signed short)vec_splats(__m1);
1002	__b = (__vector signed short)vec_splats(__m2);
1003	__c = vec_subs(__a, __b);
1004	return (__m64)((__vector long long)__c)[0];
1005	}
1006
1007	extern __inline __m64
1008	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1009	_m_psubsw(__m64 __m1, __m64 __m2) {
1010	return _mm_subs_pi16(__m1, __m2);
1011	}
1012
1013	/* Subtract the 8-bit values in M2 from the 8-bit values in M1 using
1014	unsigned saturating arithmetic. */
1015	extern __inline __m64
1016	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1017	_mm_subs_pu8(__m64 __m1, __m64 __m2) {
1018	__vector unsigned char __a, __b, __c;
1019
1020	__a = (__vector unsigned char)vec_splats(__m1);
1021	__b = (__vector unsigned char)vec_splats(__m2);
1022	__c = vec_subs(__a, __b);
1023	return (__m64)((__vector long long)__c)[0];
1024	}
1025
1026	extern __inline __m64
1027	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1028	_m_psubusb(__m64 __m1, __m64 __m2) {
1029	return _mm_subs_pu8(__m1, __m2);
1030	}
1031
1032	/* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
1033	unsigned saturating arithmetic. */
1034	extern __inline __m64
1035	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1036	_mm_subs_pu16(__m64 __m1, __m64 __m2) {
1037	__vector unsigned short __a, __b, __c;
1038
1039	__a = (__vector unsigned short)vec_splats(__m1);
1040	__b = (__vector unsigned short)vec_splats(__m2);
1041	__c = vec_subs(__a, __b);
1042	return (__m64)((__vector long long)__c)[0];
1043	}
1044
1045	extern __inline __m64
1046	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1047	_m_psubusw(__m64 __m1, __m64 __m2) {
1048	return _mm_subs_pu16(__m1, __m2);
1049	}
1050
1051	/* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing
1052	four 32-bit intermediate results, which are then summed by pairs to
1053	produce two 32-bit results. */
1054	extern __inline __m64
1055	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1056	_mm_madd_pi16(__m64 __m1, __m64 __m2) {
1057	__vector signed short __a, __b;
1058	__vector signed int __c;
1059	__vector signed int __zero = {0, 0, 0, 0};
1060
1061	__a = (__vector signed short)vec_splats(__m1);
1062	__b = (__vector signed short)vec_splats(__m2);
1063	__c = vec_vmsumshm(__a, __b, __zero);
1064	return (__m64)((__vector long long)__c)[0];
1065	}
1066
1067	extern __inline __m64
1068	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1069	_m_pmaddwd(__m64 __m1, __m64 __m2) {
1070	return _mm_madd_pi16(__m1, __m2);
1071	}
1072	/* Multiply four signed 16-bit values in M1 by four signed 16-bit values in
1073	M2 and produce the high 16 bits of the 32-bit results. */
1074	extern __inline __m64
1075	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1076	_mm_mulhi_pi16(__m64 __m1, __m64 __m2) {
1077	__vector signed short __a, __b;
1078	__vector signed short __c;
1079	__vector signed int __w0, __w1;
1080	__vector unsigned char __xform1 = {
1081	#ifdef __LITTLE_ENDIAN__
1082	0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A,
1083	0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1084	#else
1085	0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x00,
1086	0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15
1087	#endif
1088	};
1089
1090	__a = (__vector signed short)vec_splats(__m1);
1091	__b = (__vector signed short)vec_splats(__m2);
1092
1093	__w0 = vec_vmulesh(__a, __b);
1094	__w1 = vec_vmulosh(__a, __b);
1095	__c = (__vector signed short)vec_perm(__w0, __w1, __xform1);
1096
1097	return (__m64)((__vector long long)__c)[0];
1098	}
1099
1100	extern __inline __m64
1101	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1102	_m_pmulhw(__m64 __m1, __m64 __m2) {
1103	return _mm_mulhi_pi16(__m1, __m2);
1104	}
1105
1106	/* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce
1107	the low 16 bits of the results. */
1108	extern __inline __m64
1109	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1110	_mm_mullo_pi16(__m64 __m1, __m64 __m2) {
1111	__vector signed short __a, __b, __c;
1112
1113	__a = (__vector signed short)vec_splats(__m1);
1114	__b = (__vector signed short)vec_splats(__m2);
1115	__c = __a * __b;
1116	return (__m64)((__vector long long)__c)[0];
1117	}
1118
1119	extern __inline __m64
1120	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1121	_m_pmullw(__m64 __m1, __m64 __m2) {
1122	return _mm_mullo_pi16(__m1, __m2);
1123	}
1124
1125	/* Shift four 16-bit values in M left by COUNT. */
1126	extern __inline __m64
1127	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1128	_mm_sll_pi16(__m64 __m, __m64 __count) {
1129	__vector signed short __r;
1130	__vector unsigned short __c;
1131
1132	if (__count <= 15) {
1133	__r = (__vector signed short)vec_splats(__m);
1134	__c = (__vector unsigned short)vec_splats((unsigned short)__count);
1135	__r = vec_sl(__r, (__vector unsigned short)__c);
1136	return (__m64)((__vector long long)__r)[0];
1137	} else
1138	return (0);
1139	}
1140
1141	extern __inline __m64
1142	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1143	_m_psllw(__m64 __m, __m64 __count) {
1144	return _mm_sll_pi16(__m, __count);
1145	}
1146
1147	extern __inline __m64
1148	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1149	_mm_slli_pi16(__m64 __m, int __count) {
1150	/* Promote int to long then invoke mm_sll_pi16. */
1151	return _mm_sll_pi16(__m, __count);
1152	}
1153
1154	extern __inline __m64
1155	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1156	_m_psllwi(__m64 __m, int __count) {
1157	return _mm_slli_pi16(__m, __count);
1158	}
1159
1160	/* Shift two 32-bit values in M left by COUNT. */
1161	extern __inline __m64
1162	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1163	_mm_sll_pi32(__m64 __m, __m64 __count) {
1164	__m64_union __res;
1165
1166	__res.as_m64 = __m;
1167
1168	__res.as_int[0] = __res.as_int[0] << __count;
1169	__res.as_int[1] = __res.as_int[1] << __count;
1170	return (__res.as_m64);
1171	}
1172
1173	extern __inline __m64
1174	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1175	_m_pslld(__m64 __m, __m64 __count) {
1176	return _mm_sll_pi32(__m, __count);
1177	}
1178
1179	extern __inline __m64
1180	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1181	_mm_slli_pi32(__m64 __m, int __count) {
1182	/* Promote int to long then invoke mm_sll_pi32. */
1183	return _mm_sll_pi32(__m, __count);
1184	}
1185
1186	extern __inline __m64
1187	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1188	_m_pslldi(__m64 __m, int __count) {
1189	return _mm_slli_pi32(__m, __count);
1190	}
1191
1192	/* Shift four 16-bit values in M right by COUNT; shift in the sign bit. */
1193	extern __inline __m64
1194	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1195	_mm_sra_pi16(__m64 __m, __m64 __count) {
1196	__vector signed short __r;
1197	__vector unsigned short __c;
1198
1199	if (__count <= 15) {
1200	__r = (__vector signed short)vec_splats(__m);
1201	__c = (__vector unsigned short)vec_splats((unsigned short)__count);
1202	__r = vec_sra(__r, (__vector unsigned short)__c);
1203	return (__m64)((__vector long long)__r)[0];
1204	} else
1205	return (0);
1206	}
1207
1208	extern __inline __m64
1209	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1210	_m_psraw(__m64 __m, __m64 __count) {
1211	return _mm_sra_pi16(__m, __count);
1212	}
1213
1214	extern __inline __m64
1215	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1216	_mm_srai_pi16(__m64 __m, int __count) {
1217	/* Promote int to long then invoke mm_sra_pi32. */
1218	return _mm_sra_pi16(__m, __count);
1219	}
1220
1221	extern __inline __m64
1222	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1223	_m_psrawi(__m64 __m, int __count) {
1224	return _mm_srai_pi16(__m, __count);
1225	}
1226
1227	/* Shift two 32-bit values in M right by COUNT; shift in the sign bit. */
1228	extern __inline __m64
1229	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1230	_mm_sra_pi32(__m64 __m, __m64 __count) {
1231	__m64_union __res;
1232
1233	__res.as_m64 = __m;
1234
1235	__res.as_int[0] = __res.as_int[0] >> __count;
1236	__res.as_int[1] = __res.as_int[1] >> __count;
1237	return (__res.as_m64);
1238	}
1239
1240	extern __inline __m64
1241	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1242	_m_psrad(__m64 __m, __m64 __count) {
1243	return _mm_sra_pi32(__m, __count);
1244	}
1245
1246	extern __inline __m64
1247	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1248	_mm_srai_pi32(__m64 __m, int __count) {
1249	/* Promote int to long then invoke mm_sra_pi32. */
1250	return _mm_sra_pi32(__m, __count);
1251	}
1252
1253	extern __inline __m64
1254	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1255	_m_psradi(__m64 __m, int __count) {
1256	return _mm_srai_pi32(__m, __count);
1257	}
1258
1259	/* Shift four 16-bit values in M right by COUNT; shift in zeros. */
1260	extern __inline __m64
1261	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1262	_mm_srl_pi16(__m64 __m, __m64 __count) {
1263	__vector unsigned short __r;
1264	__vector unsigned short __c;
1265
1266	if (__count <= 15) {
1267	__r = (__vector unsigned short)vec_splats(__m);
1268	__c = (__vector unsigned short)vec_splats((unsigned short)__count);
1269	__r = vec_sr(__r, (__vector unsigned short)__c);
1270	return (__m64)((__vector long long)__r)[0];
1271	} else
1272	return (0);
1273	}
1274
1275	extern __inline __m64
1276	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1277	_m_psrlw(__m64 __m, __m64 __count) {
1278	return _mm_srl_pi16(__m, __count);
1279	}
1280
1281	extern __inline __m64
1282	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1283	_mm_srli_pi16(__m64 __m, int __count) {
1284	/* Promote int to long then invoke mm_sra_pi32. */
1285	return _mm_srl_pi16(__m, __count);
1286	}
1287
1288	extern __inline __m64
1289	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1290	_m_psrlwi(__m64 __m, int __count) {
1291	return _mm_srli_pi16(__m, __count);
1292	}
1293
1294	/* Shift two 32-bit values in M right by COUNT; shift in zeros. */
1295	extern __inline __m64
1296	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1297	_mm_srl_pi32(__m64 __m, __m64 __count) {
1298	__m64_union __res;
1299
1300	__res.as_m64 = __m;
1301
1302	__res.as_int[0] = (unsigned int)__res.as_int[0] >> __count;
1303	__res.as_int[1] = (unsigned int)__res.as_int[1] >> __count;
1304	return (__res.as_m64);
1305	}
1306
1307	extern __inline __m64
1308	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1309	_m_psrld(__m64 __m, __m64 __count) {
1310	return _mm_srl_pi32(__m, __count);
1311	}
1312
1313	extern __inline __m64
1314	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1315	_mm_srli_pi32(__m64 __m, int __count) {
1316	/* Promote int to long then invoke mm_srl_pi32. */
1317	return _mm_srl_pi32(__m, __count);
1318	}
1319
1320	extern __inline __m64
1321	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1322	_m_psrldi(__m64 __m, int __count) {
1323	return _mm_srli_pi32(__m, __count);
1324	}
1325	#endif /* _ARCH_PWR8 */
1326
1327	/* Creates a vector of two 32-bit values; I0 is least significant. */
1328	extern __inline __m64
1329	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1330	_mm_set_pi32(int __i1, int __i0) {
1331	__m64_union __res;
1332
1333	__res.as_int[0] = __i0;
1334	__res.as_int[1] = __i1;
1335	return (__res.as_m64);
1336	}
1337
1338	/* Creates a vector of four 16-bit values; W0 is least significant. */
1339	extern __inline __m64
1340	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1341	_mm_set_pi16(short __w3, short __w2, short __w1, short __w0) {
1342	__m64_union __res;
1343
1344	__res.as_short[0] = __w0;
1345	__res.as_short[1] = __w1;
1346	__res.as_short[2] = __w2;
1347	__res.as_short[3] = __w3;
1348	return (__res.as_m64);
1349	}
1350
1351	/* Creates a vector of eight 8-bit values; B0 is least significant. */
1352	extern __inline __m64
1353	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1354	_mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3,
1355	char __b2, char __b1, char __b0) {
1356	__m64_union __res;
1357
1358	__res.as_char[0] = __b0;
1359	__res.as_char[1] = __b1;
1360	__res.as_char[2] = __b2;
1361	__res.as_char[3] = __b3;
1362	__res.as_char[4] = __b4;
1363	__res.as_char[5] = __b5;
1364	__res.as_char[6] = __b6;
1365	__res.as_char[7] = __b7;
1366	return (__res.as_m64);
1367	}
1368
1369	/* Similar, but with the arguments in reverse order. */
1370	extern __inline __m64
1371	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1372	_mm_setr_pi32(int __i0, int __i1) {
1373	__m64_union __res;
1374
1375	__res.as_int[0] = __i0;
1376	__res.as_int[1] = __i1;
1377	return (__res.as_m64);
1378	}
1379
1380	extern __inline __m64
1381	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1382	_mm_setr_pi16(short __w0, short __w1, short __w2, short __w3) {
1383	return _mm_set_pi16(__w3, __w2, __w1, __w0);
1384	}
1385
1386	extern __inline __m64
1387	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1388	_mm_setr_pi8(char __b0, char __b1, char __b2, char __b3, char __b4,
1389	char __b5, char __b6, char __b7) {
1390	return _mm_set_pi8(__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
1391	}
1392
1393	/* Creates a vector of two 32-bit values, both elements containing I. */
1394	extern __inline __m64
1395	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1396	_mm_set1_pi32(int __i) {
1397	__m64_union __res;
1398
1399	__res.as_int[0] = __i;
1400	__res.as_int[1] = __i;
1401	return (__res.as_m64);
1402	}
1403
1404	/* Creates a vector of four 16-bit values, all elements containing W. */
1405	extern __inline __m64
1406	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1407	_mm_set1_pi16(short __w) {
1408	#if _ARCH_PWR9
1409	__vector signed short w;
1410
1411	w = (__vector signed short)vec_splats(__w);
1412	return (__m64)((__vector long long)w)[0];
1413	#else
1414	__m64_union __res;
1415
1416	__res.as_short[0] = __w;
1417	__res.as_short[1] = __w;
1418	__res.as_short[2] = __w;
1419	__res.as_short[3] = __w;
1420	return (__res.as_m64);
1421	#endif
1422	}
1423
1424	/* Creates a vector of eight 8-bit values, all elements containing B. */
1425	extern __inline __m64
1426	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1427	_mm_set1_pi8(signed char __b) {
1428	#if _ARCH_PWR8
1429	__vector signed char __res;
1430
1431	__res = (__vector signed char)vec_splats(__b);
1432	return (__m64)((__vector long long)__res)[0];
1433	#else
1434	__m64_union __res;
1435
1436	__res.as_char[0] = __b;
1437	__res.as_char[1] = __b;
1438	__res.as_char[2] = __b;
1439	__res.as_char[3] = __b;
1440	__res.as_char[4] = __b;
1441	__res.as_char[5] = __b;
1442	__res.as_char[6] = __b;
1443	__res.as_char[7] = __b;
1444	return (__res.as_m64);
1445	#endif
1446	}
1447
1448	#else
1449	#include_next <mmintrin.h>
1450	#endif /* defined(__powerpc64__) && \
1451	* (defined(__linux__) \|\| defined(__FreeBSD__) \|\| defined(_AIX)) */
1452
1453	#endif /* _MMINTRIN_H_INCLUDED */
1454

Warning: This file is not a C or C++ file. It does not have highlighting.

source code of clang/lib/Headers/ppc_wrappers/mmintrin.h