xmmintrin.h source code [clang/lib/Headers/ppc_wrappers/xmmintrin.h]

Warning: This file is not a C or C++ file. It does not have highlighting.

1	/*===---- xmmintrin.h - Implementation of SSE intrinsics on PowerPC --------===
2	*
3	* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	* See https://llvm.org/LICENSE.txt for license information.
5	* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	*
7	*===-----------------------------------------------------------------------===
8	*/
9
10	/* Implemented from the specification included in the Intel C++ Compiler
11	User Guide and Reference, version 9.0. */
12
13	#ifndef NO_WARN_X86_INTRINSICS
14	/* This header file is to help porting code using Intel intrinsics
15	explicitly from x86_64 to powerpc64/powerpc64le.
16
17	Since X86 SSE intrinsics mainly handles __m128 type, PowerPC
18	VMX/VSX ISA is a good match for vector float SIMD operations.
19	However scalar float operations in vector (XMM) registers require
20	the POWER8 VSX ISA (2.07) level. There are differences for data
21	format and placement of float scalars in the vector register, which
22	require extra steps to match SSE scalar float semantics on POWER.
23
24	It should be noted that there's much difference between X86_64's
25	MXSCR and PowerISA's FPSCR/VSCR registers. It's recommended to use
26	portable <fenv.h> instead of access MXSCR directly.
27
28	Most SSE scalar float intrinsic operations can be performed more
29	efficiently as C language float scalar operations or optimized to
30	use vector SIMD operations. We recommend this for new applications. */
31	#error \
32	"Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
33	#endif
34
35	#ifndef XMMINTRIN_H_
36	#define XMMINTRIN_H_
37
38	#if defined(__powerpc64__) && \
39	(defined(__linux__) \|\| defined(__FreeBSD__) \|\| defined(_AIX))
40
41	/* Define four value permute mask */
42	#define _MM_SHUFFLE(w, x, y, z) (((w) << 6) \| ((x) << 4) \| ((y) << 2) \| (z))
43
44	#include <altivec.h>
45
46	/* Avoid collisions between altivec.h and strict adherence to C++ and
47	C11 standards. This should eventually be done inside altivec.h itself,
48	but only after testing a full distro build. */
49	#if defined(__STRICT_ANSI__) && \
50	(defined(__cplusplus) \|\| \
51	(defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L))
52	#undef vector
53	#undef pixel
54	#undef bool
55	#endif
56
57	/* We need type definitions from the MMX header file. */
58	#include <mmintrin.h>
59
60	/* Get _mm_malloc () and _mm_free (). */
61	#if __STDC_HOSTED__
62	#include <mm_malloc.h>
63	#endif
64
65	/* The Intel API is flexible enough that we must allow aliasing with other
66	vector types, and their scalar components. */
67	typedef vector float __m128 __attribute__((__may_alias__));
68
69	/* Unaligned version of the same type. */
70	typedef vector float __m128_u __attribute__((__may_alias__, __aligned__(1)));
71
72	/* Internal data types for implementing the intrinsics. */
73	typedef vector float __v4sf;
74
75	/* Create an undefined vector. */
76	extern __inline __m128
77	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
78	_mm_undefined_ps(void) {
79	__m128 __Y = __Y;
80	return __Y;
81	}
82
83	/* Create a vector of zeros. */
84	extern __inline __m128
85	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
86	_mm_setzero_ps(void) {
87	return __extension__(__m128){0.0f, 0.0f, 0.0f, 0.0f};
88	}
89
90	/* Load four SPFP values from P. The address must be 16-byte aligned. */
91	extern __inline __m128
92	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
93	_mm_load_ps(float const *__P) {
94	return ((__m128)vec_ld(0, (__v4sf *)__P));
95	}
96
97	/* Load four SPFP values from P. The address need not be 16-byte aligned. */
98	extern __inline __m128
99	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
100	_mm_loadu_ps(float const *__P) {
101	return (vec_vsx_ld(0, __P));
102	}
103
104	/* Load four SPFP values in reverse order. The address must be aligned. */
105	extern __inline __m128
106	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
107	_mm_loadr_ps(float const *__P) {
108	__v4sf __tmp;
109	__m128 __result;
110	static const __vector unsigned char __permute_vector = {
111	0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B,
112	0x14, 0x15, 0x16, 0x17, 0x10, 0x11, 0x12, 0x13};
113
114	__tmp = vec_ld(0, (__v4sf *)__P);
115	__result = (__m128)vec_perm(__tmp, __tmp, __permute_vector);
116	return __result;
117	}
118
119	/* Create a vector with all four elements equal to F. */
120	extern __inline __m128
121	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
122	_mm_set1_ps(float __F) {
123	return __extension__(__m128)(__v4sf){__F, __F, __F, __F};
124	}
125
126	extern __inline __m128
127	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
128	_mm_set_ps1(float __F) {
129	return _mm_set1_ps(__F);
130	}
131
132	/* Create the vector [Z Y X W]. */
133	extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__,
134	__artificial__))
135	_mm_set_ps(const float __Z, const float __Y, const float __X, const float __W) {
136	return __extension__(__m128)(__v4sf){__W, __X, __Y, __Z};
137	}
138
139	/* Create the vector [W X Y Z]. */
140	extern __inline __m128
141	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
142	_mm_setr_ps(float __Z, float __Y, float __X, float __W) {
143	return __extension__(__m128)(__v4sf){__Z, __Y, __X, __W};
144	}
145
146	/* Store four SPFP values. The address must be 16-byte aligned. */
147	extern __inline void
148	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
149	_mm_store_ps(float *__P, __m128 __A) {
150	vec_st((__v4sf)__A, 0, (__v4sf *)__P);
151	}
152
153	/* Store four SPFP values. The address need not be 16-byte aligned. */
154	extern __inline void
155	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
156	_mm_storeu_ps(float *__P, __m128 __A) {
157	(__m128_u )__P = __A;
158	}
159
160	/* Store four SPFP values in reverse order. The address must be aligned. */
161	extern __inline void
162	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
163	_mm_storer_ps(float *__P, __m128 __A) {
164	__v4sf __tmp;
165	static const __vector unsigned char __permute_vector = {
166	0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B,
167	0x14, 0x15, 0x16, 0x17, 0x10, 0x11, 0x12, 0x13};
168
169	__tmp = (__m128)vec_perm(__A, __A, __permute_vector);
170
171	_mm_store_ps(__P, __tmp);
172	}
173
174	/* Store the lower SPFP value across four words. */
175	extern __inline void
176	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
177	_mm_store1_ps(float *__P, __m128 __A) {
178	__v4sf __va = vec_splat((__v4sf)__A, 0);
179	_mm_store_ps(__P, __va);
180	}
181
182	extern __inline void
183	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
184	_mm_store_ps1(float *__P, __m128 __A) {
185	_mm_store1_ps(__P, __A);
186	}
187
188	/* Create a vector with element 0 as F and the rest zero. */
189	extern __inline __m128
190	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
191	_mm_set_ss(float __F) {
192	return __extension__(__m128)(__v4sf){__F, 0.0f, 0.0f, 0.0f};
193	}
194
195	/* Sets the low SPFP value of A from the low value of B. */
196	extern __inline __m128
197	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
198	_mm_move_ss(__m128 __A, __m128 __B) {
199	static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
200
201	return (vec_sel((__v4sf)__A, (__v4sf)__B, __mask));
202	}
203
204	/* Create a vector with element 0 as P and the rest zero. /
205	extern __inline __m128
206	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
207	_mm_load_ss(float const *__P) {
208	return _mm_set_ss(*__P);
209	}
210
211	/* Stores the lower SPFP value. */
212	extern __inline void
213	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
214	_mm_store_ss(float *__P, __m128 __A) {
215	*__P = ((__v4sf)__A)[0];
216	}
217
218	/* Perform the respective operation on the lower SPFP (single-precision
219	floating-point) values of A and B; the upper three SPFP values are
220	passed through from A. */
221
222	extern __inline __m128
223	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
224	_mm_add_ss(__m128 __A, __m128 __B) {
225	#ifdef _ARCH_PWR7
226	__m128 __a, __b, __c;
227	static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
228	/* PowerISA VSX does not allow partial (for just lower double)
229	results. So to insure we don't generate spurious exceptions
230	(from the upper double values) we splat the lower double
231	before we to the operation. */
232	__a = vec_splat(__A, 0);
233	__b = vec_splat(__B, 0);
234	__c = __a + __b;
235	/* Then we merge the lower float result with the original upper
236	float elements from __A. */
237	return (vec_sel(__A, __c, __mask));
238	#else
239	__A[0] = __A[0] + __B[0];
240	return (__A);
241	#endif
242	}
243
244	extern __inline __m128
245	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
246	_mm_sub_ss(__m128 __A, __m128 __B) {
247	#ifdef _ARCH_PWR7
248	__m128 __a, __b, __c;
249	static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
250	/* PowerISA VSX does not allow partial (for just lower double)
251	results. So to insure we don't generate spurious exceptions
252	(from the upper double values) we splat the lower double
253	before we to the operation. */
254	__a = vec_splat(__A, 0);
255	__b = vec_splat(__B, 0);
256	__c = __a - __b;
257	/* Then we merge the lower float result with the original upper
258	float elements from __A. */
259	return (vec_sel(__A, __c, __mask));
260	#else
261	__A[0] = __A[0] - __B[0];
262	return (__A);
263	#endif
264	}
265
266	extern __inline __m128
267	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
268	_mm_mul_ss(__m128 __A, __m128 __B) {
269	#ifdef _ARCH_PWR7
270	__m128 __a, __b, __c;
271	static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
272	/* PowerISA VSX does not allow partial (for just lower double)
273	results. So to insure we don't generate spurious exceptions
274	(from the upper double values) we splat the lower double
275	before we to the operation. */
276	__a = vec_splat(__A, 0);
277	__b = vec_splat(__B, 0);
278	__c = __a * __b;
279	/* Then we merge the lower float result with the original upper
280	float elements from __A. */
281	return (vec_sel(__A, __c, __mask));
282	#else
283	__A[0] = __A[0] * __B[0];
284	return (__A);
285	#endif
286	}
287
288	extern __inline __m128
289	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
290	_mm_div_ss(__m128 __A, __m128 __B) {
291	#ifdef _ARCH_PWR7
292	__m128 __a, __b, __c;
293	static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
294	/* PowerISA VSX does not allow partial (for just lower double)
295	results. So to insure we don't generate spurious exceptions
296	(from the upper double values) we splat the lower double
297	before we to the operation. */
298	__a = vec_splat(__A, 0);
299	__b = vec_splat(__B, 0);
300	__c = __a / __b;
301	/* Then we merge the lower float result with the original upper
302	float elements from __A. */
303	return (vec_sel(__A, __c, __mask));
304	#else
305	__A[0] = __A[0] / __B[0];
306	return (__A);
307	#endif
308	}
309
310	extern __inline __m128
311	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
312	_mm_sqrt_ss(__m128 __A) {
313	__m128 __a, __c;
314	static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
315	/* PowerISA VSX does not allow partial (for just lower double)
316	* results. So to insure we don't generate spurious exceptions
317	* (from the upper double values) we splat the lower double
318	* before we to the operation. */
319	__a = vec_splat(__A, 0);
320	__c = vec_sqrt(__a);
321	/* Then we merge the lower float result with the original upper
322	* float elements from __A. */
323	return (vec_sel(__A, __c, __mask));
324	}
325
326	/* Perform the respective operation on the four SPFP values in A and B. */
327	extern __inline __m128
328	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
329	_mm_add_ps(__m128 __A, __m128 __B) {
330	return (__m128)((__v4sf)__A + (__v4sf)__B);
331	}
332
333	extern __inline __m128
334	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
335	_mm_sub_ps(__m128 __A, __m128 __B) {
336	return (__m128)((__v4sf)__A - (__v4sf)__B);
337	}
338
339	extern __inline __m128
340	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
341	_mm_mul_ps(__m128 __A, __m128 __B) {
342	return (__m128)((__v4sf)__A * (__v4sf)__B);
343	}
344
345	extern __inline __m128
346	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
347	_mm_div_ps(__m128 __A, __m128 __B) {
348	return (__m128)((__v4sf)__A / (__v4sf)__B);
349	}
350
351	extern __inline __m128
352	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
353	_mm_sqrt_ps(__m128 __A) {
354	return (vec_sqrt((__v4sf)__A));
355	}
356
357	extern __inline __m128
358	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
359	_mm_rcp_ps(__m128 __A) {
360	return (vec_re((__v4sf)__A));
361	}
362
363	extern __inline __m128
364	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
365	_mm_rsqrt_ps(__m128 __A) {
366	return (vec_rsqrte(__A));
367	}
368
369	extern __inline __m128
370	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
371	_mm_rcp_ss(__m128 __A) {
372	__m128 __a, __c;
373	static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
374	/* PowerISA VSX does not allow partial (for just lower double)
375	* results. So to insure we don't generate spurious exceptions
376	* (from the upper double values) we splat the lower double
377	* before we to the operation. */
378	__a = vec_splat(__A, 0);
379	__c = _mm_rcp_ps(__a);
380	/* Then we merge the lower float result with the original upper
381	* float elements from __A. */
382	return (vec_sel(__A, __c, __mask));
383	}
384
385	extern __inline __m128
386	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
387	_mm_rsqrt_ss(__m128 __A) {
388	__m128 __a, __c;
389	static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
390	/* PowerISA VSX does not allow partial (for just lower double)
391	* results. So to insure we don't generate spurious exceptions
392	* (from the upper double values) we splat the lower double
393	* before we to the operation. */
394	__a = vec_splat(__A, 0);
395	__c = vec_rsqrte(__a);
396	/* Then we merge the lower float result with the original upper
397	* float elements from __A. */
398	return (vec_sel(__A, __c, __mask));
399	}
400
401	extern __inline __m128
402	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
403	_mm_min_ss(__m128 __A, __m128 __B) {
404	__v4sf __a, __b, __c;
405	static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
406	/* PowerISA VSX does not allow partial (for just lower float)
407	* results. So to insure we don't generate spurious exceptions
408	* (from the upper float values) we splat the lower float
409	* before we to the operation. */
410	__a = vec_splat((__v4sf)__A, 0);
411	__b = vec_splat((__v4sf)__B, 0);
412	__c = vec_min(__a, __b);
413	/* Then we merge the lower float result with the original upper
414	* float elements from __A. */
415	return (vec_sel((__v4sf)__A, __c, __mask));
416	}
417
418	extern __inline __m128
419	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
420	_mm_max_ss(__m128 __A, __m128 __B) {
421	__v4sf __a, __b, __c;
422	static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
423	/* PowerISA VSX does not allow partial (for just lower float)
424	* results. So to insure we don't generate spurious exceptions
425	* (from the upper float values) we splat the lower float
426	* before we to the operation. */
427	__a = vec_splat(__A, 0);
428	__b = vec_splat(__B, 0);
429	__c = vec_max(__a, __b);
430	/* Then we merge the lower float result with the original upper
431	* float elements from __A. */
432	return (vec_sel((__v4sf)__A, __c, __mask));
433	}
434
435	extern __inline __m128
436	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
437	_mm_min_ps(__m128 __A, __m128 __B) {
438	__vector __bool int __m = vec_cmpgt((__v4sf)__B, (__v4sf)__A);
439	return vec_sel(__B, __A, __m);
440	}
441
442	extern __inline __m128
443	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
444	_mm_max_ps(__m128 __A, __m128 __B) {
445	__vector __bool int __m = vec_cmpgt((__v4sf)__A, (__v4sf)__B);
446	return vec_sel(__B, __A, __m);
447	}
448
449	/* Perform logical bit-wise operations on 128-bit values. */
450	extern __inline __m128
451	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
452	_mm_and_ps(__m128 __A, __m128 __B) {
453	return ((__m128)vec_and((__v4sf)__A, (__v4sf)__B));
454	// return __builtin_ia32_andps (__A, __B);
455	}
456
457	extern __inline __m128
458	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
459	_mm_andnot_ps(__m128 __A, __m128 __B) {
460	return ((__m128)vec_andc((__v4sf)__B, (__v4sf)__A));
461	}
462
463	extern __inline __m128
464	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
465	_mm_or_ps(__m128 __A, __m128 __B) {
466	return ((__m128)vec_or((__v4sf)__A, (__v4sf)__B));
467	}
468
469	extern __inline __m128
470	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
471	_mm_xor_ps(__m128 __A, __m128 __B) {
472	return ((__m128)vec_xor((__v4sf)__A, (__v4sf)__B));
473	}
474
475	/* Perform a comparison on the four SPFP values of A and B. For each
476	element, if the comparison is true, place a mask of all ones in the
477	result, otherwise a mask of zeros. */
478	extern __inline __m128
479	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
480	_mm_cmpeq_ps(__m128 __A, __m128 __B) {
481	return ((__m128)vec_cmpeq((__v4sf)__A, (__v4sf)__B));
482	}
483
484	extern __inline __m128
485	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
486	_mm_cmplt_ps(__m128 __A, __m128 __B) {
487	return ((__m128)vec_cmplt((__v4sf)__A, (__v4sf)__B));
488	}
489
490	extern __inline __m128
491	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
492	_mm_cmple_ps(__m128 __A, __m128 __B) {
493	return ((__m128)vec_cmple((__v4sf)__A, (__v4sf)__B));
494	}
495
496	extern __inline __m128
497	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
498	_mm_cmpgt_ps(__m128 __A, __m128 __B) {
499	return ((__m128)vec_cmpgt((__v4sf)__A, (__v4sf)__B));
500	}
501
502	extern __inline __m128
503	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
504	_mm_cmpge_ps(__m128 __A, __m128 __B) {
505	return ((__m128)vec_cmpge((__v4sf)__A, (__v4sf)__B));
506	}
507
508	extern __inline __m128
509	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
510	_mm_cmpneq_ps(__m128 __A, __m128 __B) {
511	__v4sf __temp = (__v4sf)vec_cmpeq((__v4sf)__A, (__v4sf)__B);
512	return ((__m128)vec_nor(__temp, __temp));
513	}
514
515	extern __inline __m128
516	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
517	_mm_cmpnlt_ps(__m128 __A, __m128 __B) {
518	return ((__m128)vec_cmpge((__v4sf)__A, (__v4sf)__B));
519	}
520
521	extern __inline __m128
522	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
523	_mm_cmpnle_ps(__m128 __A, __m128 __B) {
524	return ((__m128)vec_cmpgt((__v4sf)__A, (__v4sf)__B));
525	}
526
527	extern __inline __m128
528	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
529	_mm_cmpngt_ps(__m128 __A, __m128 __B) {
530	return ((__m128)vec_cmple((__v4sf)__A, (__v4sf)__B));
531	}
532
533	extern __inline __m128
534	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
535	_mm_cmpnge_ps(__m128 __A, __m128 __B) {
536	return ((__m128)vec_cmplt((__v4sf)__A, (__v4sf)__B));
537	}
538
539	extern __inline __m128
540	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
541	_mm_cmpord_ps(__m128 __A, __m128 __B) {
542	__vector unsigned int __a, __b;
543	__vector unsigned int __c, __d;
544	static const __vector unsigned int __float_exp_mask = {
545	0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000};
546
547	__a = (__vector unsigned int)vec_abs((__v4sf)__A);
548	__b = (__vector unsigned int)vec_abs((__v4sf)__B);
549	__c = (__vector unsigned int)vec_cmpgt(__float_exp_mask, __a);
550	__d = (__vector unsigned int)vec_cmpgt(__float_exp_mask, __b);
551	return ((__m128)vec_and(__c, __d));
552	}
553
554	extern __inline __m128
555	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
556	_mm_cmpunord_ps(__m128 __A, __m128 __B) {
557	__vector unsigned int __a, __b;
558	__vector unsigned int __c, __d;
559	static const __vector unsigned int __float_exp_mask = {
560	0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000};
561
562	__a = (__vector unsigned int)vec_abs((__v4sf)__A);
563	__b = (__vector unsigned int)vec_abs((__v4sf)__B);
564	__c = (__vector unsigned int)vec_cmpgt(__a, __float_exp_mask);
565	__d = (__vector unsigned int)vec_cmpgt(__b, __float_exp_mask);
566	return ((__m128)vec_or(__c, __d));
567	}
568
569	/* Perform a comparison on the lower SPFP values of A and B. If the
570	comparison is true, place a mask of all ones in the result, otherwise a
571	mask of zeros. The upper three SPFP values are passed through from A. */
572	extern __inline __m128
573	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
574	_mm_cmpeq_ss(__m128 __A, __m128 __B) {
575	static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
576	__v4sf __a, __b, __c;
577	/* PowerISA VMX does not allow partial (for just element 0)
578	* results. So to insure we don't generate spurious exceptions
579	* (from the upper elements) we splat the lower float
580	* before we to the operation. */
581	__a = vec_splat((__v4sf)__A, 0);
582	__b = vec_splat((__v4sf)__B, 0);
583	__c = (__v4sf)vec_cmpeq(__a, __b);
584	/* Then we merge the lower float result with the original upper
585	* float elements from __A. */
586	return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
587	}
588
589	extern __inline __m128
590	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
591	_mm_cmplt_ss(__m128 __A, __m128 __B) {
592	static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
593	__v4sf __a, __b, __c;
594	/* PowerISA VMX does not allow partial (for just element 0)
595	* results. So to insure we don't generate spurious exceptions
596	* (from the upper elements) we splat the lower float
597	* before we to the operation. */
598	__a = vec_splat((__v4sf)__A, 0);
599	__b = vec_splat((__v4sf)__B, 0);
600	__c = (__v4sf)vec_cmplt(__a, __b);
601	/* Then we merge the lower float result with the original upper
602	* float elements from __A. */
603	return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
604	}
605
606	extern __inline __m128
607	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
608	_mm_cmple_ss(__m128 __A, __m128 __B) {
609	static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
610	__v4sf __a, __b, __c;
611	/* PowerISA VMX does not allow partial (for just element 0)
612	* results. So to insure we don't generate spurious exceptions
613	* (from the upper elements) we splat the lower float
614	* before we to the operation. */
615	__a = vec_splat((__v4sf)__A, 0);
616	__b = vec_splat((__v4sf)__B, 0);
617	__c = (__v4sf)vec_cmple(__a, __b);
618	/* Then we merge the lower float result with the original upper
619	* float elements from __A. */
620	return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
621	}
622
623	extern __inline __m128
624	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
625	_mm_cmpgt_ss(__m128 __A, __m128 __B) {
626	static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
627	__v4sf __a, __b, __c;
628	/* PowerISA VMX does not allow partial (for just element 0)
629	* results. So to insure we don't generate spurious exceptions
630	* (from the upper elements) we splat the lower float
631	* before we to the operation. */
632	__a = vec_splat((__v4sf)__A, 0);
633	__b = vec_splat((__v4sf)__B, 0);
634	__c = (__v4sf)vec_cmpgt(__a, __b);
635	/* Then we merge the lower float result with the original upper
636	* float elements from __A. */
637	return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
638	}
639
640	extern __inline __m128
641	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
642	_mm_cmpge_ss(__m128 __A, __m128 __B) {
643	static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
644	__v4sf __a, __b, __c;
645	/* PowerISA VMX does not allow partial (for just element 0)
646	* results. So to insure we don't generate spurious exceptions
647	* (from the upper elements) we splat the lower float
648	* before we to the operation. */
649	__a = vec_splat((__v4sf)__A, 0);
650	__b = vec_splat((__v4sf)__B, 0);
651	__c = (__v4sf)vec_cmpge(__a, __b);
652	/* Then we merge the lower float result with the original upper
653	* float elements from __A. */
654	return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
655	}
656
657	extern __inline __m128
658	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
659	_mm_cmpneq_ss(__m128 __A, __m128 __B) {
660	static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
661	__v4sf __a, __b, __c;
662	/* PowerISA VMX does not allow partial (for just element 0)
663	* results. So to insure we don't generate spurious exceptions
664	* (from the upper elements) we splat the lower float
665	* before we to the operation. */
666	__a = vec_splat((__v4sf)__A, 0);
667	__b = vec_splat((__v4sf)__B, 0);
668	__c = (__v4sf)vec_cmpeq(__a, __b);
669	__c = vec_nor(__c, __c);
670	/* Then we merge the lower float result with the original upper
671	* float elements from __A. */
672	return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
673	}
674
675	extern __inline __m128
676	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
677	_mm_cmpnlt_ss(__m128 __A, __m128 __B) {
678	static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
679	__v4sf __a, __b, __c;
680	/* PowerISA VMX does not allow partial (for just element 0)
681	* results. So to insure we don't generate spurious exceptions
682	* (from the upper elements) we splat the lower float
683	* before we to the operation. */
684	__a = vec_splat((__v4sf)__A, 0);
685	__b = vec_splat((__v4sf)__B, 0);
686	__c = (__v4sf)vec_cmpge(__a, __b);
687	/* Then we merge the lower float result with the original upper
688	* float elements from __A. */
689	return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
690	}
691
692	extern __inline __m128
693	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
694	_mm_cmpnle_ss(__m128 __A, __m128 __B) {
695	static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
696	__v4sf __a, __b, __c;
697	/* PowerISA VMX does not allow partial (for just element 0)
698	* results. So to insure we don't generate spurious exceptions
699	* (from the upper elements) we splat the lower float
700	* before we to the operation. */
701	__a = vec_splat((__v4sf)__A, 0);
702	__b = vec_splat((__v4sf)__B, 0);
703	__c = (__v4sf)vec_cmpgt(__a, __b);
704	/* Then we merge the lower float result with the original upper
705	* float elements from __A. */
706	return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
707	}
708
709	extern __inline __m128
710	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
711	_mm_cmpngt_ss(__m128 __A, __m128 __B) {
712	static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
713	__v4sf __a, __b, __c;
714	/* PowerISA VMX does not allow partial (for just element 0)
715	* results. So to insure we don't generate spurious exceptions
716	* (from the upper elements) we splat the lower float
717	* before we to the operation. */
718	__a = vec_splat((__v4sf)__A, 0);
719	__b = vec_splat((__v4sf)__B, 0);
720	__c = (__v4sf)vec_cmple(__a, __b);
721	/* Then we merge the lower float result with the original upper
722	* float elements from __A. */
723	return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
724	}
725
726	extern __inline __m128
727	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
728	_mm_cmpnge_ss(__m128 __A, __m128 __B) {
729	static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
730	__v4sf __a, __b, __c;
731	/* PowerISA VMX does not allow partial (for just element 0)
732	* results. So to insure we don't generate spurious exceptions
733	* (from the upper elements) we splat the lower float
734	* before we do the operation. */
735	__a = vec_splat((__v4sf)__A, 0);
736	__b = vec_splat((__v4sf)__B, 0);
737	__c = (__v4sf)vec_cmplt(__a, __b);
738	/* Then we merge the lower float result with the original upper
739	* float elements from __A. */
740	return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
741	}
742
743	extern __inline __m128
744	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
745	_mm_cmpord_ss(__m128 __A, __m128 __B) {
746	__vector unsigned int __a, __b;
747	__vector unsigned int __c, __d;
748	static const __vector unsigned int __float_exp_mask = {
749	0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000};
750	static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
751
752	__a = (__vector unsigned int)vec_abs((__v4sf)__A);
753	__b = (__vector unsigned int)vec_abs((__v4sf)__B);
754	__c = (__vector unsigned int)vec_cmpgt(__float_exp_mask, __a);
755	__d = (__vector unsigned int)vec_cmpgt(__float_exp_mask, __b);
756	__c = vec_and(__c, __d);
757	/* Then we merge the lower float result with the original upper
758	* float elements from __A. */
759	return ((__m128)vec_sel((__v4sf)__A, (__v4sf)__c, __mask));
760	}
761
762	extern __inline __m128
763	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
764	_mm_cmpunord_ss(__m128 __A, __m128 __B) {
765	__vector unsigned int __a, __b;
766	__vector unsigned int __c, __d;
767	static const __vector unsigned int __float_exp_mask = {
768	0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000};
769	static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
770
771	__a = (__vector unsigned int)vec_abs((__v4sf)__A);
772	__b = (__vector unsigned int)vec_abs((__v4sf)__B);
773	__c = (__vector unsigned int)vec_cmpgt(__a, __float_exp_mask);
774	__d = (__vector unsigned int)vec_cmpgt(__b, __float_exp_mask);
775	__c = vec_or(__c, __d);
776	/* Then we merge the lower float result with the original upper
777	* float elements from __A. */
778	return ((__m128)vec_sel((__v4sf)__A, (__v4sf)__c, __mask));
779	}
780
781	/* Compare the lower SPFP values of A and B and return 1 if true
782	and 0 if false. */
783	extern __inline int
784	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
785	_mm_comieq_ss(__m128 __A, __m128 __B) {
786	return (__A[0] == __B[0]);
787	}
788
789	extern __inline int
790	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
791	_mm_comilt_ss(__m128 __A, __m128 __B) {
792	return (__A[0] < __B[0]);
793	}
794
795	extern __inline int
796	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
797	_mm_comile_ss(__m128 __A, __m128 __B) {
798	return (__A[0] <= __B[0]);
799	}
800
801	extern __inline int
802	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
803	_mm_comigt_ss(__m128 __A, __m128 __B) {
804	return (__A[0] > __B[0]);
805	}
806
807	extern __inline int
808	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
809	_mm_comige_ss(__m128 __A, __m128 __B) {
810	return (__A[0] >= __B[0]);
811	}
812
813	extern __inline int
814	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
815	_mm_comineq_ss(__m128 __A, __m128 __B) {
816	return (__A[0] != __B[0]);
817	}
818
819	/* FIXME
820	* The __mm_ucomi??_ss implementations below are exactly the same as
821	* __mm_comi??_ss because GCC for PowerPC only generates unordered
822	* compares (scalar and vector).
823	* Technically __mm_comieq_ss et al should be using the ordered
824	* compare and signal for QNaNs.
825	* The __mm_ucomieq_sd et all should be OK, as is.
826	*/
827	extern __inline int
828	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
829	_mm_ucomieq_ss(__m128 __A, __m128 __B) {
830	return (__A[0] == __B[0]);
831	}
832
833	extern __inline int
834	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
835	_mm_ucomilt_ss(__m128 __A, __m128 __B) {
836	return (__A[0] < __B[0]);
837	}
838
839	extern __inline int
840	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
841	_mm_ucomile_ss(__m128 __A, __m128 __B) {
842	return (__A[0] <= __B[0]);
843	}
844
845	extern __inline int
846	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
847	_mm_ucomigt_ss(__m128 __A, __m128 __B) {
848	return (__A[0] > __B[0]);
849	}
850
851	extern __inline int
852	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
853	_mm_ucomige_ss(__m128 __A, __m128 __B) {
854	return (__A[0] >= __B[0]);
855	}
856
857	extern __inline int
858	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
859	_mm_ucomineq_ss(__m128 __A, __m128 __B) {
860	return (__A[0] != __B[0]);
861	}
862
863	extern __inline float
864	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
865	_mm_cvtss_f32(__m128 __A) {
866	return ((__v4sf)__A)[0];
867	}
868
869	/* Convert the lower SPFP value to a 32-bit integer according to the current
870	rounding mode. */
871	extern __inline int
872	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
873	_mm_cvtss_si32(__m128 __A) {
874	int __res;
875	#ifdef _ARCH_PWR8
876	double __dtmp;
877	__asm__(
878	#ifdef __LITTLE_ENDIAN__
879	"xxsldwi %x0,%x0,%x0,3;\n"
880	#endif
881	"xscvspdp %x2,%x0;\n"
882	"fctiw %2,%2;\n"
883	"mfvsrd %1,%x2;\n"
884	: "+wa"(__A), "=r"(__res), "=f"(__dtmp)
885	:);
886	#else
887	__res = __builtin_rint(__A[0]);
888	#endif
889	return __res;
890	}
891
892	extern __inline int
893	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
894	_mm_cvt_ss2si(__m128 __A) {
895	return _mm_cvtss_si32(__A);
896	}
897
898	/* Convert the lower SPFP value to a 32-bit integer according to the
899	current rounding mode. */
900
901	/* Intel intrinsic. */
902	extern __inline long long
903	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
904	_mm_cvtss_si64(__m128 __A) {
905	long long __res;
906	#if defined(_ARCH_PWR8) && defined(__powerpc64__)
907	double __dtmp;
908	__asm__(
909	#ifdef __LITTLE_ENDIAN__
910	"xxsldwi %x0,%x0,%x0,3;\n"
911	#endif
912	"xscvspdp %x2,%x0;\n"
913	"fctid %2,%2;\n"
914	"mfvsrd %1,%x2;\n"
915	: "+wa"(__A), "=r"(__res), "=f"(__dtmp)
916	:);
917	#else
918	__res = __builtin_llrint(__A[0]);
919	#endif
920	return __res;
921	}
922
923	/* Microsoft intrinsic. */
924	extern __inline long long
925	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
926	_mm_cvtss_si64x(__m128 __A) {
927	return _mm_cvtss_si64((__v4sf)__A);
928	}
929
930	/* Constants for use with _mm_prefetch. */
931	enum _mm_hint {
932	/* _MM_HINT_ET is _MM_HINT_T with set 3rd bit. */
933	_MM_HINT_ET0 = 7,
934	_MM_HINT_ET1 = 6,
935	_MM_HINT_T0 = 3,
936	_MM_HINT_T1 = 2,
937	_MM_HINT_T2 = 1,
938	_MM_HINT_NTA = 0
939	};
940
941	/* Loads one cache line from address P to a location "closer" to the
942	processor. The selector I specifies the type of prefetch operation. */
943	extern __inline void
944	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
945	_mm_prefetch(const void *__P, enum _mm_hint __I) {
946	/* Current PowerPC will ignores the hint parameters. */
947	__builtin_prefetch(__P);
948	}
949
950	/* Convert the two lower SPFP values to 32-bit integers according to the
951	current rounding mode. Return the integers in packed form. */
952	extern __inline __m64
953	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
954	_mm_cvtps_pi32(__m128 __A) {
955	/* Splat two lower SPFP values to both halves. */
956	__v4sf __temp, __rounded;
957	__vector unsigned long long __result;
958
959	/* Splat two lower SPFP values to both halves. */
960	__temp = (__v4sf)vec_splat((__vector long long)__A, 0);
961	__rounded = vec_rint(__temp);
962	__result = (__vector unsigned long long)vec_cts(__rounded, 0);
963
964	return (__m64)((__vector long long)__result)[0];
965	}
966
967	extern __inline __m64
968	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
969	_mm_cvt_ps2pi(__m128 __A) {
970	return _mm_cvtps_pi32(__A);
971	}
972
973	/* Truncate the lower SPFP value to a 32-bit integer. */
974	extern __inline int
975	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
976	_mm_cvttss_si32(__m128 __A) {
977	/* Extract the lower float element. */
978	float __temp = __A[0];
979	/* truncate to 32-bit integer and return. */
980	return __temp;
981	}
982
983	extern __inline int
984	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
985	_mm_cvtt_ss2si(__m128 __A) {
986	return _mm_cvttss_si32(__A);
987	}
988
989	/* Intel intrinsic. */
990	extern __inline long long
991	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
992	_mm_cvttss_si64(__m128 __A) {
993	/* Extract the lower float element. */
994	float __temp = __A[0];
995	/* truncate to 32-bit integer and return. */
996	return __temp;
997	}
998
999	/* Microsoft intrinsic. */
1000	extern __inline long long
1001	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1002	_mm_cvttss_si64x(__m128 __A) {
1003	/* Extract the lower float element. */
1004	float __temp = __A[0];
1005	/* truncate to 32-bit integer and return. */
1006	return __temp;
1007	}
1008
1009	/* Truncate the two lower SPFP values to 32-bit integers. Return the
1010	integers in packed form. */
1011	extern __inline __m64
1012	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1013	_mm_cvttps_pi32(__m128 __A) {
1014	__v4sf __temp;
1015	__vector unsigned long long __result;
1016
1017	/* Splat two lower SPFP values to both halves. */
1018	__temp = (__v4sf)vec_splat((__vector long long)__A, 0);
1019	__result = (__vector unsigned long long)vec_cts(__temp, 0);
1020
1021	return (__m64)((__vector long long)__result)[0];
1022	}
1023
1024	extern __inline __m64
1025	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1026	_mm_cvtt_ps2pi(__m128 __A) {
1027	return _mm_cvttps_pi32(__A);
1028	}
1029
1030	/* Convert B to a SPFP value and insert it as element zero in A. */
1031	extern __inline __m128
1032	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1033	_mm_cvtsi32_ss(__m128 __A, int __B) {
1034	float __temp = __B;
1035	__A[0] = __temp;
1036
1037	return __A;
1038	}
1039
1040	extern __inline __m128
1041	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1042	_mm_cvt_si2ss(__m128 __A, int __B) {
1043	return _mm_cvtsi32_ss(__A, __B);
1044	}
1045
1046	/* Convert B to a SPFP value and insert it as element zero in A. */
1047	/* Intel intrinsic. */
1048	extern __inline __m128
1049	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1050	_mm_cvtsi64_ss(__m128 __A, long long __B) {
1051	float __temp = __B;
1052	__A[0] = __temp;
1053
1054	return __A;
1055	}
1056
1057	/* Microsoft intrinsic. */
1058	extern __inline __m128
1059	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1060	_mm_cvtsi64x_ss(__m128 __A, long long __B) {
1061	return _mm_cvtsi64_ss(__A, __B);
1062	}
1063
1064	/* Convert the two 32-bit values in B to SPFP form and insert them
1065	as the two lower elements in A. */
1066	extern __inline __m128
1067	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1068	_mm_cvtpi32_ps(__m128 __A, __m64 __B) {
1069	__vector signed int __vm1;
1070	__vector float __vf1;
1071
1072	__vm1 = (__vector signed int)(__vector unsigned long long){__B, __B};
1073	__vf1 = (__vector float)vec_ctf(__vm1, 0);
1074
1075	return ((__m128)(__vector unsigned long long){
1076	((__vector unsigned long long)__vf1)[0],
1077	((__vector unsigned long long)__A)[1]});
1078	}
1079
1080	extern __inline __m128
1081	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1082	_mm_cvt_pi2ps(__m128 __A, __m64 __B) {
1083	return _mm_cvtpi32_ps(__A, __B);
1084	}
1085
1086	/* Convert the four signed 16-bit values in A to SPFP form. */
1087	extern __inline __m128
1088	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1089	_mm_cvtpi16_ps(__m64 __A) {
1090	__vector signed short __vs8;
1091	__vector signed int __vi4;
1092	__vector float __vf1;
1093
1094	__vs8 = (__vector signed short)(__vector unsigned long long){__A, __A};
1095	__vi4 = vec_vupklsh(__vs8);
1096	__vf1 = (__vector float)vec_ctf(__vi4, 0);
1097
1098	return (__m128)__vf1;
1099	}
1100
1101	/* Convert the four unsigned 16-bit values in A to SPFP form. */
1102	extern __inline __m128
1103	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1104	_mm_cvtpu16_ps(__m64 __A) {
1105	const __vector unsigned short __zero = {0, 0, 0, 0, 0, 0, 0, 0};
1106	__vector unsigned short __vs8;
1107	__vector unsigned int __vi4;
1108	__vector float __vf1;
1109
1110	__vs8 = (__vector unsigned short)(__vector unsigned long long){__A, __A};
1111	__vi4 = (__vector unsigned int)vec_mergel
1112	#ifdef __LITTLE_ENDIAN__
1113	(__vs8, __zero);
1114	#else
1115	(__zero, __vs8);
1116	#endif
1117	__vf1 = (__vector float)vec_ctf(__vi4, 0);
1118
1119	return (__m128)__vf1;
1120	}
1121
1122	/* Convert the low four signed 8-bit values in A to SPFP form. */
1123	extern __inline __m128
1124	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1125	_mm_cvtpi8_ps(__m64 __A) {
1126	__vector signed char __vc16;
1127	__vector signed short __vs8;
1128	__vector signed int __vi4;
1129	__vector float __vf1;
1130
1131	__vc16 = (__vector signed char)(__vector unsigned long long){__A, __A};
1132	__vs8 = vec_vupkhsb(__vc16);
1133	__vi4 = vec_vupkhsh(__vs8);
1134	__vf1 = (__vector float)vec_ctf(__vi4, 0);
1135
1136	return (__m128)__vf1;
1137	}
1138
1139	/* Convert the low four unsigned 8-bit values in A to SPFP form. */
1140	extern __inline __m128
1141	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1142
1143	_mm_cvtpu8_ps(__m64 __A) {
1144	const __vector unsigned char __zero = {0, 0, 0, 0, 0, 0, 0, 0};
1145	__vector unsigned char __vc16;
1146	__vector unsigned short __vs8;
1147	__vector unsigned int __vi4;
1148	__vector float __vf1;
1149
1150	__vc16 = (__vector unsigned char)(__vector unsigned long long){__A, __A};
1151	#ifdef __LITTLE_ENDIAN__
1152	__vs8 = (__vector unsigned short)vec_mergel(__vc16, __zero);
1153	__vi4 =
1154	(__vector unsigned int)vec_mergeh(__vs8, (__vector unsigned short)__zero);
1155	#else
1156	__vs8 = (__vector unsigned short)vec_mergel(__zero, __vc16);
1157	__vi4 =
1158	(__vector unsigned int)vec_mergeh((__vector unsigned short)__zero, __vs8);
1159	#endif
1160	__vf1 = (__vector float)vec_ctf(__vi4, 0);
1161
1162	return (__m128)__vf1;
1163	}
1164
1165	/* Convert the four signed 32-bit values in A and B to SPFP form. */
1166	extern __inline __m128
1167	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1168	_mm_cvtpi32x2_ps(__m64 __A, __m64 __B) {
1169	__vector signed int __vi4;
1170	__vector float __vf4;
1171
1172	__vi4 = (__vector signed int)(__vector unsigned long long){__A, __B};
1173	__vf4 = (__vector float)vec_ctf(__vi4, 0);
1174	return (__m128)__vf4;
1175	}
1176
1177	/* Convert the four SPFP values in A to four signed 16-bit integers. */
1178	extern __inline __m64
1179	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1180	_mm_cvtps_pi16(__m128 __A) {
1181	__v4sf __rounded;
1182	__vector signed int __temp;
1183	__vector unsigned long long __result;
1184
1185	__rounded = vec_rint(__A);
1186	__temp = vec_cts(__rounded, 0);
1187	__result = (__vector unsigned long long)vec_pack(__temp, __temp);
1188
1189	return (__m64)((__vector long long)__result)[0];
1190	}
1191
1192	/* Convert the four SPFP values in A to four signed 8-bit integers. */
1193	extern __inline __m64
1194	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1195	_mm_cvtps_pi8(__m128 __A) {
1196	__v4sf __rounded;
1197	__vector signed int __tmp_i;
1198	static const __vector signed int __zero = {0, 0, 0, 0};
1199	__vector signed short __tmp_s;
1200	__vector signed char __res_v;
1201
1202	__rounded = vec_rint(__A);
1203	__tmp_i = vec_cts(__rounded, 0);
1204	__tmp_s = vec_pack(__tmp_i, __zero);
1205	__res_v = vec_pack(__tmp_s, __tmp_s);
1206	return (__m64)((__vector long long)__res_v)[0];
1207	}
1208
1209	/* Selects four specific SPFP values from A and B based on MASK. */
1210	extern __inline __m128
1211	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1212
1213	_mm_shuffle_ps(__m128 __A, __m128 __B, int const __mask) {
1214	unsigned long __element_selector_10 = __mask & 0x03;
1215	unsigned long __element_selector_32 = (__mask >> 2) & 0x03;
1216	unsigned long __element_selector_54 = (__mask >> 4) & 0x03;
1217	unsigned long __element_selector_76 = (__mask >> 6) & 0x03;
1218	static const unsigned int __permute_selectors[4] = {
1219	#ifdef __LITTLE_ENDIAN__
1220	0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
1221	#else
1222	0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F
1223	#endif
1224	};
1225	__vector unsigned int __t;
1226
1227	__t[0] = __permute_selectors[__element_selector_10];
1228	__t[1] = __permute_selectors[__element_selector_32];
1229	__t[2] = __permute_selectors[__element_selector_54] + 0x10101010;
1230	__t[3] = __permute_selectors[__element_selector_76] + 0x10101010;
1231	return vec_perm((__v4sf)__A, (__v4sf)__B, (__vector unsigned char)__t);
1232	}
1233
1234	/* Selects and interleaves the upper two SPFP values from A and B. */
1235	extern __inline __m128
1236	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1237	_mm_unpackhi_ps(__m128 __A, __m128 __B) {
1238	return (__m128)vec_vmrglw((__v4sf)__A, (__v4sf)__B);
1239	}
1240
1241	/* Selects and interleaves the lower two SPFP values from A and B. */
1242	extern __inline __m128
1243	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1244	_mm_unpacklo_ps(__m128 __A, __m128 __B) {
1245	return (__m128)vec_vmrghw((__v4sf)__A, (__v4sf)__B);
1246	}
1247
1248	/* Sets the upper two SPFP values with 64-bits of data loaded from P;
1249	the lower two values are passed through from A. */
1250	extern __inline __m128
1251	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1252	_mm_loadh_pi(__m128 __A, __m64 const *__P) {
1253	__vector unsigned long long __a = (__vector unsigned long long)__A;
1254	__vector unsigned long long __p = vec_splats(*__P);
1255	__a[1] = __p[1];
1256
1257	return (__m128)__a;
1258	}
1259
1260	/* Stores the upper two SPFP values of A into P. */
1261	extern __inline void
1262	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1263	_mm_storeh_pi(__m64 *__P, __m128 __A) {
1264	__vector unsigned long long __a = (__vector unsigned long long)__A;
1265
1266	*__P = __a[1];
1267	}
1268
1269	/* Moves the upper two values of B into the lower two values of A. */
1270	extern __inline __m128
1271	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1272	_mm_movehl_ps(__m128 __A, __m128 __B) {
1273	return (__m128)vec_mergel((__vector unsigned long long)__B,
1274	(__vector unsigned long long)__A);
1275	}
1276
1277	/* Moves the lower two values of B into the upper two values of A. */
1278	extern __inline __m128
1279	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1280	_mm_movelh_ps(__m128 __A, __m128 __B) {
1281	return (__m128)vec_mergeh((__vector unsigned long long)__A,
1282	(__vector unsigned long long)__B);
1283	}
1284
1285	/* Sets the lower two SPFP values with 64-bits of data loaded from P;
1286	the upper two values are passed through from A. */
1287	extern __inline __m128
1288	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1289	_mm_loadl_pi(__m128 __A, __m64 const *__P) {
1290	__vector unsigned long long __a = (__vector unsigned long long)__A;
1291	__vector unsigned long long __p = vec_splats(*__P);
1292	__a[0] = __p[0];
1293
1294	return (__m128)__a;
1295	}
1296
1297	/* Stores the lower two SPFP values of A into P. */
1298	extern __inline void
1299	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1300	_mm_storel_pi(__m64 *__P, __m128 __A) {
1301	__vector unsigned long long __a = (__vector unsigned long long)__A;
1302
1303	*__P = __a[0];
1304	}
1305
1306	#ifdef _ARCH_PWR8
1307	/* Intrinsic functions that require PowerISA 2.07 minimum. */
1308
1309	/* Creates a 4-bit mask from the most significant bits of the SPFP values. */
1310	extern __inline int
1311	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1312	_mm_movemask_ps(__m128 __A) {
1313	#ifdef _ARCH_PWR10
1314	return vec_extractm((__vector unsigned int)__A);
1315	#else
1316	__vector unsigned long long __result;
1317	static const __vector unsigned int __perm_mask = {
1318	#ifdef __LITTLE_ENDIAN__
1319	0x00204060, 0x80808080, 0x80808080, 0x80808080
1320	#else
1321	0x80808080, 0x80808080, 0x80808080, 0x00204060
1322	#endif
1323	};
1324
1325	__result = ((__vector unsigned long long)vec_vbpermq(
1326	(__vector unsigned char)__A, (__vector unsigned char)__perm_mask));
1327
1328	#ifdef __LITTLE_ENDIAN__
1329	return __result[1];
1330	#else
1331	return __result[0];
1332	#endif
1333	#endif /* !_ARCH_PWR10 */
1334	}
1335	#endif /* _ARCH_PWR8 */
1336
1337	/* Create a vector with all four elements equal to P. /
1338	extern __inline __m128
1339	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1340	_mm_load1_ps(float const *__P) {
1341	return _mm_set1_ps(*__P);
1342	}
1343
1344	extern __inline __m128
1345	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1346	_mm_load_ps1(float const *__P) {
1347	return _mm_load1_ps(__P);
1348	}
1349
1350	/* Extracts one of the four words of A. The selector N must be immediate. */
1351	extern __inline int
1352	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1353	_mm_extract_pi16(__m64 const __A, int const __N) {
1354	unsigned int __shiftr = __N & 3;
1355	#ifdef __BIG_ENDIAN__
1356	__shiftr = 3 - __shiftr;
1357	#endif
1358
1359	return ((__A >> (__shiftr * 16)) & 0xffff);
1360	}
1361
1362	extern __inline int
1363	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1364	_m_pextrw(__m64 const __A, int const __N) {
1365	return _mm_extract_pi16(__A, __N);
1366	}
1367
1368	/* Inserts word D into one of four words of A. The selector N must be
1369	immediate. */
1370	extern __inline __m64
1371	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1372	_mm_insert_pi16(__m64 const __A, int const __D, int const __N) {
1373	const int __shiftl = (__N & 3) * 16;
1374	const __m64 __shiftD = (const __m64)__D << __shiftl;
1375	const __m64 __mask = 0xffffUL << __shiftl;
1376	__m64 __result = (__A & (~__mask)) \| (__shiftD & __mask);
1377
1378	return __result;
1379	}
1380
1381	extern __inline __m64
1382	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1383	_m_pinsrw(__m64 const __A, int const __D, int const __N) {
1384	return _mm_insert_pi16(__A, __D, __N);
1385	}
1386
1387	/* Compute the element-wise maximum of signed 16-bit values. */
1388	extern __inline __m64
1389	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1390
1391	_mm_max_pi16(__m64 __A, __m64 __B) {
1392	#if _ARCH_PWR8
1393	__vector signed short __a, __b, __r;
1394	__vector __bool short __c;
1395
1396	__a = (__vector signed short)vec_splats(__A);
1397	__b = (__vector signed short)vec_splats(__B);
1398	__c = (__vector __bool short)vec_cmpgt(__a, __b);
1399	__r = vec_sel(__b, __a, __c);
1400	return (__m64)((__vector long long)__r)[0];
1401	#else
1402	__m64_union __m1, __m2, __res;
1403
1404	__m1.as_m64 = __A;
1405	__m2.as_m64 = __B;
1406
1407	__res.as_short[0] = (__m1.as_short[0] > __m2.as_short[0]) ? __m1.as_short[0]
1408	: __m2.as_short[0];
1409	__res.as_short[1] = (__m1.as_short[1] > __m2.as_short[1]) ? __m1.as_short[1]
1410	: __m2.as_short[1];
1411	__res.as_short[2] = (__m1.as_short[2] > __m2.as_short[2]) ? __m1.as_short[2]
1412	: __m2.as_short[2];
1413	__res.as_short[3] = (__m1.as_short[3] > __m2.as_short[3]) ? __m1.as_short[3]
1414	: __m2.as_short[3];
1415
1416	return (__m64)__res.as_m64;
1417	#endif
1418	}
1419
1420	extern __inline __m64
1421	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1422	_m_pmaxsw(__m64 __A, __m64 __B) {
1423	return _mm_max_pi16(__A, __B);
1424	}
1425
1426	/* Compute the element-wise maximum of unsigned 8-bit values. */
1427	extern __inline __m64
1428	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1429	_mm_max_pu8(__m64 __A, __m64 __B) {
1430	#if _ARCH_PWR8
1431	__vector unsigned char __a, __b, __r;
1432	__vector __bool char __c;
1433
1434	__a = (__vector unsigned char)vec_splats(__A);
1435	__b = (__vector unsigned char)vec_splats(__B);
1436	__c = (__vector __bool char)vec_cmpgt(__a, __b);
1437	__r = vec_sel(__b, __a, __c);
1438	return (__m64)((__vector long long)__r)[0];
1439	#else
1440	__m64_union __m1, __m2, __res;
1441	long __i;
1442
1443	__m1.as_m64 = __A;
1444	__m2.as_m64 = __B;
1445
1446	for (__i = 0; __i < 8; __i++)
1447	__res.as_char[__i] =
1448	((unsigned char)__m1.as_char[__i] > (unsigned char)__m2.as_char[__i])
1449	? __m1.as_char[__i]
1450	: __m2.as_char[__i];
1451
1452	return (__m64)__res.as_m64;
1453	#endif
1454	}
1455
1456	extern __inline __m64
1457	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1458	_m_pmaxub(__m64 __A, __m64 __B) {
1459	return _mm_max_pu8(__A, __B);
1460	}
1461
1462	/* Compute the element-wise minimum of signed 16-bit values. */
1463	extern __inline __m64
1464	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1465	_mm_min_pi16(__m64 __A, __m64 __B) {
1466	#if _ARCH_PWR8
1467	__vector signed short __a, __b, __r;
1468	__vector __bool short __c;
1469
1470	__a = (__vector signed short)vec_splats(__A);
1471	__b = (__vector signed short)vec_splats(__B);
1472	__c = (__vector __bool short)vec_cmplt(__a, __b);
1473	__r = vec_sel(__b, __a, __c);
1474	return (__m64)((__vector long long)__r)[0];
1475	#else
1476	__m64_union __m1, __m2, __res;
1477
1478	__m1.as_m64 = __A;
1479	__m2.as_m64 = __B;
1480
1481	__res.as_short[0] = (__m1.as_short[0] < __m2.as_short[0]) ? __m1.as_short[0]
1482	: __m2.as_short[0];
1483	__res.as_short[1] = (__m1.as_short[1] < __m2.as_short[1]) ? __m1.as_short[1]
1484	: __m2.as_short[1];
1485	__res.as_short[2] = (__m1.as_short[2] < __m2.as_short[2]) ? __m1.as_short[2]
1486	: __m2.as_short[2];
1487	__res.as_short[3] = (__m1.as_short[3] < __m2.as_short[3]) ? __m1.as_short[3]
1488	: __m2.as_short[3];
1489
1490	return (__m64)__res.as_m64;
1491	#endif
1492	}
1493
1494	extern __inline __m64
1495	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1496	_m_pminsw(__m64 __A, __m64 __B) {
1497	return _mm_min_pi16(__A, __B);
1498	}
1499
1500	/* Compute the element-wise minimum of unsigned 8-bit values. */
1501	extern __inline __m64
1502	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1503	_mm_min_pu8(__m64 __A, __m64 __B) {
1504	#if _ARCH_PWR8
1505	__vector unsigned char __a, __b, __r;
1506	__vector __bool char __c;
1507
1508	__a = (__vector unsigned char)vec_splats(__A);
1509	__b = (__vector unsigned char)vec_splats(__B);
1510	__c = (__vector __bool char)vec_cmplt(__a, __b);
1511	__r = vec_sel(__b, __a, __c);
1512	return (__m64)((__vector long long)__r)[0];
1513	#else
1514	__m64_union __m1, __m2, __res;
1515	long __i;
1516
1517	__m1.as_m64 = __A;
1518	__m2.as_m64 = __B;
1519
1520	for (__i = 0; __i < 8; __i++)
1521	__res.as_char[__i] =
1522	((unsigned char)__m1.as_char[__i] < (unsigned char)__m2.as_char[__i])
1523	? __m1.as_char[__i]
1524	: __m2.as_char[__i];
1525
1526	return (__m64)__res.as_m64;
1527	#endif
1528	}
1529
1530	extern __inline __m64
1531	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1532	_m_pminub(__m64 __A, __m64 __B) {
1533	return _mm_min_pu8(__A, __B);
1534	}
1535
1536	/* Create an 8-bit mask of the signs of 8-bit values. */
1537	extern __inline int
1538	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1539	_mm_movemask_pi8(__m64 __A) {
1540	#ifdef __powerpc64__
1541	unsigned long long __p =
1542	#ifdef __LITTLE_ENDIAN__
1543	0x0008101820283038UL; // permute control for sign bits
1544	#else
1545	0x3830282018100800UL; // permute control for sign bits
1546	#endif
1547	return __builtin_bpermd(__p, __A);
1548	#else
1549	#ifdef __LITTLE_ENDIAN__
1550	unsigned int __mask = 0x20283038UL;
1551	unsigned int __r1 = __builtin_bpermd(__mask, __A) & 0xf;
1552	unsigned int __r2 = __builtin_bpermd(__mask, __A >> 32) & 0xf;
1553	#else
1554	unsigned int __mask = 0x38302820UL;
1555	unsigned int __r1 = __builtin_bpermd(__mask, __A >> 32) & 0xf;
1556	unsigned int __r2 = __builtin_bpermd(__mask, __A) & 0xf;
1557	#endif
1558	return (__r2 << 4) \| __r1;
1559	#endif
1560	}
1561
1562	extern __inline int
1563	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1564	_m_pmovmskb(__m64 __A) {
1565	return _mm_movemask_pi8(__A);
1566	}
1567
1568	/* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values
1569	in B and produce the high 16 bits of the 32-bit results. */
1570	extern __inline __m64
1571	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1572	_mm_mulhi_pu16(__m64 __A, __m64 __B) {
1573	__vector unsigned short __a, __b;
1574	__vector unsigned short __c;
1575	__vector unsigned int __w0, __w1;
1576	__vector unsigned char __xform1 = {
1577	#ifdef __LITTLE_ENDIAN__
1578	0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A,
1579	0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1580	#else
1581	0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x00,
1582	0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15
1583	#endif
1584	};
1585
1586	__a = (__vector unsigned short)vec_splats(__A);
1587	__b = (__vector unsigned short)vec_splats(__B);
1588
1589	__w0 = vec_vmuleuh(__a, __b);
1590	__w1 = vec_vmulouh(__a, __b);
1591	__c = (__vector unsigned short)vec_perm(__w0, __w1, __xform1);
1592
1593	return (__m64)((__vector long long)__c)[0];
1594	}
1595
1596	extern __inline __m64
1597	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1598	_m_pmulhuw(__m64 __A, __m64 __B) {
1599	return _mm_mulhi_pu16(__A, __B);
1600	}
1601
1602	/* Return a combination of the four 16-bit values in A. The selector
1603	must be an immediate. */
1604	extern __inline __m64
1605	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1606	_mm_shuffle_pi16(__m64 __A, int const __N) {
1607	unsigned long __element_selector_10 = __N & 0x03;
1608	unsigned long __element_selector_32 = (__N >> 2) & 0x03;
1609	unsigned long __element_selector_54 = (__N >> 4) & 0x03;
1610	unsigned long __element_selector_76 = (__N >> 6) & 0x03;
1611	static const unsigned short __permute_selectors[4] = {
1612	#ifdef __LITTLE_ENDIAN__
1613	0x0908, 0x0B0A, 0x0D0C, 0x0F0E
1614	#else
1615	0x0607, 0x0405, 0x0203, 0x0001
1616	#endif
1617	};
1618	__m64_union __t;
1619	__vector unsigned long long __a, __p, __r;
1620
1621	#ifdef __LITTLE_ENDIAN__
1622	__t.as_short[0] = __permute_selectors[__element_selector_10];
1623	__t.as_short[1] = __permute_selectors[__element_selector_32];
1624	__t.as_short[2] = __permute_selectors[__element_selector_54];
1625	__t.as_short[3] = __permute_selectors[__element_selector_76];
1626	#else
1627	__t.as_short[3] = __permute_selectors[__element_selector_10];
1628	__t.as_short[2] = __permute_selectors[__element_selector_32];
1629	__t.as_short[1] = __permute_selectors[__element_selector_54];
1630	__t.as_short[0] = __permute_selectors[__element_selector_76];
1631	#endif
1632	__p = vec_splats(__t.as_m64);
1633	__a = vec_splats(__A);
1634	__r = vec_perm(__a, __a, (__vector unsigned char)__p);
1635	return (__m64)((__vector long long)__r)[0];
1636	}
1637
1638	extern __inline __m64
1639	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1640	_m_pshufw(__m64 __A, int const __N) {
1641	return _mm_shuffle_pi16(__A, __N);
1642	}
1643
1644	/* Conditionally store byte elements of A into P. The high bit of each
1645	byte in the selector N determines whether the corresponding byte from
1646	A is stored. */
1647	extern __inline void
1648	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1649	_mm_maskmove_si64(__m64 __A, __m64 __N, char *__P) {
1650	__m64 __hibit = 0x8080808080808080UL;
1651	__m64 __mask, __tmp;
1652	__m64 __p = (__m64 )__P;
1653
1654	__tmp = *__p;
1655	__mask = _mm_cmpeq_pi8((__N & __hibit), __hibit);
1656	__tmp = (__tmp & (~__mask)) \| (__A & __mask);
1657	*__p = __tmp;
1658	}
1659
1660	extern __inline void
1661	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1662	_m_maskmovq(__m64 __A, __m64 __N, char *__P) {
1663	_mm_maskmove_si64(__A, __N, __P);
1664	}
1665
1666	/* Compute the rounded averages of the unsigned 8-bit values in A and B. */
1667	extern __inline __m64
1668	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1669	_mm_avg_pu8(__m64 __A, __m64 __B) {
1670	__vector unsigned char __a, __b, __c;
1671
1672	__a = (__vector unsigned char)vec_splats(__A);
1673	__b = (__vector unsigned char)vec_splats(__B);
1674	__c = vec_avg(__a, __b);
1675	return (__m64)((__vector long long)__c)[0];
1676	}
1677
1678	extern __inline __m64
1679	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1680	_m_pavgb(__m64 __A, __m64 __B) {
1681	return _mm_avg_pu8(__A, __B);
1682	}
1683
1684	/* Compute the rounded averages of the unsigned 16-bit values in A and B. */
1685	extern __inline __m64
1686	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1687	_mm_avg_pu16(__m64 __A, __m64 __B) {
1688	__vector unsigned short __a, __b, __c;
1689
1690	__a = (__vector unsigned short)vec_splats(__A);
1691	__b = (__vector unsigned short)vec_splats(__B);
1692	__c = vec_avg(__a, __b);
1693	return (__m64)((__vector long long)__c)[0];
1694	}
1695
1696	extern __inline __m64
1697	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1698	_m_pavgw(__m64 __A, __m64 __B) {
1699	return _mm_avg_pu16(__A, __B);
1700	}
1701
1702	/* Compute the sum of the absolute differences of the unsigned 8-bit
1703	values in A and B. Return the value in the lower 16-bit word; the
1704	upper words are cleared. */
1705	extern __inline __m64
1706	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1707	_mm_sad_pu8(__m64 __A, __m64 __B) {
1708	__vector unsigned char __a, __b;
1709	__vector unsigned char __vmin, __vmax, __vabsdiff;
1710	__vector signed int __vsum;
1711	const __vector unsigned int __zero = {0, 0, 0, 0};
1712	__m64_union __result = {0};
1713
1714	__a = (__vector unsigned char)(__vector unsigned long long){0UL, __A};
1715	__b = (__vector unsigned char)(__vector unsigned long long){0UL, __B};
1716	__vmin = vec_min(__a, __b);
1717	__vmax = vec_max(__a, __b);
1718	__vabsdiff = vec_sub(__vmax, __vmin);
1719	/* Sum four groups of bytes into integers. */
1720	__vsum = (__vector signed int)vec_sum4s(__vabsdiff, __zero);
1721	/* Sum across four integers with integer result. */
1722	__vsum = vec_sums(__vsum, (__vector signed int)__zero);
1723	/* The sum is in the right most 32-bits of the vector result.
1724	Transfer to a GPR and truncate to 16 bits. */
1725	__result.as_short[0] = __vsum[3];
1726	return __result.as_m64;
1727	}
1728
1729	extern __inline __m64
1730	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1731	_m_psadbw(__m64 __A, __m64 __B) {
1732	return _mm_sad_pu8(__A, __B);
1733	}
1734
1735	/* Stores the data in A to the address P without polluting the caches. */
1736	extern __inline void
1737	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1738	_mm_stream_pi(__m64 *__P, __m64 __A) {
1739	/* Use the data cache block touch for store transient. */
1740	__asm__(" dcbtstt 0,%0" : : "b"(__P) : "memory");
1741	*__P = __A;
1742	}
1743
1744	/* Likewise. The address must be 16-byte aligned. */
1745	extern __inline void
1746	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1747	_mm_stream_ps(float *__P, __m128 __A) {
1748	/* Use the data cache block touch for store transient. */
1749	__asm__(" dcbtstt 0,%0" : : "b"(__P) : "memory");
1750	_mm_store_ps(__P, __A);
1751	}
1752
1753	/* Guarantees that every preceding store is globally visible before
1754	any subsequent store. */
1755	extern __inline void
1756	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1757	_mm_sfence(void) {
1758	/* Generate a light weight sync. */
1759	__atomic_thread_fence(__ATOMIC_RELEASE);
1760	}
1761
1762	/* The execution of the next instruction is delayed by an implementation
1763	specific amount of time. The instruction does not modify the
1764	architectural state. This is after the pop_options pragma because
1765	it does not require SSE support in the processor--the encoding is a
1766	nop on processors that do not support it. */
1767	extern __inline void
1768	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1769	_mm_pause(void) {
1770	/* There is no exact match with this construct, but the following is
1771	close to the desired effect. */
1772	#if _ARCH_PWR8
1773	/* On power8 and later processors we can depend on Program Priority
1774	(PRI) and associated "very low" PPI setting. Since we don't know
1775	what PPI this thread is running at we: 1) save the current PRI
1776	from the PPR SPR into a local GRP, 2) set the PRI to "very low*
1777	via the special or 31,31,31 encoding. 3) issue an "isync" to
1778	insure the PRI change takes effect before we execute any more
1779	instructions.
1780	Now we can execute a lwsync (release barrier) while we execute
1781	this thread at "very low" PRI. Finally we restore the original
1782	PRI and continue execution. */
1783	unsigned long __PPR;
1784
1785	__asm__ volatile(" mfppr %0;"
1786	" or 31,31,31;"
1787	" isync;"
1788	" lwsync;"
1789	" isync;"
1790	" mtppr %0;"
1791	: "=r"(__PPR)
1792	:
1793	: "memory");
1794	#else
1795	/* For older processor where we may not even have Program Priority
1796	controls we can only depend on Heavy Weight Sync. */
1797	__atomic_thread_fence(__ATOMIC_SEQ_CST);
1798	#endif
1799	}
1800
1801	/* Transpose the 4x4 matrix composed of row[0-3]. */
1802	#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
1803	do { \
1804	__v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3); \
1805	__v4sf __t0 = vec_vmrghw(__r0, __r1); \
1806	__v4sf __t1 = vec_vmrghw(__r2, __r3); \
1807	__v4sf __t2 = vec_vmrglw(__r0, __r1); \
1808	__v4sf __t3 = vec_vmrglw(__r2, __r3); \
1809	(row0) = (__v4sf)vec_mergeh((__vector long long)__t0, \
1810	(__vector long long)__t1); \
1811	(row1) = (__v4sf)vec_mergel((__vector long long)__t0, \
1812	(__vector long long)__t1); \
1813	(row2) = (__v4sf)vec_mergeh((__vector long long)__t2, \
1814	(__vector long long)__t3); \
1815	(row3) = (__v4sf)vec_mergel((__vector long long)__t2, \
1816	(__vector long long)__t3); \
1817	} while (0)
1818
1819	/* For backward source compatibility. */
1820	//# include <emmintrin.h>
1821
1822	#else
1823	#include_next <xmmintrin.h>
1824	#endif /* defined(__powerpc64__) && \
1825	* (defined(__linux__) \|\| defined(__FreeBSD__) \|\| defined(_AIX)) */
1826
1827	#endif /* XMMINTRIN_H_ */
1828

Warning: This file is not a C or C++ file. It does not have highlighting.

source code of clang/lib/Headers/ppc_wrappers/xmmintrin.h