emmintrin.h source code [clang/lib/Headers/ppc_wrappers/emmintrin.h]

Warning: This file is not a C or C++ file. It does not have highlighting.

1	/*===---- emmintrin.h - Implementation of SSE2 intrinsics on PowerPC -------===
2	*
3	* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	* See https://llvm.org/LICENSE.txt for license information.
5	* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	*
7	*===-----------------------------------------------------------------------===
8	*/
9
10	/* Implemented from the specification included in the Intel C++ Compiler
11	User Guide and Reference, version 9.0. */
12
13	#ifndef NO_WARN_X86_INTRINSICS
14	/* This header file is to help porting code using Intel intrinsics
15	explicitly from x86_64 to powerpc64/powerpc64le.
16
17	Since X86 SSE2 intrinsics mainly handles __m128i and __m128d type,
18	PowerPC VMX/VSX ISA is a good match for vector float SIMD operations.
19	However scalar float operations in vector (XMM) registers require
20	the POWER8 VSX ISA (2.07) level. There are differences for data
21	format and placement of float scalars in the vector register, which
22	require extra steps to match SSE2 scalar float semantics on POWER.
23
24	It should be noted that there's much difference between X86_64's
25	MXSCR and PowerISA's FPSCR/VSCR registers. It's recommended to use
26	portable <fenv.h> instead of access MXSCR directly.
27
28	Most SSE2 scalar float intrinsic operations can be performed more
29	efficiently as C language float scalar operations or optimized to
30	use vector SIMD operations. We recommend this for new applications.
31	*/
32	#error \
33	"Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
34	#endif
35
36	#ifndef EMMINTRIN_H_
37	#define EMMINTRIN_H_
38
39	#if defined(__powerpc64__) && \
40	(defined(__linux__) \|\| defined(__FreeBSD__) \|\| defined(_AIX))
41
42	#include <altivec.h>
43
44	/* We need definitions from the SSE header files. */
45	#include <xmmintrin.h>
46
47	/* SSE2 */
48	typedef __vector double __v2df;
49	typedef __vector float __v4f;
50	typedef __vector long long __v2di;
51	typedef __vector unsigned long long __v2du;
52	typedef __vector int __v4si;
53	typedef __vector unsigned int __v4su;
54	typedef __vector short __v8hi;
55	typedef __vector unsigned short __v8hu;
56	typedef __vector signed char __v16qi;
57	typedef __vector unsigned char __v16qu;
58
59	/* The Intel API is flexible enough that we must allow aliasing with other
60	vector types, and their scalar components. */
61	typedef long long __m128i __attribute__((__vector_size__(16), __may_alias__));
62	typedef double __m128d __attribute__((__vector_size__(16), __may_alias__));
63
64	/* Unaligned version of the same types. */
65	typedef long long __m128i_u
66	__attribute__((__vector_size__(16), __may_alias__, __aligned__(1)));
67	typedef double __m128d_u
68	__attribute__((__vector_size__(16), __may_alias__, __aligned__(1)));
69
70	/* Define two value permute mask. */
71	#define _MM_SHUFFLE2(x, y) (((x) << 1) \| (y))
72
73	/* Create a vector with element 0 as F and the rest zero. */
74	extern __inline __m128d
75	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
76	_mm_set_sd(double __F) {
77	return __extension__(__m128d){__F, 0.0};
78	}
79
80	/* Create a vector with both elements equal to F. */
81	extern __inline __m128d
82	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
83	_mm_set1_pd(double __F) {
84	return __extension__(__m128d){__F, __F};
85	}
86
87	extern __inline __m128d
88	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
89	_mm_set_pd1(double __F) {
90	return _mm_set1_pd(__F);
91	}
92
93	/* Create a vector with the lower value X and upper value W. */
94	extern __inline __m128d
95	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
96	_mm_set_pd(double __W, double __X) {
97	return __extension__(__m128d){__X, __W};
98	}
99
100	/* Create a vector with the lower value W and upper value X. */
101	extern __inline __m128d
102	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
103	_mm_setr_pd(double __W, double __X) {
104	return __extension__(__m128d){__W, __X};
105	}
106
107	/* Create an undefined vector. */
108	extern __inline __m128d
109	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
110	_mm_undefined_pd(void) {
111	__m128d __Y = __Y;
112	return __Y;
113	}
114
115	/* Create a vector of zeros. */
116	extern __inline __m128d
117	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
118	_mm_setzero_pd(void) {
119	return (__m128d)vec_splats(0);
120	}
121
122	/* Sets the low DPFP value of A from the low value of B. */
123	extern __inline __m128d
124	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
125	_mm_move_sd(__m128d __A, __m128d __B) {
126	__v2df __result = (__v2df)__A;
127	__result[0] = ((__v2df)__B)[0];
128	return (__m128d)__result;
129	}
130
131	/* Load two DPFP values from P. The address must be 16-byte aligned. */
132	extern __inline __m128d
133	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
134	_mm_load_pd(double const *__P) {
135	return ((__m128d)vec_ld(0, (__v16qu *)__P));
136	}
137
138	/* Load two DPFP values from P. The address need not be 16-byte aligned. */
139	extern __inline __m128d
140	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
141	_mm_loadu_pd(double const *__P) {
142	return (vec_vsx_ld(0, __P));
143	}
144
145	/* Create a vector with all two elements equal to P. /
146	extern __inline __m128d
147	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
148	_mm_load1_pd(double const *__P) {
149	return (vec_splats(*__P));
150	}
151
152	/* Create a vector with element 0 as P and the rest zero. /
153	extern __inline __m128d
154	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
155	_mm_load_sd(double const *__P) {
156	return _mm_set_sd(*__P);
157	}
158
159	extern __inline __m128d
160	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
161	_mm_load_pd1(double const *__P) {
162	return _mm_load1_pd(__P);
163	}
164
165	/* Load two DPFP values in reverse order. The address must be aligned. */
166	extern __inline __m128d
167	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
168	_mm_loadr_pd(double const *__P) {
169	__v2df __tmp = _mm_load_pd(__P);
170	return (__m128d)vec_xxpermdi(__tmp, __tmp, 2);
171	}
172
173	/* Store two DPFP values. The address must be 16-byte aligned. */
174	extern __inline void
175	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
176	_mm_store_pd(double *__P, __m128d __A) {
177	vec_st((__v16qu)__A, 0, (__v16qu *)__P);
178	}
179
180	/* Store two DPFP values. The address need not be 16-byte aligned. */
181	extern __inline void
182	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
183	_mm_storeu_pd(double *__P, __m128d __A) {
184	(__m128d_u )__P = __A;
185	}
186
187	/* Stores the lower DPFP value. */
188	extern __inline void
189	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
190	_mm_store_sd(double *__P, __m128d __A) {
191	*__P = ((__v2df)__A)[0];
192	}
193
194	extern __inline double
195	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
196	_mm_cvtsd_f64(__m128d __A) {
197	return ((__v2df)__A)[0];
198	}
199
200	extern __inline void
201	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
202	_mm_storel_pd(double *__P, __m128d __A) {
203	_mm_store_sd(__P, __A);
204	}
205
206	/* Stores the upper DPFP value. */
207	extern __inline void
208	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
209	_mm_storeh_pd(double *__P, __m128d __A) {
210	*__P = ((__v2df)__A)[1];
211	}
212	/* Store the lower DPFP value across two words.
213	The address must be 16-byte aligned. */
214	extern __inline void
215	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
216	_mm_store1_pd(double *__P, __m128d __A) {
217	_mm_store_pd(__P, vec_splat(__A, 0));
218	}
219
220	extern __inline void
221	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
222	_mm_store_pd1(double *__P, __m128d __A) {
223	_mm_store1_pd(__P, __A);
224	}
225
226	/* Store two DPFP values in reverse order. The address must be aligned. */
227	extern __inline void
228	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
229	_mm_storer_pd(double *__P, __m128d __A) {
230	_mm_store_pd(__P, vec_xxpermdi(__A, __A, 2));
231	}
232
233	/* Intel intrinsic. */
234	extern __inline long long
235	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
236	_mm_cvtsi128_si64(__m128i __A) {
237	return ((__v2di)__A)[0];
238	}
239
240	/* Microsoft intrinsic. */
241	extern __inline long long
242	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
243	_mm_cvtsi128_si64x(__m128i __A) {
244	return ((__v2di)__A)[0];
245	}
246
247	extern __inline __m128d
248	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
249	_mm_add_pd(__m128d __A, __m128d __B) {
250	return (__m128d)((__v2df)__A + (__v2df)__B);
251	}
252
253	/* Add the lower double-precision (64-bit) floating-point element in
254	a and b, store the result in the lower element of dst, and copy
255	the upper element from a to the upper element of dst. */
256	extern __inline __m128d
257	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
258	_mm_add_sd(__m128d __A, __m128d __B) {
259	__A[0] = __A[0] + __B[0];
260	return (__A);
261	}
262
263	extern __inline __m128d
264	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
265	_mm_sub_pd(__m128d __A, __m128d __B) {
266	return (__m128d)((__v2df)__A - (__v2df)__B);
267	}
268
269	extern __inline __m128d
270	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
271	_mm_sub_sd(__m128d __A, __m128d __B) {
272	__A[0] = __A[0] - __B[0];
273	return (__A);
274	}
275
276	extern __inline __m128d
277	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
278	_mm_mul_pd(__m128d __A, __m128d __B) {
279	return (__m128d)((__v2df)__A * (__v2df)__B);
280	}
281
282	extern __inline __m128d
283	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
284	_mm_mul_sd(__m128d __A, __m128d __B) {
285	__A[0] = __A[0] * __B[0];
286	return (__A);
287	}
288
289	extern __inline __m128d
290	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
291	_mm_div_pd(__m128d __A, __m128d __B) {
292	return (__m128d)((__v2df)__A / (__v2df)__B);
293	}
294
295	extern __inline __m128d
296	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
297	_mm_div_sd(__m128d __A, __m128d __B) {
298	__A[0] = __A[0] / __B[0];
299	return (__A);
300	}
301
302	extern __inline __m128d
303	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
304	_mm_sqrt_pd(__m128d __A) {
305	return (vec_sqrt(__A));
306	}
307
308	/* Return pair {sqrt (B[0]), A[1]}. */
309	extern __inline __m128d
310	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
311	_mm_sqrt_sd(__m128d __A, __m128d __B) {
312	__v2df __c;
313	__c = vec_sqrt((__v2df)_mm_set1_pd(__B[0]));
314	return (__m128d)_mm_setr_pd(__c[0], __A[1]);
315	}
316
317	extern __inline __m128d
318	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
319	_mm_min_pd(__m128d __A, __m128d __B) {
320	return (vec_min(__A, __B));
321	}
322
323	extern __inline __m128d
324	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
325	_mm_min_sd(__m128d __A, __m128d __B) {
326	__v2df __a, __b, __c;
327	__a = vec_splats(__A[0]);
328	__b = vec_splats(__B[0]);
329	__c = vec_min(__a, __b);
330	return (__m128d)_mm_setr_pd(__c[0], __A[1]);
331	}
332
333	extern __inline __m128d
334	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
335	_mm_max_pd(__m128d __A, __m128d __B) {
336	return (vec_max(__A, __B));
337	}
338
339	extern __inline __m128d
340	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
341	_mm_max_sd(__m128d __A, __m128d __B) {
342	__v2df __a, __b, __c;
343	__a = vec_splats(__A[0]);
344	__b = vec_splats(__B[0]);
345	__c = vec_max(__a, __b);
346	return (__m128d)_mm_setr_pd(__c[0], __A[1]);
347	}
348
349	extern __inline __m128d
350	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
351	_mm_cmpeq_pd(__m128d __A, __m128d __B) {
352	return ((__m128d)vec_cmpeq((__v2df)__A, (__v2df)__B));
353	}
354
355	extern __inline __m128d
356	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
357	_mm_cmplt_pd(__m128d __A, __m128d __B) {
358	return ((__m128d)vec_cmplt((__v2df)__A, (__v2df)__B));
359	}
360
361	extern __inline __m128d
362	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
363	_mm_cmple_pd(__m128d __A, __m128d __B) {
364	return ((__m128d)vec_cmple((__v2df)__A, (__v2df)__B));
365	}
366
367	extern __inline __m128d
368	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
369	_mm_cmpgt_pd(__m128d __A, __m128d __B) {
370	return ((__m128d)vec_cmpgt((__v2df)__A, (__v2df)__B));
371	}
372
373	extern __inline __m128d
374	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
375	_mm_cmpge_pd(__m128d __A, __m128d __B) {
376	return ((__m128d)vec_cmpge((__v2df)__A, (__v2df)__B));
377	}
378
379	extern __inline __m128d
380	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
381	_mm_cmpneq_pd(__m128d __A, __m128d __B) {
382	__v2df __temp = (__v2df)vec_cmpeq((__v2df)__A, (__v2df)__B);
383	return ((__m128d)vec_nor(__temp, __temp));
384	}
385
386	extern __inline __m128d
387	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
388	_mm_cmpnlt_pd(__m128d __A, __m128d __B) {
389	return ((__m128d)vec_cmpge((__v2df)__A, (__v2df)__B));
390	}
391
392	extern __inline __m128d
393	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
394	_mm_cmpnle_pd(__m128d __A, __m128d __B) {
395	return ((__m128d)vec_cmpgt((__v2df)__A, (__v2df)__B));
396	}
397
398	extern __inline __m128d
399	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
400	_mm_cmpngt_pd(__m128d __A, __m128d __B) {
401	return ((__m128d)vec_cmple((__v2df)__A, (__v2df)__B));
402	}
403
404	extern __inline __m128d
405	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
406	_mm_cmpnge_pd(__m128d __A, __m128d __B) {
407	return ((__m128d)vec_cmplt((__v2df)__A, (__v2df)__B));
408	}
409
410	extern __inline __m128d
411	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
412	_mm_cmpord_pd(__m128d __A, __m128d __B) {
413	__v2du __c, __d;
414	/* Compare against self will return false (0's) if NAN. */
415	__c = (__v2du)vec_cmpeq(__A, __A);
416	__d = (__v2du)vec_cmpeq(__B, __B);
417	/* A != NAN and B != NAN. */
418	return ((__m128d)vec_and(__c, __d));
419	}
420
421	extern __inline __m128d
422	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
423	_mm_cmpunord_pd(__m128d __A, __m128d __B) {
424	#if _ARCH_PWR8
425	__v2du __c, __d;
426	/* Compare against self will return false (0's) if NAN. */
427	__c = (__v2du)vec_cmpeq((__v2df)__A, (__v2df)__A);
428	__d = (__v2du)vec_cmpeq((__v2df)__B, (__v2df)__B);
429	/* A == NAN OR B == NAN converts too:
430	NOT(A != NAN) OR NOT(B != NAN). */
431	__c = vec_nor(__c, __c);
432	return ((__m128d)vec_orc(__c, __d));
433	#else
434	__v2du __c, __d;
435	/* Compare against self will return false (0's) if NAN. */
436	__c = (__v2du)vec_cmpeq((__v2df)__A, (__v2df)__A);
437	__d = (__v2du)vec_cmpeq((__v2df)__B, (__v2df)__B);
438	/* Convert the true ('1's) is NAN. */
439	__c = vec_nor(__c, __c);
440	__d = vec_nor(__d, __d);
441	return ((__m128d)vec_or(__c, __d));
442	#endif
443	}
444
445	extern __inline __m128d
446	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
447	_mm_cmpeq_sd(__m128d __A, __m128d __B) {
448	__v2df __a, __b, __c;
449	/* PowerISA VSX does not allow partial (for just lower double)
450	results. So to insure we don't generate spurious exceptions
451	(from the upper double values) we splat the lower double
452	before we do the operation. */
453	__a = vec_splats(__A[0]);
454	__b = vec_splats(__B[0]);
455	__c = (__v2df)vec_cmpeq(__a, __b);
456	/* Then we merge the lower double result with the original upper
457	double from __A. */
458	return (__m128d)_mm_setr_pd(__c[0], __A[1]);
459	}
460
461	extern __inline __m128d
462	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
463	_mm_cmplt_sd(__m128d __A, __m128d __B) {
464	__v2df __a, __b, __c;
465	__a = vec_splats(__A[0]);
466	__b = vec_splats(__B[0]);
467	__c = (__v2df)vec_cmplt(__a, __b);
468	return (__m128d)_mm_setr_pd(__c[0], __A[1]);
469	}
470
471	extern __inline __m128d
472	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
473	_mm_cmple_sd(__m128d __A, __m128d __B) {
474	__v2df __a, __b, __c;
475	__a = vec_splats(__A[0]);
476	__b = vec_splats(__B[0]);
477	__c = (__v2df)vec_cmple(__a, __b);
478	return (__m128d)_mm_setr_pd(__c[0], __A[1]);
479	}
480
481	extern __inline __m128d
482	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
483	_mm_cmpgt_sd(__m128d __A, __m128d __B) {
484	__v2df __a, __b, __c;
485	__a = vec_splats(__A[0]);
486	__b = vec_splats(__B[0]);
487	__c = (__v2df)vec_cmpgt(__a, __b);
488	return (__m128d)_mm_setr_pd(__c[0], __A[1]);
489	}
490
491	extern __inline __m128d
492	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
493	_mm_cmpge_sd(__m128d __A, __m128d __B) {
494	__v2df __a, __b, __c;
495	__a = vec_splats(__A[0]);
496	__b = vec_splats(__B[0]);
497	__c = (__v2df)vec_cmpge(__a, __b);
498	return (__m128d)_mm_setr_pd(__c[0], __A[1]);
499	}
500
501	extern __inline __m128d
502	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
503	_mm_cmpneq_sd(__m128d __A, __m128d __B) {
504	__v2df __a, __b, __c;
505	__a = vec_splats(__A[0]);
506	__b = vec_splats(__B[0]);
507	__c = (__v2df)vec_cmpeq(__a, __b);
508	__c = vec_nor(__c, __c);
509	return (__m128d)_mm_setr_pd(__c[0], __A[1]);
510	}
511
512	extern __inline __m128d
513	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
514	_mm_cmpnlt_sd(__m128d __A, __m128d __B) {
515	__v2df __a, __b, __c;
516	__a = vec_splats(__A[0]);
517	__b = vec_splats(__B[0]);
518	/* Not less than is just greater than or equal. */
519	__c = (__v2df)vec_cmpge(__a, __b);
520	return (__m128d)_mm_setr_pd(__c[0], __A[1]);
521	}
522
523	extern __inline __m128d
524	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
525	_mm_cmpnle_sd(__m128d __A, __m128d __B) {
526	__v2df __a, __b, __c;
527	__a = vec_splats(__A[0]);
528	__b = vec_splats(__B[0]);
529	/* Not less than or equal is just greater than. */
530	__c = (__v2df)vec_cmpge(__a, __b);
531	return (__m128d)_mm_setr_pd(__c[0], __A[1]);
532	}
533
534	extern __inline __m128d
535	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
536	_mm_cmpngt_sd(__m128d __A, __m128d __B) {
537	__v2df __a, __b, __c;
538	__a = vec_splats(__A[0]);
539	__b = vec_splats(__B[0]);
540	/* Not greater than is just less than or equal. */
541	__c = (__v2df)vec_cmple(__a, __b);
542	return (__m128d)_mm_setr_pd(__c[0], __A[1]);
543	}
544
545	extern __inline __m128d
546	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
547	_mm_cmpnge_sd(__m128d __A, __m128d __B) {
548	__v2df __a, __b, __c;
549	__a = vec_splats(__A[0]);
550	__b = vec_splats(__B[0]);
551	/* Not greater than or equal is just less than. */
552	__c = (__v2df)vec_cmplt(__a, __b);
553	return (__m128d)_mm_setr_pd(__c[0], __A[1]);
554	}
555
556	extern __inline __m128d
557	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
558	_mm_cmpord_sd(__m128d __A, __m128d __B) {
559	__v2df __r;
560	__r = (__v2df)_mm_cmpord_pd(vec_splats(__A[0]), vec_splats(__B[0]));
561	return (__m128d)_mm_setr_pd(__r[0], ((__v2df)__A)[1]);
562	}
563
564	extern __inline __m128d
565	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
566	_mm_cmpunord_sd(__m128d __A, __m128d __B) {
567	__v2df __r;
568	__r = _mm_cmpunord_pd(vec_splats(__A[0]), vec_splats(__B[0]));
569	return (__m128d)_mm_setr_pd(__r[0], __A[1]);
570	}
571
572	/* FIXME
573	The __mm_comi??_sd and __mm_ucomi??_sd implementations below are
574	exactly the same because GCC for PowerPC only generates unordered
575	compares (scalar and vector).
576	Technically __mm_comieq_sp et all should be using the ordered
577	compare and signal for QNaNs. The __mm_ucomieq_sd et all should
578	be OK. */
579	extern __inline int
580	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
581	_mm_comieq_sd(__m128d __A, __m128d __B) {
582	return (__A[0] == __B[0]);
583	}
584
585	extern __inline int
586	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
587	_mm_comilt_sd(__m128d __A, __m128d __B) {
588	return (__A[0] < __B[0]);
589	}
590
591	extern __inline int
592	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
593	_mm_comile_sd(__m128d __A, __m128d __B) {
594	return (__A[0] <= __B[0]);
595	}
596
597	extern __inline int
598	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
599	_mm_comigt_sd(__m128d __A, __m128d __B) {
600	return (__A[0] > __B[0]);
601	}
602
603	extern __inline int
604	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
605	_mm_comige_sd(__m128d __A, __m128d __B) {
606	return (__A[0] >= __B[0]);
607	}
608
609	extern __inline int
610	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
611	_mm_comineq_sd(__m128d __A, __m128d __B) {
612	return (__A[0] != __B[0]);
613	}
614
615	extern __inline int
616	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
617	_mm_ucomieq_sd(__m128d __A, __m128d __B) {
618	return (__A[0] == __B[0]);
619	}
620
621	extern __inline int
622	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
623	_mm_ucomilt_sd(__m128d __A, __m128d __B) {
624	return (__A[0] < __B[0]);
625	}
626
627	extern __inline int
628	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
629	_mm_ucomile_sd(__m128d __A, __m128d __B) {
630	return (__A[0] <= __B[0]);
631	}
632
633	extern __inline int
634	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
635	_mm_ucomigt_sd(__m128d __A, __m128d __B) {
636	return (__A[0] > __B[0]);
637	}
638
639	extern __inline int
640	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
641	_mm_ucomige_sd(__m128d __A, __m128d __B) {
642	return (__A[0] >= __B[0]);
643	}
644
645	extern __inline int
646	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
647	_mm_ucomineq_sd(__m128d __A, __m128d __B) {
648	return (__A[0] != __B[0]);
649	}
650
651	/* Create a vector of Qi, where i is the element number. */
652	extern __inline __m128i
653	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
654	_mm_set_epi64x(long long __q1, long long __q0) {
655	return __extension__(__m128i)(__v2di){__q0, __q1};
656	}
657
658	extern __inline __m128i
659	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
660	_mm_set_epi64(__m64 __q1, __m64 __q0) {
661	return _mm_set_epi64x((long long)__q1, (long long)__q0);
662	}
663
664	extern __inline __m128i
665	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
666	_mm_set_epi32(int __q3, int __q2, int __q1, int __q0) {
667	return __extension__(__m128i)(__v4si){__q0, __q1, __q2, __q3};
668	}
669
670	extern __inline __m128i
671	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
672	_mm_set_epi16(short __q7, short __q6, short __q5, short __q4, short __q3,
673	short __q2, short __q1, short __q0) {
674	return __extension__(__m128i)(__v8hi){__q0, __q1, __q2, __q3,
675	__q4, __q5, __q6, __q7};
676	}
677
678	extern __inline __m128i
679	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
680	_mm_set_epi8(char __q15, char __q14, char __q13, char __q12, char __q11,
681	char __q10, char __q09, char __q08, char __q07, char __q06,
682	char __q05, char __q04, char __q03, char __q02, char __q01,
683	char __q00) {
684	return __extension__(__m128i)(__v16qi){
685	__q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
686	__q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15};
687	}
688
689	/* Set all of the elements of the vector to A. */
690	extern __inline __m128i
691	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
692	_mm_set1_epi64x(long long __A) {
693	return _mm_set_epi64x(__A, __A);
694	}
695
696	extern __inline __m128i
697	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
698	_mm_set1_epi64(__m64 __A) {
699	return _mm_set_epi64(__A, __A);
700	}
701
702	extern __inline __m128i
703	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
704	_mm_set1_epi32(int __A) {
705	return _mm_set_epi32(__A, __A, __A, __A);
706	}
707
708	extern __inline __m128i
709	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
710	_mm_set1_epi16(short __A) {
711	return _mm_set_epi16(__A, __A, __A, __A, __A, __A, __A, __A);
712	}
713
714	extern __inline __m128i
715	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
716	_mm_set1_epi8(char __A) {
717	return _mm_set_epi8(__A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A,
718	__A, __A, __A, __A, __A);
719	}
720
721	/* Create a vector of Qi, where i is the element number.
722	The parameter order is reversed from the _mm_set_epi* functions. */
723	extern __inline __m128i
724	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
725	_mm_setr_epi64(__m64 __q0, __m64 __q1) {
726	return _mm_set_epi64(__q1, __q0);
727	}
728
729	extern __inline __m128i
730	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
731	_mm_setr_epi32(int __q0, int __q1, int __q2, int __q3) {
732	return _mm_set_epi32(__q3, __q2, __q1, __q0);
733	}
734
735	extern __inline __m128i
736	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
737	_mm_setr_epi16(short __q0, short __q1, short __q2, short __q3, short __q4,
738	short __q5, short __q6, short __q7) {
739	return _mm_set_epi16(__q7, __q6, __q5, __q4, __q3, __q2, __q1, __q0);
740	}
741
742	extern __inline __m128i
743	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
744	_mm_setr_epi8(char __q00, char __q01, char __q02, char __q03, char __q04,
745	char __q05, char __q06, char __q07, char __q08, char __q09,
746	char __q10, char __q11, char __q12, char __q13, char __q14,
747	char __q15) {
748	return _mm_set_epi8(__q15, __q14, __q13, __q12, __q11, __q10, __q09, __q08,
749	__q07, __q06, __q05, __q04, __q03, __q02, __q01, __q00);
750	}
751
752	/* Create a vector with element 0 as P and the rest zero. /
753	extern __inline __m128i
754	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
755	_mm_load_si128(__m128i const *__P) {
756	return *__P;
757	}
758
759	extern __inline __m128i
760	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
761	_mm_loadu_si128(__m128i_u const *__P) {
762	return (__m128i)(vec_vsx_ld(0, (signed int const *)__P));
763	}
764
765	extern __inline __m128i
766	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
767	_mm_loadl_epi64(__m128i_u const *__P) {
768	return _mm_set_epi64((__m64)0LL, (__m64 )__P);
769	}
770
771	extern __inline void
772	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
773	_mm_store_si128(__m128i *__P, __m128i __B) {
774	vec_st((__v16qu)__B, 0, (__v16qu *)__P);
775	}
776
777	extern __inline void
778	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
779	_mm_storeu_si128(__m128i_u *__P, __m128i __B) {
780	*__P = __B;
781	}
782
783	extern __inline void
784	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
785	_mm_storel_epi64(__m128i_u *__P, __m128i __B) {
786	(long long )__P = ((__v2di)__B)[0];
787	}
788
789	extern __inline __m64
790	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
791	_mm_movepi64_pi64(__m128i_u __B) {
792	return (__m64)((__v2di)__B)[0];
793	}
794
795	extern __inline __m128i
796	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
797	_mm_movpi64_epi64(__m64 __A) {
798	return _mm_set_epi64((__m64)0LL, __A);
799	}
800
801	extern __inline __m128i
802	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
803	_mm_move_epi64(__m128i __A) {
804	return _mm_set_epi64((__m64)0LL, (__m64)__A[0]);
805	}
806
807	/* Create an undefined vector. */
808	extern __inline __m128i
809	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
810	_mm_undefined_si128(void) {
811	__m128i __Y = __Y;
812	return __Y;
813	}
814
815	/* Create a vector of zeros. */
816	extern __inline __m128i
817	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
818	_mm_setzero_si128(void) {
819	return __extension__(__m128i)(__v4si){0, 0, 0, 0};
820	}
821
822	#ifdef _ARCH_PWR8
823	extern __inline __m128d
824	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
825	_mm_cvtepi32_pd(__m128i __A) {
826	__v2di __val;
827	/* For LE need to generate Vector Unpack Low Signed Word.
828	Which is generated from unpackh. */
829	__val = (__v2di)vec_unpackh((__v4si)__A);
830
831	return (__m128d)vec_ctf(__val, 0);
832	}
833	#endif
834
835	extern __inline __m128
836	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
837	_mm_cvtepi32_ps(__m128i __A) {
838	return ((__m128)vec_ctf((__v4si)__A, 0));
839	}
840
841	extern __inline __m128i
842	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
843	_mm_cvtpd_epi32(__m128d __A) {
844	__v2df __rounded = vec_rint(__A);
845	__v4si __result, __temp;
846	const __v4si __vzero = {0, 0, 0, 0};
847
848	/* VSX Vector truncate Double-Precision to integer and Convert to
849	Signed Integer Word format with Saturate. */
850	__asm__("xvcvdpsxws %x0,%x1" : "=wa"(__temp) : "wa"(__rounded) :);
851
852	#ifdef _ARCH_PWR8
853	#ifdef __LITTLE_ENDIAN__
854	__temp = vec_mergeo(__temp, __temp);
855	#else
856	__temp = vec_mergee(__temp, __temp);
857	#endif
858	__result = (__v4si)vec_vpkudum((__vector long long)__temp,
859	(__vector long long)__vzero);
860	#else
861	{
862	const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
863	0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f};
864	__result = (__v4si)vec_perm((__v16qu)__temp, (__v16qu)__vzero, __pkperm);
865	}
866	#endif
867	return (__m128i)__result;
868	}
869
870	extern __inline __m64
871	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
872	_mm_cvtpd_pi32(__m128d __A) {
873	__m128i __result = _mm_cvtpd_epi32(__A);
874
875	return (__m64)__result[0];
876	}
877
878	extern __inline __m128
879	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
880	_mm_cvtpd_ps(__m128d __A) {
881	__v4sf __result;
882	__v4si __temp;
883	const __v4si __vzero = {0, 0, 0, 0};
884
885	__asm__("xvcvdpsp %x0,%x1" : "=wa"(__temp) : "wa"(__A) :);
886
887	#ifdef _ARCH_PWR8
888	#ifdef __LITTLE_ENDIAN__
889	__temp = vec_mergeo(__temp, __temp);
890	#else
891	__temp = vec_mergee(__temp, __temp);
892	#endif
893	__result = (__v4sf)vec_vpkudum((__vector long long)__temp,
894	(__vector long long)__vzero);
895	#else
896	{
897	const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
898	0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f};
899	__result = (__v4sf)vec_perm((__v16qu)__temp, (__v16qu)__vzero, __pkperm);
900	}
901	#endif
902	return ((__m128)__result);
903	}
904
905	extern __inline __m128i
906	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
907	_mm_cvttpd_epi32(__m128d __A) {
908	__v4si __result;
909	__v4si __temp;
910	const __v4si __vzero = {0, 0, 0, 0};
911
912	/* VSX Vector truncate Double-Precision to integer and Convert to
913	Signed Integer Word format with Saturate. */
914	__asm__("xvcvdpsxws %x0,%x1" : "=wa"(__temp) : "wa"(__A) :);
915
916	#ifdef _ARCH_PWR8
917	#ifdef __LITTLE_ENDIAN__
918	__temp = vec_mergeo(__temp, __temp);
919	#else
920	__temp = vec_mergee(__temp, __temp);
921	#endif
922	__result = (__v4si)vec_vpkudum((__vector long long)__temp,
923	(__vector long long)__vzero);
924	#else
925	{
926	const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
927	0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f};
928	__result = (__v4si)vec_perm((__v16qu)__temp, (__v16qu)__vzero, __pkperm);
929	}
930	#endif
931
932	return ((__m128i)__result);
933	}
934
935	extern __inline __m64
936	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
937	_mm_cvttpd_pi32(__m128d __A) {
938	__m128i __result = _mm_cvttpd_epi32(__A);
939
940	return (__m64)__result[0];
941	}
942
943	extern __inline int
944	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
945	_mm_cvtsi128_si32(__m128i __A) {
946	return ((__v4si)__A)[0];
947	}
948
949	#ifdef _ARCH_PWR8
950	extern __inline __m128d
951	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
952	_mm_cvtpi32_pd(__m64 __A) {
953	__v4si __temp;
954	__v2di __tmp2;
955	__v4f __result;
956
957	__temp = (__v4si)vec_splats(__A);
958	__tmp2 = (__v2di)vec_unpackl(__temp);
959	__result = vec_ctf((__vector signed long long)__tmp2, 0);
960	return (__m128d)__result;
961	}
962	#endif
963
964	extern __inline __m128i
965	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
966	_mm_cvtps_epi32(__m128 __A) {
967	__v4sf __rounded;
968	__v4si __result;
969
970	__rounded = vec_rint((__v4sf)__A);
971	__result = vec_cts(__rounded, 0);
972	return (__m128i)__result;
973	}
974
975	extern __inline __m128i
976	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
977	_mm_cvttps_epi32(__m128 __A) {
978	__v4si __result;
979
980	__result = vec_cts((__v4sf)__A, 0);
981	return (__m128i)__result;
982	}
983
984	extern __inline __m128d
985	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
986	_mm_cvtps_pd(__m128 __A) {
987	/* Check if vec_doubleh is defined by <altivec.h>. If so use that. */
988	#ifdef vec_doubleh
989	return (__m128d)vec_doubleh((__v4sf)__A);
990	#else
991	/* Otherwise the compiler is not current and so need to generate the
992	equivalent code. */
993	__v4sf __a = (__v4sf)__A;
994	__v4sf __temp;
995	__v2df __result;
996	#ifdef __LITTLE_ENDIAN__
997	/* The input float values are in elements {[0], [1]} but the convert
998	instruction needs them in elements {[1], [3]}, So we use two
999	shift left double vector word immediates to get the elements
1000	lined up. */
1001	__temp = __builtin_vsx_xxsldwi(__a, __a, 3);
1002	__temp = __builtin_vsx_xxsldwi(__a, __temp, 2);
1003	#else
1004	/* The input float values are in elements {[0], [1]} but the convert
1005	instruction needs them in elements {[0], [2]}, So we use two
1006	shift left double vector word immediates to get the elements
1007	lined up. */
1008	__temp = vec_vmrghw(__a, __a);
1009	#endif
1010	__asm__(" xvcvspdp %x0,%x1" : "=wa"(__result) : "wa"(__temp) :);
1011	return (__m128d)__result;
1012	#endif
1013	}
1014
1015	extern __inline int
1016	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1017	_mm_cvtsd_si32(__m128d __A) {
1018	__v2df __rounded = vec_rint((__v2df)__A);
1019	int __result = ((__v2df)__rounded)[0];
1020
1021	return __result;
1022	}
1023	/* Intel intrinsic. */
1024	extern __inline long long
1025	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1026	_mm_cvtsd_si64(__m128d __A) {
1027	__v2df __rounded = vec_rint((__v2df)__A);
1028	long long __result = ((__v2df)__rounded)[0];
1029
1030	return __result;
1031	}
1032
1033	/* Microsoft intrinsic. */
1034	extern __inline long long
1035	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1036	_mm_cvtsd_si64x(__m128d __A) {
1037	return _mm_cvtsd_si64((__v2df)__A);
1038	}
1039
1040	extern __inline int
1041	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1042	_mm_cvttsd_si32(__m128d __A) {
1043	int __result = ((__v2df)__A)[0];
1044
1045	return __result;
1046	}
1047
1048	/* Intel intrinsic. */
1049	extern __inline long long
1050	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1051	_mm_cvttsd_si64(__m128d __A) {
1052	long long __result = ((__v2df)__A)[0];
1053
1054	return __result;
1055	}
1056
1057	/* Microsoft intrinsic. */
1058	extern __inline long long
1059	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1060	_mm_cvttsd_si64x(__m128d __A) {
1061	return _mm_cvttsd_si64(__A);
1062	}
1063
1064	extern __inline __m128
1065	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1066	_mm_cvtsd_ss(__m128 __A, __m128d __B) {
1067	__v4sf __result = (__v4sf)__A;
1068
1069	#ifdef __LITTLE_ENDIAN__
1070	__v4sf __temp_s;
1071	/* Copy double element[0] to element [1] for conversion. */
1072	__v2df __temp_b = vec_splat((__v2df)__B, 0);
1073
1074	/* Pre-rotate __A left 3 (logically right 1) elements. */
1075	__result = __builtin_vsx_xxsldwi(__result, __result, 3);
1076	/* Convert double to single float scalar in a vector. */
1077	__asm__("xscvdpsp %x0,%x1" : "=wa"(__temp_s) : "wa"(__temp_b) :);
1078	/* Shift the resulting scalar into vector element [0]. */
1079	__result = __builtin_vsx_xxsldwi(__result, __temp_s, 1);
1080	#else
1081	__result[0] = ((__v2df)__B)[0];
1082	#endif
1083	return (__m128)__result;
1084	}
1085
1086	extern __inline __m128d
1087	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1088	_mm_cvtsi32_sd(__m128d __A, int __B) {
1089	__v2df __result = (__v2df)__A;
1090	double __db = __B;
1091	__result[0] = __db;
1092	return (__m128d)__result;
1093	}
1094
1095	/* Intel intrinsic. */
1096	extern __inline __m128d
1097	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1098	_mm_cvtsi64_sd(__m128d __A, long long __B) {
1099	__v2df __result = (__v2df)__A;
1100	double __db = __B;
1101	__result[0] = __db;
1102	return (__m128d)__result;
1103	}
1104
1105	/* Microsoft intrinsic. */
1106	extern __inline __m128d
1107	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1108	_mm_cvtsi64x_sd(__m128d __A, long long __B) {
1109	return _mm_cvtsi64_sd(__A, __B);
1110	}
1111
1112	extern __inline __m128d
1113	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1114	_mm_cvtss_sd(__m128d __A, __m128 __B) {
1115	#ifdef __LITTLE_ENDIAN__
1116	/* Use splat to move element [0] into position for the convert. */
1117	__v4sf __temp = vec_splat((__v4sf)__B, 0);
1118	__v2df __res;
1119	/* Convert single float scalar to double in a vector. */
1120	__asm__("xscvspdp %x0,%x1" : "=wa"(__res) : "wa"(__temp) :);
1121	return (__m128d)vec_mergel(__res, (__v2df)__A);
1122	#else
1123	__v2df __res = (__v2df)__A;
1124	__res[0] = ((__v4sf)__B)[0];
1125	return (__m128d)__res;
1126	#endif
1127	}
1128
1129	extern __inline __m128d
1130	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1131	_mm_shuffle_pd(__m128d __A, __m128d __B, const int __mask) {
1132	__vector double __result;
1133	const int __litmsk = __mask & 0x3;
1134
1135	if (__litmsk == 0)
1136	__result = vec_mergeh(__A, __B);
1137	#if __GNUC__ < 6
1138	else if (__litmsk == 1)
1139	__result = vec_xxpermdi(__B, __A, 2);
1140	else if (__litmsk == 2)
1141	__result = vec_xxpermdi(__B, __A, 1);
1142	#else
1143	else if (__litmsk == 1)
1144	__result = vec_xxpermdi(__A, __B, 2);
1145	else if (__litmsk == 2)
1146	__result = vec_xxpermdi(__A, __B, 1);
1147	#endif
1148	else
1149	__result = vec_mergel(__A, __B);
1150
1151	return __result;
1152	}
1153
1154	extern __inline __m128d
1155	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1156	_mm_unpackhi_pd(__m128d __A, __m128d __B) {
1157	return (__m128d)vec_mergel((__v2df)__A, (__v2df)__B);
1158	}
1159
1160	extern __inline __m128d
1161	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1162	_mm_unpacklo_pd(__m128d __A, __m128d __B) {
1163	return (__m128d)vec_mergeh((__v2df)__A, (__v2df)__B);
1164	}
1165
1166	extern __inline __m128d
1167	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1168	_mm_loadh_pd(__m128d __A, double const *__B) {
1169	__v2df __result = (__v2df)__A;
1170	__result[1] = *__B;
1171	return (__m128d)__result;
1172	}
1173
1174	extern __inline __m128d
1175	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1176	_mm_loadl_pd(__m128d __A, double const *__B) {
1177	__v2df __result = (__v2df)__A;
1178	__result[0] = *__B;
1179	return (__m128d)__result;
1180	}
1181
1182	#ifdef _ARCH_PWR8
1183	/* Intrinsic functions that require PowerISA 2.07 minimum. */
1184
1185	/* Creates a 2-bit mask from the most significant bits of the DPFP values. */
1186	extern __inline int
1187	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1188	_mm_movemask_pd(__m128d __A) {
1189	#ifdef _ARCH_PWR10
1190	return vec_extractm((__v2du)__A);
1191	#else
1192	__vector unsigned long long __result;
1193	static const __vector unsigned int __perm_mask = {
1194	#ifdef __LITTLE_ENDIAN__
1195	0x80800040, 0x80808080, 0x80808080, 0x80808080
1196	#else
1197	0x80808080, 0x80808080, 0x80808080, 0x80804000
1198	#endif
1199	};
1200
1201	__result = ((__vector unsigned long long)vec_vbpermq(
1202	(__vector unsigned char)__A, (__vector unsigned char)__perm_mask));
1203
1204	#ifdef __LITTLE_ENDIAN__
1205	return __result[1];
1206	#else
1207	return __result[0];
1208	#endif
1209	#endif /* !_ARCH_PWR10 */
1210	}
1211	#endif /* _ARCH_PWR8 */
1212
1213	extern __inline __m128i
1214	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1215	_mm_packs_epi16(__m128i __A, __m128i __B) {
1216	return (__m128i)vec_packs((__v8hi)__A, (__v8hi)__B);
1217	}
1218
1219	extern __inline __m128i
1220	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1221	_mm_packs_epi32(__m128i __A, __m128i __B) {
1222	return (__m128i)vec_packs((__v4si)__A, (__v4si)__B);
1223	}
1224
1225	extern __inline __m128i
1226	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1227	_mm_packus_epi16(__m128i __A, __m128i __B) {
1228	return (__m128i)vec_packsu((__v8hi)__A, (__v8hi)__B);
1229	}
1230
1231	extern __inline __m128i
1232	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1233	_mm_unpackhi_epi8(__m128i __A, __m128i __B) {
1234	return (__m128i)vec_mergel((__v16qu)__A, (__v16qu)__B);
1235	}
1236
1237	extern __inline __m128i
1238	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1239	_mm_unpackhi_epi16(__m128i __A, __m128i __B) {
1240	return (__m128i)vec_mergel((__v8hu)__A, (__v8hu)__B);
1241	}
1242
1243	extern __inline __m128i
1244	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1245	_mm_unpackhi_epi32(__m128i __A, __m128i __B) {
1246	return (__m128i)vec_mergel((__v4su)__A, (__v4su)__B);
1247	}
1248
1249	extern __inline __m128i
1250	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1251	_mm_unpackhi_epi64(__m128i __A, __m128i __B) {
1252	return (__m128i)vec_mergel((__vector long long)__A, (__vector long long)__B);
1253	}
1254
1255	extern __inline __m128i
1256	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1257	_mm_unpacklo_epi8(__m128i __A, __m128i __B) {
1258	return (__m128i)vec_mergeh((__v16qu)__A, (__v16qu)__B);
1259	}
1260
1261	extern __inline __m128i
1262	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1263	_mm_unpacklo_epi16(__m128i __A, __m128i __B) {
1264	return (__m128i)vec_mergeh((__v8hi)__A, (__v8hi)__B);
1265	}
1266
1267	extern __inline __m128i
1268	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1269	_mm_unpacklo_epi32(__m128i __A, __m128i __B) {
1270	return (__m128i)vec_mergeh((__v4si)__A, (__v4si)__B);
1271	}
1272
1273	extern __inline __m128i
1274	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1275	_mm_unpacklo_epi64(__m128i __A, __m128i __B) {
1276	return (__m128i)vec_mergeh((__vector long long)__A, (__vector long long)__B);
1277	}
1278
1279	extern __inline __m128i
1280	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1281	_mm_add_epi8(__m128i __A, __m128i __B) {
1282	return (__m128i)((__v16qu)__A + (__v16qu)__B);
1283	}
1284
1285	extern __inline __m128i
1286	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1287	_mm_add_epi16(__m128i __A, __m128i __B) {
1288	return (__m128i)((__v8hu)__A + (__v8hu)__B);
1289	}
1290
1291	extern __inline __m128i
1292	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1293	_mm_add_epi32(__m128i __A, __m128i __B) {
1294	return (__m128i)((__v4su)__A + (__v4su)__B);
1295	}
1296
1297	extern __inline __m128i
1298	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1299	_mm_add_epi64(__m128i __A, __m128i __B) {
1300	return (__m128i)((__v2du)__A + (__v2du)__B);
1301	}
1302
1303	extern __inline __m128i
1304	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1305	_mm_adds_epi8(__m128i __A, __m128i __B) {
1306	return (__m128i)vec_adds((__v16qi)__A, (__v16qi)__B);
1307	}
1308
1309	extern __inline __m128i
1310	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1311	_mm_adds_epi16(__m128i __A, __m128i __B) {
1312	return (__m128i)vec_adds((__v8hi)__A, (__v8hi)__B);
1313	}
1314
1315	extern __inline __m128i
1316	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1317	_mm_adds_epu8(__m128i __A, __m128i __B) {
1318	return (__m128i)vec_adds((__v16qu)__A, (__v16qu)__B);
1319	}
1320
1321	extern __inline __m128i
1322	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1323	_mm_adds_epu16(__m128i __A, __m128i __B) {
1324	return (__m128i)vec_adds((__v8hu)__A, (__v8hu)__B);
1325	}
1326
1327	extern __inline __m128i
1328	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1329	_mm_sub_epi8(__m128i __A, __m128i __B) {
1330	return (__m128i)((__v16qu)__A - (__v16qu)__B);
1331	}
1332
1333	extern __inline __m128i
1334	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1335	_mm_sub_epi16(__m128i __A, __m128i __B) {
1336	return (__m128i)((__v8hu)__A - (__v8hu)__B);
1337	}
1338
1339	extern __inline __m128i
1340	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1341	_mm_sub_epi32(__m128i __A, __m128i __B) {
1342	return (__m128i)((__v4su)__A - (__v4su)__B);
1343	}
1344
1345	extern __inline __m128i
1346	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1347	_mm_sub_epi64(__m128i __A, __m128i __B) {
1348	return (__m128i)((__v2du)__A - (__v2du)__B);
1349	}
1350
1351	extern __inline __m128i
1352	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1353	_mm_subs_epi8(__m128i __A, __m128i __B) {
1354	return (__m128i)vec_subs((__v16qi)__A, (__v16qi)__B);
1355	}
1356
1357	extern __inline __m128i
1358	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1359	_mm_subs_epi16(__m128i __A, __m128i __B) {
1360	return (__m128i)vec_subs((__v8hi)__A, (__v8hi)__B);
1361	}
1362
1363	extern __inline __m128i
1364	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1365	_mm_subs_epu8(__m128i __A, __m128i __B) {
1366	return (__m128i)vec_subs((__v16qu)__A, (__v16qu)__B);
1367	}
1368
1369	extern __inline __m128i
1370	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1371	_mm_subs_epu16(__m128i __A, __m128i __B) {
1372	return (__m128i)vec_subs((__v8hu)__A, (__v8hu)__B);
1373	}
1374
1375	extern __inline __m128i
1376	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1377	_mm_madd_epi16(__m128i __A, __m128i __B) {
1378	__vector signed int __zero = {0, 0, 0, 0};
1379
1380	return (__m128i)vec_vmsumshm((__v8hi)__A, (__v8hi)__B, __zero);
1381	}
1382
1383	extern __inline __m128i
1384	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1385	_mm_mulhi_epi16(__m128i __A, __m128i __B) {
1386	__vector signed int __w0, __w1;
1387
1388	__vector unsigned char __xform1 = {
1389	#ifdef __LITTLE_ENDIAN__
1390	0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A,
1391	0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1392	#else
1393	0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x08,
1394	0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D
1395	#endif
1396	};
1397
1398	__w0 = vec_vmulesh((__v8hi)__A, (__v8hi)__B);
1399	__w1 = vec_vmulosh((__v8hi)__A, (__v8hi)__B);
1400	return (__m128i)vec_perm(__w0, __w1, __xform1);
1401	}
1402
1403	extern __inline __m128i
1404	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1405	_mm_mullo_epi16(__m128i __A, __m128i __B) {
1406	return (__m128i)((__v8hi)__A * (__v8hi)__B);
1407	}
1408
1409	extern __inline __m64
1410	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1411	_mm_mul_su32(__m64 __A, __m64 __B) {
1412	unsigned int __a = __A;
1413	unsigned int __b = __B;
1414
1415	return ((__m64)__a * (__m64)__b);
1416	}
1417
1418	#ifdef _ARCH_PWR8
1419	extern __inline __m128i
1420	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1421	_mm_mul_epu32(__m128i __A, __m128i __B) {
1422	#if __GNUC__ < 8
1423	__v2du __result;
1424
1425	#ifdef __LITTLE_ENDIAN__
1426	/* VMX Vector Multiply Odd Unsigned Word. */
1427	__asm__("vmulouw %0,%1,%2" : "=v"(__result) : "v"(__A), "v"(__B) :);
1428	#else
1429	/* VMX Vector Multiply Even Unsigned Word. */
1430	__asm__("vmuleuw %0,%1,%2" : "=v"(__result) : "v"(__A), "v"(__B) :);
1431	#endif
1432	return (__m128i)__result;
1433	#else
1434	return (__m128i)vec_mule((__v4su)__A, (__v4su)__B);
1435	#endif
1436	}
1437	#endif
1438
1439	extern __inline __m128i
1440	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1441	_mm_slli_epi16(__m128i __A, int __B) {
1442	__v8hu __lshift;
1443	__v8hi __result = {0, 0, 0, 0, 0, 0, 0, 0};
1444
1445	if (__B >= 0 && __B < 16) {
1446	if (__builtin_constant_p(__B))
1447	__lshift = (__v8hu)vec_splat_s16(__B);
1448	else
1449	__lshift = vec_splats((unsigned short)__B);
1450
1451	__result = vec_sl((__v8hi)__A, __lshift);
1452	}
1453
1454	return (__m128i)__result;
1455	}
1456
1457	extern __inline __m128i
1458	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1459	_mm_slli_epi32(__m128i __A, int __B) {
1460	__v4su __lshift;
1461	__v4si __result = {0, 0, 0, 0};
1462
1463	if (__B >= 0 && __B < 32) {
1464	if (__builtin_constant_p(__B) && __B < 16)
1465	__lshift = (__v4su)vec_splat_s32(__B);
1466	else
1467	__lshift = vec_splats((unsigned int)__B);
1468
1469	__result = vec_sl((__v4si)__A, __lshift);
1470	}
1471
1472	return (__m128i)__result;
1473	}
1474
1475	#ifdef _ARCH_PWR8
1476	extern __inline __m128i
1477	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1478	_mm_slli_epi64(__m128i __A, int __B) {
1479	__v2du __lshift;
1480	__v2di __result = {0, 0};
1481
1482	if (__B >= 0 && __B < 64) {
1483	if (__builtin_constant_p(__B) && __B < 16)
1484	__lshift = (__v2du)vec_splat_s32(__B);
1485	else
1486	__lshift = (__v2du)vec_splats((unsigned int)__B);
1487
1488	__result = vec_sl((__v2di)__A, __lshift);
1489	}
1490
1491	return (__m128i)__result;
1492	}
1493	#endif
1494
1495	extern __inline __m128i
1496	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1497	_mm_srai_epi16(__m128i __A, int __B) {
1498	__v8hu __rshift = {15, 15, 15, 15, 15, 15, 15, 15};
1499	__v8hi __result;
1500
1501	if (__B < 16) {
1502	if (__builtin_constant_p(__B))
1503	__rshift = (__v8hu)vec_splat_s16(__B);
1504	else
1505	__rshift = vec_splats((unsigned short)__B);
1506	}
1507	__result = vec_sra((__v8hi)__A, __rshift);
1508
1509	return (__m128i)__result;
1510	}
1511
1512	extern __inline __m128i
1513	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1514	_mm_srai_epi32(__m128i __A, int __B) {
1515	__v4su __rshift = {31, 31, 31, 31};
1516	__v4si __result;
1517
1518	if (__B < 32) {
1519	if (__builtin_constant_p(__B)) {
1520	if (__B < 16)
1521	__rshift = (__v4su)vec_splat_s32(__B);
1522	else
1523	__rshift = (__v4su)vec_splats((unsigned int)__B);
1524	} else
1525	__rshift = vec_splats((unsigned int)__B);
1526	}
1527	__result = vec_sra((__v4si)__A, __rshift);
1528
1529	return (__m128i)__result;
1530	}
1531
1532	extern __inline __m128i
1533	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1534	_mm_bslli_si128(__m128i __A, const int __N) {
1535	__v16qu __result;
1536	const __v16qu __zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1537
1538	if (__N < 16)
1539	__result = vec_sld((__v16qu)__A, __zeros, __N);
1540	else
1541	__result = __zeros;
1542
1543	return (__m128i)__result;
1544	}
1545
1546	extern __inline __m128i
1547	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1548	_mm_bsrli_si128(__m128i __A, const int __N) {
1549	__v16qu __result;
1550	const __v16qu __zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1551
1552	if (__N < 16)
1553	#ifdef __LITTLE_ENDIAN__
1554	if (__builtin_constant_p(__N))
1555	/* Would like to use Vector Shift Left Double by Octet
1556	Immediate here to use the immediate form and avoid
1557	load of __N * 8 value into a separate VR. */
1558	__result = vec_sld(__zeros, (__v16qu)__A, (16 - __N));
1559	else
1560	#endif
1561	{
1562	__v16qu __shift = vec_splats((unsigned char)(__N * 8));
1563	#ifdef __LITTLE_ENDIAN__
1564	__result = vec_sro((__v16qu)__A, __shift);
1565	#else
1566	__result = vec_slo((__v16qu)__A, __shift);
1567	#endif
1568	}
1569	else
1570	__result = __zeros;
1571
1572	return (__m128i)__result;
1573	}
1574
1575	extern __inline __m128i
1576	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1577	_mm_srli_si128(__m128i __A, const int __N) {
1578	return _mm_bsrli_si128(__A, __N);
1579	}
1580
1581	extern __inline __m128i
1582	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1583	_mm_slli_si128(__m128i __A, const int _imm5) {
1584	__v16qu __result;
1585	const __v16qu __zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1586
1587	if (_imm5 < 16)
1588	#ifdef __LITTLE_ENDIAN__
1589	__result = vec_sld((__v16qu)__A, __zeros, _imm5);
1590	#else
1591	__result = vec_sld(__zeros, (__v16qu)__A, (16 - _imm5));
1592	#endif
1593	else
1594	__result = __zeros;
1595
1596	return (__m128i)__result;
1597	}
1598
1599	extern __inline __m128i
1600	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1601
1602	_mm_srli_epi16(__m128i __A, int __B) {
1603	__v8hu __rshift;
1604	__v8hi __result = {0, 0, 0, 0, 0, 0, 0, 0};
1605
1606	if (__B < 16) {
1607	if (__builtin_constant_p(__B))
1608	__rshift = (__v8hu)vec_splat_s16(__B);
1609	else
1610	__rshift = vec_splats((unsigned short)__B);
1611
1612	__result = vec_sr((__v8hi)__A, __rshift);
1613	}
1614
1615	return (__m128i)__result;
1616	}
1617
1618	extern __inline __m128i
1619	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1620	_mm_srli_epi32(__m128i __A, int __B) {
1621	__v4su __rshift;
1622	__v4si __result = {0, 0, 0, 0};
1623
1624	if (__B < 32) {
1625	if (__builtin_constant_p(__B)) {
1626	if (__B < 16)
1627	__rshift = (__v4su)vec_splat_s32(__B);
1628	else
1629	__rshift = (__v4su)vec_splats((unsigned int)__B);
1630	} else
1631	__rshift = vec_splats((unsigned int)__B);
1632
1633	__result = vec_sr((__v4si)__A, __rshift);
1634	}
1635
1636	return (__m128i)__result;
1637	}
1638
1639	#ifdef _ARCH_PWR8
1640	extern __inline __m128i
1641	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1642	_mm_srli_epi64(__m128i __A, int __B) {
1643	__v2du __rshift;
1644	__v2di __result = {0, 0};
1645
1646	if (__B < 64) {
1647	if (__builtin_constant_p(__B)) {
1648	if (__B < 16)
1649	__rshift = (__v2du)vec_splat_s32(__B);
1650	else
1651	__rshift = (__v2du)vec_splats((unsigned long long)__B);
1652	} else
1653	__rshift = (__v2du)vec_splats((unsigned int)__B);
1654
1655	__result = vec_sr((__v2di)__A, __rshift);
1656	}
1657
1658	return (__m128i)__result;
1659	}
1660	#endif
1661
1662	extern __inline __m128i
1663	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1664	_mm_sll_epi16(__m128i __A, __m128i __B) {
1665	__v8hu __lshift;
1666	__vector __bool short __shmask;
1667	const __v8hu __shmax = {15, 15, 15, 15, 15, 15, 15, 15};
1668	__v8hu __result;
1669
1670	#ifdef __LITTLE_ENDIAN__
1671	__lshift = vec_splat((__v8hu)__B, 0);
1672	#else
1673	__lshift = vec_splat((__v8hu)__B, 3);
1674	#endif
1675	__shmask = vec_cmple(__lshift, __shmax);
1676	__result = vec_sl((__v8hu)__A, __lshift);
1677	__result = vec_sel((__v8hu)__shmask, __result, __shmask);
1678
1679	return (__m128i)__result;
1680	}
1681
1682	extern __inline __m128i
1683	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1684	_mm_sll_epi32(__m128i __A, __m128i __B) {
1685	__v4su __lshift;
1686	__vector __bool int __shmask;
1687	const __v4su __shmax = {32, 32, 32, 32};
1688	__v4su __result;
1689	#ifdef __LITTLE_ENDIAN__
1690	__lshift = vec_splat((__v4su)__B, 0);
1691	#else
1692	__lshift = vec_splat((__v4su)__B, 1);
1693	#endif
1694	__shmask = vec_cmplt(__lshift, __shmax);
1695	__result = vec_sl((__v4su)__A, __lshift);
1696	__result = vec_sel((__v4su)__shmask, __result, __shmask);
1697
1698	return (__m128i)__result;
1699	}
1700
1701	#ifdef _ARCH_PWR8
1702	extern __inline __m128i
1703	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1704	_mm_sll_epi64(__m128i __A, __m128i __B) {
1705	__v2du __lshift;
1706	__vector __bool long long __shmask;
1707	const __v2du __shmax = {64, 64};
1708	__v2du __result;
1709
1710	__lshift = vec_splat((__v2du)__B, 0);
1711	__shmask = vec_cmplt(__lshift, __shmax);
1712	__result = vec_sl((__v2du)__A, __lshift);
1713	__result = vec_sel((__v2du)__shmask, __result, __shmask);
1714
1715	return (__m128i)__result;
1716	}
1717	#endif
1718
1719	extern __inline __m128i
1720	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1721	_mm_sra_epi16(__m128i __A, __m128i __B) {
1722	const __v8hu __rshmax = {15, 15, 15, 15, 15, 15, 15, 15};
1723	__v8hu __rshift;
1724	__v8hi __result;
1725
1726	#ifdef __LITTLE_ENDIAN__
1727	__rshift = vec_splat((__v8hu)__B, 0);
1728	#else
1729	__rshift = vec_splat((__v8hu)__B, 3);
1730	#endif
1731	__rshift = vec_min(__rshift, __rshmax);
1732	__result = vec_sra((__v8hi)__A, __rshift);
1733
1734	return (__m128i)__result;
1735	}
1736
1737	extern __inline __m128i
1738	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1739	_mm_sra_epi32(__m128i __A, __m128i __B) {
1740	const __v4su __rshmax = {31, 31, 31, 31};
1741	__v4su __rshift;
1742	__v4si __result;
1743
1744	#ifdef __LITTLE_ENDIAN__
1745	__rshift = vec_splat((__v4su)__B, 0);
1746	#else
1747	__rshift = vec_splat((__v4su)__B, 1);
1748	#endif
1749	__rshift = vec_min(__rshift, __rshmax);
1750	__result = vec_sra((__v4si)__A, __rshift);
1751
1752	return (__m128i)__result;
1753	}
1754
1755	extern __inline __m128i
1756	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1757	_mm_srl_epi16(__m128i __A, __m128i __B) {
1758	__v8hu __rshift;
1759	__vector __bool short __shmask;
1760	const __v8hu __shmax = {15, 15, 15, 15, 15, 15, 15, 15};
1761	__v8hu __result;
1762
1763	#ifdef __LITTLE_ENDIAN__
1764	__rshift = vec_splat((__v8hu)__B, 0);
1765	#else
1766	__rshift = vec_splat((__v8hu)__B, 3);
1767	#endif
1768	__shmask = vec_cmple(__rshift, __shmax);
1769	__result = vec_sr((__v8hu)__A, __rshift);
1770	__result = vec_sel((__v8hu)__shmask, __result, __shmask);
1771
1772	return (__m128i)__result;
1773	}
1774
1775	extern __inline __m128i
1776	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1777	_mm_srl_epi32(__m128i __A, __m128i __B) {
1778	__v4su __rshift;
1779	__vector __bool int __shmask;
1780	const __v4su __shmax = {32, 32, 32, 32};
1781	__v4su __result;
1782
1783	#ifdef __LITTLE_ENDIAN__
1784	__rshift = vec_splat((__v4su)__B, 0);
1785	#else
1786	__rshift = vec_splat((__v4su)__B, 1);
1787	#endif
1788	__shmask = vec_cmplt(__rshift, __shmax);
1789	__result = vec_sr((__v4su)__A, __rshift);
1790	__result = vec_sel((__v4su)__shmask, __result, __shmask);
1791
1792	return (__m128i)__result;
1793	}
1794
1795	#ifdef _ARCH_PWR8
1796	extern __inline __m128i
1797	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1798	_mm_srl_epi64(__m128i __A, __m128i __B) {
1799	__v2du __rshift;
1800	__vector __bool long long __shmask;
1801	const __v2du __shmax = {64, 64};
1802	__v2du __result;
1803
1804	__rshift = vec_splat((__v2du)__B, 0);
1805	__shmask = vec_cmplt(__rshift, __shmax);
1806	__result = vec_sr((__v2du)__A, __rshift);
1807	__result = vec_sel((__v2du)__shmask, __result, __shmask);
1808
1809	return (__m128i)__result;
1810	}
1811	#endif
1812
1813	extern __inline __m128d
1814	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1815	_mm_and_pd(__m128d __A, __m128d __B) {
1816	return (vec_and((__v2df)__A, (__v2df)__B));
1817	}
1818
1819	extern __inline __m128d
1820	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1821	_mm_andnot_pd(__m128d __A, __m128d __B) {
1822	return (vec_andc((__v2df)__B, (__v2df)__A));
1823	}
1824
1825	extern __inline __m128d
1826	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1827	_mm_or_pd(__m128d __A, __m128d __B) {
1828	return (vec_or((__v2df)__A, (__v2df)__B));
1829	}
1830
1831	extern __inline __m128d
1832	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1833	_mm_xor_pd(__m128d __A, __m128d __B) {
1834	return (vec_xor((__v2df)__A, (__v2df)__B));
1835	}
1836
1837	extern __inline __m128i
1838	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1839	_mm_and_si128(__m128i __A, __m128i __B) {
1840	return (__m128i)vec_and((__v2di)__A, (__v2di)__B);
1841	}
1842
1843	extern __inline __m128i
1844	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1845	_mm_andnot_si128(__m128i __A, __m128i __B) {
1846	return (__m128i)vec_andc((__v2di)__B, (__v2di)__A);
1847	}
1848
1849	extern __inline __m128i
1850	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1851	_mm_or_si128(__m128i __A, __m128i __B) {
1852	return (__m128i)vec_or((__v2di)__A, (__v2di)__B);
1853	}
1854
1855	extern __inline __m128i
1856	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1857	_mm_xor_si128(__m128i __A, __m128i __B) {
1858	return (__m128i)vec_xor((__v2di)__A, (__v2di)__B);
1859	}
1860
1861	extern __inline __m128i
1862	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1863	_mm_cmpeq_epi8(__m128i __A, __m128i __B) {
1864	return (__m128i)vec_cmpeq((__v16qi)__A, (__v16qi)__B);
1865	}
1866
1867	extern __inline __m128i
1868	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1869	_mm_cmpeq_epi16(__m128i __A, __m128i __B) {
1870	return (__m128i)vec_cmpeq((__v8hi)__A, (__v8hi)__B);
1871	}
1872
1873	extern __inline __m128i
1874	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1875	_mm_cmpeq_epi32(__m128i __A, __m128i __B) {
1876	return (__m128i)vec_cmpeq((__v4si)__A, (__v4si)__B);
1877	}
1878
1879	extern __inline __m128i
1880	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1881	_mm_cmplt_epi8(__m128i __A, __m128i __B) {
1882	return (__m128i)vec_cmplt((__v16qi)__A, (__v16qi)__B);
1883	}
1884
1885	extern __inline __m128i
1886	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1887	_mm_cmplt_epi16(__m128i __A, __m128i __B) {
1888	return (__m128i)vec_cmplt((__v8hi)__A, (__v8hi)__B);
1889	}
1890
1891	extern __inline __m128i
1892	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1893	_mm_cmplt_epi32(__m128i __A, __m128i __B) {
1894	return (__m128i)vec_cmplt((__v4si)__A, (__v4si)__B);
1895	}
1896
1897	extern __inline __m128i
1898	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1899	_mm_cmpgt_epi8(__m128i __A, __m128i __B) {
1900	return (__m128i)vec_cmpgt((__v16qi)__A, (__v16qi)__B);
1901	}
1902
1903	extern __inline __m128i
1904	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1905	_mm_cmpgt_epi16(__m128i __A, __m128i __B) {
1906	return (__m128i)vec_cmpgt((__v8hi)__A, (__v8hi)__B);
1907	}
1908
1909	extern __inline __m128i
1910	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1911	_mm_cmpgt_epi32(__m128i __A, __m128i __B) {
1912	return (__m128i)vec_cmpgt((__v4si)__A, (__v4si)__B);
1913	}
1914
1915	extern __inline int
1916	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1917	_mm_extract_epi16(__m128i const __A, int const __N) {
1918	return (unsigned short)((__v8hi)__A)[__N & 7];
1919	}
1920
1921	extern __inline __m128i
1922	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1923	_mm_insert_epi16(__m128i const __A, int const __D, int const __N) {
1924	__v8hi __result = (__v8hi)__A;
1925
1926	__result[(__N & 7)] = __D;
1927
1928	return (__m128i)__result;
1929	}
1930
1931	extern __inline __m128i
1932	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1933	_mm_max_epi16(__m128i __A, __m128i __B) {
1934	return (__m128i)vec_max((__v8hi)__A, (__v8hi)__B);
1935	}
1936
1937	extern __inline __m128i
1938	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1939	_mm_max_epu8(__m128i __A, __m128i __B) {
1940	return (__m128i)vec_max((__v16qu)__A, (__v16qu)__B);
1941	}
1942
1943	extern __inline __m128i
1944	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1945	_mm_min_epi16(__m128i __A, __m128i __B) {
1946	return (__m128i)vec_min((__v8hi)__A, (__v8hi)__B);
1947	}
1948
1949	extern __inline __m128i
1950	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1951	_mm_min_epu8(__m128i __A, __m128i __B) {
1952	return (__m128i)vec_min((__v16qu)__A, (__v16qu)__B);
1953	}
1954
1955	#ifdef _ARCH_PWR8
1956	/* Intrinsic functions that require PowerISA 2.07 minimum. */
1957
1958	/* Return a mask created from the most significant bit of each 8-bit
1959	element in A. */
1960	extern __inline int
1961	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1962	_mm_movemask_epi8(__m128i __A) {
1963	#ifdef _ARCH_PWR10
1964	return vec_extractm((__v16qu)__A);
1965	#else
1966	__vector unsigned long long __result;
1967	static const __vector unsigned char __perm_mask = {
1968	0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40,
1969	0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00};
1970
1971	__result = ((__vector unsigned long long)vec_vbpermq(
1972	(__vector unsigned char)__A, (__vector unsigned char)__perm_mask));
1973
1974	#ifdef __LITTLE_ENDIAN__
1975	return __result[1];
1976	#else
1977	return __result[0];
1978	#endif
1979	#endif /* !_ARCH_PWR10 */
1980	}
1981	#endif /* _ARCH_PWR8 */
1982
1983	extern __inline __m128i
1984	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1985	_mm_mulhi_epu16(__m128i __A, __m128i __B) {
1986	__v4su __w0, __w1;
1987	__v16qu __xform1 = {
1988	#ifdef __LITTLE_ENDIAN__
1989	0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A,
1990	0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1991	#else
1992	0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x08,
1993	0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D
1994	#endif
1995	};
1996
1997	__w0 = vec_vmuleuh((__v8hu)__A, (__v8hu)__B);
1998	__w1 = vec_vmulouh((__v8hu)__A, (__v8hu)__B);
1999	return (__m128i)vec_perm(__w0, __w1, __xform1);
2000	}
2001
2002	extern __inline __m128i
2003	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
2004	_mm_shufflehi_epi16(__m128i __A, const int __mask) {
2005	unsigned long __element_selector_98 = __mask & 0x03;
2006	unsigned long __element_selector_BA = (__mask >> 2) & 0x03;
2007	unsigned long __element_selector_DC = (__mask >> 4) & 0x03;
2008	unsigned long __element_selector_FE = (__mask >> 6) & 0x03;
2009	static const unsigned short __permute_selectors[4] = {
2010	#ifdef __LITTLE_ENDIAN__
2011	0x0908, 0x0B0A, 0x0D0C, 0x0F0E
2012	#else
2013	0x0809, 0x0A0B, 0x0C0D, 0x0E0F
2014	#endif
2015	};
2016	__v2du __pmask =
2017	#ifdef __LITTLE_ENDIAN__
2018	{0x1716151413121110UL, 0UL};
2019	#else
2020	{0x1011121314151617UL, 0UL};
2021	#endif
2022	__m64_union __t;
2023	__v2du __a, __r;
2024
2025	__t.as_short[0] = __permute_selectors[__element_selector_98];
2026	__t.as_short[1] = __permute_selectors[__element_selector_BA];
2027	__t.as_short[2] = __permute_selectors[__element_selector_DC];
2028	__t.as_short[3] = __permute_selectors[__element_selector_FE];
2029	__pmask[1] = __t.as_m64;
2030	__a = (__v2du)__A;
2031	__r = vec_perm(__a, __a, (__vector unsigned char)__pmask);
2032	return (__m128i)__r;
2033	}
2034
2035	extern __inline __m128i
2036	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
2037	_mm_shufflelo_epi16(__m128i __A, const int __mask) {
2038	unsigned long __element_selector_10 = __mask & 0x03;
2039	unsigned long __element_selector_32 = (__mask >> 2) & 0x03;
2040	unsigned long __element_selector_54 = (__mask >> 4) & 0x03;
2041	unsigned long __element_selector_76 = (__mask >> 6) & 0x03;
2042	static const unsigned short __permute_selectors[4] = {
2043	#ifdef __LITTLE_ENDIAN__
2044	0x0100, 0x0302, 0x0504, 0x0706
2045	#else
2046	0x0001, 0x0203, 0x0405, 0x0607
2047	#endif
2048	};
2049	__v2du __pmask =
2050	#ifdef __LITTLE_ENDIAN__
2051	{0UL, 0x1f1e1d1c1b1a1918UL};
2052	#else
2053	{0UL, 0x18191a1b1c1d1e1fUL};
2054	#endif
2055	__m64_union __t;
2056	__v2du __a, __r;
2057	__t.as_short[0] = __permute_selectors[__element_selector_10];
2058	__t.as_short[1] = __permute_selectors[__element_selector_32];
2059	__t.as_short[2] = __permute_selectors[__element_selector_54];
2060	__t.as_short[3] = __permute_selectors[__element_selector_76];
2061	__pmask[0] = __t.as_m64;
2062	__a = (__v2du)__A;
2063	__r = vec_perm(__a, __a, (__vector unsigned char)__pmask);
2064	return (__m128i)__r;
2065	}
2066
2067	extern __inline __m128i
2068	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
2069	_mm_shuffle_epi32(__m128i __A, const int __mask) {
2070	unsigned long __element_selector_10 = __mask & 0x03;
2071	unsigned long __element_selector_32 = (__mask >> 2) & 0x03;
2072	unsigned long __element_selector_54 = (__mask >> 4) & 0x03;
2073	unsigned long __element_selector_76 = (__mask >> 6) & 0x03;
2074	static const unsigned int __permute_selectors[4] = {
2075	#ifdef __LITTLE_ENDIAN__
2076	0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
2077	#else
2078	0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F
2079	#endif
2080	};
2081	__v4su __t;
2082
2083	__t[0] = __permute_selectors[__element_selector_10];
2084	__t[1] = __permute_selectors[__element_selector_32];
2085	__t[2] = __permute_selectors[__element_selector_54] + 0x10101010;
2086	__t[3] = __permute_selectors[__element_selector_76] + 0x10101010;
2087	return (__m128i)vec_perm((__v4si)__A, (__v4si)__A,
2088	(__vector unsigned char)__t);
2089	}
2090
2091	extern __inline void
2092	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
2093	_mm_maskmoveu_si128(__m128i __A, __m128i __B, char *__C) {
2094	__v2du __hibit = {0x7f7f7f7f7f7f7f7fUL, 0x7f7f7f7f7f7f7f7fUL};
2095	__v16qu __mask, __tmp;
2096	__m128i_u __p = (__m128i_u )__C;
2097
2098	__tmp = (__v16qu)_mm_loadu_si128(__p);
2099	__mask = (__v16qu)vec_cmpgt((__v16qu)__B, (__v16qu)__hibit);
2100	__tmp = vec_sel(__tmp, (__v16qu)__A, __mask);
2101	_mm_storeu_si128(__p, (__m128i)__tmp);
2102	}
2103
2104	extern __inline __m128i
2105	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
2106	_mm_avg_epu8(__m128i __A, __m128i __B) {
2107	return (__m128i)vec_avg((__v16qu)__A, (__v16qu)__B);
2108	}
2109
2110	extern __inline __m128i
2111	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
2112	_mm_avg_epu16(__m128i __A, __m128i __B) {
2113	return (__m128i)vec_avg((__v8hu)__A, (__v8hu)__B);
2114	}
2115
2116	extern __inline __m128i
2117	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
2118	_mm_sad_epu8(__m128i __A, __m128i __B) {
2119	__v16qu __a, __b;
2120	__v16qu __vabsdiff;
2121	__v4si __vsum;
2122	const __v4su __zero = {0, 0, 0, 0};
2123	__v4si __result;
2124
2125	__a = (__v16qu)__A;
2126	__b = (__v16qu)__B;
2127	#ifndef _ARCH_PWR9
2128	__v16qu __vmin = vec_min(__a, __b);
2129	__v16qu __vmax = vec_max(__a, __b);
2130	__vabsdiff = vec_sub(__vmax, __vmin);
2131	#else
2132	__vabsdiff = vec_absd(__a, __b);
2133	#endif
2134	/* Sum four groups of bytes into integers. */
2135	__vsum = (__vector signed int)vec_sum4s(__vabsdiff, __zero);
2136	#ifdef __LITTLE_ENDIAN__
2137	/* Sum across four integers with two integer results. */
2138	__asm__("vsum2sws %0,%1,%2" : "=v"(__result) : "v"(__vsum), "v"(__zero));
2139	/* Note: vec_sum2s could be used here, but on little-endian, vector
2140	shifts are added that are not needed for this use-case.
2141	A vector shift to correctly position the 32-bit integer results
2142	(currently at [0] and [2]) to [1] and [3] would then need to be
2143	swapped back again since the desired results are two 64-bit
2144	integers ([1]\|[0] and [3]\|[2]). Thus, no shift is performed. */
2145	#else
2146	/* Sum across four integers with two integer results. */
2147	__result = vec_sum2s(__vsum, (__vector signed int)__zero);
2148	/* Rotate the sums into the correct position. */
2149	__result = vec_sld(__result, __result, 6);
2150	#endif
2151	return (__m128i)__result;
2152	}
2153
2154	extern __inline void
2155	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
2156	_mm_stream_si32(int *__A, int __B) {
2157	/* Use the data cache block touch for store transient. */
2158	__asm__("dcbtstt 0,%0" : : "b"(__A) : "memory");
2159	*__A = __B;
2160	}
2161
2162	extern __inline void
2163	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
2164	_mm_stream_si64(long long int *__A, long long int __B) {
2165	/* Use the data cache block touch for store transient. */
2166	__asm__(" dcbtstt 0,%0" : : "b"(__A) : "memory");
2167	*__A = __B;
2168	}
2169
2170	extern __inline void
2171	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
2172	_mm_stream_si128(__m128i *__A, __m128i __B) {
2173	/* Use the data cache block touch for store transient. */
2174	__asm__("dcbtstt 0,%0" : : "b"(__A) : "memory");
2175	*__A = __B;
2176	}
2177
2178	extern __inline void
2179	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
2180	_mm_stream_pd(double *__A, __m128d __B) {
2181	/* Use the data cache block touch for store transient. */
2182	__asm__("dcbtstt 0,%0" : : "b"(__A) : "memory");
2183	(__m128d )__A = __B;
2184	}
2185
2186	extern __inline void
2187	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
2188	_mm_clflush(void const *__A) {
2189	/* Use the data cache block flush. */
2190	__asm__("dcbf 0,%0" : : "b"(__A) : "memory");
2191	}
2192
2193	extern __inline void
2194	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
2195	_mm_lfence(void) {
2196	/* Use light weight sync for load to load ordering. */
2197	__atomic_thread_fence(__ATOMIC_RELEASE);
2198	}
2199
2200	extern __inline void
2201	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
2202	_mm_mfence(void) {
2203	/* Use heavy weight sync for any to any ordering. */
2204	__atomic_thread_fence(__ATOMIC_SEQ_CST);
2205	}
2206
2207	extern __inline __m128i
2208	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
2209	_mm_cvtsi32_si128(int __A) {
2210	return _mm_set_epi32(0, 0, 0, __A);
2211	}
2212
2213	extern __inline __m128i
2214	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
2215	_mm_cvtsi64_si128(long long __A) {
2216	return __extension__(__m128i)(__v2di){__A, 0LL};
2217	}
2218
2219	/* Microsoft intrinsic. */
2220	extern __inline __m128i
2221	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
2222	_mm_cvtsi64x_si128(long long __A) {
2223	return __extension__(__m128i)(__v2di){__A, 0LL};
2224	}
2225
2226	/* Casts between various SP, DP, INT vector types. Note that these do no
2227	conversion of values, they just change the type. */
2228	extern __inline __m128
2229	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
2230	_mm_castpd_ps(__m128d __A) {
2231	return (__m128)__A;
2232	}
2233
2234	extern __inline __m128i
2235	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
2236	_mm_castpd_si128(__m128d __A) {
2237	return (__m128i)__A;
2238	}
2239
2240	extern __inline __m128d
2241	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
2242	_mm_castps_pd(__m128 __A) {
2243	return (__m128d)__A;
2244	}
2245
2246	extern __inline __m128i
2247	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
2248	_mm_castps_si128(__m128 __A) {
2249	return (__m128i)__A;
2250	}
2251
2252	extern __inline __m128
2253	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
2254	_mm_castsi128_ps(__m128i __A) {
2255	return (__m128)__A;
2256	}
2257
2258	extern __inline __m128d
2259	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
2260	_mm_castsi128_pd(__m128i __A) {
2261	return (__m128d)__A;
2262	}
2263
2264	#else
2265	#include_next <emmintrin.h>
2266	#endif /* defined(__powerpc64__) && \
2267	* (defined(__linux__) \|\| defined(__FreeBSD__) \|\| defined(_AIX)) */
2268
2269	#endif /* EMMINTRIN_H_ */
2270

Warning: This file is not a C or C++ file. It does not have highlighting.

source code of clang/lib/Headers/ppc_wrappers/emmintrin.h