emmintrin.h source code [clang/lib/Headers/emmintrin.h]

Warning: This file is not a C or C++ file. It does not have highlighting.

1	/*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
2	*
3	* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	* See https://llvm.org/LICENSE.txt for license information.
5	* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	*
7	*===-----------------------------------------------------------------------===
8	*/
9
10	#ifndef __EMMINTRIN_H
11	#define __EMMINTRIN_H
12
13	#if !defined(__i386__) && !defined(__x86_64__)
14	#error "This header is only meant to be used on x86 and x64 architecture"
15	#endif
16
17	#include <xmmintrin.h>
18
19	typedef double __m128d __attribute__((__vector_size__(16), __aligned__(16)));
20	typedef long long __m128i __attribute__((__vector_size__(16), __aligned__(16)));
21
22	typedef double __m128d_u __attribute__((__vector_size__(16), __aligned__(1)));
23	typedef long long __m128i_u
24	__attribute__((__vector_size__(16), __aligned__(1)));
25
26	/* Type defines. */
27	typedef double __v2df __attribute__((__vector_size__(16)));
28	typedef long long __v2di __attribute__((__vector_size__(16)));
29	typedef short __v8hi __attribute__((__vector_size__(16)));
30	typedef char __v16qi __attribute__((__vector_size__(16)));
31
32	/* Unsigned types */
33	typedef unsigned long long __v2du __attribute__((__vector_size__(16)));
34	typedef unsigned short __v8hu __attribute__((__vector_size__(16)));
35	typedef unsigned char __v16qu __attribute__((__vector_size__(16)));
36
37	/* We need an explicitly signed variant for char. Note that this shouldn't
38	* appear in the interface though. */
39	typedef signed char __v16qs __attribute__((__vector_size__(16)));
40
41	#ifdef __SSE2__
42	/* Both _Float16 and __bf16 require SSE2 being enabled. */
43	typedef _Float16 __v8hf __attribute__((__vector_size__(16), __aligned__(16)));
44	typedef _Float16 __m128h __attribute__((__vector_size__(16), __aligned__(16)));
45	typedef _Float16 __m128h_u __attribute__((__vector_size__(16), __aligned__(1)));
46
47	typedef __bf16 __v8bf __attribute__((__vector_size__(16), __aligned__(16)));
48	typedef __bf16 __m128bh __attribute__((__vector_size__(16), __aligned__(16)));
49	#endif
50
51	/* Define the default attributes for the functions in this file. */
52	#define __DEFAULT_FN_ATTRS \
53	__attribute__((__always_inline__, __nodebug__, \
54	__target__("sse2,no-evex512"), __min_vector_width__(128)))
55	#define __DEFAULT_FN_ATTRS_MMX \
56	__attribute__((__always_inline__, __nodebug__, \
57	__target__("mmx,sse2,no-evex512"), __min_vector_width__(64)))
58
59	/// Adds lower double-precision values in both operands and returns the
60	/// sum in the lower 64 bits of the result. The upper 64 bits of the result
61	/// are copied from the upper double-precision value of the first operand.
62	///
63	/// \headerfile <x86intrin.h>
64	///
65	/// This intrinsic corresponds to the <c> VADDSD / ADDSD </c> instruction.
66	///
67	/// \param __a
68	/// A 128-bit vector of [2 x double] containing one of the source operands.
69	/// \param __b
70	/// A 128-bit vector of [2 x double] containing one of the source operands.
71	/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
72	/// sum of the lower 64 bits of both operands. The upper 64 bits are copied
73	/// from the upper 64 bits of the first source operand.
74	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_sd(__m128d __a,
75	__m128d __b) {
76	__a[0] += __b[0];
77	return __a;
78	}
79
80	/// Adds two 128-bit vectors of [2 x double].
81	///
82	/// \headerfile <x86intrin.h>
83	///
84	/// This intrinsic corresponds to the <c> VADDPD / ADDPD </c> instruction.
85	///
86	/// \param __a
87	/// A 128-bit vector of [2 x double] containing one of the source operands.
88	/// \param __b
89	/// A 128-bit vector of [2 x double] containing one of the source operands.
90	/// \returns A 128-bit vector of [2 x double] containing the sums of both
91	/// operands.
92	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_pd(__m128d __a,
93	__m128d __b) {
94	return (__m128d)((__v2df)__a + (__v2df)__b);
95	}
96
97	/// Subtracts the lower double-precision value of the second operand
98	/// from the lower double-precision value of the first operand and returns
99	/// the difference in the lower 64 bits of the result. The upper 64 bits of
100	/// the result are copied from the upper double-precision value of the first
101	/// operand.
102	///
103	/// \headerfile <x86intrin.h>
104	///
105	/// This intrinsic corresponds to the <c> VSUBSD / SUBSD </c> instruction.
106	///
107	/// \param __a
108	/// A 128-bit vector of [2 x double] containing the minuend.
109	/// \param __b
110	/// A 128-bit vector of [2 x double] containing the subtrahend.
111	/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
112	/// difference of the lower 64 bits of both operands. The upper 64 bits are
113	/// copied from the upper 64 bits of the first source operand.
114	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_sd(__m128d __a,
115	__m128d __b) {
116	__a[0] -= __b[0];
117	return __a;
118	}
119
120	/// Subtracts two 128-bit vectors of [2 x double].
121	///
122	/// \headerfile <x86intrin.h>
123	///
124	/// This intrinsic corresponds to the <c> VSUBPD / SUBPD </c> instruction.
125	///
126	/// \param __a
127	/// A 128-bit vector of [2 x double] containing the minuend.
128	/// \param __b
129	/// A 128-bit vector of [2 x double] containing the subtrahend.
130	/// \returns A 128-bit vector of [2 x double] containing the differences between
131	/// both operands.
132	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_pd(__m128d __a,
133	__m128d __b) {
134	return (__m128d)((__v2df)__a - (__v2df)__b);
135	}
136
137	/// Multiplies lower double-precision values in both operands and returns
138	/// the product in the lower 64 bits of the result. The upper 64 bits of the
139	/// result are copied from the upper double-precision value of the first
140	/// operand.
141	///
142	/// \headerfile <x86intrin.h>
143	///
144	/// This intrinsic corresponds to the <c> VMULSD / MULSD </c> instruction.
145	///
146	/// \param __a
147	/// A 128-bit vector of [2 x double] containing one of the source operands.
148	/// \param __b
149	/// A 128-bit vector of [2 x double] containing one of the source operands.
150	/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
151	/// product of the lower 64 bits of both operands. The upper 64 bits are
152	/// copied from the upper 64 bits of the first source operand.
153	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_sd(__m128d __a,
154	__m128d __b) {
155	__a[0] *= __b[0];
156	return __a;
157	}
158
159	/// Multiplies two 128-bit vectors of [2 x double].
160	///
161	/// \headerfile <x86intrin.h>
162	///
163	/// This intrinsic corresponds to the <c> VMULPD / MULPD </c> instruction.
164	///
165	/// \param __a
166	/// A 128-bit vector of [2 x double] containing one of the operands.
167	/// \param __b
168	/// A 128-bit vector of [2 x double] containing one of the operands.
169	/// \returns A 128-bit vector of [2 x double] containing the products of both
170	/// operands.
171	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_pd(__m128d __a,
172	__m128d __b) {
173	return (__m128d)((__v2df)__a * (__v2df)__b);
174	}
175
176	/// Divides the lower double-precision value of the first operand by the
177	/// lower double-precision value of the second operand and returns the
178	/// quotient in the lower 64 bits of the result. The upper 64 bits of the
179	/// result are copied from the upper double-precision value of the first
180	/// operand.
181	///
182	/// \headerfile <x86intrin.h>
183	///
184	/// This intrinsic corresponds to the <c> VDIVSD / DIVSD </c> instruction.
185	///
186	/// \param __a
187	/// A 128-bit vector of [2 x double] containing the dividend.
188	/// \param __b
189	/// A 128-bit vector of [2 x double] containing divisor.
190	/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
191	/// quotient of the lower 64 bits of both operands. The upper 64 bits are
192	/// copied from the upper 64 bits of the first source operand.
193	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_sd(__m128d __a,
194	__m128d __b) {
195	__a[0] /= __b[0];
196	return __a;
197	}
198
199	/// Performs an element-by-element division of two 128-bit vectors of
200	/// [2 x double].
201	///
202	/// \headerfile <x86intrin.h>
203	///
204	/// This intrinsic corresponds to the <c> VDIVPD / DIVPD </c> instruction.
205	///
206	/// \param __a
207	/// A 128-bit vector of [2 x double] containing the dividend.
208	/// \param __b
209	/// A 128-bit vector of [2 x double] containing the divisor.
210	/// \returns A 128-bit vector of [2 x double] containing the quotients of both
211	/// operands.
212	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_pd(__m128d __a,
213	__m128d __b) {
214	return (__m128d)((__v2df)__a / (__v2df)__b);
215	}
216
217	/// Calculates the square root of the lower double-precision value of
218	/// the second operand and returns it in the lower 64 bits of the result.
219	/// The upper 64 bits of the result are copied from the upper
220	/// double-precision value of the first operand.
221	///
222	/// \headerfile <x86intrin.h>
223	///
224	/// This intrinsic corresponds to the <c> VSQRTSD / SQRTSD </c> instruction.
225	///
226	/// \param __a
227	/// A 128-bit vector of [2 x double] containing one of the operands. The
228	/// upper 64 bits of this operand are copied to the upper 64 bits of the
229	/// result.
230	/// \param __b
231	/// A 128-bit vector of [2 x double] containing one of the operands. The
232	/// square root is calculated using the lower 64 bits of this operand.
233	/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
234	/// square root of the lower 64 bits of operand \a __b, and whose upper 64
235	/// bits are copied from the upper 64 bits of operand \a __a.
236	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_sd(__m128d __a,
237	__m128d __b) {
238	__m128d __c = __builtin_ia32_sqrtsd((__v2df)__b);
239	return __extension__(__m128d){__c[0], __a[1]};
240	}
241
242	/// Calculates the square root of the each of two values stored in a
243	/// 128-bit vector of [2 x double].
244	///
245	/// \headerfile <x86intrin.h>
246	///
247	/// This intrinsic corresponds to the <c> VSQRTPD / SQRTPD </c> instruction.
248	///
249	/// \param __a
250	/// A 128-bit vector of [2 x double].
251	/// \returns A 128-bit vector of [2 x double] containing the square roots of the
252	/// values in the operand.
253	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_pd(__m128d __a) {
254	return __builtin_ia32_sqrtpd((__v2df)__a);
255	}
256
257	/// Compares lower 64-bit double-precision values of both operands, and
258	/// returns the lesser of the pair of values in the lower 64-bits of the
259	/// result. The upper 64 bits of the result are copied from the upper
260	/// double-precision value of the first operand.
261	///
262	/// \headerfile <x86intrin.h>
263	///
264	/// This intrinsic corresponds to the <c> VMINSD / MINSD </c> instruction.
265	///
266	/// \param __a
267	/// A 128-bit vector of [2 x double] containing one of the operands. The
268	/// lower 64 bits of this operand are used in the comparison.
269	/// \param __b
270	/// A 128-bit vector of [2 x double] containing one of the operands. The
271	/// lower 64 bits of this operand are used in the comparison.
272	/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
273	/// minimum value between both operands. The upper 64 bits are copied from
274	/// the upper 64 bits of the first source operand.
275	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_sd(__m128d __a,
276	__m128d __b) {
277	return __builtin_ia32_minsd((__v2df)__a, (__v2df)__b);
278	}
279
280	/// Performs element-by-element comparison of the two 128-bit vectors of
281	/// [2 x double] and returns the vector containing the lesser of each pair of
282	/// values.
283	///
284	/// \headerfile <x86intrin.h>
285	///
286	/// This intrinsic corresponds to the <c> VMINPD / MINPD </c> instruction.
287	///
288	/// \param __a
289	/// A 128-bit vector of [2 x double] containing one of the operands.
290	/// \param __b
291	/// A 128-bit vector of [2 x double] containing one of the operands.
292	/// \returns A 128-bit vector of [2 x double] containing the minimum values
293	/// between both operands.
294	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_pd(__m128d __a,
295	__m128d __b) {
296	return __builtin_ia32_minpd((__v2df)__a, (__v2df)__b);
297	}
298
299	/// Compares lower 64-bit double-precision values of both operands, and
300	/// returns the greater of the pair of values in the lower 64-bits of the
301	/// result. The upper 64 bits of the result are copied from the upper
302	/// double-precision value of the first operand.
303	///
304	/// \headerfile <x86intrin.h>
305	///
306	/// This intrinsic corresponds to the <c> VMAXSD / MAXSD </c> instruction.
307	///
308	/// \param __a
309	/// A 128-bit vector of [2 x double] containing one of the operands. The
310	/// lower 64 bits of this operand are used in the comparison.
311	/// \param __b
312	/// A 128-bit vector of [2 x double] containing one of the operands. The
313	/// lower 64 bits of this operand are used in the comparison.
314	/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
315	/// maximum value between both operands. The upper 64 bits are copied from
316	/// the upper 64 bits of the first source operand.
317	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_sd(__m128d __a,
318	__m128d __b) {
319	return __builtin_ia32_maxsd((__v2df)__a, (__v2df)__b);
320	}
321
322	/// Performs element-by-element comparison of the two 128-bit vectors of
323	/// [2 x double] and returns the vector containing the greater of each pair
324	/// of values.
325	///
326	/// \headerfile <x86intrin.h>
327	///
328	/// This intrinsic corresponds to the <c> VMAXPD / MAXPD </c> instruction.
329	///
330	/// \param __a
331	/// A 128-bit vector of [2 x double] containing one of the operands.
332	/// \param __b
333	/// A 128-bit vector of [2 x double] containing one of the operands.
334	/// \returns A 128-bit vector of [2 x double] containing the maximum values
335	/// between both operands.
336	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_pd(__m128d __a,
337	__m128d __b) {
338	return __builtin_ia32_maxpd((__v2df)__a, (__v2df)__b);
339	}
340
341	/// Performs a bitwise AND of two 128-bit vectors of [2 x double].
342	///
343	/// \headerfile <x86intrin.h>
344	///
345	/// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
346	///
347	/// \param __a
348	/// A 128-bit vector of [2 x double] containing one of the source operands.
349	/// \param __b
350	/// A 128-bit vector of [2 x double] containing one of the source operands.
351	/// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
352	/// values between both operands.
353	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_and_pd(__m128d __a,
354	__m128d __b) {
355	return (__m128d)((__v2du)__a & (__v2du)__b);
356	}
357
358	/// Performs a bitwise AND of two 128-bit vectors of [2 x double], using
359	/// the one's complement of the values contained in the first source operand.
360	///
361	/// \headerfile <x86intrin.h>
362	///
363	/// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
364	///
365	/// \param __a
366	/// A 128-bit vector of [2 x double] containing the left source operand. The
367	/// one's complement of this value is used in the bitwise AND.
368	/// \param __b
369	/// A 128-bit vector of [2 x double] containing the right source operand.
370	/// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
371	/// values in the second operand and the one's complement of the first
372	/// operand.
373	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_andnot_pd(__m128d __a,
374	__m128d __b) {
375	return (__m128d)(~(__v2du)__a & (__v2du)__b);
376	}
377
378	/// Performs a bitwise OR of two 128-bit vectors of [2 x double].
379	///
380	/// \headerfile <x86intrin.h>
381	///
382	/// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
383	///
384	/// \param __a
385	/// A 128-bit vector of [2 x double] containing one of the source operands.
386	/// \param __b
387	/// A 128-bit vector of [2 x double] containing one of the source operands.
388	/// \returns A 128-bit vector of [2 x double] containing the bitwise OR of the
389	/// values between both operands.
390	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_or_pd(__m128d __a,
391	__m128d __b) {
392	return (__m128d)((__v2du)__a \| (__v2du)__b);
393	}
394
395	/// Performs a bitwise XOR of two 128-bit vectors of [2 x double].
396	///
397	/// \headerfile <x86intrin.h>
398	///
399	/// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
400	///
401	/// \param __a
402	/// A 128-bit vector of [2 x double] containing one of the source operands.
403	/// \param __b
404	/// A 128-bit vector of [2 x double] containing one of the source operands.
405	/// \returns A 128-bit vector of [2 x double] containing the bitwise XOR of the
406	/// values between both operands.
407	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_xor_pd(__m128d __a,
408	__m128d __b) {
409	return (__m128d)((__v2du)__a ^ (__v2du)__b);
410	}
411
412	/// Compares each of the corresponding double-precision values of the
413	/// 128-bit vectors of [2 x double] for equality. Each comparison yields 0x0
414	/// for false, 0xFFFFFFFFFFFFFFFF for true.
415	///
416	/// \headerfile <x86intrin.h>
417	///
418	/// This intrinsic corresponds to the <c> VCMPEQPD / CMPEQPD </c> instruction.
419	///
420	/// \param __a
421	/// A 128-bit vector of [2 x double].
422	/// \param __b
423	/// A 128-bit vector of [2 x double].
424	/// \returns A 128-bit vector containing the comparison results.
425	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_pd(__m128d __a,
426	__m128d __b) {
427	return (__m128d)__builtin_ia32_cmpeqpd((__v2df)__a, (__v2df)__b);
428	}
429
430	/// Compares each of the corresponding double-precision values of the
431	/// 128-bit vectors of [2 x double] to determine if the values in the first
432	/// operand are less than those in the second operand. Each comparison
433	/// yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
434	///
435	/// \headerfile <x86intrin.h>
436	///
437	/// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
438	///
439	/// \param __a
440	/// A 128-bit vector of [2 x double].
441	/// \param __b
442	/// A 128-bit vector of [2 x double].
443	/// \returns A 128-bit vector containing the comparison results.
444	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_pd(__m128d __a,
445	__m128d __b) {
446	return (__m128d)__builtin_ia32_cmpltpd((__v2df)__a, (__v2df)__b);
447	}
448
449	/// Compares each of the corresponding double-precision values of the
450	/// 128-bit vectors of [2 x double] to determine if the values in the first
451	/// operand are less than or equal to those in the second operand.
452	///
453	/// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
454	///
455	/// \headerfile <x86intrin.h>
456	///
457	/// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
458	///
459	/// \param __a
460	/// A 128-bit vector of [2 x double].
461	/// \param __b
462	/// A 128-bit vector of [2 x double].
463	/// \returns A 128-bit vector containing the comparison results.
464	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_pd(__m128d __a,
465	__m128d __b) {
466	return (__m128d)__builtin_ia32_cmplepd((__v2df)__a, (__v2df)__b);
467	}
468
469	/// Compares each of the corresponding double-precision values of the
470	/// 128-bit vectors of [2 x double] to determine if the values in the first
471	/// operand are greater than those in the second operand.
472	///
473	/// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
474	///
475	/// \headerfile <x86intrin.h>
476	///
477	/// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
478	///
479	/// \param __a
480	/// A 128-bit vector of [2 x double].
481	/// \param __b
482	/// A 128-bit vector of [2 x double].
483	/// \returns A 128-bit vector containing the comparison results.
484	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_pd(__m128d __a,
485	__m128d __b) {
486	return (__m128d)__builtin_ia32_cmpltpd((__v2df)__b, (__v2df)__a);
487	}
488
489	/// Compares each of the corresponding double-precision values of the
490	/// 128-bit vectors of [2 x double] to determine if the values in the first
491	/// operand are greater than or equal to those in the second operand.
492	///
493	/// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
494	///
495	/// \headerfile <x86intrin.h>
496	///
497	/// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
498	///
499	/// \param __a
500	/// A 128-bit vector of [2 x double].
501	/// \param __b
502	/// A 128-bit vector of [2 x double].
503	/// \returns A 128-bit vector containing the comparison results.
504	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_pd(__m128d __a,
505	__m128d __b) {
506	return (__m128d)__builtin_ia32_cmplepd((__v2df)__b, (__v2df)__a);
507	}
508
509	/// Compares each of the corresponding double-precision values of the
510	/// 128-bit vectors of [2 x double] to determine if the values in the first
511	/// operand are ordered with respect to those in the second operand.
512	///
513	/// A pair of double-precision values are "ordered" with respect to each
514	/// other if neither value is a NaN. Each comparison yields 0x0 for false,
515	/// 0xFFFFFFFFFFFFFFFF for true.
516	///
517	/// \headerfile <x86intrin.h>
518	///
519	/// This intrinsic corresponds to the <c> VCMPORDPD / CMPORDPD </c> instruction.
520	///
521	/// \param __a
522	/// A 128-bit vector of [2 x double].
523	/// \param __b
524	/// A 128-bit vector of [2 x double].
525	/// \returns A 128-bit vector containing the comparison results.
526	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_pd(__m128d __a,
527	__m128d __b) {
528	return (__m128d)__builtin_ia32_cmpordpd((__v2df)__a, (__v2df)__b);
529	}
530
531	/// Compares each of the corresponding double-precision values of the
532	/// 128-bit vectors of [2 x double] to determine if the values in the first
533	/// operand are unordered with respect to those in the second operand.
534	///
535	/// A pair of double-precision values are "unordered" with respect to each
536	/// other if one or both values are NaN. Each comparison yields 0x0 for
537	/// false, 0xFFFFFFFFFFFFFFFF for true.
538	///
539	/// \headerfile <x86intrin.h>
540	///
541	/// This intrinsic corresponds to the <c> VCMPUNORDPD / CMPUNORDPD </c>
542	/// instruction.
543	///
544	/// \param __a
545	/// A 128-bit vector of [2 x double].
546	/// \param __b
547	/// A 128-bit vector of [2 x double].
548	/// \returns A 128-bit vector containing the comparison results.
549	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_pd(__m128d __a,
550	__m128d __b) {
551	return (__m128d)__builtin_ia32_cmpunordpd((__v2df)__a, (__v2df)__b);
552	}
553
554	/// Compares each of the corresponding double-precision values of the
555	/// 128-bit vectors of [2 x double] to determine if the values in the first
556	/// operand are unequal to those in the second operand.
557	///
558	/// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
559	///
560	/// \headerfile <x86intrin.h>
561	///
562	/// This intrinsic corresponds to the <c> VCMPNEQPD / CMPNEQPD </c> instruction.
563	///
564	/// \param __a
565	/// A 128-bit vector of [2 x double].
566	/// \param __b
567	/// A 128-bit vector of [2 x double].
568	/// \returns A 128-bit vector containing the comparison results.
569	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_pd(__m128d __a,
570	__m128d __b) {
571	return (__m128d)__builtin_ia32_cmpneqpd((__v2df)__a, (__v2df)__b);
572	}
573
574	/// Compares each of the corresponding double-precision values of the
575	/// 128-bit vectors of [2 x double] to determine if the values in the first
576	/// operand are not less than those in the second operand.
577	///
578	/// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
579	///
580	/// \headerfile <x86intrin.h>
581	///
582	/// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
583	///
584	/// \param __a
585	/// A 128-bit vector of [2 x double].
586	/// \param __b
587	/// A 128-bit vector of [2 x double].
588	/// \returns A 128-bit vector containing the comparison results.
589	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_pd(__m128d __a,
590	__m128d __b) {
591	return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__a, (__v2df)__b);
592	}
593
594	/// Compares each of the corresponding double-precision values of the
595	/// 128-bit vectors of [2 x double] to determine if the values in the first
596	/// operand are not less than or equal to those in the second operand.
597	///
598	/// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
599	///
600	/// \headerfile <x86intrin.h>
601	///
602	/// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
603	///
604	/// \param __a
605	/// A 128-bit vector of [2 x double].
606	/// \param __b
607	/// A 128-bit vector of [2 x double].
608	/// \returns A 128-bit vector containing the comparison results.
609	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_pd(__m128d __a,
610	__m128d __b) {
611	return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__a, (__v2df)__b);
612	}
613
614	/// Compares each of the corresponding double-precision values of the
615	/// 128-bit vectors of [2 x double] to determine if the values in the first
616	/// operand are not greater than those in the second operand.
617	///
618	/// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
619	///
620	/// \headerfile <x86intrin.h>
621	///
622	/// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
623	///
624	/// \param __a
625	/// A 128-bit vector of [2 x double].
626	/// \param __b
627	/// A 128-bit vector of [2 x double].
628	/// \returns A 128-bit vector containing the comparison results.
629	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_pd(__m128d __a,
630	__m128d __b) {
631	return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__b, (__v2df)__a);
632	}
633
634	/// Compares each of the corresponding double-precision values of the
635	/// 128-bit vectors of [2 x double] to determine if the values in the first
636	/// operand are not greater than or equal to those in the second operand.
637	///
638	/// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
639	///
640	/// \headerfile <x86intrin.h>
641	///
642	/// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
643	///
644	/// \param __a
645	/// A 128-bit vector of [2 x double].
646	/// \param __b
647	/// A 128-bit vector of [2 x double].
648	/// \returns A 128-bit vector containing the comparison results.
649	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_pd(__m128d __a,
650	__m128d __b) {
651	return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__b, (__v2df)__a);
652	}
653
654	/// Compares the lower double-precision floating-point values in each of
655	/// the two 128-bit floating-point vectors of [2 x double] for equality.
656	///
657	/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
658	///
659	/// \headerfile <x86intrin.h>
660	///
661	/// This intrinsic corresponds to the <c> VCMPEQSD / CMPEQSD </c> instruction.
662	///
663	/// \param __a
664	/// A 128-bit vector of [2 x double]. The lower double-precision value is
665	/// compared to the lower double-precision value of \a __b.
666	/// \param __b
667	/// A 128-bit vector of [2 x double]. The lower double-precision value is
668	/// compared to the lower double-precision value of \a __a.
669	/// \returns A 128-bit vector. The lower 64 bits contains the comparison
670	/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
671	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_sd(__m128d __a,
672	__m128d __b) {
673	return (__m128d)__builtin_ia32_cmpeqsd((__v2df)__a, (__v2df)__b);
674	}
675
676	/// Compares the lower double-precision floating-point values in each of
677	/// the two 128-bit floating-point vectors of [2 x double] to determine if
678	/// the value in the first parameter is less than the corresponding value in
679	/// the second parameter.
680	///
681	/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
682	///
683	/// \headerfile <x86intrin.h>
684	///
685	/// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
686	///
687	/// \param __a
688	/// A 128-bit vector of [2 x double]. The lower double-precision value is
689	/// compared to the lower double-precision value of \a __b.
690	/// \param __b
691	/// A 128-bit vector of [2 x double]. The lower double-precision value is
692	/// compared to the lower double-precision value of \a __a.
693	/// \returns A 128-bit vector. The lower 64 bits contains the comparison
694	/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
695	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_sd(__m128d __a,
696	__m128d __b) {
697	return (__m128d)__builtin_ia32_cmpltsd((__v2df)__a, (__v2df)__b);
698	}
699
700	/// Compares the lower double-precision floating-point values in each of
701	/// the two 128-bit floating-point vectors of [2 x double] to determine if
702	/// the value in the first parameter is less than or equal to the
703	/// corresponding value in the second parameter.
704	///
705	/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
706	///
707	/// \headerfile <x86intrin.h>
708	///
709	/// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
710	///
711	/// \param __a
712	/// A 128-bit vector of [2 x double]. The lower double-precision value is
713	/// compared to the lower double-precision value of \a __b.
714	/// \param __b
715	/// A 128-bit vector of [2 x double]. The lower double-precision value is
716	/// compared to the lower double-precision value of \a __a.
717	/// \returns A 128-bit vector. The lower 64 bits contains the comparison
718	/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
719	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_sd(__m128d __a,
720	__m128d __b) {
721	return (__m128d)__builtin_ia32_cmplesd((__v2df)__a, (__v2df)__b);
722	}
723
724	/// Compares the lower double-precision floating-point values in each of
725	/// the two 128-bit floating-point vectors of [2 x double] to determine if
726	/// the value in the first parameter is greater than the corresponding value
727	/// in the second parameter.
728	///
729	/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
730	///
731	/// \headerfile <x86intrin.h>
732	///
733	/// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
734	///
735	/// \param __a
736	/// A 128-bit vector of [2 x double]. The lower double-precision value is
737	/// compared to the lower double-precision value of \a __b.
738	/// \param __b
739	/// A 128-bit vector of [2 x double]. The lower double-precision value is
740	/// compared to the lower double-precision value of \a __a.
741	/// \returns A 128-bit vector. The lower 64 bits contains the comparison
742	/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
743	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_sd(__m128d __a,
744	__m128d __b) {
745	__m128d __c = __builtin_ia32_cmpltsd((__v2df)__b, (__v2df)__a);
746	return __extension__(__m128d){__c[0], __a[1]};
747	}
748
749	/// Compares the lower double-precision floating-point values in each of
750	/// the two 128-bit floating-point vectors of [2 x double] to determine if
751	/// the value in the first parameter is greater than or equal to the
752	/// corresponding value in the second parameter.
753	///
754	/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
755	///
756	/// \headerfile <x86intrin.h>
757	///
758	/// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
759	///
760	/// \param __a
761	/// A 128-bit vector of [2 x double]. The lower double-precision value is
762	/// compared to the lower double-precision value of \a __b.
763	/// \param __b
764	/// A 128-bit vector of [2 x double]. The lower double-precision value is
765	/// compared to the lower double-precision value of \a __a.
766	/// \returns A 128-bit vector. The lower 64 bits contains the comparison
767	/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
768	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_sd(__m128d __a,
769	__m128d __b) {
770	__m128d __c = __builtin_ia32_cmplesd((__v2df)__b, (__v2df)__a);
771	return __extension__(__m128d){__c[0], __a[1]};
772	}
773
774	/// Compares the lower double-precision floating-point values in each of
775	/// the two 128-bit floating-point vectors of [2 x double] to determine if
776	/// the value in the first parameter is "ordered" with respect to the
777	/// corresponding value in the second parameter.
778	///
779	/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair
780	/// of double-precision values are "ordered" with respect to each other if
781	/// neither value is a NaN.
782	///
783	/// \headerfile <x86intrin.h>
784	///
785	/// This intrinsic corresponds to the <c> VCMPORDSD / CMPORDSD </c> instruction.
786	///
787	/// \param __a
788	/// A 128-bit vector of [2 x double]. The lower double-precision value is
789	/// compared to the lower double-precision value of \a __b.
790	/// \param __b
791	/// A 128-bit vector of [2 x double]. The lower double-precision value is
792	/// compared to the lower double-precision value of \a __a.
793	/// \returns A 128-bit vector. The lower 64 bits contains the comparison
794	/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
795	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_sd(__m128d __a,
796	__m128d __b) {
797	return (__m128d)__builtin_ia32_cmpordsd((__v2df)__a, (__v2df)__b);
798	}
799
800	/// Compares the lower double-precision floating-point values in each of
801	/// the two 128-bit floating-point vectors of [2 x double] to determine if
802	/// the value in the first parameter is "unordered" with respect to the
803	/// corresponding value in the second parameter.
804	///
805	/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair
806	/// of double-precision values are "unordered" with respect to each other if
807	/// one or both values are NaN.
808	///
809	/// \headerfile <x86intrin.h>
810	///
811	/// This intrinsic corresponds to the <c> VCMPUNORDSD / CMPUNORDSD </c>
812	/// instruction.
813	///
814	/// \param __a
815	/// A 128-bit vector of [2 x double]. The lower double-precision value is
816	/// compared to the lower double-precision value of \a __b.
817	/// \param __b
818	/// A 128-bit vector of [2 x double]. The lower double-precision value is
819	/// compared to the lower double-precision value of \a __a.
820	/// \returns A 128-bit vector. The lower 64 bits contains the comparison
821	/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
822	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_sd(__m128d __a,
823	__m128d __b) {
824	return (__m128d)__builtin_ia32_cmpunordsd((__v2df)__a, (__v2df)__b);
825	}
826
827	/// Compares the lower double-precision floating-point values in each of
828	/// the two 128-bit floating-point vectors of [2 x double] to determine if
829	/// the value in the first parameter is unequal to the corresponding value in
830	/// the second parameter.
831	///
832	/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
833	///
834	/// \headerfile <x86intrin.h>
835	///
836	/// This intrinsic corresponds to the <c> VCMPNEQSD / CMPNEQSD </c> instruction.
837	///
838	/// \param __a
839	/// A 128-bit vector of [2 x double]. The lower double-precision value is
840	/// compared to the lower double-precision value of \a __b.
841	/// \param __b
842	/// A 128-bit vector of [2 x double]. The lower double-precision value is
843	/// compared to the lower double-precision value of \a __a.
844	/// \returns A 128-bit vector. The lower 64 bits contains the comparison
845	/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
846	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_sd(__m128d __a,
847	__m128d __b) {
848	return (__m128d)__builtin_ia32_cmpneqsd((__v2df)__a, (__v2df)__b);
849	}
850
851	/// Compares the lower double-precision floating-point values in each of
852	/// the two 128-bit floating-point vectors of [2 x double] to determine if
853	/// the value in the first parameter is not less than the corresponding
854	/// value in the second parameter.
855	///
856	/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
857	///
858	/// \headerfile <x86intrin.h>
859	///
860	/// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
861	///
862	/// \param __a
863	/// A 128-bit vector of [2 x double]. The lower double-precision value is
864	/// compared to the lower double-precision value of \a __b.
865	/// \param __b
866	/// A 128-bit vector of [2 x double]. The lower double-precision value is
867	/// compared to the lower double-precision value of \a __a.
868	/// \returns A 128-bit vector. The lower 64 bits contains the comparison
869	/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
870	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_sd(__m128d __a,
871	__m128d __b) {
872	return (__m128d)__builtin_ia32_cmpnltsd((__v2df)__a, (__v2df)__b);
873	}
874
875	/// Compares the lower double-precision floating-point values in each of
876	/// the two 128-bit floating-point vectors of [2 x double] to determine if
877	/// the value in the first parameter is not less than or equal to the
878	/// corresponding value in the second parameter.
879	///
880	/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
881	///
882	/// \headerfile <x86intrin.h>
883	///
884	/// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
885	///
886	/// \param __a
887	/// A 128-bit vector of [2 x double]. The lower double-precision value is
888	/// compared to the lower double-precision value of \a __b.
889	/// \param __b
890	/// A 128-bit vector of [2 x double]. The lower double-precision value is
891	/// compared to the lower double-precision value of \a __a.
892	/// \returns A 128-bit vector. The lower 64 bits contains the comparison
893	/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
894	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_sd(__m128d __a,
895	__m128d __b) {
896	return (__m128d)__builtin_ia32_cmpnlesd((__v2df)__a, (__v2df)__b);
897	}
898
899	/// Compares the lower double-precision floating-point values in each of
900	/// the two 128-bit floating-point vectors of [2 x double] to determine if
901	/// the value in the first parameter is not greater than the corresponding
902	/// value in the second parameter.
903	///
904	/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
905	///
906	/// \headerfile <x86intrin.h>
907	///
908	/// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
909	///
910	/// \param __a
911	/// A 128-bit vector of [2 x double]. The lower double-precision value is
912	/// compared to the lower double-precision value of \a __b.
913	/// \param __b
914	/// A 128-bit vector of [2 x double]. The lower double-precision value is
915	/// compared to the lower double-precision value of \a __a.
916	/// \returns A 128-bit vector. The lower 64 bits contains the comparison
917	/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
918	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_sd(__m128d __a,
919	__m128d __b) {
920	__m128d __c = __builtin_ia32_cmpnltsd((__v2df)__b, (__v2df)__a);
921	return __extension__(__m128d){__c[0], __a[1]};
922	}
923
924	/// Compares the lower double-precision floating-point values in each of
925	/// the two 128-bit floating-point vectors of [2 x double] to determine if
926	/// the value in the first parameter is not greater than or equal to the
927	/// corresponding value in the second parameter.
928	///
929	/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
930	///
931	/// \headerfile <x86intrin.h>
932	///
933	/// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
934	///
935	/// \param __a
936	/// A 128-bit vector of [2 x double]. The lower double-precision value is
937	/// compared to the lower double-precision value of \a __b.
938	/// \param __b
939	/// A 128-bit vector of [2 x double]. The lower double-precision value is
940	/// compared to the lower double-precision value of \a __a.
941	/// \returns A 128-bit vector. The lower 64 bits contains the comparison
942	/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
943	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_sd(__m128d __a,
944	__m128d __b) {
945	__m128d __c = __builtin_ia32_cmpnlesd((__v2df)__b, (__v2df)__a);
946	return __extension__(__m128d){__c[0], __a[1]};
947	}
948
949	/// Compares the lower double-precision floating-point values in each of
950	/// the two 128-bit floating-point vectors of [2 x double] for equality.
951	///
952	/// The comparison yields 0 for false, 1 for true. If either of the two
953	/// lower double-precision values is NaN, 0 is returned.
954	///
955	/// \headerfile <x86intrin.h>
956	///
957	/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
958	///
959	/// \param __a
960	/// A 128-bit vector of [2 x double]. The lower double-precision value is
961	/// compared to the lower double-precision value of \a __b.
962	/// \param __b
963	/// A 128-bit vector of [2 x double]. The lower double-precision value is
964	/// compared to the lower double-precision value of \a __a.
965	/// \returns An integer containing the comparison results. If either of the two
966	/// lower double-precision values is NaN, 0 is returned.
967	static __inline__ int __DEFAULT_FN_ATTRS _mm_comieq_sd(__m128d __a,
968	__m128d __b) {
969	return __builtin_ia32_comisdeq((__v2df)__a, (__v2df)__b);
970	}
971
972	/// Compares the lower double-precision floating-point values in each of
973	/// the two 128-bit floating-point vectors of [2 x double] to determine if
974	/// the value in the first parameter is less than the corresponding value in
975	/// the second parameter.
976	///
977	/// The comparison yields 0 for false, 1 for true. If either of the two
978	/// lower double-precision values is NaN, 0 is returned.
979	///
980	/// \headerfile <x86intrin.h>
981	///
982	/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
983	///
984	/// \param __a
985	/// A 128-bit vector of [2 x double]. The lower double-precision value is
986	/// compared to the lower double-precision value of \a __b.
987	/// \param __b
988	/// A 128-bit vector of [2 x double]. The lower double-precision value is
989	/// compared to the lower double-precision value of \a __a.
990	/// \returns An integer containing the comparison results. If either of the two
991	/// lower double-precision values is NaN, 0 is returned.
992	static __inline__ int __DEFAULT_FN_ATTRS _mm_comilt_sd(__m128d __a,
993	__m128d __b) {
994	return __builtin_ia32_comisdlt((__v2df)__a, (__v2df)__b);
995	}
996
997	/// Compares the lower double-precision floating-point values in each of
998	/// the two 128-bit floating-point vectors of [2 x double] to determine if
999	/// the value in the first parameter is less than or equal to the
1000	/// corresponding value in the second parameter.
1001	///
1002	/// The comparison yields 0 for false, 1 for true. If either of the two
1003	/// lower double-precision values is NaN, 0 is returned.
1004	///
1005	/// \headerfile <x86intrin.h>
1006	///
1007	/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1008	///
1009	/// \param __a
1010	/// A 128-bit vector of [2 x double]. The lower double-precision value is
1011	/// compared to the lower double-precision value of \a __b.
1012	/// \param __b
1013	/// A 128-bit vector of [2 x double]. The lower double-precision value is
1014	/// compared to the lower double-precision value of \a __a.
1015	/// \returns An integer containing the comparison results. If either of the two
1016	/// lower double-precision values is NaN, 0 is returned.
1017	static __inline__ int __DEFAULT_FN_ATTRS _mm_comile_sd(__m128d __a,
1018	__m128d __b) {
1019	return __builtin_ia32_comisdle((__v2df)__a, (__v2df)__b);
1020	}
1021
1022	/// Compares the lower double-precision floating-point values in each of
1023	/// the two 128-bit floating-point vectors of [2 x double] to determine if
1024	/// the value in the first parameter is greater than the corresponding value
1025	/// in the second parameter.
1026	///
1027	/// The comparison yields 0 for false, 1 for true. If either of the two
1028	/// lower double-precision values is NaN, 0 is returned.
1029	///
1030	/// \headerfile <x86intrin.h>
1031	///
1032	/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1033	///
1034	/// \param __a
1035	/// A 128-bit vector of [2 x double]. The lower double-precision value is
1036	/// compared to the lower double-precision value of \a __b.
1037	/// \param __b
1038	/// A 128-bit vector of [2 x double]. The lower double-precision value is
1039	/// compared to the lower double-precision value of \a __a.
1040	/// \returns An integer containing the comparison results. If either of the two
1041	/// lower double-precision values is NaN, 0 is returned.
1042	static __inline__ int __DEFAULT_FN_ATTRS _mm_comigt_sd(__m128d __a,
1043	__m128d __b) {
1044	return __builtin_ia32_comisdgt((__v2df)__a, (__v2df)__b);
1045	}
1046
1047	/// Compares the lower double-precision floating-point values in each of
1048	/// the two 128-bit floating-point vectors of [2 x double] to determine if
1049	/// the value in the first parameter is greater than or equal to the
1050	/// corresponding value in the second parameter.
1051	///
1052	/// The comparison yields 0 for false, 1 for true. If either of the two
1053	/// lower double-precision values is NaN, 0 is returned.
1054	///
1055	/// \headerfile <x86intrin.h>
1056	///
1057	/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1058	///
1059	/// \param __a
1060	/// A 128-bit vector of [2 x double]. The lower double-precision value is
1061	/// compared to the lower double-precision value of \a __b.
1062	/// \param __b
1063	/// A 128-bit vector of [2 x double]. The lower double-precision value is
1064	/// compared to the lower double-precision value of \a __a.
1065	/// \returns An integer containing the comparison results. If either of the two
1066	/// lower double-precision values is NaN, 0 is returned.
1067	static __inline__ int __DEFAULT_FN_ATTRS _mm_comige_sd(__m128d __a,
1068	__m128d __b) {
1069	return __builtin_ia32_comisdge((__v2df)__a, (__v2df)__b);
1070	}
1071
1072	/// Compares the lower double-precision floating-point values in each of
1073	/// the two 128-bit floating-point vectors of [2 x double] to determine if
1074	/// the value in the first parameter is unequal to the corresponding value in
1075	/// the second parameter.
1076	///
1077	/// The comparison yields 0 for false, 1 for true. If either of the two
1078	/// lower double-precision values is NaN, 1 is returned.
1079	///
1080	/// \headerfile <x86intrin.h>
1081	///
1082	/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1083	///
1084	/// \param __a
1085	/// A 128-bit vector of [2 x double]. The lower double-precision value is
1086	/// compared to the lower double-precision value of \a __b.
1087	/// \param __b
1088	/// A 128-bit vector of [2 x double]. The lower double-precision value is
1089	/// compared to the lower double-precision value of \a __a.
1090	/// \returns An integer containing the comparison results. If either of the two
1091	/// lower double-precision values is NaN, 1 is returned.
1092	static __inline__ int __DEFAULT_FN_ATTRS _mm_comineq_sd(__m128d __a,
1093	__m128d __b) {
1094	return __builtin_ia32_comisdneq((__v2df)__a, (__v2df)__b);
1095	}
1096
1097	/// Compares the lower double-precision floating-point values in each of
1098	/// the two 128-bit floating-point vectors of [2 x double] for equality. The
1099	/// comparison yields 0 for false, 1 for true.
1100	///
1101	/// If either of the two lower double-precision values is NaN, 0 is returned.
1102	///
1103	/// \headerfile <x86intrin.h>
1104	///
1105	/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1106	///
1107	/// \param __a
1108	/// A 128-bit vector of [2 x double]. The lower double-precision value is
1109	/// compared to the lower double-precision value of \a __b.
1110	/// \param __b
1111	/// A 128-bit vector of [2 x double]. The lower double-precision value is
1112	/// compared to the lower double-precision value of \a __a.
1113	/// \returns An integer containing the comparison results. If either of the two
1114	/// lower double-precision values is NaN, 0 is returned.
1115	static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomieq_sd(__m128d __a,
1116	__m128d __b) {
1117	return __builtin_ia32_ucomisdeq((__v2df)__a, (__v2df)__b);
1118	}
1119
1120	/// Compares the lower double-precision floating-point values in each of
1121	/// the two 128-bit floating-point vectors of [2 x double] to determine if
1122	/// the value in the first parameter is less than the corresponding value in
1123	/// the second parameter.
1124	///
1125	/// The comparison yields 0 for false, 1 for true. If either of the two lower
1126	/// double-precision values is NaN, 0 is returned.
1127	///
1128	/// \headerfile <x86intrin.h>
1129	///
1130	/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1131	///
1132	/// \param __a
1133	/// A 128-bit vector of [2 x double]. The lower double-precision value is
1134	/// compared to the lower double-precision value of \a __b.
1135	/// \param __b
1136	/// A 128-bit vector of [2 x double]. The lower double-precision value is
1137	/// compared to the lower double-precision value of \a __a.
1138	/// \returns An integer containing the comparison results. If either of the two
1139	/// lower double-precision values is NaN, 0 is returned.
1140	static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomilt_sd(__m128d __a,
1141	__m128d __b) {
1142	return __builtin_ia32_ucomisdlt((__v2df)__a, (__v2df)__b);
1143	}
1144
1145	/// Compares the lower double-precision floating-point values in each of
1146	/// the two 128-bit floating-point vectors of [2 x double] to determine if
1147	/// the value in the first parameter is less than or equal to the
1148	/// corresponding value in the second parameter.
1149	///
1150	/// The comparison yields 0 for false, 1 for true. If either of the two lower
1151	/// double-precision values is NaN, 0 is returned.
1152	///
1153	/// \headerfile <x86intrin.h>
1154	///
1155	/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1156	///
1157	/// \param __a
1158	/// A 128-bit vector of [2 x double]. The lower double-precision value is
1159	/// compared to the lower double-precision value of \a __b.
1160	/// \param __b
1161	/// A 128-bit vector of [2 x double]. The lower double-precision value is
1162	/// compared to the lower double-precision value of \a __a.
1163	/// \returns An integer containing the comparison results. If either of the two
1164	/// lower double-precision values is NaN, 0 is returned.
1165	static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomile_sd(__m128d __a,
1166	__m128d __b) {
1167	return __builtin_ia32_ucomisdle((__v2df)__a, (__v2df)__b);
1168	}
1169
1170	/// Compares the lower double-precision floating-point values in each of
1171	/// the two 128-bit floating-point vectors of [2 x double] to determine if
1172	/// the value in the first parameter is greater than the corresponding value
1173	/// in the second parameter.
1174	///
1175	/// The comparison yields 0 for false, 1 for true. If either of the two lower
1176	/// double-precision values is NaN, 0 is returned.
1177	///
1178	/// \headerfile <x86intrin.h>
1179	///
1180	/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1181	///
1182	/// \param __a
1183	/// A 128-bit vector of [2 x double]. The lower double-precision value is
1184	/// compared to the lower double-precision value of \a __b.
1185	/// \param __b
1186	/// A 128-bit vector of [2 x double]. The lower double-precision value is
1187	/// compared to the lower double-precision value of \a __a.
1188	/// \returns An integer containing the comparison results. If either of the two
1189	/// lower double-precision values is NaN, 0 is returned.
1190	static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomigt_sd(__m128d __a,
1191	__m128d __b) {
1192	return __builtin_ia32_ucomisdgt((__v2df)__a, (__v2df)__b);
1193	}
1194
1195	/// Compares the lower double-precision floating-point values in each of
1196	/// the two 128-bit floating-point vectors of [2 x double] to determine if
1197	/// the value in the first parameter is greater than or equal to the
1198	/// corresponding value in the second parameter.
1199	///
1200	/// The comparison yields 0 for false, 1 for true. If either of the two
1201	/// lower double-precision values is NaN, 0 is returned.
1202	///
1203	/// \headerfile <x86intrin.h>
1204	///
1205	/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1206	///
1207	/// \param __a
1208	/// A 128-bit vector of [2 x double]. The lower double-precision value is
1209	/// compared to the lower double-precision value of \a __b.
1210	/// \param __b
1211	/// A 128-bit vector of [2 x double]. The lower double-precision value is
1212	/// compared to the lower double-precision value of \a __a.
1213	/// \returns An integer containing the comparison results. If either of the two
1214	/// lower double-precision values is NaN, 0 is returned.
1215	static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomige_sd(__m128d __a,
1216	__m128d __b) {
1217	return __builtin_ia32_ucomisdge((__v2df)__a, (__v2df)__b);
1218	}
1219
1220	/// Compares the lower double-precision floating-point values in each of
1221	/// the two 128-bit floating-point vectors of [2 x double] to determine if
1222	/// the value in the first parameter is unequal to the corresponding value in
1223	/// the second parameter.
1224	///
1225	/// The comparison yields 0 for false, 1 for true. If either of the two lower
1226	/// double-precision values is NaN, 1 is returned.
1227	///
1228	/// \headerfile <x86intrin.h>
1229	///
1230	/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1231	///
1232	/// \param __a
1233	/// A 128-bit vector of [2 x double]. The lower double-precision value is
1234	/// compared to the lower double-precision value of \a __b.
1235	/// \param __b
1236	/// A 128-bit vector of [2 x double]. The lower double-precision value is
1237	/// compared to the lower double-precision value of \a __a.
1238	/// \returns An integer containing the comparison result. If either of the two
1239	/// lower double-precision values is NaN, 1 is returned.
1240	static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomineq_sd(__m128d __a,
1241	__m128d __b) {
1242	return __builtin_ia32_ucomisdneq((__v2df)__a, (__v2df)__b);
1243	}
1244
1245	/// Converts the two double-precision floating-point elements of a
1246	/// 128-bit vector of [2 x double] into two single-precision floating-point
1247	/// values, returned in the lower 64 bits of a 128-bit vector of [4 x float].
1248	/// The upper 64 bits of the result vector are set to zero.
1249	///
1250	/// \headerfile <x86intrin.h>
1251	///
1252	/// This intrinsic corresponds to the <c> VCVTPD2PS / CVTPD2PS </c> instruction.
1253	///
1254	/// \param __a
1255	/// A 128-bit vector of [2 x double].
1256	/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1257	/// converted values. The upper 64 bits are set to zero.
1258	static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtpd_ps(__m128d __a) {
1259	return __builtin_ia32_cvtpd2ps((__v2df)__a);
1260	}
1261
1262	/// Converts the lower two single-precision floating-point elements of a
1263	/// 128-bit vector of [4 x float] into two double-precision floating-point
1264	/// values, returned in a 128-bit vector of [2 x double]. The upper two
1265	/// elements of the input vector are unused.
1266	///
1267	/// \headerfile <x86intrin.h>
1268	///
1269	/// This intrinsic corresponds to the <c> VCVTPS2PD / CVTPS2PD </c> instruction.
1270	///
1271	/// \param __a
1272	/// A 128-bit vector of [4 x float]. The lower two single-precision
1273	/// floating-point elements are converted to double-precision values. The
1274	/// upper two elements are unused.
1275	/// \returns A 128-bit vector of [2 x double] containing the converted values.
1276	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtps_pd(__m128 __a) {
1277	return (__m128d) __builtin_convertvector(
1278	__builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1), __v2df);
1279	}
1280
1281	/// Converts the lower two integer elements of a 128-bit vector of
1282	/// [4 x i32] into two double-precision floating-point values, returned in a
1283	/// 128-bit vector of [2 x double].
1284	///
1285	/// The upper two elements of the input vector are unused.
1286	///
1287	/// \headerfile <x86intrin.h>
1288	///
1289	/// This intrinsic corresponds to the <c> VCVTDQ2PD / CVTDQ2PD </c> instruction.
1290	///
1291	/// \param __a
1292	/// A 128-bit integer vector of [4 x i32]. The lower two integer elements are
1293	/// converted to double-precision values.
1294	///
1295	/// The upper two elements are unused.
1296	/// \returns A 128-bit vector of [2 x double] containing the converted values.
1297	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtepi32_pd(__m128i __a) {
1298	return (__m128d) __builtin_convertvector(
1299	__builtin_shufflevector((__v4si)__a, (__v4si)__a, 0, 1), __v2df);
1300	}
1301
1302	/// Converts the two double-precision floating-point elements of a
1303	/// 128-bit vector of [2 x double] into two signed 32-bit integer values,
1304	/// returned in the lower 64 bits of a 128-bit vector of [4 x i32]. The upper
1305	/// 64 bits of the result vector are set to zero.
1306	///
1307	/// \headerfile <x86intrin.h>
1308	///
1309	/// This intrinsic corresponds to the <c> VCVTPD2DQ / CVTPD2DQ </c> instruction.
1310	///
1311	/// \param __a
1312	/// A 128-bit vector of [2 x double].
1313	/// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
1314	/// converted values. The upper 64 bits are set to zero.
1315	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtpd_epi32(__m128d __a) {
1316	return __builtin_ia32_cvtpd2dq((__v2df)__a);
1317	}
1318
1319	/// Converts the low-order element of a 128-bit vector of [2 x double]
1320	/// into a 32-bit signed integer value.
1321	///
1322	/// \headerfile <x86intrin.h>
1323	///
1324	/// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
1325	///
1326	/// \param __a
1327	/// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
1328	/// conversion.
1329	/// \returns A 32-bit signed integer containing the converted value.
1330	static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsd_si32(__m128d __a) {
1331	return __builtin_ia32_cvtsd2si((__v2df)__a);
1332	}
1333
1334	/// Converts the lower double-precision floating-point element of a
1335	/// 128-bit vector of [2 x double], in the second parameter, into a
1336	/// single-precision floating-point value, returned in the lower 32 bits of a
1337	/// 128-bit vector of [4 x float]. The upper 96 bits of the result vector are
1338	/// copied from the upper 96 bits of the first parameter.
1339	///
1340	/// \headerfile <x86intrin.h>
1341	///
1342	/// This intrinsic corresponds to the <c> VCVTSD2SS / CVTSD2SS </c> instruction.
1343	///
1344	/// \param __a
1345	/// A 128-bit vector of [4 x float]. The upper 96 bits of this parameter are
1346	/// copied to the upper 96 bits of the result.
1347	/// \param __b
1348	/// A 128-bit vector of [2 x double]. The lower double-precision
1349	/// floating-point element is used in the conversion.
1350	/// \returns A 128-bit vector of [4 x float]. The lower 32 bits contain the
1351	/// converted value from the second parameter. The upper 96 bits are copied
1352	/// from the upper 96 bits of the first parameter.
1353	static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtsd_ss(__m128 __a,
1354	__m128d __b) {
1355	return (__m128)__builtin_ia32_cvtsd2ss((__v4sf)__a, (__v2df)__b);
1356	}
1357
1358	/// Converts a 32-bit signed integer value, in the second parameter, into
1359	/// a double-precision floating-point value, returned in the lower 64 bits of
1360	/// a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
1361	/// are copied from the upper 64 bits of the first parameter.
1362	///
1363	/// \headerfile <x86intrin.h>
1364	///
1365	/// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
1366	///
1367	/// \param __a
1368	/// A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
1369	/// copied to the upper 64 bits of the result.
1370	/// \param __b
1371	/// A 32-bit signed integer containing the value to be converted.
1372	/// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
1373	/// converted value from the second parameter. The upper 64 bits are copied
1374	/// from the upper 64 bits of the first parameter.
1375	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtsi32_sd(__m128d __a,
1376	int __b) {
1377	__a[0] = __b;
1378	return __a;
1379	}
1380
1381	/// Converts the lower single-precision floating-point element of a
1382	/// 128-bit vector of [4 x float], in the second parameter, into a
1383	/// double-precision floating-point value, returned in the lower 64 bits of
1384	/// a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
1385	/// are copied from the upper 64 bits of the first parameter.
1386	///
1387	/// \headerfile <x86intrin.h>
1388	///
1389	/// This intrinsic corresponds to the <c> VCVTSS2SD / CVTSS2SD </c> instruction.
1390	///
1391	/// \param __a
1392	/// A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
1393	/// copied to the upper 64 bits of the result.
1394	/// \param __b
1395	/// A 128-bit vector of [4 x float]. The lower single-precision
1396	/// floating-point element is used in the conversion.
1397	/// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
1398	/// converted value from the second parameter. The upper 64 bits are copied
1399	/// from the upper 64 bits of the first parameter.
1400	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtss_sd(__m128d __a,
1401	__m128 __b) {
1402	__a[0] = __b[0];
1403	return __a;
1404	}
1405
1406	/// Converts the two double-precision floating-point elements of a
1407	/// 128-bit vector of [2 x double] into two signed 32-bit integer values,
1408	/// returned in the lower 64 bits of a 128-bit vector of [4 x i32].
1409	///
1410	/// If the result of either conversion is inexact, the result is truncated
1411	/// (rounded towards zero) regardless of the current MXCSR setting. The upper
1412	/// 64 bits of the result vector are set to zero.
1413	///
1414	/// \headerfile <x86intrin.h>
1415	///
1416	/// This intrinsic corresponds to the <c> VCVTTPD2DQ / CVTTPD2DQ </c>
1417	/// instruction.
1418	///
1419	/// \param __a
1420	/// A 128-bit vector of [2 x double].
1421	/// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
1422	/// converted values. The upper 64 bits are set to zero.
1423	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttpd_epi32(__m128d __a) {
1424	return (__m128i)__builtin_ia32_cvttpd2dq((__v2df)__a);
1425	}
1426
1427	/// Converts the low-order element of a [2 x double] vector into a 32-bit
1428	/// signed integer value, truncating the result when it is inexact.
1429	///
1430	/// \headerfile <x86intrin.h>
1431	///
1432	/// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
1433	/// instruction.
1434	///
1435	/// \param __a
1436	/// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
1437	/// conversion.
1438	/// \returns A 32-bit signed integer containing the converted value.
1439	static __inline__ int __DEFAULT_FN_ATTRS _mm_cvttsd_si32(__m128d __a) {
1440	return __builtin_ia32_cvttsd2si((__v2df)__a);
1441	}
1442
1443	/// Converts the two double-precision floating-point elements of a
1444	/// 128-bit vector of [2 x double] into two signed 32-bit integer values,
1445	/// returned in a 64-bit vector of [2 x i32].
1446	///
1447	/// \headerfile <x86intrin.h>
1448	///
1449	/// This intrinsic corresponds to the <c> CVTPD2PI </c> instruction.
1450	///
1451	/// \param __a
1452	/// A 128-bit vector of [2 x double].
1453	/// \returns A 64-bit vector of [2 x i32] containing the converted values.
1454	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtpd_pi32(__m128d __a) {
1455	return (__m64)__builtin_ia32_cvtpd2pi((__v2df)__a);
1456	}
1457
1458	/// Converts the two double-precision floating-point elements of a
1459	/// 128-bit vector of [2 x double] into two signed 32-bit integer values,
1460	/// returned in a 64-bit vector of [2 x i32].
1461	///
1462	/// If the result of either conversion is inexact, the result is truncated
1463	/// (rounded towards zero) regardless of the current MXCSR setting.
1464	///
1465	/// \headerfile <x86intrin.h>
1466	///
1467	/// This intrinsic corresponds to the <c> CVTTPD2PI </c> instruction.
1468	///
1469	/// \param __a
1470	/// A 128-bit vector of [2 x double].
1471	/// \returns A 64-bit vector of [2 x i32] containing the converted values.
1472	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvttpd_pi32(__m128d __a) {
1473	return (__m64)__builtin_ia32_cvttpd2pi((__v2df)__a);
1474	}
1475
1476	/// Converts the two signed 32-bit integer elements of a 64-bit vector of
1477	/// [2 x i32] into two double-precision floating-point values, returned in a
1478	/// 128-bit vector of [2 x double].
1479	///
1480	/// \headerfile <x86intrin.h>
1481	///
1482	/// This intrinsic corresponds to the <c> CVTPI2PD </c> instruction.
1483	///
1484	/// \param __a
1485	/// A 64-bit vector of [2 x i32].
1486	/// \returns A 128-bit vector of [2 x double] containing the converted values.
1487	static __inline__ __m128d __DEFAULT_FN_ATTRS_MMX _mm_cvtpi32_pd(__m64 __a) {
1488	return __builtin_ia32_cvtpi2pd((__v2si)__a);
1489	}
1490
1491	/// Returns the low-order element of a 128-bit vector of [2 x double] as
1492	/// a double-precision floating-point value.
1493	///
1494	/// \headerfile <x86intrin.h>
1495	///
1496	/// This intrinsic has no corresponding instruction.
1497	///
1498	/// \param __a
1499	/// A 128-bit vector of [2 x double]. The lower 64 bits are returned.
1500	/// \returns A double-precision floating-point value copied from the lower 64
1501	/// bits of \a __a.
1502	static __inline__ double __DEFAULT_FN_ATTRS _mm_cvtsd_f64(__m128d __a) {
1503	return __a[0];
1504	}
1505
1506	/// Loads a 128-bit floating-point vector of [2 x double] from an aligned
1507	/// memory location.
1508	///
1509	/// \headerfile <x86intrin.h>
1510	///
1511	/// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction.
1512	///
1513	/// \param __dp
1514	/// A pointer to a 128-bit memory location. The address of the memory
1515	/// location has to be 16-byte aligned.
1516	/// \returns A 128-bit vector of [2 x double] containing the loaded values.
1517	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_pd(double const *__dp) {
1518	return (const __m128d )__dp;
1519	}
1520
1521	/// Loads a double-precision floating-point value from a specified memory
1522	/// location and duplicates it to both vector elements of a 128-bit vector of
1523	/// [2 x double].
1524	///
1525	/// \headerfile <x86intrin.h>
1526	///
1527	/// This intrinsic corresponds to the <c> VMOVDDUP / MOVDDUP </c> instruction.
1528	///
1529	/// \param __dp
1530	/// A pointer to a memory location containing a double-precision value.
1531	/// \returns A 128-bit vector of [2 x double] containing the loaded and
1532	/// duplicated values.
1533	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load1_pd(double const *__dp) {
1534	struct __mm_load1_pd_struct {
1535	double __u;
1536	} __attribute__((__packed__, __may_alias__));
1537	double __u = ((const struct __mm_load1_pd_struct *)__dp)->__u;
1538	return __extension__(__m128d){__u, __u};
1539	}
1540
1541	#define _mm_load_pd1(dp) _mm_load1_pd(dp)
1542
1543	/// Loads two double-precision values, in reverse order, from an aligned
1544	/// memory location into a 128-bit vector of [2 x double].
1545	///
1546	/// \headerfile <x86intrin.h>
1547	///
1548	/// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction +
1549	/// needed shuffling instructions. In AVX mode, the shuffling may be combined
1550	/// with the \c VMOVAPD, resulting in only a \c VPERMILPD instruction.
1551	///
1552	/// \param __dp
1553	/// A 16-byte aligned pointer to an array of double-precision values to be
1554	/// loaded in reverse order.
1555	/// \returns A 128-bit vector of [2 x double] containing the reversed loaded
1556	/// values.
1557	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadr_pd(double const *__dp) {
1558	__m128d __u = (const __m128d )__dp;
1559	return __builtin_shufflevector((__v2df)__u, (__v2df)__u, 1, 0);
1560	}
1561
1562	/// Loads a 128-bit floating-point vector of [2 x double] from an
1563	/// unaligned memory location.
1564	///
1565	/// \headerfile <x86intrin.h>
1566	///
1567	/// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
1568	///
1569	/// \param __dp
1570	/// A pointer to a 128-bit memory location. The address of the memory
1571	/// location does not have to be aligned.
1572	/// \returns A 128-bit vector of [2 x double] containing the loaded values.
1573	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadu_pd(double const *__dp) {
1574	struct __loadu_pd {
1575	__m128d_u __v;
1576	} __attribute__((__packed__, __may_alias__));
1577	return ((const struct __loadu_pd *)__dp)->__v;
1578	}
1579
1580	/// Loads a 64-bit integer value to the low element of a 128-bit integer
1581	/// vector and clears the upper element.
1582	///
1583	/// \headerfile <x86intrin.h>
1584	///
1585	/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
1586	///
1587	/// \param __a
1588	/// A pointer to a 64-bit memory location. The address of the memory
1589	/// location does not have to be aligned.
1590	/// \returns A 128-bit vector of [2 x i64] containing the loaded value.
1591	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si64(void const *__a) {
1592	struct __loadu_si64 {
1593	long long __v;
1594	} __attribute__((__packed__, __may_alias__));
1595	long long __u = ((const struct __loadu_si64 *)__a)->__v;
1596	return __extension__(__m128i)(__v2di){__u, 0LL};
1597	}
1598
1599	/// Loads a 32-bit integer value to the low element of a 128-bit integer
1600	/// vector and clears the upper element.
1601	///
1602	/// \headerfile <x86intrin.h>
1603	///
1604	/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
1605	///
1606	/// \param __a
1607	/// A pointer to a 32-bit memory location. The address of the memory
1608	/// location does not have to be aligned.
1609	/// \returns A 128-bit vector of [4 x i32] containing the loaded value.
1610	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si32(void const *__a) {
1611	struct __loadu_si32 {
1612	int __v;
1613	} __attribute__((__packed__, __may_alias__));
1614	int __u = ((const struct __loadu_si32 *)__a)->__v;
1615	return __extension__(__m128i)(__v4si){__u, 0, 0, 0};
1616	}
1617
1618	/// Loads a 16-bit integer value to the low element of a 128-bit integer
1619	/// vector and clears the upper element.
1620	///
1621	/// \headerfile <x86intrin.h>
1622	///
1623	/// This intrinsic does not correspond to a specific instruction.
1624	///
1625	/// \param __a
1626	/// A pointer to a 16-bit memory location. The address of the memory
1627	/// location does not have to be aligned.
1628	/// \returns A 128-bit vector of [8 x i16] containing the loaded value.
1629	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si16(void const *__a) {
1630	struct __loadu_si16 {
1631	short __v;
1632	} __attribute__((__packed__, __may_alias__));
1633	short __u = ((const struct __loadu_si16 *)__a)->__v;
1634	return __extension__(__m128i)(__v8hi){__u, 0, 0, 0, 0, 0, 0, 0};
1635	}
1636
1637	/// Loads a 64-bit double-precision value to the low element of a
1638	/// 128-bit integer vector and clears the upper element.
1639	///
1640	/// \headerfile <x86intrin.h>
1641	///
1642	/// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
1643	///
1644	/// \param __dp
1645	/// A pointer to a memory location containing a double-precision value.
1646	/// The address of the memory location does not have to be aligned.
1647	/// \returns A 128-bit vector of [2 x double] containing the loaded value.
1648	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_sd(double const *__dp) {
1649	struct __mm_load_sd_struct {
1650	double __u;
1651	} __attribute__((__packed__, __may_alias__));
1652	double __u = ((const struct __mm_load_sd_struct *)__dp)->__u;
1653	return __extension__(__m128d){__u, 0};
1654	}
1655
1656	/// Loads a double-precision value into the high-order bits of a 128-bit
1657	/// vector of [2 x double]. The low-order bits are copied from the low-order
1658	/// bits of the first operand.
1659	///
1660	/// \headerfile <x86intrin.h>
1661	///
1662	/// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
1663	///
1664	/// \param __a
1665	/// A 128-bit vector of [2 x double]. \n
1666	/// Bits [63:0] are written to bits [63:0] of the result.
1667	/// \param __dp
1668	/// A pointer to a 64-bit memory location containing a double-precision
1669	/// floating-point value that is loaded. The loaded value is written to bits
1670	/// [127:64] of the result. The address of the memory location does not have
1671	/// to be aligned.
1672	/// \returns A 128-bit vector of [2 x double] containing the moved values.
1673	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadh_pd(__m128d __a,
1674	double const *__dp) {
1675	struct __mm_loadh_pd_struct {
1676	double __u;
1677	} __attribute__((__packed__, __may_alias__));
1678	double __u = ((const struct __mm_loadh_pd_struct *)__dp)->__u;
1679	return __extension__(__m128d){__a[0], __u};
1680	}
1681
1682	/// Loads a double-precision value into the low-order bits of a 128-bit
1683	/// vector of [2 x double]. The high-order bits are copied from the
1684	/// high-order bits of the first operand.
1685	///
1686	/// \headerfile <x86intrin.h>
1687	///
1688	/// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
1689	///
1690	/// \param __a
1691	/// A 128-bit vector of [2 x double]. \n
1692	/// Bits [127:64] are written to bits [127:64] of the result.
1693	/// \param __dp
1694	/// A pointer to a 64-bit memory location containing a double-precision
1695	/// floating-point value that is loaded. The loaded value is written to bits
1696	/// [63:0] of the result. The address of the memory location does not have to
1697	/// be aligned.
1698	/// \returns A 128-bit vector of [2 x double] containing the moved values.
1699	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadl_pd(__m128d __a,
1700	double const *__dp) {
1701	struct __mm_loadl_pd_struct {
1702	double __u;
1703	} __attribute__((__packed__, __may_alias__));
1704	double __u = ((const struct __mm_loadl_pd_struct *)__dp)->__u;
1705	return __extension__(__m128d){__u, __a[1]};
1706	}
1707
1708	/// Constructs a 128-bit floating-point vector of [2 x double] with
1709	/// unspecified content. This could be used as an argument to another
1710	/// intrinsic function where the argument is required but the value is not
1711	/// actually used.
1712	///
1713	/// \headerfile <x86intrin.h>
1714	///
1715	/// This intrinsic has no corresponding instruction.
1716	///
1717	/// \returns A 128-bit floating-point vector of [2 x double] with unspecified
1718	/// content.
1719	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_undefined_pd(void) {
1720	return (__m128d)__builtin_ia32_undef128();
1721	}
1722
1723	/// Constructs a 128-bit floating-point vector of [2 x double]. The lower
1724	/// 64 bits of the vector are initialized with the specified double-precision
1725	/// floating-point value. The upper 64 bits are set to zero.
1726	///
1727	/// \headerfile <x86intrin.h>
1728	///
1729	/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
1730	///
1731	/// \param __w
1732	/// A double-precision floating-point value used to initialize the lower 64
1733	/// bits of the result.
1734	/// \returns An initialized 128-bit floating-point vector of [2 x double]. The
1735	/// lower 64 bits contain the value of the parameter. The upper 64 bits are
1736	/// set to zero.
1737	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_sd(double __w) {
1738	return __extension__(__m128d){__w, 0};
1739	}
1740
1741	/// Constructs a 128-bit floating-point vector of [2 x double], with each
1742	/// of the two double-precision floating-point vector elements set to the
1743	/// specified double-precision floating-point value.
1744	///
1745	/// \headerfile <x86intrin.h>
1746	///
1747	/// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
1748	///
1749	/// \param __w
1750	/// A double-precision floating-point value used to initialize each vector
1751	/// element of the result.
1752	/// \returns An initialized 128-bit floating-point vector of [2 x double].
1753	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set1_pd(double __w) {
1754	return __extension__(__m128d){__w, __w};
1755	}
1756
1757	/// Constructs a 128-bit floating-point vector of [2 x double], with each
1758	/// of the two double-precision floating-point vector elements set to the
1759	/// specified double-precision floating-point value.
1760	///
1761	/// \headerfile <x86intrin.h>
1762	///
1763	/// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
1764	///
1765	/// \param __w
1766	/// A double-precision floating-point value used to initialize each vector
1767	/// element of the result.
1768	/// \returns An initialized 128-bit floating-point vector of [2 x double].
1769	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd1(double __w) {
1770	return _mm_set1_pd(__w);
1771	}
1772
1773	/// Constructs a 128-bit floating-point vector of [2 x double]
1774	/// initialized with the specified double-precision floating-point values.
1775	///
1776	/// \headerfile <x86intrin.h>
1777	///
1778	/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
1779	///
1780	/// \param __w
1781	/// A double-precision floating-point value used to initialize the upper 64
1782	/// bits of the result.
1783	/// \param __x
1784	/// A double-precision floating-point value used to initialize the lower 64
1785	/// bits of the result.
1786	/// \returns An initialized 128-bit floating-point vector of [2 x double].
1787	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd(double __w,
1788	double __x) {
1789	return __extension__(__m128d){__x, __w};
1790	}
1791
1792	/// Constructs a 128-bit floating-point vector of [2 x double],
1793	/// initialized in reverse order with the specified double-precision
1794	/// floating-point values.
1795	///
1796	/// \headerfile <x86intrin.h>
1797	///
1798	/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
1799	///
1800	/// \param __w
1801	/// A double-precision floating-point value used to initialize the lower 64
1802	/// bits of the result.
1803	/// \param __x
1804	/// A double-precision floating-point value used to initialize the upper 64
1805	/// bits of the result.
1806	/// \returns An initialized 128-bit floating-point vector of [2 x double].
1807	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setr_pd(double __w,
1808	double __x) {
1809	return __extension__(__m128d){__w, __x};
1810	}
1811
1812	/// Constructs a 128-bit floating-point vector of [2 x double]
1813	/// initialized to zero.
1814	///
1815	/// \headerfile <x86intrin.h>
1816	///
1817	/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
1818	///
1819	/// \returns An initialized 128-bit floating-point vector of [2 x double] with
1820	/// all elements set to zero.
1821	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setzero_pd(void) {
1822	return __extension__(__m128d){0.0, 0.0};
1823	}
1824
1825	/// Constructs a 128-bit floating-point vector of [2 x double]. The lower
1826	/// 64 bits are set to the lower 64 bits of the second parameter. The upper
1827	/// 64 bits are set to the upper 64 bits of the first parameter.
1828	///
1829	/// \headerfile <x86intrin.h>
1830	///
1831	/// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction.
1832	///
1833	/// \param __a
1834	/// A 128-bit vector of [2 x double]. The upper 64 bits are written to the
1835	/// upper 64 bits of the result.
1836	/// \param __b
1837	/// A 128-bit vector of [2 x double]. The lower 64 bits are written to the
1838	/// lower 64 bits of the result.
1839	/// \returns A 128-bit vector of [2 x double] containing the moved values.
1840	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_move_sd(__m128d __a,
1841	__m128d __b) {
1842	__a[0] = __b[0];
1843	return __a;
1844	}
1845
1846	/// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
1847	/// memory location.
1848	///
1849	/// \headerfile <x86intrin.h>
1850	///
1851	/// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
1852	///
1853	/// \param __dp
1854	/// A pointer to a 64-bit memory location.
1855	/// \param __a
1856	/// A 128-bit vector of [2 x double] containing the value to be stored.
1857	static __inline__ void __DEFAULT_FN_ATTRS _mm_store_sd(double *__dp,
1858	__m128d __a) {
1859	struct __mm_store_sd_struct {
1860	double __u;
1861	} __attribute__((__packed__, __may_alias__));
1862	((struct __mm_store_sd_struct *)__dp)->__u = __a[0];
1863	}
1864
1865	/// Moves packed double-precision values from a 128-bit vector of
1866	/// [2 x double] to a memory location.
1867	///
1868	/// \headerfile <x86intrin.h>
1869	///
1870	/// This intrinsic corresponds to the <c>VMOVAPD / MOVAPS</c> instruction.
1871	///
1872	/// \param __dp
1873	/// A pointer to an aligned memory location that can store two
1874	/// double-precision values.
1875	/// \param __a
1876	/// A packed 128-bit vector of [2 x double] containing the values to be
1877	/// moved.
1878	static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd(double *__dp,
1879	__m128d __a) {
1880	(__m128d )__dp = __a;
1881	}
1882
1883	/// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
1884	/// the upper and lower 64 bits of a memory location.
1885	///
1886	/// \headerfile <x86intrin.h>
1887	///
1888	/// This intrinsic corresponds to the
1889	/// <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
1890	///
1891	/// \param __dp
1892	/// A pointer to a memory location that can store two double-precision
1893	/// values.
1894	/// \param __a
1895	/// A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
1896	/// of the values in \a __dp.
1897	static __inline__ void __DEFAULT_FN_ATTRS _mm_store1_pd(double *__dp,
1898	__m128d __a) {
1899	__a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
1900	_mm_store_pd(__dp, __a);
1901	}
1902
1903	/// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
1904	/// the upper and lower 64 bits of a memory location.
1905	///
1906	/// \headerfile <x86intrin.h>
1907	///
1908	/// This intrinsic corresponds to the
1909	/// <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
1910	///
1911	/// \param __dp
1912	/// A pointer to a memory location that can store two double-precision
1913	/// values.
1914	/// \param __a
1915	/// A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
1916	/// of the values in \a __dp.
1917	static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd1(double *__dp,
1918	__m128d __a) {
1919	_mm_store1_pd(__dp, __a);
1920	}
1921
1922	/// Stores a 128-bit vector of [2 x double] into an unaligned memory
1923	/// location.
1924	///
1925	/// \headerfile <x86intrin.h>
1926	///
1927	/// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
1928	///
1929	/// \param __dp
1930	/// A pointer to a 128-bit memory location. The address of the memory
1931	/// location does not have to be aligned.
1932	/// \param __a
1933	/// A 128-bit vector of [2 x double] containing the values to be stored.
1934	static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_pd(double *__dp,
1935	__m128d __a) {
1936	struct __storeu_pd {
1937	__m128d_u __v;
1938	} __attribute__((__packed__, __may_alias__));
1939	((struct __storeu_pd *)__dp)->__v = __a;
1940	}
1941
1942	/// Stores two double-precision values, in reverse order, from a 128-bit
1943	/// vector of [2 x double] to a 16-byte aligned memory location.
1944	///
1945	/// \headerfile <x86intrin.h>
1946	///
1947	/// This intrinsic corresponds to a shuffling instruction followed by a
1948	/// <c> VMOVAPD / MOVAPD </c> instruction.
1949	///
1950	/// \param __dp
1951	/// A pointer to a 16-byte aligned memory location that can store two
1952	/// double-precision values.
1953	/// \param __a
1954	/// A 128-bit vector of [2 x double] containing the values to be reversed and
1955	/// stored.
1956	static __inline__ void __DEFAULT_FN_ATTRS _mm_storer_pd(double *__dp,
1957	__m128d __a) {
1958	__a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 1, 0);
1959	(__m128d )__dp = __a;
1960	}
1961
1962	/// Stores the upper 64 bits of a 128-bit vector of [2 x double] to a
1963	/// memory location.
1964	///
1965	/// \headerfile <x86intrin.h>
1966	///
1967	/// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
1968	///
1969	/// \param __dp
1970	/// A pointer to a 64-bit memory location.
1971	/// \param __a
1972	/// A 128-bit vector of [2 x double] containing the value to be stored.
1973	static __inline__ void __DEFAULT_FN_ATTRS _mm_storeh_pd(double *__dp,
1974	__m128d __a) {
1975	struct __mm_storeh_pd_struct {
1976	double __u;
1977	} __attribute__((__packed__, __may_alias__));
1978	((struct __mm_storeh_pd_struct *)__dp)->__u = __a[1];
1979	}
1980
1981	/// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
1982	/// memory location.
1983	///
1984	/// \headerfile <x86intrin.h>
1985	///
1986	/// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
1987	///
1988	/// \param __dp
1989	/// A pointer to a 64-bit memory location.
1990	/// \param __a
1991	/// A 128-bit vector of [2 x double] containing the value to be stored.
1992	static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_pd(double *__dp,
1993	__m128d __a) {
1994	struct __mm_storeh_pd_struct {
1995	double __u;
1996	} __attribute__((__packed__, __may_alias__));
1997	((struct __mm_storeh_pd_struct *)__dp)->__u = __a[0];
1998	}
1999
2000	/// Adds the corresponding elements of two 128-bit vectors of [16 x i8],
2001	/// saving the lower 8 bits of each sum in the corresponding element of a
2002	/// 128-bit result vector of [16 x i8].
2003	///
2004	/// The integer elements of both parameters can be either signed or unsigned.
2005	///
2006	/// \headerfile <x86intrin.h>
2007	///
2008	/// This intrinsic corresponds to the <c> VPADDB / PADDB </c> instruction.
2009	///
2010	/// \param __a
2011	/// A 128-bit vector of [16 x i8].
2012	/// \param __b
2013	/// A 128-bit vector of [16 x i8].
2014	/// \returns A 128-bit vector of [16 x i8] containing the sums of both
2015	/// parameters.
2016	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi8(__m128i __a,
2017	__m128i __b) {
2018	return (__m128i)((__v16qu)__a + (__v16qu)__b);
2019	}
2020
2021	/// Adds the corresponding elements of two 128-bit vectors of [8 x i16],
2022	/// saving the lower 16 bits of each sum in the corresponding element of a
2023	/// 128-bit result vector of [8 x i16].
2024	///
2025	/// The integer elements of both parameters can be either signed or unsigned.
2026	///
2027	/// \headerfile <x86intrin.h>
2028	///
2029	/// This intrinsic corresponds to the <c> VPADDW / PADDW </c> instruction.
2030	///
2031	/// \param __a
2032	/// A 128-bit vector of [8 x i16].
2033	/// \param __b
2034	/// A 128-bit vector of [8 x i16].
2035	/// \returns A 128-bit vector of [8 x i16] containing the sums of both
2036	/// parameters.
2037	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi16(__m128i __a,
2038	__m128i __b) {
2039	return (__m128i)((__v8hu)__a + (__v8hu)__b);
2040	}
2041
2042	/// Adds the corresponding elements of two 128-bit vectors of [4 x i32],
2043	/// saving the lower 32 bits of each sum in the corresponding element of a
2044	/// 128-bit result vector of [4 x i32].
2045	///
2046	/// The integer elements of both parameters can be either signed or unsigned.
2047	///
2048	/// \headerfile <x86intrin.h>
2049	///
2050	/// This intrinsic corresponds to the <c> VPADDD / PADDD </c> instruction.
2051	///
2052	/// \param __a
2053	/// A 128-bit vector of [4 x i32].
2054	/// \param __b
2055	/// A 128-bit vector of [4 x i32].
2056	/// \returns A 128-bit vector of [4 x i32] containing the sums of both
2057	/// parameters.
2058	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi32(__m128i __a,
2059	__m128i __b) {
2060	return (__m128i)((__v4su)__a + (__v4su)__b);
2061	}
2062
2063	/// Adds two signed or unsigned 64-bit integer values, returning the
2064	/// lower 64 bits of the sum.
2065	///
2066	/// \headerfile <x86intrin.h>
2067	///
2068	/// This intrinsic corresponds to the <c> PADDQ </c> instruction.
2069	///
2070	/// \param __a
2071	/// A 64-bit integer.
2072	/// \param __b
2073	/// A 64-bit integer.
2074	/// \returns A 64-bit integer containing the sum of both parameters.
2075	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_add_si64(__m64 __a,
2076	__m64 __b) {
2077	return (__m64)__builtin_ia32_paddq((__v1di)__a, (__v1di)__b);
2078	}
2079
2080	/// Adds the corresponding elements of two 128-bit vectors of [2 x i64],
2081	/// saving the lower 64 bits of each sum in the corresponding element of a
2082	/// 128-bit result vector of [2 x i64].
2083	///
2084	/// The integer elements of both parameters can be either signed or unsigned.
2085	///
2086	/// \headerfile <x86intrin.h>
2087	///
2088	/// This intrinsic corresponds to the <c> VPADDQ / PADDQ </c> instruction.
2089	///
2090	/// \param __a
2091	/// A 128-bit vector of [2 x i64].
2092	/// \param __b
2093	/// A 128-bit vector of [2 x i64].
2094	/// \returns A 128-bit vector of [2 x i64] containing the sums of both
2095	/// parameters.
2096	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi64(__m128i __a,
2097	__m128i __b) {
2098	return (__m128i)((__v2du)__a + (__v2du)__b);
2099	}
2100
2101	/// Adds, with saturation, the corresponding elements of two 128-bit
2102	/// signed [16 x i8] vectors, saving each sum in the corresponding element of
2103	/// a 128-bit result vector of [16 x i8]. Positive sums greater than 0x7F are
2104	/// saturated to 0x7F. Negative sums less than 0x80 are saturated to 0x80.
2105	///
2106	/// \headerfile <x86intrin.h>
2107	///
2108	/// This intrinsic corresponds to the <c> VPADDSB / PADDSB </c> instruction.
2109	///
2110	/// \param __a
2111	/// A 128-bit signed [16 x i8] vector.
2112	/// \param __b
2113	/// A 128-bit signed [16 x i8] vector.
2114	/// \returns A 128-bit signed [16 x i8] vector containing the saturated sums of
2115	/// both parameters.
2116	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi8(__m128i __a,
2117	__m128i __b) {
2118	return (__m128i)__builtin_elementwise_add_sat((__v16qs)__a, (__v16qs)__b);
2119	}
2120
2121	/// Adds, with saturation, the corresponding elements of two 128-bit
2122	/// signed [8 x i16] vectors, saving each sum in the corresponding element of
2123	/// a 128-bit result vector of [8 x i16]. Positive sums greater than 0x7FFF
2124	/// are saturated to 0x7FFF. Negative sums less than 0x8000 are saturated to
2125	/// 0x8000.
2126	///
2127	/// \headerfile <x86intrin.h>
2128	///
2129	/// This intrinsic corresponds to the <c> VPADDSW / PADDSW </c> instruction.
2130	///
2131	/// \param __a
2132	/// A 128-bit signed [8 x i16] vector.
2133	/// \param __b
2134	/// A 128-bit signed [8 x i16] vector.
2135	/// \returns A 128-bit signed [8 x i16] vector containing the saturated sums of
2136	/// both parameters.
2137	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi16(__m128i __a,
2138	__m128i __b) {
2139	return (__m128i)__builtin_elementwise_add_sat((__v8hi)__a, (__v8hi)__b);
2140	}
2141
2142	/// Adds, with saturation, the corresponding elements of two 128-bit
2143	/// unsigned [16 x i8] vectors, saving each sum in the corresponding element
2144	/// of a 128-bit result vector of [16 x i8]. Positive sums greater than 0xFF
2145	/// are saturated to 0xFF. Negative sums are saturated to 0x00.
2146	///
2147	/// \headerfile <x86intrin.h>
2148	///
2149	/// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
2150	///
2151	/// \param __a
2152	/// A 128-bit unsigned [16 x i8] vector.
2153	/// \param __b
2154	/// A 128-bit unsigned [16 x i8] vector.
2155	/// \returns A 128-bit unsigned [16 x i8] vector containing the saturated sums
2156	/// of both parameters.
2157	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu8(__m128i __a,
2158	__m128i __b) {
2159	return (__m128i)__builtin_elementwise_add_sat((__v16qu)__a, (__v16qu)__b);
2160	}
2161
2162	/// Adds, with saturation, the corresponding elements of two 128-bit
2163	/// unsigned [8 x i16] vectors, saving each sum in the corresponding element
2164	/// of a 128-bit result vector of [8 x i16]. Positive sums greater than
2165	/// 0xFFFF are saturated to 0xFFFF. Negative sums are saturated to 0x0000.
2166	///
2167	/// \headerfile <x86intrin.h>
2168	///
2169	/// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
2170	///
2171	/// \param __a
2172	/// A 128-bit unsigned [8 x i16] vector.
2173	/// \param __b
2174	/// A 128-bit unsigned [8 x i16] vector.
2175	/// \returns A 128-bit unsigned [8 x i16] vector containing the saturated sums
2176	/// of both parameters.
2177	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu16(__m128i __a,
2178	__m128i __b) {
2179	return (__m128i)__builtin_elementwise_add_sat((__v8hu)__a, (__v8hu)__b);
2180	}
2181
2182	/// Computes the rounded averages of corresponding elements of two
2183	/// 128-bit unsigned [16 x i8] vectors, saving each result in the
2184	/// corresponding element of a 128-bit result vector of [16 x i8].
2185	///
2186	/// \headerfile <x86intrin.h>
2187	///
2188	/// This intrinsic corresponds to the <c> VPAVGB / PAVGB </c> instruction.
2189	///
2190	/// \param __a
2191	/// A 128-bit unsigned [16 x i8] vector.
2192	/// \param __b
2193	/// A 128-bit unsigned [16 x i8] vector.
2194	/// \returns A 128-bit unsigned [16 x i8] vector containing the rounded
2195	/// averages of both parameters.
2196	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu8(__m128i __a,
2197	__m128i __b) {
2198	return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b);
2199	}
2200
2201	/// Computes the rounded averages of corresponding elements of two
2202	/// 128-bit unsigned [8 x i16] vectors, saving each result in the
2203	/// corresponding element of a 128-bit result vector of [8 x i16].
2204	///
2205	/// \headerfile <x86intrin.h>
2206	///
2207	/// This intrinsic corresponds to the <c> VPAVGW / PAVGW </c> instruction.
2208	///
2209	/// \param __a
2210	/// A 128-bit unsigned [8 x i16] vector.
2211	/// \param __b
2212	/// A 128-bit unsigned [8 x i16] vector.
2213	/// \returns A 128-bit unsigned [8 x i16] vector containing the rounded
2214	/// averages of both parameters.
2215	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu16(__m128i __a,
2216	__m128i __b) {
2217	return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b);
2218	}
2219
2220	/// Multiplies the corresponding elements of two 128-bit signed [8 x i16]
2221	/// vectors, producing eight intermediate 32-bit signed integer products, and
2222	/// adds the consecutive pairs of 32-bit products to form a 128-bit signed
2223	/// [4 x i32] vector.
2224	///
2225	/// For example, bits [15:0] of both parameters are multiplied producing a
2226	/// 32-bit product, bits [31:16] of both parameters are multiplied producing
2227	/// a 32-bit product, and the sum of those two products becomes bits [31:0]
2228	/// of the result.
2229	///
2230	/// \headerfile <x86intrin.h>
2231	///
2232	/// This intrinsic corresponds to the <c> VPMADDWD / PMADDWD </c> instruction.
2233	///
2234	/// \param __a
2235	/// A 128-bit signed [8 x i16] vector.
2236	/// \param __b
2237	/// A 128-bit signed [8 x i16] vector.
2238	/// \returns A 128-bit signed [4 x i32] vector containing the sums of products
2239	/// of both parameters.
2240	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_madd_epi16(__m128i __a,
2241	__m128i __b) {
2242	return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b);
2243	}
2244
2245	/// Compares corresponding elements of two 128-bit signed [8 x i16]
2246	/// vectors, saving the greater value from each comparison in the
2247	/// corresponding element of a 128-bit result vector of [8 x i16].
2248	///
2249	/// \headerfile <x86intrin.h>
2250	///
2251	/// This intrinsic corresponds to the <c> VPMAXSW / PMAXSW </c> instruction.
2252	///
2253	/// \param __a
2254	/// A 128-bit signed [8 x i16] vector.
2255	/// \param __b
2256	/// A 128-bit signed [8 x i16] vector.
2257	/// \returns A 128-bit signed [8 x i16] vector containing the greater value of
2258	/// each comparison.
2259	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi16(__m128i __a,
2260	__m128i __b) {
2261	return (__m128i)__builtin_elementwise_max((__v8hi)__a, (__v8hi)__b);
2262	}
2263
2264	/// Compares corresponding elements of two 128-bit unsigned [16 x i8]
2265	/// vectors, saving the greater value from each comparison in the
2266	/// corresponding element of a 128-bit result vector of [16 x i8].
2267	///
2268	/// \headerfile <x86intrin.h>
2269	///
2270	/// This intrinsic corresponds to the <c> VPMAXUB / PMAXUB </c> instruction.
2271	///
2272	/// \param __a
2273	/// A 128-bit unsigned [16 x i8] vector.
2274	/// \param __b
2275	/// A 128-bit unsigned [16 x i8] vector.
2276	/// \returns A 128-bit unsigned [16 x i8] vector containing the greater value of
2277	/// each comparison.
2278	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu8(__m128i __a,
2279	__m128i __b) {
2280	return (__m128i)__builtin_elementwise_max((__v16qu)__a, (__v16qu)__b);
2281	}
2282
2283	/// Compares corresponding elements of two 128-bit signed [8 x i16]
2284	/// vectors, saving the smaller value from each comparison in the
2285	/// corresponding element of a 128-bit result vector of [8 x i16].
2286	///
2287	/// \headerfile <x86intrin.h>
2288	///
2289	/// This intrinsic corresponds to the <c> VPMINSW / PMINSW </c> instruction.
2290	///
2291	/// \param __a
2292	/// A 128-bit signed [8 x i16] vector.
2293	/// \param __b
2294	/// A 128-bit signed [8 x i16] vector.
2295	/// \returns A 128-bit signed [8 x i16] vector containing the smaller value of
2296	/// each comparison.
2297	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi16(__m128i __a,
2298	__m128i __b) {
2299	return (__m128i)__builtin_elementwise_min((__v8hi)__a, (__v8hi)__b);
2300	}
2301
2302	/// Compares corresponding elements of two 128-bit unsigned [16 x i8]
2303	/// vectors, saving the smaller value from each comparison in the
2304	/// corresponding element of a 128-bit result vector of [16 x i8].
2305	///
2306	/// \headerfile <x86intrin.h>
2307	///
2308	/// This intrinsic corresponds to the <c> VPMINUB / PMINUB </c> instruction.
2309	///
2310	/// \param __a
2311	/// A 128-bit unsigned [16 x i8] vector.
2312	/// \param __b
2313	/// A 128-bit unsigned [16 x i8] vector.
2314	/// \returns A 128-bit unsigned [16 x i8] vector containing the smaller value of
2315	/// each comparison.
2316	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu8(__m128i __a,
2317	__m128i __b) {
2318	return (__m128i)__builtin_elementwise_min((__v16qu)__a, (__v16qu)__b);
2319	}
2320
2321	/// Multiplies the corresponding elements of two signed [8 x i16]
2322	/// vectors, saving the upper 16 bits of each 32-bit product in the
2323	/// corresponding element of a 128-bit signed [8 x i16] result vector.
2324	///
2325	/// \headerfile <x86intrin.h>
2326	///
2327	/// This intrinsic corresponds to the <c> VPMULHW / PMULHW </c> instruction.
2328	///
2329	/// \param __a
2330	/// A 128-bit signed [8 x i16] vector.
2331	/// \param __b
2332	/// A 128-bit signed [8 x i16] vector.
2333	/// \returns A 128-bit signed [8 x i16] vector containing the upper 16 bits of
2334	/// each of the eight 32-bit products.
2335	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epi16(__m128i __a,
2336	__m128i __b) {
2337	return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b);
2338	}
2339
2340	/// Multiplies the corresponding elements of two unsigned [8 x i16]
2341	/// vectors, saving the upper 16 bits of each 32-bit product in the
2342	/// corresponding element of a 128-bit unsigned [8 x i16] result vector.
2343	///
2344	/// \headerfile <x86intrin.h>
2345	///
2346	/// This intrinsic corresponds to the <c> VPMULHUW / PMULHUW </c> instruction.
2347	///
2348	/// \param __a
2349	/// A 128-bit unsigned [8 x i16] vector.
2350	/// \param __b
2351	/// A 128-bit unsigned [8 x i16] vector.
2352	/// \returns A 128-bit unsigned [8 x i16] vector containing the upper 16 bits
2353	/// of each of the eight 32-bit products.
2354	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epu16(__m128i __a,
2355	__m128i __b) {
2356	return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b);
2357	}
2358
2359	/// Multiplies the corresponding elements of two signed [8 x i16]
2360	/// vectors, saving the lower 16 bits of each 32-bit product in the
2361	/// corresponding element of a 128-bit signed [8 x i16] result vector.
2362	///
2363	/// \headerfile <x86intrin.h>
2364	///
2365	/// This intrinsic corresponds to the <c> VPMULLW / PMULLW </c> instruction.
2366	///
2367	/// \param __a
2368	/// A 128-bit signed [8 x i16] vector.
2369	/// \param __b
2370	/// A 128-bit signed [8 x i16] vector.
2371	/// \returns A 128-bit signed [8 x i16] vector containing the lower 16 bits of
2372	/// each of the eight 32-bit products.
2373	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi16(__m128i __a,
2374	__m128i __b) {
2375	return (__m128i)((__v8hu)__a * (__v8hu)__b);
2376	}
2377
2378	/// Multiplies 32-bit unsigned integer values contained in the lower bits
2379	/// of the two 64-bit integer vectors and returns the 64-bit unsigned
2380	/// product.
2381	///
2382	/// \headerfile <x86intrin.h>
2383	///
2384	/// This intrinsic corresponds to the <c> PMULUDQ </c> instruction.
2385	///
2386	/// \param __a
2387	/// A 64-bit integer containing one of the source operands.
2388	/// \param __b
2389	/// A 64-bit integer containing one of the source operands.
2390	/// \returns A 64-bit integer vector containing the product of both operands.
2391	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_mul_su32(__m64 __a,
2392	__m64 __b) {
2393	return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b);
2394	}
2395
2396	/// Multiplies 32-bit unsigned integer values contained in the lower
2397	/// bits of the corresponding elements of two [2 x i64] vectors, and returns
2398	/// the 64-bit products in the corresponding elements of a [2 x i64] vector.
2399	///
2400	/// \headerfile <x86intrin.h>
2401	///
2402	/// This intrinsic corresponds to the <c> VPMULUDQ / PMULUDQ </c> instruction.
2403	///
2404	/// \param __a
2405	/// A [2 x i64] vector containing one of the source operands.
2406	/// \param __b
2407	/// A [2 x i64] vector containing one of the source operands.
2408	/// \returns A [2 x i64] vector containing the product of both operands.
2409	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epu32(__m128i __a,
2410	__m128i __b) {
2411	return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b);
2412	}
2413
2414	/// Computes the absolute differences of corresponding 8-bit integer
2415	/// values in two 128-bit vectors. Sums the first 8 absolute differences, and
2416	/// separately sums the second 8 absolute differences. Packs these two
2417	/// unsigned 16-bit integer sums into the upper and lower elements of a
2418	/// [2 x i64] vector.
2419	///
2420	/// \headerfile <x86intrin.h>
2421	///
2422	/// This intrinsic corresponds to the <c> VPSADBW / PSADBW </c> instruction.
2423	///
2424	/// \param __a
2425	/// A 128-bit integer vector containing one of the source operands.
2426	/// \param __b
2427	/// A 128-bit integer vector containing one of the source operands.
2428	/// \returns A [2 x i64] vector containing the sums of the sets of absolute
2429	/// differences between both operands.
2430	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sad_epu8(__m128i __a,
2431	__m128i __b) {
2432	return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b);
2433	}
2434
2435	/// Subtracts the corresponding 8-bit integer values in the operands.
2436	///
2437	/// \headerfile <x86intrin.h>
2438	///
2439	/// This intrinsic corresponds to the <c> VPSUBB / PSUBB </c> instruction.
2440	///
2441	/// \param __a
2442	/// A 128-bit integer vector containing the minuends.
2443	/// \param __b
2444	/// A 128-bit integer vector containing the subtrahends.
2445	/// \returns A 128-bit integer vector containing the differences of the values
2446	/// in the operands.
2447	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi8(__m128i __a,
2448	__m128i __b) {
2449	return (__m128i)((__v16qu)__a - (__v16qu)__b);
2450	}
2451
2452	/// Subtracts the corresponding 16-bit integer values in the operands.
2453	///
2454	/// \headerfile <x86intrin.h>
2455	///
2456	/// This intrinsic corresponds to the <c> VPSUBW / PSUBW </c> instruction.
2457	///
2458	/// \param __a
2459	/// A 128-bit integer vector containing the minuends.
2460	/// \param __b
2461	/// A 128-bit integer vector containing the subtrahends.
2462	/// \returns A 128-bit integer vector containing the differences of the values
2463	/// in the operands.
2464	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi16(__m128i __a,
2465	__m128i __b) {
2466	return (__m128i)((__v8hu)__a - (__v8hu)__b);
2467	}
2468
2469	/// Subtracts the corresponding 32-bit integer values in the operands.
2470	///
2471	/// \headerfile <x86intrin.h>
2472	///
2473	/// This intrinsic corresponds to the <c> VPSUBD / PSUBD </c> instruction.
2474	///
2475	/// \param __a
2476	/// A 128-bit integer vector containing the minuends.
2477	/// \param __b
2478	/// A 128-bit integer vector containing the subtrahends.
2479	/// \returns A 128-bit integer vector containing the differences of the values
2480	/// in the operands.
2481	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi32(__m128i __a,
2482	__m128i __b) {
2483	return (__m128i)((__v4su)__a - (__v4su)__b);
2484	}
2485
2486	/// Subtracts signed or unsigned 64-bit integer values and writes the
2487	/// difference to the corresponding bits in the destination.
2488	///
2489	/// \headerfile <x86intrin.h>
2490	///
2491	/// This intrinsic corresponds to the <c> PSUBQ </c> instruction.
2492	///
2493	/// \param __a
2494	/// A 64-bit integer vector containing the minuend.
2495	/// \param __b
2496	/// A 64-bit integer vector containing the subtrahend.
2497	/// \returns A 64-bit integer vector containing the difference of the values in
2498	/// the operands.
2499	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_sub_si64(__m64 __a,
2500	__m64 __b) {
2501	return (__m64)__builtin_ia32_psubq((__v1di)__a, (__v1di)__b);
2502	}
2503
2504	/// Subtracts the corresponding elements of two [2 x i64] vectors.
2505	///
2506	/// \headerfile <x86intrin.h>
2507	///
2508	/// This intrinsic corresponds to the <c> VPSUBQ / PSUBQ </c> instruction.
2509	///
2510	/// \param __a
2511	/// A 128-bit integer vector containing the minuends.
2512	/// \param __b
2513	/// A 128-bit integer vector containing the subtrahends.
2514	/// \returns A 128-bit integer vector containing the differences of the values
2515	/// in the operands.
2516	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi64(__m128i __a,
2517	__m128i __b) {
2518	return (__m128i)((__v2du)__a - (__v2du)__b);
2519	}
2520
2521	/// Subtracts corresponding 8-bit signed integer values in the input and
2522	/// returns the differences in the corresponding bytes in the destination.
2523	/// Differences greater than 0x7F are saturated to 0x7F, and differences less
2524	/// than 0x80 are saturated to 0x80.
2525	///
2526	/// \headerfile <x86intrin.h>
2527	///
2528	/// This intrinsic corresponds to the <c> VPSUBSB / PSUBSB </c> instruction.
2529	///
2530	/// \param __a
2531	/// A 128-bit integer vector containing the minuends.
2532	/// \param __b
2533	/// A 128-bit integer vector containing the subtrahends.
2534	/// \returns A 128-bit integer vector containing the differences of the values
2535	/// in the operands.
2536	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi8(__m128i __a,
2537	__m128i __b) {
2538	return (__m128i)__builtin_elementwise_sub_sat((__v16qs)__a, (__v16qs)__b);
2539	}
2540
2541	/// Subtracts corresponding 16-bit signed integer values in the input and
2542	/// returns the differences in the corresponding bytes in the destination.
2543	/// Differences greater than 0x7FFF are saturated to 0x7FFF, and values less
2544	/// than 0x8000 are saturated to 0x8000.
2545	///
2546	/// \headerfile <x86intrin.h>
2547	///
2548	/// This intrinsic corresponds to the <c> VPSUBSW / PSUBSW </c> instruction.
2549	///
2550	/// \param __a
2551	/// A 128-bit integer vector containing the minuends.
2552	/// \param __b
2553	/// A 128-bit integer vector containing the subtrahends.
2554	/// \returns A 128-bit integer vector containing the differences of the values
2555	/// in the operands.
2556	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi16(__m128i __a,
2557	__m128i __b) {
2558	return (__m128i)__builtin_elementwise_sub_sat((__v8hi)__a, (__v8hi)__b);
2559	}
2560
2561	/// Subtracts corresponding 8-bit unsigned integer values in the input
2562	/// and returns the differences in the corresponding bytes in the
2563	/// destination. Differences less than 0x00 are saturated to 0x00.
2564	///
2565	/// \headerfile <x86intrin.h>
2566	///
2567	/// This intrinsic corresponds to the <c> VPSUBUSB / PSUBUSB </c> instruction.
2568	///
2569	/// \param __a
2570	/// A 128-bit integer vector containing the minuends.
2571	/// \param __b
2572	/// A 128-bit integer vector containing the subtrahends.
2573	/// \returns A 128-bit integer vector containing the unsigned integer
2574	/// differences of the values in the operands.
2575	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu8(__m128i __a,
2576	__m128i __b) {
2577	return (__m128i)__builtin_elementwise_sub_sat((__v16qu)__a, (__v16qu)__b);
2578	}
2579
2580	/// Subtracts corresponding 16-bit unsigned integer values in the input
2581	/// and returns the differences in the corresponding bytes in the
2582	/// destination. Differences less than 0x0000 are saturated to 0x0000.
2583	///
2584	/// \headerfile <x86intrin.h>
2585	///
2586	/// This intrinsic corresponds to the <c> VPSUBUSW / PSUBUSW </c> instruction.
2587	///
2588	/// \param __a
2589	/// A 128-bit integer vector containing the minuends.
2590	/// \param __b
2591	/// A 128-bit integer vector containing the subtrahends.
2592	/// \returns A 128-bit integer vector containing the unsigned integer
2593	/// differences of the values in the operands.
2594	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu16(__m128i __a,
2595	__m128i __b) {
2596	return (__m128i)__builtin_elementwise_sub_sat((__v8hu)__a, (__v8hu)__b);
2597	}
2598
2599	/// Performs a bitwise AND of two 128-bit integer vectors.
2600	///
2601	/// \headerfile <x86intrin.h>
2602	///
2603	/// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
2604	///
2605	/// \param __a
2606	/// A 128-bit integer vector containing one of the source operands.
2607	/// \param __b
2608	/// A 128-bit integer vector containing one of the source operands.
2609	/// \returns A 128-bit integer vector containing the bitwise AND of the values
2610	/// in both operands.
2611	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_and_si128(__m128i __a,
2612	__m128i __b) {
2613	return (__m128i)((__v2du)__a & (__v2du)__b);
2614	}
2615
2616	/// Performs a bitwise AND of two 128-bit integer vectors, using the
2617	/// one's complement of the values contained in the first source operand.
2618	///
2619	/// \headerfile <x86intrin.h>
2620	///
2621	/// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
2622	///
2623	/// \param __a
2624	/// A 128-bit vector containing the left source operand. The one's complement
2625	/// of this value is used in the bitwise AND.
2626	/// \param __b
2627	/// A 128-bit vector containing the right source operand.
2628	/// \returns A 128-bit integer vector containing the bitwise AND of the one's
2629	/// complement of the first operand and the values in the second operand.
2630	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_andnot_si128(__m128i __a,
2631	__m128i __b) {
2632	return (__m128i)(~(__v2du)__a & (__v2du)__b);
2633	}
2634	/// Performs a bitwise OR of two 128-bit integer vectors.
2635	///
2636	/// \headerfile <x86intrin.h>
2637	///
2638	/// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
2639	///
2640	/// \param __a
2641	/// A 128-bit integer vector containing one of the source operands.
2642	/// \param __b
2643	/// A 128-bit integer vector containing one of the source operands.
2644	/// \returns A 128-bit integer vector containing the bitwise OR of the values
2645	/// in both operands.
2646	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_or_si128(__m128i __a,
2647	__m128i __b) {
2648	return (__m128i)((__v2du)__a \| (__v2du)__b);
2649	}
2650
2651	/// Performs a bitwise exclusive OR of two 128-bit integer vectors.
2652	///
2653	/// \headerfile <x86intrin.h>
2654	///
2655	/// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
2656	///
2657	/// \param __a
2658	/// A 128-bit integer vector containing one of the source operands.
2659	/// \param __b
2660	/// A 128-bit integer vector containing one of the source operands.
2661	/// \returns A 128-bit integer vector containing the bitwise exclusive OR of the
2662	/// values in both operands.
2663	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_xor_si128(__m128i __a,
2664	__m128i __b) {
2665	return (__m128i)((__v2du)__a ^ (__v2du)__b);
2666	}
2667
2668	/// Left-shifts the 128-bit integer vector operand by the specified
2669	/// number of bytes. Low-order bits are cleared.
2670	///
2671	/// \headerfile <x86intrin.h>
2672	///
2673	/// \code
2674	/// __m128i _mm_slli_si128(__m128i a, const int imm);
2675	/// \endcode
2676	///
2677	/// This intrinsic corresponds to the <c> VPSLLDQ / PSLLDQ </c> instruction.
2678	///
2679	/// \param a
2680	/// A 128-bit integer vector containing the source operand.
2681	/// \param imm
2682	/// An immediate value specifying the number of bytes to left-shift operand
2683	/// \a a.
2684	/// \returns A 128-bit integer vector containing the left-shifted value.
2685	#define _mm_slli_si128(a, imm) \
2686	((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), \
2687	(int)(imm)))
2688
2689	#define _mm_bslli_si128(a, imm) \
2690	((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), \
2691	(int)(imm)))
2692
2693	/// Left-shifts each 16-bit value in the 128-bit integer vector operand
2694	/// by the specified number of bits. Low-order bits are cleared.
2695	///
2696	/// \headerfile <x86intrin.h>
2697	///
2698	/// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
2699	///
2700	/// \param __a
2701	/// A 128-bit integer vector containing the source operand.
2702	/// \param __count
2703	/// An integer value specifying the number of bits to left-shift each value
2704	/// in operand \a __a.
2705	/// \returns A 128-bit integer vector containing the left-shifted values.
2706	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi16(__m128i __a,
2707	int __count) {
2708	return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count);
2709	}
2710
2711	/// Left-shifts each 16-bit value in the 128-bit integer vector operand
2712	/// by the specified number of bits. Low-order bits are cleared.
2713	///
2714	/// \headerfile <x86intrin.h>
2715	///
2716	/// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
2717	///
2718	/// \param __a
2719	/// A 128-bit integer vector containing the source operand.
2720	/// \param __count
2721	/// A 128-bit integer vector in which bits [63:0] specify the number of bits
2722	/// to left-shift each value in operand \a __a.
2723	/// \returns A 128-bit integer vector containing the left-shifted values.
2724	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi16(__m128i __a,
2725	__m128i __count) {
2726	return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count);
2727	}
2728
2729	/// Left-shifts each 32-bit value in the 128-bit integer vector operand
2730	/// by the specified number of bits. Low-order bits are cleared.
2731	///
2732	/// \headerfile <x86intrin.h>
2733	///
2734	/// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
2735	///
2736	/// \param __a
2737	/// A 128-bit integer vector containing the source operand.
2738	/// \param __count
2739	/// An integer value specifying the number of bits to left-shift each value
2740	/// in operand \a __a.
2741	/// \returns A 128-bit integer vector containing the left-shifted values.
2742	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi32(__m128i __a,
2743	int __count) {
2744	return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count);
2745	}
2746
2747	/// Left-shifts each 32-bit value in the 128-bit integer vector operand
2748	/// by the specified number of bits. Low-order bits are cleared.
2749	///
2750	/// \headerfile <x86intrin.h>
2751	///
2752	/// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
2753	///
2754	/// \param __a
2755	/// A 128-bit integer vector containing the source operand.
2756	/// \param __count
2757	/// A 128-bit integer vector in which bits [63:0] specify the number of bits
2758	/// to left-shift each value in operand \a __a.
2759	/// \returns A 128-bit integer vector containing the left-shifted values.
2760	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi32(__m128i __a,
2761	__m128i __count) {
2762	return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count);
2763	}
2764
2765	/// Left-shifts each 64-bit value in the 128-bit integer vector operand
2766	/// by the specified number of bits. Low-order bits are cleared.
2767	///
2768	/// \headerfile <x86intrin.h>
2769	///
2770	/// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
2771	///
2772	/// \param __a
2773	/// A 128-bit integer vector containing the source operand.
2774	/// \param __count
2775	/// An integer value specifying the number of bits to left-shift each value
2776	/// in operand \a __a.
2777	/// \returns A 128-bit integer vector containing the left-shifted values.
2778	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi64(__m128i __a,
2779	int __count) {
2780	return __builtin_ia32_psllqi128((__v2di)__a, __count);
2781	}
2782
2783	/// Left-shifts each 64-bit value in the 128-bit integer vector operand
2784	/// by the specified number of bits. Low-order bits are cleared.
2785	///
2786	/// \headerfile <x86intrin.h>
2787	///
2788	/// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
2789	///
2790	/// \param __a
2791	/// A 128-bit integer vector containing the source operand.
2792	/// \param __count
2793	/// A 128-bit integer vector in which bits [63:0] specify the number of bits
2794	/// to left-shift each value in operand \a __a.
2795	/// \returns A 128-bit integer vector containing the left-shifted values.
2796	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi64(__m128i __a,
2797	__m128i __count) {
2798	return __builtin_ia32_psllq128((__v2di)__a, (__v2di)__count);
2799	}
2800
2801	/// Right-shifts each 16-bit value in the 128-bit integer vector operand
2802	/// by the specified number of bits. High-order bits are filled with the sign
2803	/// bit of the initial value.
2804	///
2805	/// \headerfile <x86intrin.h>
2806	///
2807	/// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
2808	///
2809	/// \param __a
2810	/// A 128-bit integer vector containing the source operand.
2811	/// \param __count
2812	/// An integer value specifying the number of bits to right-shift each value
2813	/// in operand \a __a.
2814	/// \returns A 128-bit integer vector containing the right-shifted values.
2815	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi16(__m128i __a,
2816	int __count) {
2817	return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count);
2818	}
2819
2820	/// Right-shifts each 16-bit value in the 128-bit integer vector operand
2821	/// by the specified number of bits. High-order bits are filled with the sign
2822	/// bit of the initial value.
2823	///
2824	/// \headerfile <x86intrin.h>
2825	///
2826	/// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
2827	///
2828	/// \param __a
2829	/// A 128-bit integer vector containing the source operand.
2830	/// \param __count
2831	/// A 128-bit integer vector in which bits [63:0] specify the number of bits
2832	/// to right-shift each value in operand \a __a.
2833	/// \returns A 128-bit integer vector containing the right-shifted values.
2834	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi16(__m128i __a,
2835	__m128i __count) {
2836	return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count);
2837	}
2838
2839	/// Right-shifts each 32-bit value in the 128-bit integer vector operand
2840	/// by the specified number of bits. High-order bits are filled with the sign
2841	/// bit of the initial value.
2842	///
2843	/// \headerfile <x86intrin.h>
2844	///
2845	/// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
2846	///
2847	/// \param __a
2848	/// A 128-bit integer vector containing the source operand.
2849	/// \param __count
2850	/// An integer value specifying the number of bits to right-shift each value
2851	/// in operand \a __a.
2852	/// \returns A 128-bit integer vector containing the right-shifted values.
2853	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi32(__m128i __a,
2854	int __count) {
2855	return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count);
2856	}
2857
2858	/// Right-shifts each 32-bit value in the 128-bit integer vector operand
2859	/// by the specified number of bits. High-order bits are filled with the sign
2860	/// bit of the initial value.
2861	///
2862	/// \headerfile <x86intrin.h>
2863	///
2864	/// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
2865	///
2866	/// \param __a
2867	/// A 128-bit integer vector containing the source operand.
2868	/// \param __count
2869	/// A 128-bit integer vector in which bits [63:0] specify the number of bits
2870	/// to right-shift each value in operand \a __a.
2871	/// \returns A 128-bit integer vector containing the right-shifted values.
2872	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi32(__m128i __a,
2873	__m128i __count) {
2874	return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count);
2875	}
2876
2877	/// Right-shifts the 128-bit integer vector operand by the specified
2878	/// number of bytes. High-order bits are cleared.
2879	///
2880	/// \headerfile <x86intrin.h>
2881	///
2882	/// \code
2883	/// __m128i _mm_srli_si128(__m128i a, const int imm);
2884	/// \endcode
2885	///
2886	/// This intrinsic corresponds to the <c> VPSRLDQ / PSRLDQ </c> instruction.
2887	///
2888	/// \param a
2889	/// A 128-bit integer vector containing the source operand.
2890	/// \param imm
2891	/// An immediate value specifying the number of bytes to right-shift operand
2892	/// \a a.
2893	/// \returns A 128-bit integer vector containing the right-shifted value.
2894	#define _mm_srli_si128(a, imm) \
2895	((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), \
2896	(int)(imm)))
2897
2898	#define _mm_bsrli_si128(a, imm) \
2899	((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), \
2900	(int)(imm)))
2901
2902	/// Right-shifts each of 16-bit values in the 128-bit integer vector
2903	/// operand by the specified number of bits. High-order bits are cleared.
2904	///
2905	/// \headerfile <x86intrin.h>
2906	///
2907	/// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
2908	///
2909	/// \param __a
2910	/// A 128-bit integer vector containing the source operand.
2911	/// \param __count
2912	/// An integer value specifying the number of bits to right-shift each value
2913	/// in operand \a __a.
2914	/// \returns A 128-bit integer vector containing the right-shifted values.
2915	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi16(__m128i __a,
2916	int __count) {
2917	return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count);
2918	}
2919
2920	/// Right-shifts each of 16-bit values in the 128-bit integer vector
2921	/// operand by the specified number of bits. High-order bits are cleared.
2922	///
2923	/// \headerfile <x86intrin.h>
2924	///
2925	/// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
2926	///
2927	/// \param __a
2928	/// A 128-bit integer vector containing the source operand.
2929	/// \param __count
2930	/// A 128-bit integer vector in which bits [63:0] specify the number of bits
2931	/// to right-shift each value in operand \a __a.
2932	/// \returns A 128-bit integer vector containing the right-shifted values.
2933	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi16(__m128i __a,
2934	__m128i __count) {
2935	return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count);
2936	}
2937
2938	/// Right-shifts each of 32-bit values in the 128-bit integer vector
2939	/// operand by the specified number of bits. High-order bits are cleared.
2940	///
2941	/// \headerfile <x86intrin.h>
2942	///
2943	/// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
2944	///
2945	/// \param __a
2946	/// A 128-bit integer vector containing the source operand.
2947	/// \param __count
2948	/// An integer value specifying the number of bits to right-shift each value
2949	/// in operand \a __a.
2950	/// \returns A 128-bit integer vector containing the right-shifted values.
2951	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi32(__m128i __a,
2952	int __count) {
2953	return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count);
2954	}
2955
2956	/// Right-shifts each of 32-bit values in the 128-bit integer vector
2957	/// operand by the specified number of bits. High-order bits are cleared.
2958	///
2959	/// \headerfile <x86intrin.h>
2960	///
2961	/// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
2962	///
2963	/// \param __a
2964	/// A 128-bit integer vector containing the source operand.
2965	/// \param __count
2966	/// A 128-bit integer vector in which bits [63:0] specify the number of bits
2967	/// to right-shift each value in operand \a __a.
2968	/// \returns A 128-bit integer vector containing the right-shifted values.
2969	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi32(__m128i __a,
2970	__m128i __count) {
2971	return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count);
2972	}
2973
2974	/// Right-shifts each of 64-bit values in the 128-bit integer vector
2975	/// operand by the specified number of bits. High-order bits are cleared.
2976	///
2977	/// \headerfile <x86intrin.h>
2978	///
2979	/// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
2980	///
2981	/// \param __a
2982	/// A 128-bit integer vector containing the source operand.
2983	/// \param __count
2984	/// An integer value specifying the number of bits to right-shift each value
2985	/// in operand \a __a.
2986	/// \returns A 128-bit integer vector containing the right-shifted values.
2987	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi64(__m128i __a,
2988	int __count) {
2989	return __builtin_ia32_psrlqi128((__v2di)__a, __count);
2990	}
2991
2992	/// Right-shifts each of 64-bit values in the 128-bit integer vector
2993	/// operand by the specified number of bits. High-order bits are cleared.
2994	///
2995	/// \headerfile <x86intrin.h>
2996	///
2997	/// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
2998	///
2999	/// \param __a
3000	/// A 128-bit integer vector containing the source operand.
3001	/// \param __count
3002	/// A 128-bit integer vector in which bits [63:0] specify the number of bits
3003	/// to right-shift each value in operand \a __a.
3004	/// \returns A 128-bit integer vector containing the right-shifted values.
3005	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi64(__m128i __a,
3006	__m128i __count) {
3007	return __builtin_ia32_psrlq128((__v2di)__a, (__v2di)__count);
3008	}
3009
3010	/// Compares each of the corresponding 8-bit values of the 128-bit
3011	/// integer vectors for equality. Each comparison yields 0x0 for false, 0xFF
3012	/// for true.
3013	///
3014	/// \headerfile <x86intrin.h>
3015	///
3016	/// This intrinsic corresponds to the <c> VPCMPEQB / PCMPEQB </c> instruction.
3017	///
3018	/// \param __a
3019	/// A 128-bit integer vector.
3020	/// \param __b
3021	/// A 128-bit integer vector.
3022	/// \returns A 128-bit integer vector containing the comparison results.
3023	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi8(__m128i __a,
3024	__m128i __b) {
3025	return (__m128i)((__v16qi)__a == (__v16qi)__b);
3026	}
3027
3028	/// Compares each of the corresponding 16-bit values of the 128-bit
3029	/// integer vectors for equality. Each comparison yields 0x0 for false,
3030	/// 0xFFFF for true.
3031	///
3032	/// \headerfile <x86intrin.h>
3033	///
3034	/// This intrinsic corresponds to the <c> VPCMPEQW / PCMPEQW </c> instruction.
3035	///
3036	/// \param __a
3037	/// A 128-bit integer vector.
3038	/// \param __b
3039	/// A 128-bit integer vector.
3040	/// \returns A 128-bit integer vector containing the comparison results.
3041	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi16(__m128i __a,
3042	__m128i __b) {
3043	return (__m128i)((__v8hi)__a == (__v8hi)__b);
3044	}
3045
3046	/// Compares each of the corresponding 32-bit values of the 128-bit
3047	/// integer vectors for equality. Each comparison yields 0x0 for false,
3048	/// 0xFFFFFFFF for true.
3049	///
3050	/// \headerfile <x86intrin.h>
3051	///
3052	/// This intrinsic corresponds to the <c> VPCMPEQD / PCMPEQD </c> instruction.
3053	///
3054	/// \param __a
3055	/// A 128-bit integer vector.
3056	/// \param __b
3057	/// A 128-bit integer vector.
3058	/// \returns A 128-bit integer vector containing the comparison results.
3059	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi32(__m128i __a,
3060	__m128i __b) {
3061	return (__m128i)((__v4si)__a == (__v4si)__b);
3062	}
3063
3064	/// Compares each of the corresponding signed 8-bit values of the 128-bit
3065	/// integer vectors to determine if the values in the first operand are
3066	/// greater than those in the second operand. Each comparison yields 0x0 for
3067	/// false, 0xFF for true.
3068	///
3069	/// \headerfile <x86intrin.h>
3070	///
3071	/// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
3072	///
3073	/// \param __a
3074	/// A 128-bit integer vector.
3075	/// \param __b
3076	/// A 128-bit integer vector.
3077	/// \returns A 128-bit integer vector containing the comparison results.
3078	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi8(__m128i __a,
3079	__m128i __b) {
3080	/* This function always performs a signed comparison, but __v16qi is a char
3081	which may be signed or unsigned, so use __v16qs. */
3082	return (__m128i)((__v16qs)__a > (__v16qs)__b);
3083	}
3084
3085	/// Compares each of the corresponding signed 16-bit values of the
3086	/// 128-bit integer vectors to determine if the values in the first operand
3087	/// are greater than those in the second operand.
3088	///
3089	/// Each comparison yields 0x0 for false, 0xFFFF for true.
3090	///
3091	/// \headerfile <x86intrin.h>
3092	///
3093	/// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
3094	///
3095	/// \param __a
3096	/// A 128-bit integer vector.
3097	/// \param __b
3098	/// A 128-bit integer vector.
3099	/// \returns A 128-bit integer vector containing the comparison results.
3100	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi16(__m128i __a,
3101	__m128i __b) {
3102	return (__m128i)((__v8hi)__a > (__v8hi)__b);
3103	}
3104
3105	/// Compares each of the corresponding signed 32-bit values of the
3106	/// 128-bit integer vectors to determine if the values in the first operand
3107	/// are greater than those in the second operand.
3108	///
3109	/// Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
3110	///
3111	/// \headerfile <x86intrin.h>
3112	///
3113	/// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
3114	///
3115	/// \param __a
3116	/// A 128-bit integer vector.
3117	/// \param __b
3118	/// A 128-bit integer vector.
3119	/// \returns A 128-bit integer vector containing the comparison results.
3120	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi32(__m128i __a,
3121	__m128i __b) {
3122	return (__m128i)((__v4si)__a > (__v4si)__b);
3123	}
3124
3125	/// Compares each of the corresponding signed 8-bit values of the 128-bit
3126	/// integer vectors to determine if the values in the first operand are less
3127	/// than those in the second operand.
3128	///
3129	/// Each comparison yields 0x0 for false, 0xFF for true.
3130	///
3131	/// \headerfile <x86intrin.h>
3132	///
3133	/// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
3134	///
3135	/// \param __a
3136	/// A 128-bit integer vector.
3137	/// \param __b
3138	/// A 128-bit integer vector.
3139	/// \returns A 128-bit integer vector containing the comparison results.
3140	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi8(__m128i __a,
3141	__m128i __b) {
3142	return _mm_cmpgt_epi8(__b, __a);
3143	}
3144
3145	/// Compares each of the corresponding signed 16-bit values of the
3146	/// 128-bit integer vectors to determine if the values in the first operand
3147	/// are less than those in the second operand.
3148	///
3149	/// Each comparison yields 0x0 for false, 0xFFFF for true.
3150	///
3151	/// \headerfile <x86intrin.h>
3152	///
3153	/// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
3154	///
3155	/// \param __a
3156	/// A 128-bit integer vector.
3157	/// \param __b
3158	/// A 128-bit integer vector.
3159	/// \returns A 128-bit integer vector containing the comparison results.
3160	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi16(__m128i __a,
3161	__m128i __b) {
3162	return _mm_cmpgt_epi16(__b, __a);
3163	}
3164
3165	/// Compares each of the corresponding signed 32-bit values of the
3166	/// 128-bit integer vectors to determine if the values in the first operand
3167	/// are less than those in the second operand.
3168	///
3169	/// Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
3170	///
3171	/// \headerfile <x86intrin.h>
3172	///
3173	/// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
3174	///
3175	/// \param __a
3176	/// A 128-bit integer vector.
3177	/// \param __b
3178	/// A 128-bit integer vector.
3179	/// \returns A 128-bit integer vector containing the comparison results.
3180	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi32(__m128i __a,
3181	__m128i __b) {
3182	return _mm_cmpgt_epi32(__b, __a);
3183	}
3184
3185	#ifdef __x86_64__
3186	/// Converts a 64-bit signed integer value from the second operand into a
3187	/// double-precision value and returns it in the lower element of a [2 x
3188	/// double] vector; the upper element of the returned vector is copied from
3189	/// the upper element of the first operand.
3190	///
3191	/// \headerfile <x86intrin.h>
3192	///
3193	/// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
3194	///
3195	/// \param __a
3196	/// A 128-bit vector of [2 x double]. The upper 64 bits of this operand are
3197	/// copied to the upper 64 bits of the destination.
3198	/// \param __b
3199	/// A 64-bit signed integer operand containing the value to be converted.
3200	/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
3201	/// converted value of the second operand. The upper 64 bits are copied from
3202	/// the upper 64 bits of the first operand.
3203	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtsi64_sd(__m128d __a,
3204	long long __b) {
3205	__a[0] = __b;
3206	return __a;
3207	}
3208
3209	/// Converts the first (lower) element of a vector of [2 x double] into a
3210	/// 64-bit signed integer value, according to the current rounding mode.
3211	///
3212	/// \headerfile <x86intrin.h>
3213	///
3214	/// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
3215	///
3216	/// \param __a
3217	/// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
3218	/// conversion.
3219	/// \returns A 64-bit signed integer containing the converted value.
3220	static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvtsd_si64(__m128d __a) {
3221	return __builtin_ia32_cvtsd2si64((__v2df)__a);
3222	}
3223
3224	/// Converts the first (lower) element of a vector of [2 x double] into a
3225	/// 64-bit signed integer value, truncating the result when it is inexact.
3226	///
3227	/// \headerfile <x86intrin.h>
3228	///
3229	/// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
3230	/// instruction.
3231	///
3232	/// \param __a
3233	/// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
3234	/// conversion.
3235	/// \returns A 64-bit signed integer containing the converted value.
3236	static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvttsd_si64(__m128d __a) {
3237	return __builtin_ia32_cvttsd2si64((__v2df)__a);
3238	}
3239	#endif
3240
3241	/// Converts a vector of [4 x i32] into a vector of [4 x float].
3242	///
3243	/// \headerfile <x86intrin.h>
3244	///
3245	/// This intrinsic corresponds to the <c> VCVTDQ2PS / CVTDQ2PS </c> instruction.
3246	///
3247	/// \param __a
3248	/// A 128-bit integer vector.
3249	/// \returns A 128-bit vector of [4 x float] containing the converted values.
3250	static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtepi32_ps(__m128i __a) {
3251	return (__m128) __builtin_convertvector((__v4si)__a, __v4sf);
3252	}
3253
3254	/// Converts a vector of [4 x float] into a vector of [4 x i32].
3255	///
3256	/// \headerfile <x86intrin.h>
3257	///
3258	/// This intrinsic corresponds to the <c> VCVTPS2DQ / CVTPS2DQ </c> instruction.
3259	///
3260	/// \param __a
3261	/// A 128-bit vector of [4 x float].
3262	/// \returns A 128-bit integer vector of [4 x i32] containing the converted
3263	/// values.
3264	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtps_epi32(__m128 __a) {
3265	return (__m128i)__builtin_ia32_cvtps2dq((__v4sf)__a);
3266	}
3267
3268	/// Converts a vector of [4 x float] into a vector of [4 x i32],
3269	/// truncating the result when it is inexact.
3270	///
3271	/// \headerfile <x86intrin.h>
3272	///
3273	/// This intrinsic corresponds to the <c> VCVTTPS2DQ / CVTTPS2DQ </c>
3274	/// instruction.
3275	///
3276	/// \param __a
3277	/// A 128-bit vector of [4 x float].
3278	/// \returns A 128-bit vector of [4 x i32] containing the converted values.
3279	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttps_epi32(__m128 __a) {
3280	return (__m128i)__builtin_ia32_cvttps2dq((__v4sf)__a);
3281	}
3282
3283	/// Returns a vector of [4 x i32] where the lowest element is the input
3284	/// operand and the remaining elements are zero.
3285	///
3286	/// \headerfile <x86intrin.h>
3287	///
3288	/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3289	///
3290	/// \param __a
3291	/// A 32-bit signed integer operand.
3292	/// \returns A 128-bit vector of [4 x i32].
3293	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi32_si128(int __a) {
3294	return __extension__(__m128i)(__v4si){__a, 0, 0, 0};
3295	}
3296
3297	/// Returns a vector of [2 x i64] where the lower element is the input
3298	/// operand and the upper element is zero.
3299	///
3300	/// \headerfile <x86intrin.h>
3301	///
3302	/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction
3303	/// in 64-bit mode.
3304	///
3305	/// \param __a
3306	/// A 64-bit signed integer operand containing the value to be converted.
3307	/// \returns A 128-bit vector of [2 x i64] containing the converted value.
3308	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi64_si128(long long __a) {
3309	return __extension__(__m128i)(__v2di){__a, 0};
3310	}
3311
3312	/// Moves the least significant 32 bits of a vector of [4 x i32] to a
3313	/// 32-bit signed integer value.
3314	///
3315	/// \headerfile <x86intrin.h>
3316	///
3317	/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3318	///
3319	/// \param __a
3320	/// A vector of [4 x i32]. The least significant 32 bits are moved to the
3321	/// destination.
3322	/// \returns A 32-bit signed integer containing the moved value.
3323	static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsi128_si32(__m128i __a) {
3324	__v4si __b = (__v4si)__a;
3325	return __b[0];
3326	}
3327
3328	/// Moves the least significant 64 bits of a vector of [2 x i64] to a
3329	/// 64-bit signed integer value.
3330	///
3331	/// \headerfile <x86intrin.h>
3332	///
3333	/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3334	///
3335	/// \param __a
3336	/// A vector of [2 x i64]. The least significant 64 bits are moved to the
3337	/// destination.
3338	/// \returns A 64-bit signed integer containing the moved value.
3339	static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvtsi128_si64(__m128i __a) {
3340	return __a[0];
3341	}
3342
3343	/// Moves packed integer values from an aligned 128-bit memory location
3344	/// to elements in a 128-bit integer vector.
3345	///
3346	/// \headerfile <x86intrin.h>
3347	///
3348	/// This intrinsic corresponds to the <c> VMOVDQA / MOVDQA </c> instruction.
3349	///
3350	/// \param __p
3351	/// An aligned pointer to a memory location containing integer values.
3352	/// \returns A 128-bit integer vector containing the moved values.
3353	static __inline__ __m128i __DEFAULT_FN_ATTRS
3354	_mm_load_si128(__m128i const *__p) {
3355	return *__p;
3356	}
3357
3358	/// Moves packed integer values from an unaligned 128-bit memory location
3359	/// to elements in a 128-bit integer vector.
3360	///
3361	/// \headerfile <x86intrin.h>
3362	///
3363	/// This intrinsic corresponds to the <c> VMOVDQU / MOVDQU </c> instruction.
3364	///
3365	/// \param __p
3366	/// A pointer to a memory location containing integer values.
3367	/// \returns A 128-bit integer vector containing the moved values.
3368	static __inline__ __m128i __DEFAULT_FN_ATTRS
3369	_mm_loadu_si128(__m128i_u const *__p) {
3370	struct __loadu_si128 {
3371	__m128i_u __v;
3372	} __attribute__((__packed__, __may_alias__));
3373	return ((const struct __loadu_si128 *)__p)->__v;
3374	}
3375
3376	/// Returns a vector of [2 x i64] where the lower element is taken from
3377	/// the lower element of the operand, and the upper element is zero.
3378	///
3379	/// \headerfile <x86intrin.h>
3380	///
3381	/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3382	///
3383	/// \param __p
3384	/// A 128-bit vector of [2 x i64]. Bits [63:0] are written to bits [63:0] of
3385	/// the destination.
3386	/// \returns A 128-bit vector of [2 x i64]. The lower order bits contain the
3387	/// moved value. The higher order bits are cleared.
3388	static __inline__ __m128i __DEFAULT_FN_ATTRS
3389	_mm_loadl_epi64(__m128i_u const *__p) {
3390	struct __mm_loadl_epi64_struct {
3391	long long __u;
3392	} __attribute__((__packed__, __may_alias__));
3393	return __extension__(__m128i){
3394	((const struct __mm_loadl_epi64_struct *)__p)->__u, 0};
3395	}
3396
3397	/// Generates a 128-bit vector of [4 x i32] with unspecified content.
3398	/// This could be used as an argument to another intrinsic function where the
3399	/// argument is required but the value is not actually used.
3400	///
3401	/// \headerfile <x86intrin.h>
3402	///
3403	/// This intrinsic has no corresponding instruction.
3404	///
3405	/// \returns A 128-bit vector of [4 x i32] with unspecified content.
3406	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_undefined_si128(void) {
3407	return (__m128i)__builtin_ia32_undef128();
3408	}
3409
3410	/// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
3411	/// the specified 64-bit integer values.
3412	///
3413	/// \headerfile <x86intrin.h>
3414	///
3415	/// This intrinsic is a utility function and does not correspond to a specific
3416	/// instruction.
3417	///
3418	/// \param __q1
3419	/// A 64-bit integer value used to initialize the upper 64 bits of the
3420	/// destination vector of [2 x i64].
3421	/// \param __q0
3422	/// A 64-bit integer value used to initialize the lower 64 bits of the
3423	/// destination vector of [2 x i64].
3424	/// \returns An initialized 128-bit vector of [2 x i64] containing the values
3425	/// provided in the operands.
3426	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64x(long long __q1,
3427	long long __q0) {
3428	return __extension__(__m128i)(__v2di){__q0, __q1};
3429	}
3430
3431	/// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
3432	/// the specified 64-bit integer values.
3433	///
3434	/// \headerfile <x86intrin.h>
3435	///
3436	/// This intrinsic is a utility function and does not correspond to a specific
3437	/// instruction.
3438	///
3439	/// \param __q1
3440	/// A 64-bit integer value used to initialize the upper 64 bits of the
3441	/// destination vector of [2 x i64].
3442	/// \param __q0
3443	/// A 64-bit integer value used to initialize the lower 64 bits of the
3444	/// destination vector of [2 x i64].
3445	/// \returns An initialized 128-bit vector of [2 x i64] containing the values
3446	/// provided in the operands.
3447	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64(__m64 __q1,
3448	__m64 __q0) {
3449	return _mm_set_epi64x((long long)__q1, (long long)__q0);
3450	}
3451
3452	/// Initializes the 32-bit values in a 128-bit vector of [4 x i32] with
3453	/// the specified 32-bit integer values.
3454	///
3455	/// \headerfile <x86intrin.h>
3456	///
3457	/// This intrinsic is a utility function and does not correspond to a specific
3458	/// instruction.
3459	///
3460	/// \param __i3
3461	/// A 32-bit integer value used to initialize bits [127:96] of the
3462	/// destination vector.
3463	/// \param __i2
3464	/// A 32-bit integer value used to initialize bits [95:64] of the destination
3465	/// vector.
3466	/// \param __i1
3467	/// A 32-bit integer value used to initialize bits [63:32] of the destination
3468	/// vector.
3469	/// \param __i0
3470	/// A 32-bit integer value used to initialize bits [31:0] of the destination
3471	/// vector.
3472	/// \returns An initialized 128-bit vector of [4 x i32] containing the values
3473	/// provided in the operands.
3474	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi32(int __i3, int __i2,
3475	int __i1, int __i0) {
3476	return __extension__(__m128i)(__v4si){__i0, __i1, __i2, __i3};
3477	}
3478
3479	/// Initializes the 16-bit values in a 128-bit vector of [8 x i16] with
3480	/// the specified 16-bit integer values.
3481	///
3482	/// \headerfile <x86intrin.h>
3483	///
3484	/// This intrinsic is a utility function and does not correspond to a specific
3485	/// instruction.
3486	///
3487	/// \param __w7
3488	/// A 16-bit integer value used to initialize bits [127:112] of the
3489	/// destination vector.
3490	/// \param __w6
3491	/// A 16-bit integer value used to initialize bits [111:96] of the
3492	/// destination vector.
3493	/// \param __w5
3494	/// A 16-bit integer value used to initialize bits [95:80] of the destination
3495	/// vector.
3496	/// \param __w4
3497	/// A 16-bit integer value used to initialize bits [79:64] of the destination
3498	/// vector.
3499	/// \param __w3
3500	/// A 16-bit integer value used to initialize bits [63:48] of the destination
3501	/// vector.
3502	/// \param __w2
3503	/// A 16-bit integer value used to initialize bits [47:32] of the destination
3504	/// vector.
3505	/// \param __w1
3506	/// A 16-bit integer value used to initialize bits [31:16] of the destination
3507	/// vector.
3508	/// \param __w0
3509	/// A 16-bit integer value used to initialize bits [15:0] of the destination
3510	/// vector.
3511	/// \returns An initialized 128-bit vector of [8 x i16] containing the values
3512	/// provided in the operands.
3513	static __inline__ __m128i __DEFAULT_FN_ATTRS
3514	_mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3,
3515	short __w2, short __w1, short __w0) {
3516	return __extension__(__m128i)(__v8hi){__w0, __w1, __w2, __w3,
3517	__w4, __w5, __w6, __w7};
3518	}
3519
3520	/// Initializes the 8-bit values in a 128-bit vector of [16 x i8] with
3521	/// the specified 8-bit integer values.
3522	///
3523	/// \headerfile <x86intrin.h>
3524	///
3525	/// This intrinsic is a utility function and does not correspond to a specific
3526	/// instruction.
3527	///
3528	/// \param __b15
3529	/// Initializes bits [127:120] of the destination vector.
3530	/// \param __b14
3531	/// Initializes bits [119:112] of the destination vector.
3532	/// \param __b13
3533	/// Initializes bits [111:104] of the destination vector.
3534	/// \param __b12
3535	/// Initializes bits [103:96] of the destination vector.
3536	/// \param __b11
3537	/// Initializes bits [95:88] of the destination vector.
3538	/// \param __b10
3539	/// Initializes bits [87:80] of the destination vector.
3540	/// \param __b9
3541	/// Initializes bits [79:72] of the destination vector.
3542	/// \param __b8
3543	/// Initializes bits [71:64] of the destination vector.
3544	/// \param __b7
3545	/// Initializes bits [63:56] of the destination vector.
3546	/// \param __b6
3547	/// Initializes bits [55:48] of the destination vector.
3548	/// \param __b5
3549	/// Initializes bits [47:40] of the destination vector.
3550	/// \param __b4
3551	/// Initializes bits [39:32] of the destination vector.
3552	/// \param __b3
3553	/// Initializes bits [31:24] of the destination vector.
3554	/// \param __b2
3555	/// Initializes bits [23:16] of the destination vector.
3556	/// \param __b1
3557	/// Initializes bits [15:8] of the destination vector.
3558	/// \param __b0
3559	/// Initializes bits [7:0] of the destination vector.
3560	/// \returns An initialized 128-bit vector of [16 x i8] containing the values
3561	/// provided in the operands.
3562	static __inline__ __m128i __DEFAULT_FN_ATTRS
3563	_mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11,
3564	char __b10, char __b9, char __b8, char __b7, char __b6, char __b5,
3565	char __b4, char __b3, char __b2, char __b1, char __b0) {
3566	return __extension__(__m128i)(__v16qi){
3567	__b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7,
3568	__b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15};
3569	}
3570
3571	/// Initializes both values in a 128-bit integer vector with the
3572	/// specified 64-bit integer value.
3573	///
3574	/// \headerfile <x86intrin.h>
3575	///
3576	/// This intrinsic is a utility function and does not correspond to a specific
3577	/// instruction.
3578	///
3579	/// \param __q
3580	/// Integer value used to initialize the elements of the destination integer
3581	/// vector.
3582	/// \returns An initialized 128-bit integer vector of [2 x i64] with both
3583	/// elements containing the value provided in the operand.
3584	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64x(long long __q) {
3585	return _mm_set_epi64x(__q, __q);
3586	}
3587
3588	/// Initializes both values in a 128-bit vector of [2 x i64] with the
3589	/// specified 64-bit value.
3590	///
3591	/// \headerfile <x86intrin.h>
3592	///
3593	/// This intrinsic is a utility function and does not correspond to a specific
3594	/// instruction.
3595	///
3596	/// \param __q
3597	/// A 64-bit value used to initialize the elements of the destination integer
3598	/// vector.
3599	/// \returns An initialized 128-bit vector of [2 x i64] with all elements
3600	/// containing the value provided in the operand.
3601	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64(__m64 __q) {
3602	return _mm_set_epi64(__q, __q);
3603	}
3604
3605	/// Initializes all values in a 128-bit vector of [4 x i32] with the
3606	/// specified 32-bit value.
3607	///
3608	/// \headerfile <x86intrin.h>
3609	///
3610	/// This intrinsic is a utility function and does not correspond to a specific
3611	/// instruction.
3612	///
3613	/// \param __i
3614	/// A 32-bit value used to initialize the elements of the destination integer
3615	/// vector.
3616	/// \returns An initialized 128-bit vector of [4 x i32] with all elements
3617	/// containing the value provided in the operand.
3618	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi32(int __i) {
3619	return _mm_set_epi32(__i, __i, __i, __i);
3620	}
3621
3622	/// Initializes all values in a 128-bit vector of [8 x i16] with the
3623	/// specified 16-bit value.
3624	///
3625	/// \headerfile <x86intrin.h>
3626	///
3627	/// This intrinsic is a utility function and does not correspond to a specific
3628	/// instruction.
3629	///
3630	/// \param __w
3631	/// A 16-bit value used to initialize the elements of the destination integer
3632	/// vector.
3633	/// \returns An initialized 128-bit vector of [8 x i16] with all elements
3634	/// containing the value provided in the operand.
3635	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi16(short __w) {
3636	return _mm_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w);
3637	}
3638
3639	/// Initializes all values in a 128-bit vector of [16 x i8] with the
3640	/// specified 8-bit value.
3641	///
3642	/// \headerfile <x86intrin.h>
3643	///
3644	/// This intrinsic is a utility function and does not correspond to a specific
3645	/// instruction.
3646	///
3647	/// \param __b
3648	/// An 8-bit value used to initialize the elements of the destination integer
3649	/// vector.
3650	/// \returns An initialized 128-bit vector of [16 x i8] with all elements
3651	/// containing the value provided in the operand.
3652	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi8(char __b) {
3653	return _mm_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
3654	__b, __b, __b, __b, __b);
3655	}
3656
3657	/// Constructs a 128-bit integer vector, initialized in reverse order
3658	/// with the specified 64-bit integral values.
3659	///
3660	/// \headerfile <x86intrin.h>
3661	///
3662	/// This intrinsic does not correspond to a specific instruction.
3663	///
3664	/// \param __q0
3665	/// A 64-bit integral value used to initialize the lower 64 bits of the
3666	/// result.
3667	/// \param __q1
3668	/// A 64-bit integral value used to initialize the upper 64 bits of the
3669	/// result.
3670	/// \returns An initialized 128-bit integer vector.
3671	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi64(__m64 __q0,
3672	__m64 __q1) {
3673	return _mm_set_epi64(__q1, __q0);
3674	}
3675
3676	/// Constructs a 128-bit integer vector, initialized in reverse order
3677	/// with the specified 32-bit integral values.
3678	///
3679	/// \headerfile <x86intrin.h>
3680	///
3681	/// This intrinsic is a utility function and does not correspond to a specific
3682	/// instruction.
3683	///
3684	/// \param __i0
3685	/// A 32-bit integral value used to initialize bits [31:0] of the result.
3686	/// \param __i1
3687	/// A 32-bit integral value used to initialize bits [63:32] of the result.
3688	/// \param __i2
3689	/// A 32-bit integral value used to initialize bits [95:64] of the result.
3690	/// \param __i3
3691	/// A 32-bit integral value used to initialize bits [127:96] of the result.
3692	/// \returns An initialized 128-bit integer vector.
3693	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi32(int __i0, int __i1,
3694	int __i2,
3695	int __i3) {
3696	return _mm_set_epi32(__i3, __i2, __i1, __i0);
3697	}
3698
3699	/// Constructs a 128-bit integer vector, initialized in reverse order
3700	/// with the specified 16-bit integral values.
3701	///
3702	/// \headerfile <x86intrin.h>
3703	///
3704	/// This intrinsic is a utility function and does not correspond to a specific
3705	/// instruction.
3706	///
3707	/// \param __w0
3708	/// A 16-bit integral value used to initialize bits [15:0] of the result.
3709	/// \param __w1
3710	/// A 16-bit integral value used to initialize bits [31:16] of the result.
3711	/// \param __w2
3712	/// A 16-bit integral value used to initialize bits [47:32] of the result.
3713	/// \param __w3
3714	/// A 16-bit integral value used to initialize bits [63:48] of the result.
3715	/// \param __w4
3716	/// A 16-bit integral value used to initialize bits [79:64] of the result.
3717	/// \param __w5
3718	/// A 16-bit integral value used to initialize bits [95:80] of the result.
3719	/// \param __w6
3720	/// A 16-bit integral value used to initialize bits [111:96] of the result.
3721	/// \param __w7
3722	/// A 16-bit integral value used to initialize bits [127:112] of the result.
3723	/// \returns An initialized 128-bit integer vector.
3724	static __inline__ __m128i __DEFAULT_FN_ATTRS
3725	_mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4,
3726	short __w5, short __w6, short __w7) {
3727	return _mm_set_epi16(__w7, __w6, __w5, __w4, __w3, __w2, __w1, __w0);
3728	}
3729
3730	/// Constructs a 128-bit integer vector, initialized in reverse order
3731	/// with the specified 8-bit integral values.
3732	///
3733	/// \headerfile <x86intrin.h>
3734	///
3735	/// This intrinsic is a utility function and does not correspond to a specific
3736	/// instruction.
3737	///
3738	/// \param __b0
3739	/// An 8-bit integral value used to initialize bits [7:0] of the result.
3740	/// \param __b1
3741	/// An 8-bit integral value used to initialize bits [15:8] of the result.
3742	/// \param __b2
3743	/// An 8-bit integral value used to initialize bits [23:16] of the result.
3744	/// \param __b3
3745	/// An 8-bit integral value used to initialize bits [31:24] of the result.
3746	/// \param __b4
3747	/// An 8-bit integral value used to initialize bits [39:32] of the result.
3748	/// \param __b5
3749	/// An 8-bit integral value used to initialize bits [47:40] of the result.
3750	/// \param __b6
3751	/// An 8-bit integral value used to initialize bits [55:48] of the result.
3752	/// \param __b7
3753	/// An 8-bit integral value used to initialize bits [63:56] of the result.
3754	/// \param __b8
3755	/// An 8-bit integral value used to initialize bits [71:64] of the result.
3756	/// \param __b9
3757	/// An 8-bit integral value used to initialize bits [79:72] of the result.
3758	/// \param __b10
3759	/// An 8-bit integral value used to initialize bits [87:80] of the result.
3760	/// \param __b11
3761	/// An 8-bit integral value used to initialize bits [95:88] of the result.
3762	/// \param __b12
3763	/// An 8-bit integral value used to initialize bits [103:96] of the result.
3764	/// \param __b13
3765	/// An 8-bit integral value used to initialize bits [111:104] of the result.
3766	/// \param __b14
3767	/// An 8-bit integral value used to initialize bits [119:112] of the result.
3768	/// \param __b15
3769	/// An 8-bit integral value used to initialize bits [127:120] of the result.
3770	/// \returns An initialized 128-bit integer vector.
3771	static __inline__ __m128i __DEFAULT_FN_ATTRS
3772	_mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5,
3773	char __b6, char __b7, char __b8, char __b9, char __b10,
3774	char __b11, char __b12, char __b13, char __b14, char __b15) {
3775	return _mm_set_epi8(__b15, __b14, __b13, __b12, __b11, __b10, __b9, __b8,
3776	__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
3777	}
3778
3779	/// Creates a 128-bit integer vector initialized to zero.
3780	///
3781	/// \headerfile <x86intrin.h>
3782	///
3783	/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
3784	///
3785	/// \returns An initialized 128-bit integer vector with all elements set to
3786	/// zero.
3787	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setzero_si128(void) {
3788	return __extension__(__m128i)(__v2di){0LL, 0LL};
3789	}
3790
3791	/// Stores a 128-bit integer vector to a memory location aligned on a
3792	/// 128-bit boundary.
3793	///
3794	/// \headerfile <x86intrin.h>
3795	///
3796	/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
3797	///
3798	/// \param __p
3799	/// A pointer to an aligned memory location that will receive the integer
3800	/// values.
3801	/// \param __b
3802	/// A 128-bit integer vector containing the values to be moved.
3803	static __inline__ void __DEFAULT_FN_ATTRS _mm_store_si128(__m128i *__p,
3804	__m128i __b) {
3805	*__p = __b;
3806	}
3807
3808	/// Stores a 128-bit integer vector to an unaligned memory location.
3809	///
3810	/// \headerfile <x86intrin.h>
3811	///
3812	/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
3813	///
3814	/// \param __p
3815	/// A pointer to a memory location that will receive the integer values.
3816	/// \param __b
3817	/// A 128-bit integer vector containing the values to be moved.
3818	static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si128(__m128i_u *__p,
3819	__m128i __b) {
3820	struct __storeu_si128 {
3821	__m128i_u __v;
3822	} __attribute__((__packed__, __may_alias__));
3823	((struct __storeu_si128 *)__p)->__v = __b;
3824	}
3825
3826	/// Stores a 64-bit integer value from the low element of a 128-bit integer
3827	/// vector.
3828	///
3829	/// \headerfile <x86intrin.h>
3830	///
3831	/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3832	///
3833	/// \param __p
3834	/// A pointer to a 64-bit memory location. The address of the memory
3835	/// location does not have to be aligned.
3836	/// \param __b
3837	/// A 128-bit integer vector containing the value to be stored.
3838	static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si64(void *__p,
3839	__m128i __b) {
3840	struct __storeu_si64 {
3841	long long __v;
3842	} __attribute__((__packed__, __may_alias__));
3843	((struct __storeu_si64 *)__p)->__v = ((__v2di)__b)[0];
3844	}
3845
3846	/// Stores a 32-bit integer value from the low element of a 128-bit integer
3847	/// vector.
3848	///
3849	/// \headerfile <x86intrin.h>
3850	///
3851	/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3852	///
3853	/// \param __p
3854	/// A pointer to a 32-bit memory location. The address of the memory
3855	/// location does not have to be aligned.
3856	/// \param __b
3857	/// A 128-bit integer vector containing the value to be stored.
3858	static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si32(void *__p,
3859	__m128i __b) {
3860	struct __storeu_si32 {
3861	int __v;
3862	} __attribute__((__packed__, __may_alias__));
3863	((struct __storeu_si32 *)__p)->__v = ((__v4si)__b)[0];
3864	}
3865
3866	/// Stores a 16-bit integer value from the low element of a 128-bit integer
3867	/// vector.
3868	///
3869	/// \headerfile <x86intrin.h>
3870	///
3871	/// This intrinsic does not correspond to a specific instruction.
3872	///
3873	/// \param __p
3874	/// A pointer to a 16-bit memory location. The address of the memory
3875	/// location does not have to be aligned.
3876	/// \param __b
3877	/// A 128-bit integer vector containing the value to be stored.
3878	static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si16(void *__p,
3879	__m128i __b) {
3880	struct __storeu_si16 {
3881	short __v;
3882	} __attribute__((__packed__, __may_alias__));
3883	((struct __storeu_si16 *)__p)->__v = ((__v8hi)__b)[0];
3884	}
3885
3886	/// Moves bytes selected by the mask from the first operand to the
3887	/// specified unaligned memory location. When a mask bit is 1, the
3888	/// corresponding byte is written, otherwise it is not written.
3889	///
3890	/// To minimize caching, the data is flagged as non-temporal (unlikely to be
3891	/// used again soon). Exception and trap behavior for elements not selected
3892	/// for storage to memory are implementation dependent.
3893	///
3894	/// \headerfile <x86intrin.h>
3895	///
3896	/// This intrinsic corresponds to the <c> VMASKMOVDQU / MASKMOVDQU </c>
3897	/// instruction.
3898	///
3899	/// \param __d
3900	/// A 128-bit integer vector containing the values to be moved.
3901	/// \param __n
3902	/// A 128-bit integer vector containing the mask. The most significant bit of
3903	/// each byte represents the mask bits.
3904	/// \param __p
3905	/// A pointer to an unaligned 128-bit memory location where the specified
3906	/// values are moved.
3907	static __inline__ void __DEFAULT_FN_ATTRS _mm_maskmoveu_si128(__m128i __d,
3908	__m128i __n,
3909	char *__p) {
3910	__builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p);
3911	}
3912
3913	/// Stores the lower 64 bits of a 128-bit integer vector of [2 x i64] to
3914	/// a memory location.
3915	///
3916	/// \headerfile <x86intrin.h>
3917	///
3918	/// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
3919	///
3920	/// \param __p
3921	/// A pointer to a 64-bit memory location that will receive the lower 64 bits
3922	/// of the integer vector parameter.
3923	/// \param __a
3924	/// A 128-bit integer vector of [2 x i64]. The lower 64 bits contain the
3925	/// value to be stored.
3926	static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_epi64(__m128i_u *__p,
3927	__m128i __a) {
3928	struct __mm_storel_epi64_struct {
3929	long long __u;
3930	} __attribute__((__packed__, __may_alias__));
3931	((struct __mm_storel_epi64_struct *)__p)->__u = __a[0];
3932	}
3933
3934	/// Stores a 128-bit floating point vector of [2 x double] to a 128-bit
3935	/// aligned memory location.
3936	///
3937	/// To minimize caching, the data is flagged as non-temporal (unlikely to be
3938	/// used again soon).
3939	///
3940	/// \headerfile <x86intrin.h>
3941	///
3942	/// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
3943	///
3944	/// \param __p
3945	/// A pointer to the 128-bit aligned memory location used to store the value.
3946	/// \param __a
3947	/// A vector of [2 x double] containing the 64-bit values to be stored.
3948	static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_pd(void *__p,
3949	__m128d __a) {
3950	__builtin_nontemporal_store((__v2df)__a, (__v2df *)__p);
3951	}
3952
3953	/// Stores a 128-bit integer vector to a 128-bit aligned memory location.
3954	///
3955	/// To minimize caching, the data is flagged as non-temporal (unlikely to be
3956	/// used again soon).
3957	///
3958	/// \headerfile <x86intrin.h>
3959	///
3960	/// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
3961	///
3962	/// \param __p
3963	/// A pointer to the 128-bit aligned memory location used to store the value.
3964	/// \param __a
3965	/// A 128-bit integer vector containing the values to be stored.
3966	static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_si128(void *__p,
3967	__m128i __a) {
3968	__builtin_nontemporal_store((__v2di)__a, (__v2di *)__p);
3969	}
3970
3971	/// Stores a 32-bit integer value in the specified memory location.
3972	///
3973	/// To minimize caching, the data is flagged as non-temporal (unlikely to be
3974	/// used again soon).
3975	///
3976	/// \headerfile <x86intrin.h>
3977	///
3978	/// This intrinsic corresponds to the <c> MOVNTI </c> instruction.
3979	///
3980	/// \param __p
3981	/// A pointer to the 32-bit memory location used to store the value.
3982	/// \param __a
3983	/// A 32-bit integer containing the value to be stored.
3984	static __inline__ void
3985	__attribute__((__always_inline__, __nodebug__, __target__("sse2")))
3986	_mm_stream_si32(void *__p, int __a) {
3987	__builtin_ia32_movnti((int *)__p, __a);
3988	}
3989
3990	#ifdef __x86_64__
3991	/// Stores a 64-bit integer value in the specified memory location.
3992	///
3993	/// To minimize caching, the data is flagged as non-temporal (unlikely to be
3994	/// used again soon).
3995	///
3996	/// \headerfile <x86intrin.h>
3997	///
3998	/// This intrinsic corresponds to the <c> MOVNTIQ </c> instruction.
3999	///
4000	/// \param __p
4001	/// A pointer to the 64-bit memory location used to store the value.
4002	/// \param __a
4003	/// A 64-bit integer containing the value to be stored.
4004	static __inline__ void
4005	__attribute__((__always_inline__, __nodebug__, __target__("sse2")))
4006	_mm_stream_si64(void *__p, long long __a) {
4007	__builtin_ia32_movnti64((long long *)__p, __a);
4008	}
4009	#endif
4010
4011	#if defined(__cplusplus)
4012	extern "C" {
4013	#endif
4014
4015	/// The cache line containing \a __p is flushed and invalidated from all
4016	/// caches in the coherency domain.
4017	///
4018	/// \headerfile <x86intrin.h>
4019	///
4020	/// This intrinsic corresponds to the <c> CLFLUSH </c> instruction.
4021	///
4022	/// \param __p
4023	/// A pointer to the memory location used to identify the cache line to be
4024	/// flushed.
4025	void _mm_clflush(void const *__p);
4026
4027	/// Forces strong memory ordering (serialization) between load
4028	/// instructions preceding this instruction and load instructions following
4029	/// this instruction, ensuring the system completes all previous loads before
4030	/// executing subsequent loads.
4031	///
4032	/// \headerfile <x86intrin.h>
4033	///
4034	/// This intrinsic corresponds to the <c> LFENCE </c> instruction.
4035	///
4036	void _mm_lfence(void);
4037
4038	/// Forces strong memory ordering (serialization) between load and store
4039	/// instructions preceding this instruction and load and store instructions
4040	/// following this instruction, ensuring that the system completes all
4041	/// previous memory accesses before executing subsequent memory accesses.
4042	///
4043	/// \headerfile <x86intrin.h>
4044	///
4045	/// This intrinsic corresponds to the <c> MFENCE </c> instruction.
4046	///
4047	void _mm_mfence(void);
4048
4049	#if defined(__cplusplus)
4050	} // extern "C"
4051	#endif
4052
4053	/// Converts 16-bit signed integers from both 128-bit integer vector
4054	/// operands into 8-bit signed integers, and packs the results into the
4055	/// destination. Positive values greater than 0x7F are saturated to 0x7F.
4056	/// Negative values less than 0x80 are saturated to 0x80.
4057	///
4058	/// \headerfile <x86intrin.h>
4059	///
4060	/// This intrinsic corresponds to the <c> VPACKSSWB / PACKSSWB </c> instruction.
4061	///
4062	/// \param __a
4063	/// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
4064	/// a signed integer and is converted to a 8-bit signed integer with
4065	/// saturation. Values greater than 0x7F are saturated to 0x7F. Values less
4066	/// than 0x80 are saturated to 0x80. The converted [8 x i8] values are
4067	/// written to the lower 64 bits of the result.
4068	/// \param __b
4069	/// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
4070	/// a signed integer and is converted to a 8-bit signed integer with
4071	/// saturation. Values greater than 0x7F are saturated to 0x7F. Values less
4072	/// than 0x80 are saturated to 0x80. The converted [8 x i8] values are
4073	/// written to the higher 64 bits of the result.
4074	/// \returns A 128-bit vector of [16 x i8] containing the converted values.
4075	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi16(__m128i __a,
4076	__m128i __b) {
4077	return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b);
4078	}
4079
4080	/// Converts 32-bit signed integers from both 128-bit integer vector
4081	/// operands into 16-bit signed integers, and packs the results into the
4082	/// destination. Positive values greater than 0x7FFF are saturated to 0x7FFF.
4083	/// Negative values less than 0x8000 are saturated to 0x8000.
4084	///
4085	/// \headerfile <x86intrin.h>
4086	///
4087	/// This intrinsic corresponds to the <c> VPACKSSDW / PACKSSDW </c> instruction.
4088	///
4089	/// \param __a
4090	/// A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as
4091	/// a signed integer and is converted to a 16-bit signed integer with
4092	/// saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values
4093	/// less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values
4094	/// are written to the lower 64 bits of the result.
4095	/// \param __b
4096	/// A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as
4097	/// a signed integer and is converted to a 16-bit signed integer with
4098	/// saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values
4099	/// less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values
4100	/// are written to the higher 64 bits of the result.
4101	/// \returns A 128-bit vector of [8 x i16] containing the converted values.
4102	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi32(__m128i __a,
4103	__m128i __b) {
4104	return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b);
4105	}
4106
4107	/// Converts 16-bit signed integers from both 128-bit integer vector
4108	/// operands into 8-bit unsigned integers, and packs the results into the
4109	/// destination. Values greater than 0xFF are saturated to 0xFF. Values less
4110	/// than 0x00 are saturated to 0x00.
4111	///
4112	/// \headerfile <x86intrin.h>
4113	///
4114	/// This intrinsic corresponds to the <c> VPACKUSWB / PACKUSWB </c> instruction.
4115	///
4116	/// \param __a
4117	/// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
4118	/// a signed integer and is converted to an 8-bit unsigned integer with
4119	/// saturation. Values greater than 0xFF are saturated to 0xFF. Values less
4120	/// than 0x00 are saturated to 0x00. The converted [8 x i8] values are
4121	/// written to the lower 64 bits of the result.
4122	/// \param __b
4123	/// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
4124	/// a signed integer and is converted to an 8-bit unsigned integer with
4125	/// saturation. Values greater than 0xFF are saturated to 0xFF. Values less
4126	/// than 0x00 are saturated to 0x00. The converted [8 x i8] values are
4127	/// written to the higher 64 bits of the result.
4128	/// \returns A 128-bit vector of [16 x i8] containing the converted values.
4129	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi16(__m128i __a,
4130	__m128i __b) {
4131	return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b);
4132	}
4133
4134	/// Extracts 16 bits from a 128-bit integer vector of [8 x i16], using
4135	/// the immediate-value parameter as a selector.
4136	///
4137	/// \headerfile <x86intrin.h>
4138	///
4139	/// \code
4140	/// __m128i _mm_extract_epi16(__m128i a, const int imm);
4141	/// \endcode
4142	///
4143	/// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
4144	///
4145	/// \param a
4146	/// A 128-bit integer vector.
4147	/// \param imm
4148	/// An immediate value. Bits [2:0] selects values from \a a to be assigned
4149	/// to bits[15:0] of the result. \n
4150	/// 000: assign values from bits [15:0] of \a a. \n
4151	/// 001: assign values from bits [31:16] of \a a. \n
4152	/// 010: assign values from bits [47:32] of \a a. \n
4153	/// 011: assign values from bits [63:48] of \a a. \n
4154	/// 100: assign values from bits [79:64] of \a a. \n
4155	/// 101: assign values from bits [95:80] of \a a. \n
4156	/// 110: assign values from bits [111:96] of \a a. \n
4157	/// 111: assign values from bits [127:112] of \a a.
4158	/// \returns An integer, whose lower 16 bits are selected from the 128-bit
4159	/// integer vector parameter and the remaining bits are assigned zeros.
4160	#define _mm_extract_epi16(a, imm) \
4161	((int)(unsigned short)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(a), \
4162	(int)(imm)))
4163
4164	/// Constructs a 128-bit integer vector by first making a copy of the
4165	/// 128-bit integer vector parameter, and then inserting the lower 16 bits
4166	/// of an integer parameter into an offset specified by the immediate-value
4167	/// parameter.
4168	///
4169	/// \headerfile <x86intrin.h>
4170	///
4171	/// \code
4172	/// __m128i _mm_insert_epi16(__m128i a, int b, const int imm);
4173	/// \endcode
4174	///
4175	/// This intrinsic corresponds to the <c> VPINSRW / PINSRW </c> instruction.
4176	///
4177	/// \param a
4178	/// A 128-bit integer vector of [8 x i16]. This vector is copied to the
4179	/// result and then one of the eight elements in the result is replaced by
4180	/// the lower 16 bits of \a b.
4181	/// \param b
4182	/// An integer. The lower 16 bits of this parameter are written to the
4183	/// result beginning at an offset specified by \a imm.
4184	/// \param imm
4185	/// An immediate value specifying the bit offset in the result at which the
4186	/// lower 16 bits of \a b are written.
4187	/// \returns A 128-bit integer vector containing the constructed values.
4188	#define _mm_insert_epi16(a, b, imm) \
4189	((__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)(__m128i)(a), (int)(b), \
4190	(int)(imm)))
4191
4192	/// Copies the values of the most significant bits from each 8-bit
4193	/// element in a 128-bit integer vector of [16 x i8] to create a 16-bit mask
4194	/// value, zero-extends the value, and writes it to the destination.
4195	///
4196	/// \headerfile <x86intrin.h>
4197	///
4198	/// This intrinsic corresponds to the <c> VPMOVMSKB / PMOVMSKB </c> instruction.
4199	///
4200	/// \param __a
4201	/// A 128-bit integer vector containing the values with bits to be extracted.
4202	/// \returns The most significant bits from each 8-bit element in \a __a,
4203	/// written to bits [15:0]. The other bits are assigned zeros.
4204	static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_epi8(__m128i __a) {
4205	return __builtin_ia32_pmovmskb128((__v16qi)__a);
4206	}
4207
4208	/// Constructs a 128-bit integer vector by shuffling four 32-bit
4209	/// elements of a 128-bit integer vector parameter, using the immediate-value
4210	/// parameter as a specifier.
4211	///
4212	/// \headerfile <x86intrin.h>
4213	///
4214	/// \code
4215	/// __m128i _mm_shuffle_epi32(__m128i a, const int imm);
4216	/// \endcode
4217	///
4218	/// This intrinsic corresponds to the <c> VPSHUFD / PSHUFD </c> instruction.
4219	///
4220	/// \param a
4221	/// A 128-bit integer vector containing the values to be copied.
4222	/// \param imm
4223	/// An immediate value containing an 8-bit value specifying which elements to
4224	/// copy from a. The destinations within the 128-bit destination are assigned
4225	/// values as follows: \n
4226	/// Bits [1:0] are used to assign values to bits [31:0] of the result. \n
4227	/// Bits [3:2] are used to assign values to bits [63:32] of the result. \n
4228	/// Bits [5:4] are used to assign values to bits [95:64] of the result. \n
4229	/// Bits [7:6] are used to assign values to bits [127:96] of the result. \n
4230	/// Bit value assignments: \n
4231	/// 00: assign values from bits [31:0] of \a a. \n
4232	/// 01: assign values from bits [63:32] of \a a. \n
4233	/// 10: assign values from bits [95:64] of \a a. \n
4234	/// 11: assign values from bits [127:96] of \a a. \n
4235	/// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
4236	/// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
4237	/// <c>[b6, b4, b2, b0]</c>.
4238	/// \returns A 128-bit integer vector containing the shuffled values.
4239	#define _mm_shuffle_epi32(a, imm) \
4240	((__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(a), (int)(imm)))
4241
4242	/// Constructs a 128-bit integer vector by shuffling four lower 16-bit
4243	/// elements of a 128-bit integer vector of [8 x i16], using the immediate
4244	/// value parameter as a specifier.
4245	///
4246	/// \headerfile <x86intrin.h>
4247	///
4248	/// \code
4249	/// __m128i _mm_shufflelo_epi16(__m128i a, const int imm);
4250	/// \endcode
4251	///
4252	/// This intrinsic corresponds to the <c> VPSHUFLW / PSHUFLW </c> instruction.
4253	///
4254	/// \param a
4255	/// A 128-bit integer vector of [8 x i16]. Bits [127:64] are copied to bits
4256	/// [127:64] of the result.
4257	/// \param imm
4258	/// An 8-bit immediate value specifying which elements to copy from \a a. \n
4259	/// Bits[1:0] are used to assign values to bits [15:0] of the result. \n
4260	/// Bits[3:2] are used to assign values to bits [31:16] of the result. \n
4261	/// Bits[5:4] are used to assign values to bits [47:32] of the result. \n
4262	/// Bits[7:6] are used to assign values to bits [63:48] of the result. \n
4263	/// Bit value assignments: \n
4264	/// 00: assign values from bits [15:0] of \a a. \n
4265	/// 01: assign values from bits [31:16] of \a a. \n
4266	/// 10: assign values from bits [47:32] of \a a. \n
4267	/// 11: assign values from bits [63:48] of \a a. \n
4268	/// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
4269	/// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
4270	/// <c>[b6, b4, b2, b0]</c>.
4271	/// \returns A 128-bit integer vector containing the shuffled values.
4272	#define _mm_shufflelo_epi16(a, imm) \
4273	((__m128i)__builtin_ia32_pshuflw((__v8hi)(__m128i)(a), (int)(imm)))
4274
4275	/// Constructs a 128-bit integer vector by shuffling four upper 16-bit
4276	/// elements of a 128-bit integer vector of [8 x i16], using the immediate
4277	/// value parameter as a specifier.
4278	///
4279	/// \headerfile <x86intrin.h>
4280	///
4281	/// \code
4282	/// __m128i _mm_shufflehi_epi16(__m128i a, const int imm);
4283	/// \endcode
4284	///
4285	/// This intrinsic corresponds to the <c> VPSHUFHW / PSHUFHW </c> instruction.
4286	///
4287	/// \param a
4288	/// A 128-bit integer vector of [8 x i16]. Bits [63:0] are copied to bits
4289	/// [63:0] of the result.
4290	/// \param imm
4291	/// An 8-bit immediate value specifying which elements to copy from \a a. \n
4292	/// Bits[1:0] are used to assign values to bits [79:64] of the result. \n
4293	/// Bits[3:2] are used to assign values to bits [95:80] of the result. \n
4294	/// Bits[5:4] are used to assign values to bits [111:96] of the result. \n
4295	/// Bits[7:6] are used to assign values to bits [127:112] of the result. \n
4296	/// Bit value assignments: \n
4297	/// 00: assign values from bits [79:64] of \a a. \n
4298	/// 01: assign values from bits [95:80] of \a a. \n
4299	/// 10: assign values from bits [111:96] of \a a. \n
4300	/// 11: assign values from bits [127:112] of \a a. \n
4301	/// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
4302	/// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
4303	/// <c>[b6, b4, b2, b0]</c>.
4304	/// \returns A 128-bit integer vector containing the shuffled values.
4305	#define _mm_shufflehi_epi16(a, imm) \
4306	((__m128i)__builtin_ia32_pshufhw((__v8hi)(__m128i)(a), (int)(imm)))
4307
4308	/// Unpacks the high-order (index 8-15) values from two 128-bit vectors
4309	/// of [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
4310	///
4311	/// \headerfile <x86intrin.h>
4312	///
4313	/// This intrinsic corresponds to the <c> VPUNPCKHBW / PUNPCKHBW </c>
4314	/// instruction.
4315	///
4316	/// \param __a
4317	/// A 128-bit vector of [16 x i8].
4318	/// Bits [71:64] are written to bits [7:0] of the result. \n
4319	/// Bits [79:72] are written to bits [23:16] of the result. \n
4320	/// Bits [87:80] are written to bits [39:32] of the result. \n
4321	/// Bits [95:88] are written to bits [55:48] of the result. \n
4322	/// Bits [103:96] are written to bits [71:64] of the result. \n
4323	/// Bits [111:104] are written to bits [87:80] of the result. \n
4324	/// Bits [119:112] are written to bits [103:96] of the result. \n
4325	/// Bits [127:120] are written to bits [119:112] of the result.
4326	/// \param __b
4327	/// A 128-bit vector of [16 x i8]. \n
4328	/// Bits [71:64] are written to bits [15:8] of the result. \n
4329	/// Bits [79:72] are written to bits [31:24] of the result. \n
4330	/// Bits [87:80] are written to bits [47:40] of the result. \n
4331	/// Bits [95:88] are written to bits [63:56] of the result. \n
4332	/// Bits [103:96] are written to bits [79:72] of the result. \n
4333	/// Bits [111:104] are written to bits [95:88] of the result. \n
4334	/// Bits [119:112] are written to bits [111:104] of the result. \n
4335	/// Bits [127:120] are written to bits [127:120] of the result.
4336	/// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
4337	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi8(__m128i __a,
4338	__m128i __b) {
4339	return (__m128i)__builtin_shufflevector(
4340	(__v16qi)__a, (__v16qi)__b, 8, 16 + 8, 9, 16 + 9, 10, 16 + 10, 11,
4341	16 + 11, 12, 16 + 12, 13, 16 + 13, 14, 16 + 14, 15, 16 + 15);
4342	}
4343
4344	/// Unpacks the high-order (index 4-7) values from two 128-bit vectors of
4345	/// [8 x i16] and interleaves them into a 128-bit vector of [8 x i16].
4346	///
4347	/// \headerfile <x86intrin.h>
4348	///
4349	/// This intrinsic corresponds to the <c> VPUNPCKHWD / PUNPCKHWD </c>
4350	/// instruction.
4351	///
4352	/// \param __a
4353	/// A 128-bit vector of [8 x i16].
4354	/// Bits [79:64] are written to bits [15:0] of the result. \n
4355	/// Bits [95:80] are written to bits [47:32] of the result. \n
4356	/// Bits [111:96] are written to bits [79:64] of the result. \n
4357	/// Bits [127:112] are written to bits [111:96] of the result.
4358	/// \param __b
4359	/// A 128-bit vector of [8 x i16].
4360	/// Bits [79:64] are written to bits [31:16] of the result. \n
4361	/// Bits [95:80] are written to bits [63:48] of the result. \n
4362	/// Bits [111:96] are written to bits [95:80] of the result. \n
4363	/// Bits [127:112] are written to bits [127:112] of the result.
4364	/// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
4365	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi16(__m128i __a,
4366	__m128i __b) {
4367	return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8 + 4, 5,
4368	8 + 5, 6, 8 + 6, 7, 8 + 7);
4369	}
4370
4371	/// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
4372	/// [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
4373	///
4374	/// \headerfile <x86intrin.h>
4375	///
4376	/// This intrinsic corresponds to the <c> VPUNPCKHDQ / PUNPCKHDQ </c>
4377	/// instruction.
4378	///
4379	/// \param __a
4380	/// A 128-bit vector of [4 x i32]. \n
4381	/// Bits [95:64] are written to bits [31:0] of the destination. \n
4382	/// Bits [127:96] are written to bits [95:64] of the destination.
4383	/// \param __b
4384	/// A 128-bit vector of [4 x i32]. \n
4385	/// Bits [95:64] are written to bits [64:32] of the destination. \n
4386	/// Bits [127:96] are written to bits [127:96] of the destination.
4387	/// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
4388	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi32(__m128i __a,
4389	__m128i __b) {
4390	return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4 + 2, 3,
4391	4 + 3);
4392	}
4393
4394	/// Unpacks the high-order 64-bit elements from two 128-bit vectors of
4395	/// [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
4396	///
4397	/// \headerfile <x86intrin.h>
4398	///
4399	/// This intrinsic corresponds to the <c> VPUNPCKHQDQ / PUNPCKHQDQ </c>
4400	/// instruction.
4401	///
4402	/// \param __a
4403	/// A 128-bit vector of [2 x i64]. \n
4404	/// Bits [127:64] are written to bits [63:0] of the destination.
4405	/// \param __b
4406	/// A 128-bit vector of [2 x i64]. \n
4407	/// Bits [127:64] are written to bits [127:64] of the destination.
4408	/// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
4409	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi64(__m128i __a,
4410	__m128i __b) {
4411	return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 1, 2 + 1);
4412	}
4413
4414	/// Unpacks the low-order (index 0-7) values from two 128-bit vectors of
4415	/// [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
4416	///
4417	/// \headerfile <x86intrin.h>
4418	///
4419	/// This intrinsic corresponds to the <c> VPUNPCKLBW / PUNPCKLBW </c>
4420	/// instruction.
4421	///
4422	/// \param __a
4423	/// A 128-bit vector of [16 x i8]. \n
4424	/// Bits [7:0] are written to bits [7:0] of the result. \n
4425	/// Bits [15:8] are written to bits [23:16] of the result. \n
4426	/// Bits [23:16] are written to bits [39:32] of the result. \n
4427	/// Bits [31:24] are written to bits [55:48] of the result. \n
4428	/// Bits [39:32] are written to bits [71:64] of the result. \n
4429	/// Bits [47:40] are written to bits [87:80] of the result. \n
4430	/// Bits [55:48] are written to bits [103:96] of the result. \n
4431	/// Bits [63:56] are written to bits [119:112] of the result.
4432	/// \param __b
4433	/// A 128-bit vector of [16 x i8].
4434	/// Bits [7:0] are written to bits [15:8] of the result. \n
4435	/// Bits [15:8] are written to bits [31:24] of the result. \n
4436	/// Bits [23:16] are written to bits [47:40] of the result. \n
4437	/// Bits [31:24] are written to bits [63:56] of the result. \n
4438	/// Bits [39:32] are written to bits [79:72] of the result. \n
4439	/// Bits [47:40] are written to bits [95:88] of the result. \n
4440	/// Bits [55:48] are written to bits [111:104] of the result. \n
4441	/// Bits [63:56] are written to bits [127:120] of the result.
4442	/// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
4443	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi8(__m128i __a,
4444	__m128i __b) {
4445	return (__m128i)__builtin_shufflevector(
4446	(__v16qi)__a, (__v16qi)__b, 0, 16 + 0, 1, 16 + 1, 2, 16 + 2, 3, 16 + 3, 4,
4447	16 + 4, 5, 16 + 5, 6, 16 + 6, 7, 16 + 7);
4448	}
4449
4450	/// Unpacks the low-order (index 0-3) values from each of the two 128-bit
4451	/// vectors of [8 x i16] and interleaves them into a 128-bit vector of
4452	/// [8 x i16].
4453	///
4454	/// \headerfile <x86intrin.h>
4455	///
4456	/// This intrinsic corresponds to the <c> VPUNPCKLWD / PUNPCKLWD </c>
4457	/// instruction.
4458	///
4459	/// \param __a
4460	/// A 128-bit vector of [8 x i16].
4461	/// Bits [15:0] are written to bits [15:0] of the result. \n
4462	/// Bits [31:16] are written to bits [47:32] of the result. \n
4463	/// Bits [47:32] are written to bits [79:64] of the result. \n
4464	/// Bits [63:48] are written to bits [111:96] of the result.
4465	/// \param __b
4466	/// A 128-bit vector of [8 x i16].
4467	/// Bits [15:0] are written to bits [31:16] of the result. \n
4468	/// Bits [31:16] are written to bits [63:48] of the result. \n
4469	/// Bits [47:32] are written to bits [95:80] of the result. \n
4470	/// Bits [63:48] are written to bits [127:112] of the result.
4471	/// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
4472	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi16(__m128i __a,
4473	__m128i __b) {
4474	return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8 + 0, 1,
4475	8 + 1, 2, 8 + 2, 3, 8 + 3);
4476	}
4477
4478	/// Unpacks the low-order (index 0,1) values from two 128-bit vectors of
4479	/// [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
4480	///
4481	/// \headerfile <x86intrin.h>
4482	///
4483	/// This intrinsic corresponds to the <c> VPUNPCKLDQ / PUNPCKLDQ </c>
4484	/// instruction.
4485	///
4486	/// \param __a
4487	/// A 128-bit vector of [4 x i32]. \n
4488	/// Bits [31:0] are written to bits [31:0] of the destination. \n
4489	/// Bits [63:32] are written to bits [95:64] of the destination.
4490	/// \param __b
4491	/// A 128-bit vector of [4 x i32]. \n
4492	/// Bits [31:0] are written to bits [64:32] of the destination. \n
4493	/// Bits [63:32] are written to bits [127:96] of the destination.
4494	/// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
4495	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi32(__m128i __a,
4496	__m128i __b) {
4497	return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4 + 0, 1,
4498	4 + 1);
4499	}
4500
4501	/// Unpacks the low-order 64-bit elements from two 128-bit vectors of
4502	/// [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
4503	///
4504	/// \headerfile <x86intrin.h>
4505	///
4506	/// This intrinsic corresponds to the <c> VPUNPCKLQDQ / PUNPCKLQDQ </c>
4507	/// instruction.
4508	///
4509	/// \param __a
4510	/// A 128-bit vector of [2 x i64]. \n
4511	/// Bits [63:0] are written to bits [63:0] of the destination. \n
4512	/// \param __b
4513	/// A 128-bit vector of [2 x i64]. \n
4514	/// Bits [63:0] are written to bits [127:64] of the destination. \n
4515	/// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
4516	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi64(__m128i __a,
4517	__m128i __b) {
4518	return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 0, 2 + 0);
4519	}
4520
4521	/// Returns the lower 64 bits of a 128-bit integer vector as a 64-bit
4522	/// integer.
4523	///
4524	/// \headerfile <x86intrin.h>
4525	///
4526	/// This intrinsic corresponds to the <c> MOVDQ2Q </c> instruction.
4527	///
4528	/// \param __a
4529	/// A 128-bit integer vector operand. The lower 64 bits are moved to the
4530	/// destination.
4531	/// \returns A 64-bit integer containing the lower 64 bits of the parameter.
4532	static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_movepi64_pi64(__m128i __a) {
4533	return (__m64)__a[0];
4534	}
4535
4536	/// Moves the 64-bit operand to a 128-bit integer vector, zeroing the
4537	/// upper bits.
4538	///
4539	/// \headerfile <x86intrin.h>
4540	///
4541	/// This intrinsic corresponds to the <c> MOVD+VMOVQ </c> instruction.
4542	///
4543	/// \param __a
4544	/// A 64-bit value.
4545	/// \returns A 128-bit integer vector. The lower 64 bits contain the value from
4546	/// the operand. The upper 64 bits are assigned zeros.
4547	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_movpi64_epi64(__m64 __a) {
4548	return __extension__(__m128i)(__v2di){(long long)__a, 0};
4549	}
4550
4551	/// Moves the lower 64 bits of a 128-bit integer vector to a 128-bit
4552	/// integer vector, zeroing the upper bits.
4553	///
4554	/// \headerfile <x86intrin.h>
4555	///
4556	/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
4557	///
4558	/// \param __a
4559	/// A 128-bit integer vector operand. The lower 64 bits are moved to the
4560	/// destination.
4561	/// \returns A 128-bit integer vector. The lower 64 bits contain the value from
4562	/// the operand. The upper 64 bits are assigned zeros.
4563	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_move_epi64(__m128i __a) {
4564	return __builtin_shufflevector((__v2di)__a, _mm_setzero_si128(), 0, 2);
4565	}
4566
4567	/// Unpacks the high-order 64-bit elements from two 128-bit vectors of
4568	/// [2 x double] and interleaves them into a 128-bit vector of [2 x
4569	/// double].
4570	///
4571	/// \headerfile <x86intrin.h>
4572	///
4573	/// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
4574	///
4575	/// \param __a
4576	/// A 128-bit vector of [2 x double]. \n
4577	/// Bits [127:64] are written to bits [63:0] of the destination.
4578	/// \param __b
4579	/// A 128-bit vector of [2 x double]. \n
4580	/// Bits [127:64] are written to bits [127:64] of the destination.
4581	/// \returns A 128-bit vector of [2 x double] containing the interleaved values.
4582	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpackhi_pd(__m128d __a,
4583	__m128d __b) {
4584	return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 1, 2 + 1);
4585	}
4586
4587	/// Unpacks the low-order 64-bit elements from two 128-bit vectors
4588	/// of [2 x double] and interleaves them into a 128-bit vector of [2 x
4589	/// double].
4590	///
4591	/// \headerfile <x86intrin.h>
4592	///
4593	/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
4594	///
4595	/// \param __a
4596	/// A 128-bit vector of [2 x double]. \n
4597	/// Bits [63:0] are written to bits [63:0] of the destination.
4598	/// \param __b
4599	/// A 128-bit vector of [2 x double]. \n
4600	/// Bits [63:0] are written to bits [127:64] of the destination.
4601	/// \returns A 128-bit vector of [2 x double] containing the interleaved values.
4602	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpacklo_pd(__m128d __a,
4603	__m128d __b) {
4604	return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 0, 2 + 0);
4605	}
4606
4607	/// Extracts the sign bits of the double-precision values in the 128-bit
4608	/// vector of [2 x double], zero-extends the value, and writes it to the
4609	/// low-order bits of the destination.
4610	///
4611	/// \headerfile <x86intrin.h>
4612	///
4613	/// This intrinsic corresponds to the <c> VMOVMSKPD / MOVMSKPD </c> instruction.
4614	///
4615	/// \param __a
4616	/// A 128-bit vector of [2 x double] containing the values with sign bits to
4617	/// be extracted.
4618	/// \returns The sign bits from each of the double-precision elements in \a __a,
4619	/// written to bits [1:0]. The remaining bits are assigned values of zero.
4620	static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_pd(__m128d __a) {
4621	return __builtin_ia32_movmskpd((__v2df)__a);
4622	}
4623
4624	/// Constructs a 128-bit floating-point vector of [2 x double] from two
4625	/// 128-bit vector parameters of [2 x double], using the immediate-value
4626	/// parameter as a specifier.
4627	///
4628	/// \headerfile <x86intrin.h>
4629	///
4630	/// \code
4631	/// __m128d _mm_shuffle_pd(__m128d a, __m128d b, const int i);
4632	/// \endcode
4633	///
4634	/// This intrinsic corresponds to the <c> VSHUFPD / SHUFPD </c> instruction.
4635	///
4636	/// \param a
4637	/// A 128-bit vector of [2 x double].
4638	/// \param b
4639	/// A 128-bit vector of [2 x double].
4640	/// \param i
4641	/// An 8-bit immediate value. The least significant two bits specify which
4642	/// elements to copy from \a a and \a b: \n
4643	/// Bit[0] = 0: lower element of \a a copied to lower element of result. \n
4644	/// Bit[0] = 1: upper element of \a a copied to lower element of result. \n
4645	/// Bit[1] = 0: lower element of \a b copied to upper element of result. \n
4646	/// Bit[1] = 1: upper element of \a b copied to upper element of result. \n
4647	/// Note: To generate a mask, you can use the \c _MM_SHUFFLE2 macro.
4648	/// <c>_MM_SHUFFLE2(b1, b0)</c> can create a 2-bit mask of the form
4649	/// <c>[b1, b0]</c>.
4650	/// \returns A 128-bit vector of [2 x double] containing the shuffled values.
4651	#define _mm_shuffle_pd(a, b, i) \
4652	((__m128d)__builtin_ia32_shufpd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
4653	(int)(i)))
4654
4655	/// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
4656	/// floating-point vector of [4 x float].
4657	///
4658	/// \headerfile <x86intrin.h>
4659	///
4660	/// This intrinsic has no corresponding instruction.
4661	///
4662	/// \param __a
4663	/// A 128-bit floating-point vector of [2 x double].
4664	/// \returns A 128-bit floating-point vector of [4 x float] containing the same
4665	/// bitwise pattern as the parameter.
4666	static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castpd_ps(__m128d __a) {
4667	return (__m128)__a;
4668	}
4669
4670	/// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
4671	/// integer vector.
4672	///
4673	/// \headerfile <x86intrin.h>
4674	///
4675	/// This intrinsic has no corresponding instruction.
4676	///
4677	/// \param __a
4678	/// A 128-bit floating-point vector of [2 x double].
4679	/// \returns A 128-bit integer vector containing the same bitwise pattern as the
4680	/// parameter.
4681	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castpd_si128(__m128d __a) {
4682	return (__m128i)__a;
4683	}
4684
4685	/// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
4686	/// floating-point vector of [2 x double].
4687	///
4688	/// \headerfile <x86intrin.h>
4689	///
4690	/// This intrinsic has no corresponding instruction.
4691	///
4692	/// \param __a
4693	/// A 128-bit floating-point vector of [4 x float].
4694	/// \returns A 128-bit floating-point vector of [2 x double] containing the same
4695	/// bitwise pattern as the parameter.
4696	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castps_pd(__m128 __a) {
4697	return (__m128d)__a;
4698	}
4699
4700	/// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
4701	/// integer vector.
4702	///
4703	/// \headerfile <x86intrin.h>
4704	///
4705	/// This intrinsic has no corresponding instruction.
4706	///
4707	/// \param __a
4708	/// A 128-bit floating-point vector of [4 x float].
4709	/// \returns A 128-bit integer vector containing the same bitwise pattern as the
4710	/// parameter.
4711	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castps_si128(__m128 __a) {
4712	return (__m128i)__a;
4713	}
4714
4715	/// Casts a 128-bit integer vector into a 128-bit floating-point vector
4716	/// of [4 x float].
4717	///
4718	/// \headerfile <x86intrin.h>
4719	///
4720	/// This intrinsic has no corresponding instruction.
4721	///
4722	/// \param __a
4723	/// A 128-bit integer vector.
4724	/// \returns A 128-bit floating-point vector of [4 x float] containing the same
4725	/// bitwise pattern as the parameter.
4726	static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castsi128_ps(__m128i __a) {
4727	return (__m128)__a;
4728	}
4729
4730	/// Casts a 128-bit integer vector into a 128-bit floating-point vector
4731	/// of [2 x double].
4732	///
4733	/// \headerfile <x86intrin.h>
4734	///
4735	/// This intrinsic has no corresponding instruction.
4736	///
4737	/// \param __a
4738	/// A 128-bit integer vector.
4739	/// \returns A 128-bit floating-point vector of [2 x double] containing the same
4740	/// bitwise pattern as the parameter.
4741	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castsi128_pd(__m128i __a) {
4742	return (__m128d)__a;
4743	}
4744
4745	#if defined(__cplusplus)
4746	extern "C" {
4747	#endif
4748
4749	/// Indicates that a spin loop is being executed for the purposes of
4750	/// optimizing power consumption during the loop.
4751	///
4752	/// \headerfile <x86intrin.h>
4753	///
4754	/// This intrinsic corresponds to the <c> PAUSE </c> instruction.
4755	///
4756	void _mm_pause(void);
4757
4758	#if defined(__cplusplus)
4759	} // extern "C"
4760	#endif
4761	#undef __DEFAULT_FN_ATTRS
4762	#undef __DEFAULT_FN_ATTRS_MMX
4763
4764	#define _MM_SHUFFLE2(x, y) (((x) << 1) \| (y))
4765
4766	#define _MM_DENORMALS_ZERO_ON (0x0040U)
4767	#define _MM_DENORMALS_ZERO_OFF (0x0000U)
4768
4769	#define _MM_DENORMALS_ZERO_MASK (0x0040U)
4770
4771	#define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK)
4772	#define _MM_SET_DENORMALS_ZERO_MODE(x) \
4773	(_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) \| (x)))
4774
4775	#endif /* __EMMINTRIN_H */
4776

Warning: This file is not a C or C++ file. It does not have highlighting.

source code of clang/lib/Headers/emmintrin.h