xmmintrin.h source code [clang/lib/Headers/xmmintrin.h]

Warning: This file is not a C or C++ file. It does not have highlighting.

1	/*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
2	*
3	* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	* See https://llvm.org/LICENSE.txt for license information.
5	* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	*
7	*===-----------------------------------------------------------------------===
8	*/
9
10	#ifndef __XMMINTRIN_H
11	#define __XMMINTRIN_H
12
13	#if !defined(__i386__) && !defined(__x86_64__)
14	#error "This header is only meant to be used on x86 and x64 architecture"
15	#endif
16
17	#include <mmintrin.h>
18
19	typedef int __v4si __attribute__((__vector_size__(16)));
20	typedef float __v4sf __attribute__((__vector_size__(16)));
21	typedef float __m128 __attribute__((__vector_size__(16), __aligned__(16)));
22
23	typedef float __m128_u __attribute__((__vector_size__(16), __aligned__(1)));
24
25	/* Unsigned types */
26	typedef unsigned int __v4su __attribute__((__vector_size__(16)));
27
28	/* This header should only be included in a hosted environment as it depends on
29	* a standard library to provide allocation routines. */
30	#if __STDC_HOSTED__
31	#include <mm_malloc.h>
32	#endif
33
34	/* Define the default attributes for the functions in this file. */
35	#define __DEFAULT_FN_ATTRS \
36	__attribute__((__always_inline__, __nodebug__, __target__("sse,no-evex512"), \
37	__min_vector_width__(128)))
38	#define __DEFAULT_FN_ATTRS_MMX \
39	__attribute__((__always_inline__, __nodebug__, \
40	__target__("mmx,sse,no-evex512"), __min_vector_width__(64)))
41
42	/// Adds the 32-bit float values in the low-order bits of the operands.
43	///
44	/// \headerfile <x86intrin.h>
45	///
46	/// This intrinsic corresponds to the <c> VADDSS / ADDSS </c> instructions.
47	///
48	/// \param __a
49	/// A 128-bit vector of [4 x float] containing one of the source operands.
50	/// The lower 32 bits of this operand are used in the calculation.
51	/// \param __b
52	/// A 128-bit vector of [4 x float] containing one of the source operands.
53	/// The lower 32 bits of this operand are used in the calculation.
54	/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the sum
55	/// of the lower 32 bits of both operands. The upper 96 bits are copied from
56	/// the upper 96 bits of the first source operand.
57	static __inline__ __m128 __DEFAULT_FN_ATTRS
58	_mm_add_ss(__m128 __a, __m128 __b)
59	{
60	__a[0] += __b[0];
61	return __a;
62	}
63
64	/// Adds two 128-bit vectors of [4 x float], and returns the results of
65	/// the addition.
66	///
67	/// \headerfile <x86intrin.h>
68	///
69	/// This intrinsic corresponds to the <c> VADDPS / ADDPS </c> instructions.
70	///
71	/// \param __a
72	/// A 128-bit vector of [4 x float] containing one of the source operands.
73	/// \param __b
74	/// A 128-bit vector of [4 x float] containing one of the source operands.
75	/// \returns A 128-bit vector of [4 x float] containing the sums of both
76	/// operands.
77	static __inline__ __m128 __DEFAULT_FN_ATTRS
78	_mm_add_ps(__m128 __a, __m128 __b)
79	{
80	return (__m128)((__v4sf)__a + (__v4sf)__b);
81	}
82
83	/// Subtracts the 32-bit float value in the low-order bits of the second
84	/// operand from the corresponding value in the first operand.
85	///
86	/// \headerfile <x86intrin.h>
87	///
88	/// This intrinsic corresponds to the <c> VSUBSS / SUBSS </c> instructions.
89	///
90	/// \param __a
91	/// A 128-bit vector of [4 x float] containing the minuend. The lower 32 bits
92	/// of this operand are used in the calculation.
93	/// \param __b
94	/// A 128-bit vector of [4 x float] containing the subtrahend. The lower 32
95	/// bits of this operand are used in the calculation.
96	/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
97	/// difference of the lower 32 bits of both operands. The upper 96 bits are
98	/// copied from the upper 96 bits of the first source operand.
99	static __inline__ __m128 __DEFAULT_FN_ATTRS
100	_mm_sub_ss(__m128 __a, __m128 __b)
101	{
102	__a[0] -= __b[0];
103	return __a;
104	}
105
106	/// Subtracts each of the values of the second operand from the first
107	/// operand, both of which are 128-bit vectors of [4 x float] and returns
108	/// the results of the subtraction.
109	///
110	/// \headerfile <x86intrin.h>
111	///
112	/// This intrinsic corresponds to the <c> VSUBPS / SUBPS </c> instructions.
113	///
114	/// \param __a
115	/// A 128-bit vector of [4 x float] containing the minuend.
116	/// \param __b
117	/// A 128-bit vector of [4 x float] containing the subtrahend.
118	/// \returns A 128-bit vector of [4 x float] containing the differences between
119	/// both operands.
120	static __inline__ __m128 __DEFAULT_FN_ATTRS
121	_mm_sub_ps(__m128 __a, __m128 __b)
122	{
123	return (__m128)((__v4sf)__a - (__v4sf)__b);
124	}
125
126	/// Multiplies two 32-bit float values in the low-order bits of the
127	/// operands.
128	///
129	/// \headerfile <x86intrin.h>
130	///
131	/// This intrinsic corresponds to the <c> VMULSS / MULSS </c> instructions.
132	///
133	/// \param __a
134	/// A 128-bit vector of [4 x float] containing one of the source operands.
135	/// The lower 32 bits of this operand are used in the calculation.
136	/// \param __b
137	/// A 128-bit vector of [4 x float] containing one of the source operands.
138	/// The lower 32 bits of this operand are used in the calculation.
139	/// \returns A 128-bit vector of [4 x float] containing the product of the lower
140	/// 32 bits of both operands. The upper 96 bits are copied from the upper 96
141	/// bits of the first source operand.
142	static __inline__ __m128 __DEFAULT_FN_ATTRS
143	_mm_mul_ss(__m128 __a, __m128 __b)
144	{
145	__a[0] *= __b[0];
146	return __a;
147	}
148
149	/// Multiplies two 128-bit vectors of [4 x float] and returns the
150	/// results of the multiplication.
151	///
152	/// \headerfile <x86intrin.h>
153	///
154	/// This intrinsic corresponds to the <c> VMULPS / MULPS </c> instructions.
155	///
156	/// \param __a
157	/// A 128-bit vector of [4 x float] containing one of the source operands.
158	/// \param __b
159	/// A 128-bit vector of [4 x float] containing one of the source operands.
160	/// \returns A 128-bit vector of [4 x float] containing the products of both
161	/// operands.
162	static __inline__ __m128 __DEFAULT_FN_ATTRS
163	_mm_mul_ps(__m128 __a, __m128 __b)
164	{
165	return (__m128)((__v4sf)__a * (__v4sf)__b);
166	}
167
168	/// Divides the value in the low-order 32 bits of the first operand by
169	/// the corresponding value in the second operand.
170	///
171	/// \headerfile <x86intrin.h>
172	///
173	/// This intrinsic corresponds to the <c> VDIVSS / DIVSS </c> instructions.
174	///
175	/// \param __a
176	/// A 128-bit vector of [4 x float] containing the dividend. The lower 32
177	/// bits of this operand are used in the calculation.
178	/// \param __b
179	/// A 128-bit vector of [4 x float] containing the divisor. The lower 32 bits
180	/// of this operand are used in the calculation.
181	/// \returns A 128-bit vector of [4 x float] containing the quotients of the
182	/// lower 32 bits of both operands. The upper 96 bits are copied from the
183	/// upper 96 bits of the first source operand.
184	static __inline__ __m128 __DEFAULT_FN_ATTRS
185	_mm_div_ss(__m128 __a, __m128 __b)
186	{
187	__a[0] /= __b[0];
188	return __a;
189	}
190
191	/// Divides two 128-bit vectors of [4 x float].
192	///
193	/// \headerfile <x86intrin.h>
194	///
195	/// This intrinsic corresponds to the <c> VDIVPS / DIVPS </c> instructions.
196	///
197	/// \param __a
198	/// A 128-bit vector of [4 x float] containing the dividend.
199	/// \param __b
200	/// A 128-bit vector of [4 x float] containing the divisor.
201	/// \returns A 128-bit vector of [4 x float] containing the quotients of both
202	/// operands.
203	static __inline__ __m128 __DEFAULT_FN_ATTRS
204	_mm_div_ps(__m128 __a, __m128 __b)
205	{
206	return (__m128)((__v4sf)__a / (__v4sf)__b);
207	}
208
209	/// Calculates the square root of the value stored in the low-order bits
210	/// of a 128-bit vector of [4 x float].
211	///
212	/// \headerfile <x86intrin.h>
213	///
214	/// This intrinsic corresponds to the <c> VSQRTSS / SQRTSS </c> instructions.
215	///
216	/// \param __a
217	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
218	/// used in the calculation.
219	/// \returns A 128-bit vector of [4 x float] containing the square root of the
220	/// value in the low-order bits of the operand.
221	static __inline__ __m128 __DEFAULT_FN_ATTRS
222	_mm_sqrt_ss(__m128 __a)
223	{
224	return (__m128)__builtin_ia32_sqrtss((__v4sf)__a);
225	}
226
227	/// Calculates the square roots of the values stored in a 128-bit vector
228	/// of [4 x float].
229	///
230	/// \headerfile <x86intrin.h>
231	///
232	/// This intrinsic corresponds to the <c> VSQRTPS / SQRTPS </c> instructions.
233	///
234	/// \param __a
235	/// A 128-bit vector of [4 x float].
236	/// \returns A 128-bit vector of [4 x float] containing the square roots of the
237	/// values in the operand.
238	static __inline__ __m128 __DEFAULT_FN_ATTRS
239	_mm_sqrt_ps(__m128 __a)
240	{
241	return __builtin_ia32_sqrtps((__v4sf)__a);
242	}
243
244	/// Calculates the approximate reciprocal of the value stored in the
245	/// low-order bits of a 128-bit vector of [4 x float].
246	///
247	/// \headerfile <x86intrin.h>
248	///
249	/// This intrinsic corresponds to the <c> VRCPSS / RCPSS </c> instructions.
250	///
251	/// \param __a
252	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
253	/// used in the calculation.
254	/// \returns A 128-bit vector of [4 x float] containing the approximate
255	/// reciprocal of the value in the low-order bits of the operand.
256	static __inline__ __m128 __DEFAULT_FN_ATTRS
257	_mm_rcp_ss(__m128 __a)
258	{
259	return (__m128)__builtin_ia32_rcpss((__v4sf)__a);
260	}
261
262	/// Calculates the approximate reciprocals of the values stored in a
263	/// 128-bit vector of [4 x float].
264	///
265	/// \headerfile <x86intrin.h>
266	///
267	/// This intrinsic corresponds to the <c> VRCPPS / RCPPS </c> instructions.
268	///
269	/// \param __a
270	/// A 128-bit vector of [4 x float].
271	/// \returns A 128-bit vector of [4 x float] containing the approximate
272	/// reciprocals of the values in the operand.
273	static __inline__ __m128 __DEFAULT_FN_ATTRS
274	_mm_rcp_ps(__m128 __a)
275	{
276	return (__m128)__builtin_ia32_rcpps((__v4sf)__a);
277	}
278
279	/// Calculates the approximate reciprocal of the square root of the value
280	/// stored in the low-order bits of a 128-bit vector of [4 x float].
281	///
282	/// \headerfile <x86intrin.h>
283	///
284	/// This intrinsic corresponds to the <c> VRSQRTSS / RSQRTSS </c> instructions.
285	///
286	/// \param __a
287	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
288	/// used in the calculation.
289	/// \returns A 128-bit vector of [4 x float] containing the approximate
290	/// reciprocal of the square root of the value in the low-order bits of the
291	/// operand.
292	static __inline__ __m128 __DEFAULT_FN_ATTRS
293	_mm_rsqrt_ss(__m128 __a)
294	{
295	return __builtin_ia32_rsqrtss((__v4sf)__a);
296	}
297
298	/// Calculates the approximate reciprocals of the square roots of the
299	/// values stored in a 128-bit vector of [4 x float].
300	///
301	/// \headerfile <x86intrin.h>
302	///
303	/// This intrinsic corresponds to the <c> VRSQRTPS / RSQRTPS </c> instructions.
304	///
305	/// \param __a
306	/// A 128-bit vector of [4 x float].
307	/// \returns A 128-bit vector of [4 x float] containing the approximate
308	/// reciprocals of the square roots of the values in the operand.
309	static __inline__ __m128 __DEFAULT_FN_ATTRS
310	_mm_rsqrt_ps(__m128 __a)
311	{
312	return __builtin_ia32_rsqrtps((__v4sf)__a);
313	}
314
315	/// Compares two 32-bit float values in the low-order bits of both
316	/// operands and returns the lesser value in the low-order bits of the
317	/// vector of [4 x float].
318	///
319	/// \headerfile <x86intrin.h>
320	///
321	/// This intrinsic corresponds to the <c> VMINSS / MINSS </c> instructions.
322	///
323	/// \param __a
324	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
325	/// 32 bits of this operand are used in the comparison.
326	/// \param __b
327	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
328	/// 32 bits of this operand are used in the comparison.
329	/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
330	/// minimum value between both operands. The upper 96 bits are copied from
331	/// the upper 96 bits of the first source operand.
332	static __inline__ __m128 __DEFAULT_FN_ATTRS
333	_mm_min_ss(__m128 __a, __m128 __b)
334	{
335	return __builtin_ia32_minss((__v4sf)__a, (__v4sf)__b);
336	}
337
338	/// Compares two 128-bit vectors of [4 x float] and returns the lesser
339	/// of each pair of values.
340	///
341	/// \headerfile <x86intrin.h>
342	///
343	/// This intrinsic corresponds to the <c> VMINPS / MINPS </c> instructions.
344	///
345	/// \param __a
346	/// A 128-bit vector of [4 x float] containing one of the operands.
347	/// \param __b
348	/// A 128-bit vector of [4 x float] containing one of the operands.
349	/// \returns A 128-bit vector of [4 x float] containing the minimum values
350	/// between both operands.
351	static __inline__ __m128 __DEFAULT_FN_ATTRS
352	_mm_min_ps(__m128 __a, __m128 __b)
353	{
354	return __builtin_ia32_minps((__v4sf)__a, (__v4sf)__b);
355	}
356
357	/// Compares two 32-bit float values in the low-order bits of both
358	/// operands and returns the greater value in the low-order bits of a 128-bit
359	/// vector of [4 x float].
360	///
361	/// \headerfile <x86intrin.h>
362	///
363	/// This intrinsic corresponds to the <c> VMAXSS / MAXSS </c> instructions.
364	///
365	/// \param __a
366	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
367	/// 32 bits of this operand are used in the comparison.
368	/// \param __b
369	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
370	/// 32 bits of this operand are used in the comparison.
371	/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
372	/// maximum value between both operands. The upper 96 bits are copied from
373	/// the upper 96 bits of the first source operand.
374	static __inline__ __m128 __DEFAULT_FN_ATTRS
375	_mm_max_ss(__m128 __a, __m128 __b)
376	{
377	return __builtin_ia32_maxss((__v4sf)__a, (__v4sf)__b);
378	}
379
380	/// Compares two 128-bit vectors of [4 x float] and returns the greater
381	/// of each pair of values.
382	///
383	/// \headerfile <x86intrin.h>
384	///
385	/// This intrinsic corresponds to the <c> VMAXPS / MAXPS </c> instructions.
386	///
387	/// \param __a
388	/// A 128-bit vector of [4 x float] containing one of the operands.
389	/// \param __b
390	/// A 128-bit vector of [4 x float] containing one of the operands.
391	/// \returns A 128-bit vector of [4 x float] containing the maximum values
392	/// between both operands.
393	static __inline__ __m128 __DEFAULT_FN_ATTRS
394	_mm_max_ps(__m128 __a, __m128 __b)
395	{
396	return __builtin_ia32_maxps((__v4sf)__a, (__v4sf)__b);
397	}
398
399	/// Performs a bitwise AND of two 128-bit vectors of [4 x float].
400	///
401	/// \headerfile <x86intrin.h>
402	///
403	/// This intrinsic corresponds to the <c> VANDPS / ANDPS </c> instructions.
404	///
405	/// \param __a
406	/// A 128-bit vector containing one of the source operands.
407	/// \param __b
408	/// A 128-bit vector containing one of the source operands.
409	/// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
410	/// values between both operands.
411	static __inline__ __m128 __DEFAULT_FN_ATTRS
412	_mm_and_ps(__m128 __a, __m128 __b)
413	{
414	return (__m128)((__v4su)__a & (__v4su)__b);
415	}
416
417	/// Performs a bitwise AND of two 128-bit vectors of [4 x float], using
418	/// the one's complement of the values contained in the first source
419	/// operand.
420	///
421	/// \headerfile <x86intrin.h>
422	///
423	/// This intrinsic corresponds to the <c> VANDNPS / ANDNPS </c> instructions.
424	///
425	/// \param __a
426	/// A 128-bit vector of [4 x float] containing the first source operand. The
427	/// one's complement of this value is used in the bitwise AND.
428	/// \param __b
429	/// A 128-bit vector of [4 x float] containing the second source operand.
430	/// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
431	/// one's complement of the first operand and the values in the second
432	/// operand.
433	static __inline__ __m128 __DEFAULT_FN_ATTRS
434	_mm_andnot_ps(__m128 __a, __m128 __b)
435	{
436	return (__m128)(~(__v4su)__a & (__v4su)__b);
437	}
438
439	/// Performs a bitwise OR of two 128-bit vectors of [4 x float].
440	///
441	/// \headerfile <x86intrin.h>
442	///
443	/// This intrinsic corresponds to the <c> VORPS / ORPS </c> instructions.
444	///
445	/// \param __a
446	/// A 128-bit vector of [4 x float] containing one of the source operands.
447	/// \param __b
448	/// A 128-bit vector of [4 x float] containing one of the source operands.
449	/// \returns A 128-bit vector of [4 x float] containing the bitwise OR of the
450	/// values between both operands.
451	static __inline__ __m128 __DEFAULT_FN_ATTRS
452	_mm_or_ps(__m128 __a, __m128 __b)
453	{
454	return (__m128)((__v4su)__a \| (__v4su)__b);
455	}
456
457	/// Performs a bitwise exclusive OR of two 128-bit vectors of
458	/// [4 x float].
459	///
460	/// \headerfile <x86intrin.h>
461	///
462	/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instructions.
463	///
464	/// \param __a
465	/// A 128-bit vector of [4 x float] containing one of the source operands.
466	/// \param __b
467	/// A 128-bit vector of [4 x float] containing one of the source operands.
468	/// \returns A 128-bit vector of [4 x float] containing the bitwise exclusive OR
469	/// of the values between both operands.
470	static __inline__ __m128 __DEFAULT_FN_ATTRS
471	_mm_xor_ps(__m128 __a, __m128 __b)
472	{
473	return (__m128)((__v4su)__a ^ (__v4su)__b);
474	}
475
476	/// Compares two 32-bit float values in the low-order bits of both
477	/// operands for equality and returns the result of the comparison in the
478	/// low-order bits of a vector [4 x float].
479	///
480	/// \headerfile <x86intrin.h>
481	///
482	/// This intrinsic corresponds to the <c> VCMPEQSS / CMPEQSS </c> instructions.
483	///
484	/// \param __a
485	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
486	/// 32 bits of this operand are used in the comparison.
487	/// \param __b
488	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
489	/// 32 bits of this operand are used in the comparison.
490	/// \returns A 128-bit vector of [4 x float] containing the comparison results
491	/// in the low-order bits.
492	static __inline__ __m128 __DEFAULT_FN_ATTRS
493	_mm_cmpeq_ss(__m128 __a, __m128 __b)
494	{
495	return (__m128)__builtin_ia32_cmpeqss((__v4sf)__a, (__v4sf)__b);
496	}
497
498	/// Compares each of the corresponding 32-bit float values of the
499	/// 128-bit vectors of [4 x float] for equality.
500	///
501	/// \headerfile <x86intrin.h>
502	///
503	/// This intrinsic corresponds to the <c> VCMPEQPS / CMPEQPS </c> instructions.
504	///
505	/// \param __a
506	/// A 128-bit vector of [4 x float].
507	/// \param __b
508	/// A 128-bit vector of [4 x float].
509	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
510	static __inline__ __m128 __DEFAULT_FN_ATTRS
511	_mm_cmpeq_ps(__m128 __a, __m128 __b)
512	{
513	return (__m128)__builtin_ia32_cmpeqps((__v4sf)__a, (__v4sf)__b);
514	}
515
516	/// Compares two 32-bit float values in the low-order bits of both
517	/// operands to determine if the value in the first operand is less than the
518	/// corresponding value in the second operand and returns the result of the
519	/// comparison in the low-order bits of a vector of [4 x float].
520	///
521	/// \headerfile <x86intrin.h>
522	///
523	/// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
524	///
525	/// \param __a
526	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
527	/// 32 bits of this operand are used in the comparison.
528	/// \param __b
529	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
530	/// 32 bits of this operand are used in the comparison.
531	/// \returns A 128-bit vector of [4 x float] containing the comparison results
532	/// in the low-order bits.
533	static __inline__ __m128 __DEFAULT_FN_ATTRS
534	_mm_cmplt_ss(__m128 __a, __m128 __b)
535	{
536	return (__m128)__builtin_ia32_cmpltss((__v4sf)__a, (__v4sf)__b);
537	}
538
539	/// Compares each of the corresponding 32-bit float values of the
540	/// 128-bit vectors of [4 x float] to determine if the values in the first
541	/// operand are less than those in the second operand.
542	///
543	/// \headerfile <x86intrin.h>
544	///
545	/// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
546	///
547	/// \param __a
548	/// A 128-bit vector of [4 x float].
549	/// \param __b
550	/// A 128-bit vector of [4 x float].
551	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
552	static __inline__ __m128 __DEFAULT_FN_ATTRS
553	_mm_cmplt_ps(__m128 __a, __m128 __b)
554	{
555	return (__m128)__builtin_ia32_cmpltps((__v4sf)__a, (__v4sf)__b);
556	}
557
558	/// Compares two 32-bit float values in the low-order bits of both
559	/// operands to determine if the value in the first operand is less than or
560	/// equal to the corresponding value in the second operand and returns the
561	/// result of the comparison in the low-order bits of a vector of
562	/// [4 x float].
563	///
564	/// \headerfile <x86intrin.h>
565	///
566	/// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
567	///
568	/// \param __a
569	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
570	/// 32 bits of this operand are used in the comparison.
571	/// \param __b
572	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
573	/// 32 bits of this operand are used in the comparison.
574	/// \returns A 128-bit vector of [4 x float] containing the comparison results
575	/// in the low-order bits.
576	static __inline__ __m128 __DEFAULT_FN_ATTRS
577	_mm_cmple_ss(__m128 __a, __m128 __b)
578	{
579	return (__m128)__builtin_ia32_cmpless((__v4sf)__a, (__v4sf)__b);
580	}
581
582	/// Compares each of the corresponding 32-bit float values of the
583	/// 128-bit vectors of [4 x float] to determine if the values in the first
584	/// operand are less than or equal to those in the second operand.
585	///
586	/// \headerfile <x86intrin.h>
587	///
588	/// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
589	///
590	/// \param __a
591	/// A 128-bit vector of [4 x float].
592	/// \param __b
593	/// A 128-bit vector of [4 x float].
594	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
595	static __inline__ __m128 __DEFAULT_FN_ATTRS
596	_mm_cmple_ps(__m128 __a, __m128 __b)
597	{
598	return (__m128)__builtin_ia32_cmpleps((__v4sf)__a, (__v4sf)__b);
599	}
600
601	/// Compares two 32-bit float values in the low-order bits of both
602	/// operands to determine if the value in the first operand is greater than
603	/// the corresponding value in the second operand and returns the result of
604	/// the comparison in the low-order bits of a vector of [4 x float].
605	///
606	/// \headerfile <x86intrin.h>
607	///
608	/// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
609	///
610	/// \param __a
611	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
612	/// 32 bits of this operand are used in the comparison.
613	/// \param __b
614	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
615	/// 32 bits of this operand are used in the comparison.
616	/// \returns A 128-bit vector of [4 x float] containing the comparison results
617	/// in the low-order bits.
618	static __inline__ __m128 __DEFAULT_FN_ATTRS
619	_mm_cmpgt_ss(__m128 __a, __m128 __b)
620	{
621	return (__m128)__builtin_shufflevector((__v4sf)__a,
622	(__v4sf)__builtin_ia32_cmpltss((__v4sf)__b, (__v4sf)__a),
623	4, 1, 2, 3);
624	}
625
626	/// Compares each of the corresponding 32-bit float values of the
627	/// 128-bit vectors of [4 x float] to determine if the values in the first
628	/// operand are greater than those in the second operand.
629	///
630	/// \headerfile <x86intrin.h>
631	///
632	/// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
633	///
634	/// \param __a
635	/// A 128-bit vector of [4 x float].
636	/// \param __b
637	/// A 128-bit vector of [4 x float].
638	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
639	static __inline__ __m128 __DEFAULT_FN_ATTRS
640	_mm_cmpgt_ps(__m128 __a, __m128 __b)
641	{
642	return (__m128)__builtin_ia32_cmpltps((__v4sf)__b, (__v4sf)__a);
643	}
644
645	/// Compares two 32-bit float values in the low-order bits of both
646	/// operands to determine if the value in the first operand is greater than
647	/// or equal to the corresponding value in the second operand and returns
648	/// the result of the comparison in the low-order bits of a vector of
649	/// [4 x float].
650	///
651	/// \headerfile <x86intrin.h>
652	///
653	/// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
654	///
655	/// \param __a
656	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
657	/// 32 bits of this operand are used in the comparison.
658	/// \param __b
659	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
660	/// 32 bits of this operand are used in the comparison.
661	/// \returns A 128-bit vector of [4 x float] containing the comparison results
662	/// in the low-order bits.
663	static __inline__ __m128 __DEFAULT_FN_ATTRS
664	_mm_cmpge_ss(__m128 __a, __m128 __b)
665	{
666	return (__m128)__builtin_shufflevector((__v4sf)__a,
667	(__v4sf)__builtin_ia32_cmpless((__v4sf)__b, (__v4sf)__a),
668	4, 1, 2, 3);
669	}
670
671	/// Compares each of the corresponding 32-bit float values of the
672	/// 128-bit vectors of [4 x float] to determine if the values in the first
673	/// operand are greater than or equal to those in the second operand.
674	///
675	/// \headerfile <x86intrin.h>
676	///
677	/// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
678	///
679	/// \param __a
680	/// A 128-bit vector of [4 x float].
681	/// \param __b
682	/// A 128-bit vector of [4 x float].
683	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
684	static __inline__ __m128 __DEFAULT_FN_ATTRS
685	_mm_cmpge_ps(__m128 __a, __m128 __b)
686	{
687	return (__m128)__builtin_ia32_cmpleps((__v4sf)__b, (__v4sf)__a);
688	}
689
690	/// Compares two 32-bit float values in the low-order bits of both
691	/// operands for inequality and returns the result of the comparison in the
692	/// low-order bits of a vector of [4 x float].
693	///
694	/// \headerfile <x86intrin.h>
695	///
696	/// This intrinsic corresponds to the <c> VCMPNEQSS / CMPNEQSS </c>
697	/// instructions.
698	///
699	/// \param __a
700	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
701	/// 32 bits of this operand are used in the comparison.
702	/// \param __b
703	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
704	/// 32 bits of this operand are used in the comparison.
705	/// \returns A 128-bit vector of [4 x float] containing the comparison results
706	/// in the low-order bits.
707	static __inline__ __m128 __DEFAULT_FN_ATTRS
708	_mm_cmpneq_ss(__m128 __a, __m128 __b)
709	{
710	return (__m128)__builtin_ia32_cmpneqss((__v4sf)__a, (__v4sf)__b);
711	}
712
713	/// Compares each of the corresponding 32-bit float values of the
714	/// 128-bit vectors of [4 x float] for inequality.
715	///
716	/// \headerfile <x86intrin.h>
717	///
718	/// This intrinsic corresponds to the <c> VCMPNEQPS / CMPNEQPS </c>
719	/// instructions.
720	///
721	/// \param __a
722	/// A 128-bit vector of [4 x float].
723	/// \param __b
724	/// A 128-bit vector of [4 x float].
725	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
726	static __inline__ __m128 __DEFAULT_FN_ATTRS
727	_mm_cmpneq_ps(__m128 __a, __m128 __b)
728	{
729	return (__m128)__builtin_ia32_cmpneqps((__v4sf)__a, (__v4sf)__b);
730	}
731
732	/// Compares two 32-bit float values in the low-order bits of both
733	/// operands to determine if the value in the first operand is not less than
734	/// the corresponding value in the second operand and returns the result of
735	/// the comparison in the low-order bits of a vector of [4 x float].
736	///
737	/// \headerfile <x86intrin.h>
738	///
739	/// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
740	/// instructions.
741	///
742	/// \param __a
743	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
744	/// 32 bits of this operand are used in the comparison.
745	/// \param __b
746	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
747	/// 32 bits of this operand are used in the comparison.
748	/// \returns A 128-bit vector of [4 x float] containing the comparison results
749	/// in the low-order bits.
750	static __inline__ __m128 __DEFAULT_FN_ATTRS
751	_mm_cmpnlt_ss(__m128 __a, __m128 __b)
752	{
753	return (__m128)__builtin_ia32_cmpnltss((__v4sf)__a, (__v4sf)__b);
754	}
755
756	/// Compares each of the corresponding 32-bit float values of the
757	/// 128-bit vectors of [4 x float] to determine if the values in the first
758	/// operand are not less than those in the second operand.
759	///
760	/// \headerfile <x86intrin.h>
761	///
762	/// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
763	/// instructions.
764	///
765	/// \param __a
766	/// A 128-bit vector of [4 x float].
767	/// \param __b
768	/// A 128-bit vector of [4 x float].
769	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
770	static __inline__ __m128 __DEFAULT_FN_ATTRS
771	_mm_cmpnlt_ps(__m128 __a, __m128 __b)
772	{
773	return (__m128)__builtin_ia32_cmpnltps((__v4sf)__a, (__v4sf)__b);
774	}
775
776	/// Compares two 32-bit float values in the low-order bits of both
777	/// operands to determine if the value in the first operand is not less than
778	/// or equal to the corresponding value in the second operand and returns
779	/// the result of the comparison in the low-order bits of a vector of
780	/// [4 x float].
781	///
782	/// \headerfile <x86intrin.h>
783	///
784	/// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
785	/// instructions.
786	///
787	/// \param __a
788	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
789	/// 32 bits of this operand are used in the comparison.
790	/// \param __b
791	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
792	/// 32 bits of this operand are used in the comparison.
793	/// \returns A 128-bit vector of [4 x float] containing the comparison results
794	/// in the low-order bits.
795	static __inline__ __m128 __DEFAULT_FN_ATTRS
796	_mm_cmpnle_ss(__m128 __a, __m128 __b)
797	{
798	return (__m128)__builtin_ia32_cmpnless((__v4sf)__a, (__v4sf)__b);
799	}
800
801	/// Compares each of the corresponding 32-bit float values of the
802	/// 128-bit vectors of [4 x float] to determine if the values in the first
803	/// operand are not less than or equal to those in the second operand.
804	///
805	/// \headerfile <x86intrin.h>
806	///
807	/// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
808	/// instructions.
809	///
810	/// \param __a
811	/// A 128-bit vector of [4 x float].
812	/// \param __b
813	/// A 128-bit vector of [4 x float].
814	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
815	static __inline__ __m128 __DEFAULT_FN_ATTRS
816	_mm_cmpnle_ps(__m128 __a, __m128 __b)
817	{
818	return (__m128)__builtin_ia32_cmpnleps((__v4sf)__a, (__v4sf)__b);
819	}
820
821	/// Compares two 32-bit float values in the low-order bits of both
822	/// operands to determine if the value in the first operand is not greater
823	/// than the corresponding value in the second operand and returns the
824	/// result of the comparison in the low-order bits of a vector of
825	/// [4 x float].
826	///
827	/// \headerfile <x86intrin.h>
828	///
829	/// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
830	/// instructions.
831	///
832	/// \param __a
833	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
834	/// 32 bits of this operand are used in the comparison.
835	/// \param __b
836	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
837	/// 32 bits of this operand are used in the comparison.
838	/// \returns A 128-bit vector of [4 x float] containing the comparison results
839	/// in the low-order bits.
840	static __inline__ __m128 __DEFAULT_FN_ATTRS
841	_mm_cmpngt_ss(__m128 __a, __m128 __b)
842	{
843	return (__m128)__builtin_shufflevector((__v4sf)__a,
844	(__v4sf)__builtin_ia32_cmpnltss((__v4sf)__b, (__v4sf)__a),
845	4, 1, 2, 3);
846	}
847
848	/// Compares each of the corresponding 32-bit float values of the
849	/// 128-bit vectors of [4 x float] to determine if the values in the first
850	/// operand are not greater than those in the second operand.
851	///
852	/// \headerfile <x86intrin.h>
853	///
854	/// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
855	/// instructions.
856	///
857	/// \param __a
858	/// A 128-bit vector of [4 x float].
859	/// \param __b
860	/// A 128-bit vector of [4 x float].
861	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
862	static __inline__ __m128 __DEFAULT_FN_ATTRS
863	_mm_cmpngt_ps(__m128 __a, __m128 __b)
864	{
865	return (__m128)__builtin_ia32_cmpnltps((__v4sf)__b, (__v4sf)__a);
866	}
867
868	/// Compares two 32-bit float values in the low-order bits of both
869	/// operands to determine if the value in the first operand is not greater
870	/// than or equal to the corresponding value in the second operand and
871	/// returns the result of the comparison in the low-order bits of a vector
872	/// of [4 x float].
873	///
874	/// \headerfile <x86intrin.h>
875	///
876	/// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
877	/// instructions.
878	///
879	/// \param __a
880	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
881	/// 32 bits of this operand are used in the comparison.
882	/// \param __b
883	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
884	/// 32 bits of this operand are used in the comparison.
885	/// \returns A 128-bit vector of [4 x float] containing the comparison results
886	/// in the low-order bits.
887	static __inline__ __m128 __DEFAULT_FN_ATTRS
888	_mm_cmpnge_ss(__m128 __a, __m128 __b)
889	{
890	return (__m128)__builtin_shufflevector((__v4sf)__a,
891	(__v4sf)__builtin_ia32_cmpnless((__v4sf)__b, (__v4sf)__a),
892	4, 1, 2, 3);
893	}
894
895	/// Compares each of the corresponding 32-bit float values of the
896	/// 128-bit vectors of [4 x float] to determine if the values in the first
897	/// operand are not greater than or equal to those in the second operand.
898	///
899	/// \headerfile <x86intrin.h>
900	///
901	/// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
902	/// instructions.
903	///
904	/// \param __a
905	/// A 128-bit vector of [4 x float].
906	/// \param __b
907	/// A 128-bit vector of [4 x float].
908	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
909	static __inline__ __m128 __DEFAULT_FN_ATTRS
910	_mm_cmpnge_ps(__m128 __a, __m128 __b)
911	{
912	return (__m128)__builtin_ia32_cmpnleps((__v4sf)__b, (__v4sf)__a);
913	}
914
915	/// Compares two 32-bit float values in the low-order bits of both
916	/// operands to determine if the value in the first operand is ordered with
917	/// respect to the corresponding value in the second operand and returns the
918	/// result of the comparison in the low-order bits of a vector of
919	/// [4 x float].
920	///
921	/// \headerfile <x86intrin.h>
922	///
923	/// This intrinsic corresponds to the <c> VCMPORDSS / CMPORDSS </c>
924	/// instructions.
925	///
926	/// \param __a
927	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
928	/// 32 bits of this operand are used in the comparison.
929	/// \param __b
930	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
931	/// 32 bits of this operand are used in the comparison.
932	/// \returns A 128-bit vector of [4 x float] containing the comparison results
933	/// in the low-order bits.
934	static __inline__ __m128 __DEFAULT_FN_ATTRS
935	_mm_cmpord_ss(__m128 __a, __m128 __b)
936	{
937	return (__m128)__builtin_ia32_cmpordss((__v4sf)__a, (__v4sf)__b);
938	}
939
940	/// Compares each of the corresponding 32-bit float values of the
941	/// 128-bit vectors of [4 x float] to determine if the values in the first
942	/// operand are ordered with respect to those in the second operand.
943	///
944	/// \headerfile <x86intrin.h>
945	///
946	/// This intrinsic corresponds to the <c> VCMPORDPS / CMPORDPS </c>
947	/// instructions.
948	///
949	/// \param __a
950	/// A 128-bit vector of [4 x float].
951	/// \param __b
952	/// A 128-bit vector of [4 x float].
953	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
954	static __inline__ __m128 __DEFAULT_FN_ATTRS
955	_mm_cmpord_ps(__m128 __a, __m128 __b)
956	{
957	return (__m128)__builtin_ia32_cmpordps((__v4sf)__a, (__v4sf)__b);
958	}
959
960	/// Compares two 32-bit float values in the low-order bits of both
961	/// operands to determine if the value in the first operand is unordered
962	/// with respect to the corresponding value in the second operand and
963	/// returns the result of the comparison in the low-order bits of a vector
964	/// of [4 x float].
965	///
966	/// \headerfile <x86intrin.h>
967	///
968	/// This intrinsic corresponds to the <c> VCMPUNORDSS / CMPUNORDSS </c>
969	/// instructions.
970	///
971	/// \param __a
972	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
973	/// 32 bits of this operand are used in the comparison.
974	/// \param __b
975	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
976	/// 32 bits of this operand are used in the comparison.
977	/// \returns A 128-bit vector of [4 x float] containing the comparison results
978	/// in the low-order bits.
979	static __inline__ __m128 __DEFAULT_FN_ATTRS
980	_mm_cmpunord_ss(__m128 __a, __m128 __b)
981	{
982	return (__m128)__builtin_ia32_cmpunordss((__v4sf)__a, (__v4sf)__b);
983	}
984
985	/// Compares each of the corresponding 32-bit float values of the
986	/// 128-bit vectors of [4 x float] to determine if the values in the first
987	/// operand are unordered with respect to those in the second operand.
988	///
989	/// \headerfile <x86intrin.h>
990	///
991	/// This intrinsic corresponds to the <c> VCMPUNORDPS / CMPUNORDPS </c>
992	/// instructions.
993	///
994	/// \param __a
995	/// A 128-bit vector of [4 x float].
996	/// \param __b
997	/// A 128-bit vector of [4 x float].
998	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
999	static __inline__ __m128 __DEFAULT_FN_ATTRS
1000	_mm_cmpunord_ps(__m128 __a, __m128 __b)
1001	{
1002	return (__m128)__builtin_ia32_cmpunordps((__v4sf)__a, (__v4sf)__b);
1003	}
1004
1005	/// Compares two 32-bit float values in the low-order bits of both
1006	/// operands for equality and returns the result of the comparison.
1007	///
1008	/// If either of the two lower 32-bit values is NaN, 0 is returned.
1009	///
1010	/// \headerfile <x86intrin.h>
1011	///
1012	/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
1013	/// instructions.
1014	///
1015	/// \param __a
1016	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1017	/// used in the comparison.
1018	/// \param __b
1019	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1020	/// used in the comparison.
1021	/// \returns An integer containing the comparison results. If either of the
1022	/// two lower 32-bit values is NaN, 0 is returned.
1023	static __inline__ int __DEFAULT_FN_ATTRS
1024	_mm_comieq_ss(__m128 __a, __m128 __b)
1025	{
1026	return __builtin_ia32_comieq((__v4sf)__a, (__v4sf)__b);
1027	}
1028
1029	/// Compares two 32-bit float values in the low-order bits of both
1030	/// operands to determine if the first operand is less than the second
1031	/// operand and returns the result of the comparison.
1032	///
1033	/// If either of the two lower 32-bit values is NaN, 0 is returned.
1034	///
1035	/// \headerfile <x86intrin.h>
1036	///
1037	/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
1038	/// instructions.
1039	///
1040	/// \param __a
1041	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1042	/// used in the comparison.
1043	/// \param __b
1044	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1045	/// used in the comparison.
1046	/// \returns An integer containing the comparison results. If either of the two
1047	/// lower 32-bit values is NaN, 0 is returned.
1048	static __inline__ int __DEFAULT_FN_ATTRS
1049	_mm_comilt_ss(__m128 __a, __m128 __b)
1050	{
1051	return __builtin_ia32_comilt((__v4sf)__a, (__v4sf)__b);
1052	}
1053
1054	/// Compares two 32-bit float values in the low-order bits of both
1055	/// operands to determine if the first operand is less than or equal to the
1056	/// second operand and returns the result of the comparison.
1057	///
1058	/// If either of the two lower 32-bit values is NaN, 0 is returned.
1059	///
1060	/// \headerfile <x86intrin.h>
1061	///
1062	/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1063	///
1064	/// \param __a
1065	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1066	/// used in the comparison.
1067	/// \param __b
1068	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1069	/// used in the comparison.
1070	/// \returns An integer containing the comparison results. If either of the two
1071	/// lower 32-bit values is NaN, 0 is returned.
1072	static __inline__ int __DEFAULT_FN_ATTRS
1073	_mm_comile_ss(__m128 __a, __m128 __b)
1074	{
1075	return __builtin_ia32_comile((__v4sf)__a, (__v4sf)__b);
1076	}
1077
1078	/// Compares two 32-bit float values in the low-order bits of both
1079	/// operands to determine if the first operand is greater than the second
1080	/// operand and returns the result of the comparison.
1081	///
1082	/// If either of the two lower 32-bit values is NaN, 0 is returned.
1083	///
1084	/// \headerfile <x86intrin.h>
1085	///
1086	/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1087	///
1088	/// \param __a
1089	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1090	/// used in the comparison.
1091	/// \param __b
1092	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1093	/// used in the comparison.
1094	/// \returns An integer containing the comparison results. If either of the
1095	/// two lower 32-bit values is NaN, 0 is returned.
1096	static __inline__ int __DEFAULT_FN_ATTRS
1097	_mm_comigt_ss(__m128 __a, __m128 __b)
1098	{
1099	return __builtin_ia32_comigt((__v4sf)__a, (__v4sf)__b);
1100	}
1101
1102	/// Compares two 32-bit float values in the low-order bits of both
1103	/// operands to determine if the first operand is greater than or equal to
1104	/// the second operand and returns the result of the comparison.
1105	///
1106	/// If either of the two lower 32-bit values is NaN, 0 is returned.
1107	///
1108	/// \headerfile <x86intrin.h>
1109	///
1110	/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1111	///
1112	/// \param __a
1113	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1114	/// used in the comparison.
1115	/// \param __b
1116	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1117	/// used in the comparison.
1118	/// \returns An integer containing the comparison results. If either of the two
1119	/// lower 32-bit values is NaN, 0 is returned.
1120	static __inline__ int __DEFAULT_FN_ATTRS
1121	_mm_comige_ss(__m128 __a, __m128 __b)
1122	{
1123	return __builtin_ia32_comige((__v4sf)__a, (__v4sf)__b);
1124	}
1125
1126	/// Compares two 32-bit float values in the low-order bits of both
1127	/// operands to determine if the first operand is not equal to the second
1128	/// operand and returns the result of the comparison.
1129	///
1130	/// If either of the two lower 32-bit values is NaN, 1 is returned.
1131	///
1132	/// \headerfile <x86intrin.h>
1133	///
1134	/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1135	///
1136	/// \param __a
1137	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1138	/// used in the comparison.
1139	/// \param __b
1140	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1141	/// used in the comparison.
1142	/// \returns An integer containing the comparison results. If either of the
1143	/// two lower 32-bit values is NaN, 1 is returned.
1144	static __inline__ int __DEFAULT_FN_ATTRS
1145	_mm_comineq_ss(__m128 __a, __m128 __b)
1146	{
1147	return __builtin_ia32_comineq((__v4sf)__a, (__v4sf)__b);
1148	}
1149
1150	/// Performs an unordered comparison of two 32-bit float values using
1151	/// the low-order bits of both operands to determine equality and returns
1152	/// the result of the comparison.
1153	///
1154	/// If either of the two lower 32-bit values is NaN, 0 is returned.
1155	///
1156	/// \headerfile <x86intrin.h>
1157	///
1158	/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1159	///
1160	/// \param __a
1161	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1162	/// used in the comparison.
1163	/// \param __b
1164	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1165	/// used in the comparison.
1166	/// \returns An integer containing the comparison results. If either of the two
1167	/// lower 32-bit values is NaN, 0 is returned.
1168	static __inline__ int __DEFAULT_FN_ATTRS
1169	_mm_ucomieq_ss(__m128 __a, __m128 __b)
1170	{
1171	return __builtin_ia32_ucomieq((__v4sf)__a, (__v4sf)__b);
1172	}
1173
1174	/// Performs an unordered comparison of two 32-bit float values using
1175	/// the low-order bits of both operands to determine if the first operand is
1176	/// less than the second operand and returns the result of the comparison.
1177	///
1178	/// If either of the two lower 32-bit values is NaN, 0 is returned.
1179	///
1180	/// \headerfile <x86intrin.h>
1181	///
1182	/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1183	///
1184	/// \param __a
1185	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1186	/// used in the comparison.
1187	/// \param __b
1188	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1189	/// used in the comparison.
1190	/// \returns An integer containing the comparison results. If either of the two
1191	/// lower 32-bit values is NaN, 0 is returned.
1192	static __inline__ int __DEFAULT_FN_ATTRS
1193	_mm_ucomilt_ss(__m128 __a, __m128 __b)
1194	{
1195	return __builtin_ia32_ucomilt((__v4sf)__a, (__v4sf)__b);
1196	}
1197
1198	/// Performs an unordered comparison of two 32-bit float values using
1199	/// the low-order bits of both operands to determine if the first operand is
1200	/// less than or equal to the second operand and returns the result of the
1201	/// comparison.
1202	///
1203	/// If either of the two lower 32-bit values is NaN, 0 is returned.
1204	///
1205	/// \headerfile <x86intrin.h>
1206	///
1207	/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1208	///
1209	/// \param __a
1210	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1211	/// used in the comparison.
1212	/// \param __b
1213	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1214	/// used in the comparison.
1215	/// \returns An integer containing the comparison results. If either of the two
1216	/// lower 32-bit values is NaN, 0 is returned.
1217	static __inline__ int __DEFAULT_FN_ATTRS
1218	_mm_ucomile_ss(__m128 __a, __m128 __b)
1219	{
1220	return __builtin_ia32_ucomile((__v4sf)__a, (__v4sf)__b);
1221	}
1222
1223	/// Performs an unordered comparison of two 32-bit float values using
1224	/// the low-order bits of both operands to determine if the first operand is
1225	/// greater than the second operand and returns the result of the
1226	/// comparison.
1227	///
1228	/// If either of the two lower 32-bit values is NaN, 0 is returned.
1229	///
1230	/// \headerfile <x86intrin.h>
1231	///
1232	/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1233	///
1234	/// \param __a
1235	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1236	/// used in the comparison.
1237	/// \param __b
1238	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1239	/// used in the comparison.
1240	/// \returns An integer containing the comparison results. If either of the two
1241	/// lower 32-bit values is NaN, 0 is returned.
1242	static __inline__ int __DEFAULT_FN_ATTRS
1243	_mm_ucomigt_ss(__m128 __a, __m128 __b)
1244	{
1245	return __builtin_ia32_ucomigt((__v4sf)__a, (__v4sf)__b);
1246	}
1247
1248	/// Performs an unordered comparison of two 32-bit float values using
1249	/// the low-order bits of both operands to determine if the first operand is
1250	/// greater than or equal to the second operand and returns the result of
1251	/// the comparison.
1252	///
1253	/// If either of the two lower 32-bit values is NaN, 0 is returned.
1254	///
1255	/// \headerfile <x86intrin.h>
1256	///
1257	/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1258	///
1259	/// \param __a
1260	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1261	/// used in the comparison.
1262	/// \param __b
1263	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1264	/// used in the comparison.
1265	/// \returns An integer containing the comparison results. If either of the two
1266	/// lower 32-bit values is NaN, 0 is returned.
1267	static __inline__ int __DEFAULT_FN_ATTRS
1268	_mm_ucomige_ss(__m128 __a, __m128 __b)
1269	{
1270	return __builtin_ia32_ucomige((__v4sf)__a, (__v4sf)__b);
1271	}
1272
1273	/// Performs an unordered comparison of two 32-bit float values using
1274	/// the low-order bits of both operands to determine inequality and returns
1275	/// the result of the comparison.
1276	///
1277	/// If either of the two lower 32-bit values is NaN, 1 is returned.
1278	///
1279	/// \headerfile <x86intrin.h>
1280	///
1281	/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1282	///
1283	/// \param __a
1284	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1285	/// used in the comparison.
1286	/// \param __b
1287	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1288	/// used in the comparison.
1289	/// \returns An integer containing the comparison results. If either of the two
1290	/// lower 32-bit values is NaN, 1 is returned.
1291	static __inline__ int __DEFAULT_FN_ATTRS
1292	_mm_ucomineq_ss(__m128 __a, __m128 __b)
1293	{
1294	return __builtin_ia32_ucomineq((__v4sf)__a, (__v4sf)__b);
1295	}
1296
1297	/// Converts a float value contained in the lower 32 bits of a vector of
1298	/// [4 x float] into a 32-bit integer.
1299	///
1300	/// \headerfile <x86intrin.h>
1301	///
1302	/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1303	/// instructions.
1304	///
1305	/// \param __a
1306	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1307	/// used in the conversion.
1308	/// \returns A 32-bit integer containing the converted value.
1309	static __inline__ int __DEFAULT_FN_ATTRS
1310	_mm_cvtss_si32(__m128 __a)
1311	{
1312	return __builtin_ia32_cvtss2si((__v4sf)__a);
1313	}
1314
1315	/// Converts a float value contained in the lower 32 bits of a vector of
1316	/// [4 x float] into a 32-bit integer.
1317	///
1318	/// \headerfile <x86intrin.h>
1319	///
1320	/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1321	/// instructions.
1322	///
1323	/// \param __a
1324	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1325	/// used in the conversion.
1326	/// \returns A 32-bit integer containing the converted value.
1327	static __inline__ int __DEFAULT_FN_ATTRS
1328	_mm_cvt_ss2si(__m128 __a)
1329	{
1330	return _mm_cvtss_si32(__a);
1331	}
1332
1333	#ifdef __x86_64__
1334
1335	/// Converts a float value contained in the lower 32 bits of a vector of
1336	/// [4 x float] into a 64-bit integer.
1337	///
1338	/// \headerfile <x86intrin.h>
1339	///
1340	/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1341	/// instructions.
1342	///
1343	/// \param __a
1344	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1345	/// used in the conversion.
1346	/// \returns A 64-bit integer containing the converted value.
1347	static __inline__ long long __DEFAULT_FN_ATTRS
1348	_mm_cvtss_si64(__m128 __a)
1349	{
1350	return __builtin_ia32_cvtss2si64((__v4sf)__a);
1351	}
1352
1353	#endif
1354
1355	/// Converts two low-order float values in a 128-bit vector of
1356	/// [4 x float] into a 64-bit vector of [2 x i32].
1357	///
1358	/// \headerfile <x86intrin.h>
1359	///
1360	/// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
1361	///
1362	/// \param __a
1363	/// A 128-bit vector of [4 x float].
1364	/// \returns A 64-bit integer vector containing the converted values.
1365	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
1366	_mm_cvtps_pi32(__m128 __a)
1367	{
1368	return (__m64)__builtin_ia32_cvtps2pi((__v4sf)__a);
1369	}
1370
1371	/// Converts two low-order float values in a 128-bit vector of
1372	/// [4 x float] into a 64-bit vector of [2 x i32].
1373	///
1374	/// \headerfile <x86intrin.h>
1375	///
1376	/// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
1377	///
1378	/// \param __a
1379	/// A 128-bit vector of [4 x float].
1380	/// \returns A 64-bit integer vector containing the converted values.
1381	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
1382	_mm_cvt_ps2pi(__m128 __a)
1383	{
1384	return _mm_cvtps_pi32(__a);
1385	}
1386
1387	/// Converts a float value contained in the lower 32 bits of a vector of
1388	/// [4 x float] into a 32-bit integer, truncating the result when it is
1389	/// inexact.
1390	///
1391	/// \headerfile <x86intrin.h>
1392	///
1393	/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1394	/// instructions.
1395	///
1396	/// \param __a
1397	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1398	/// used in the conversion.
1399	/// \returns A 32-bit integer containing the converted value.
1400	static __inline__ int __DEFAULT_FN_ATTRS
1401	_mm_cvttss_si32(__m128 __a)
1402	{
1403	return __builtin_ia32_cvttss2si((__v4sf)__a);
1404	}
1405
1406	/// Converts a float value contained in the lower 32 bits of a vector of
1407	/// [4 x float] into a 32-bit integer, truncating the result when it is
1408	/// inexact.
1409	///
1410	/// \headerfile <x86intrin.h>
1411	///
1412	/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1413	/// instructions.
1414	///
1415	/// \param __a
1416	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1417	/// used in the conversion.
1418	/// \returns A 32-bit integer containing the converted value.
1419	static __inline__ int __DEFAULT_FN_ATTRS
1420	_mm_cvtt_ss2si(__m128 __a)
1421	{
1422	return _mm_cvttss_si32(__a);
1423	}
1424
1425	#ifdef __x86_64__
1426	/// Converts a float value contained in the lower 32 bits of a vector of
1427	/// [4 x float] into a 64-bit integer, truncating the result when it is
1428	/// inexact.
1429	///
1430	/// \headerfile <x86intrin.h>
1431	///
1432	/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1433	/// instructions.
1434	///
1435	/// \param __a
1436	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1437	/// used in the conversion.
1438	/// \returns A 64-bit integer containing the converted value.
1439	static __inline__ long long __DEFAULT_FN_ATTRS
1440	_mm_cvttss_si64(__m128 __a)
1441	{
1442	return __builtin_ia32_cvttss2si64((__v4sf)__a);
1443	}
1444	#endif
1445
1446	/// Converts two low-order float values in a 128-bit vector of
1447	/// [4 x float] into a 64-bit vector of [2 x i32], truncating the result
1448	/// when it is inexact.
1449	///
1450	/// \headerfile <x86intrin.h>
1451	///
1452	/// This intrinsic corresponds to the <c> CVTTPS2PI / VTTPS2PI </c>
1453	/// instructions.
1454	///
1455	/// \param __a
1456	/// A 128-bit vector of [4 x float].
1457	/// \returns A 64-bit integer vector containing the converted values.
1458	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
1459	_mm_cvttps_pi32(__m128 __a)
1460	{
1461	return (__m64)__builtin_ia32_cvttps2pi((__v4sf)__a);
1462	}
1463
1464	/// Converts two low-order float values in a 128-bit vector of [4 x
1465	/// float] into a 64-bit vector of [2 x i32], truncating the result when it
1466	/// is inexact.
1467	///
1468	/// \headerfile <x86intrin.h>
1469	///
1470	/// This intrinsic corresponds to the <c> CVTTPS2PI </c> instruction.
1471	///
1472	/// \param __a
1473	/// A 128-bit vector of [4 x float].
1474	/// \returns A 64-bit integer vector containing the converted values.
1475	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
1476	_mm_cvtt_ps2pi(__m128 __a)
1477	{
1478	return _mm_cvttps_pi32(__a);
1479	}
1480
1481	/// Converts a 32-bit signed integer value into a floating point value
1482	/// and writes it to the lower 32 bits of the destination. The remaining
1483	/// higher order elements of the destination vector are copied from the
1484	/// corresponding elements in the first operand.
1485	///
1486	/// \headerfile <x86intrin.h>
1487	///
1488	/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
1489	///
1490	/// \param __a
1491	/// A 128-bit vector of [4 x float].
1492	/// \param __b
1493	/// A 32-bit signed integer operand containing the value to be converted.
1494	/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1495	/// converted value of the second operand. The upper 96 bits are copied from
1496	/// the upper 96 bits of the first operand.
1497	static __inline__ __m128 __DEFAULT_FN_ATTRS
1498	_mm_cvtsi32_ss(__m128 __a, int __b)
1499	{
1500	__a[0] = __b;
1501	return __a;
1502	}
1503
1504	/// Converts a 32-bit signed integer value into a floating point value
1505	/// and writes it to the lower 32 bits of the destination. The remaining
1506	/// higher order elements of the destination are copied from the
1507	/// corresponding elements in the first operand.
1508	///
1509	/// \headerfile <x86intrin.h>
1510	///
1511	/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
1512	///
1513	/// \param __a
1514	/// A 128-bit vector of [4 x float].
1515	/// \param __b
1516	/// A 32-bit signed integer operand containing the value to be converted.
1517	/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1518	/// converted value of the second operand. The upper 96 bits are copied from
1519	/// the upper 96 bits of the first operand.
1520	static __inline__ __m128 __DEFAULT_FN_ATTRS
1521	_mm_cvt_si2ss(__m128 __a, int __b)
1522	{
1523	return _mm_cvtsi32_ss(__a, __b);
1524	}
1525
1526	#ifdef __x86_64__
1527
1528	/// Converts a 64-bit signed integer value into a floating point value
1529	/// and writes it to the lower 32 bits of the destination. The remaining
1530	/// higher order elements of the destination are copied from the
1531	/// corresponding elements in the first operand.
1532	///
1533	/// \headerfile <x86intrin.h>
1534	///
1535	/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
1536	///
1537	/// \param __a
1538	/// A 128-bit vector of [4 x float].
1539	/// \param __b
1540	/// A 64-bit signed integer operand containing the value to be converted.
1541	/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1542	/// converted value of the second operand. The upper 96 bits are copied from
1543	/// the upper 96 bits of the first operand.
1544	static __inline__ __m128 __DEFAULT_FN_ATTRS
1545	_mm_cvtsi64_ss(__m128 __a, long long __b)
1546	{
1547	__a[0] = __b;
1548	return __a;
1549	}
1550
1551	#endif
1552
1553	/// Converts two elements of a 64-bit vector of [2 x i32] into two
1554	/// floating point values and writes them to the lower 64-bits of the
1555	/// destination. The remaining higher order elements of the destination are
1556	/// copied from the corresponding elements in the first operand.
1557	///
1558	/// \headerfile <x86intrin.h>
1559	///
1560	/// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
1561	///
1562	/// \param __a
1563	/// A 128-bit vector of [4 x float].
1564	/// \param __b
1565	/// A 64-bit vector of [2 x i32]. The elements in this vector are converted
1566	/// and written to the corresponding low-order elements in the destination.
1567	/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1568	/// converted value of the second operand. The upper 64 bits are copied from
1569	/// the upper 64 bits of the first operand.
1570	static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
1571	_mm_cvtpi32_ps(__m128 __a, __m64 __b)
1572	{
1573	return __builtin_ia32_cvtpi2ps((__v4sf)__a, (__v2si)__b);
1574	}
1575
1576	/// Converts two elements of a 64-bit vector of [2 x i32] into two
1577	/// floating point values and writes them to the lower 64-bits of the
1578	/// destination. The remaining higher order elements of the destination are
1579	/// copied from the corresponding elements in the first operand.
1580	///
1581	/// \headerfile <x86intrin.h>
1582	///
1583	/// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
1584	///
1585	/// \param __a
1586	/// A 128-bit vector of [4 x float].
1587	/// \param __b
1588	/// A 64-bit vector of [2 x i32]. The elements in this vector are converted
1589	/// and written to the corresponding low-order elements in the destination.
1590	/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1591	/// converted value from the second operand. The upper 64 bits are copied
1592	/// from the upper 64 bits of the first operand.
1593	static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
1594	_mm_cvt_pi2ps(__m128 __a, __m64 __b)
1595	{
1596	return _mm_cvtpi32_ps(__a, __b);
1597	}
1598
1599	/// Extracts a float value contained in the lower 32 bits of a vector of
1600	/// [4 x float].
1601	///
1602	/// \headerfile <x86intrin.h>
1603	///
1604	/// This intrinsic has no corresponding instruction.
1605	///
1606	/// \param __a
1607	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1608	/// used in the extraction.
1609	/// \returns A 32-bit float containing the extracted value.
1610	static __inline__ float __DEFAULT_FN_ATTRS
1611	_mm_cvtss_f32(__m128 __a)
1612	{
1613	return __a[0];
1614	}
1615
1616	/// Loads two packed float values from the address \a __p into the
1617	/// high-order bits of a 128-bit vector of [4 x float]. The low-order bits
1618	/// are copied from the low-order bits of the first operand.
1619	///
1620	/// \headerfile <x86intrin.h>
1621	///
1622	/// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
1623	///
1624	/// \param __a
1625	/// A 128-bit vector of [4 x float]. Bits [63:0] are written to bits [63:0]
1626	/// of the destination.
1627	/// \param __p
1628	/// A pointer to two packed float values. Bits [63:0] are written to bits
1629	/// [127:64] of the destination.
1630	/// \returns A 128-bit vector of [4 x float] containing the moved values.
1631	static __inline__ __m128 __DEFAULT_FN_ATTRS
1632	_mm_loadh_pi(__m128 __a, const __m64 *__p)
1633	{
1634	typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8)));
1635	struct __mm_loadh_pi_struct {
1636	__mm_loadh_pi_v2f32 __u;
1637	} __attribute__((__packed__, __may_alias__));
1638	__mm_loadh_pi_v2f32 __b = ((const struct __mm_loadh_pi_struct*)__p)->__u;
1639	__m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
1640	return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5);
1641	}
1642
1643	/// Loads two packed float values from the address \a __p into the
1644	/// low-order bits of a 128-bit vector of [4 x float]. The high-order bits
1645	/// are copied from the high-order bits of the first operand.
1646	///
1647	/// \headerfile <x86intrin.h>
1648	///
1649	/// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
1650	///
1651	/// \param __a
1652	/// A 128-bit vector of [4 x float]. Bits [127:64] are written to bits
1653	/// [127:64] of the destination.
1654	/// \param __p
1655	/// A pointer to two packed float values. Bits [63:0] are written to bits
1656	/// [63:0] of the destination.
1657	/// \returns A 128-bit vector of [4 x float] containing the moved values.
1658	static __inline__ __m128 __DEFAULT_FN_ATTRS
1659	_mm_loadl_pi(__m128 __a, const __m64 *__p)
1660	{
1661	typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8)));
1662	struct __mm_loadl_pi_struct {
1663	__mm_loadl_pi_v2f32 __u;
1664	} __attribute__((__packed__, __may_alias__));
1665	__mm_loadl_pi_v2f32 __b = ((const struct __mm_loadl_pi_struct*)__p)->__u;
1666	__m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
1667	return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3);
1668	}
1669
1670	/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
1671	/// 32 bits of the vector are initialized with the single-precision
1672	/// floating-point value loaded from a specified memory location. The upper
1673	/// 96 bits are set to zero.
1674	///
1675	/// \headerfile <x86intrin.h>
1676	///
1677	/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
1678	///
1679	/// \param __p
1680	/// A pointer to a 32-bit memory location containing a single-precision
1681	/// floating-point value.
1682	/// \returns An initialized 128-bit floating-point vector of [4 x float]. The
1683	/// lower 32 bits contain the value loaded from the memory location. The
1684	/// upper 96 bits are set to zero.
1685	static __inline__ __m128 __DEFAULT_FN_ATTRS
1686	_mm_load_ss(const float *__p)
1687	{
1688	struct __mm_load_ss_struct {
1689	float __u;
1690	} __attribute__((__packed__, __may_alias__));
1691	float __u = ((const struct __mm_load_ss_struct*)__p)->__u;
1692	return __extension__ (__m128){ __u, 0, 0, 0 };
1693	}
1694
1695	/// Loads a 32-bit float value and duplicates it to all four vector
1696	/// elements of a 128-bit vector of [4 x float].
1697	///
1698	/// \headerfile <x86intrin.h>
1699	///
1700	/// This intrinsic corresponds to the <c> VBROADCASTSS / MOVSS + shuffling </c>
1701	/// instruction.
1702	///
1703	/// \param __p
1704	/// A pointer to a float value to be loaded and duplicated.
1705	/// \returns A 128-bit vector of [4 x float] containing the loaded and
1706	/// duplicated values.
1707	static __inline__ __m128 __DEFAULT_FN_ATTRS
1708	_mm_load1_ps(const float *__p)
1709	{
1710	struct __mm_load1_ps_struct {
1711	float __u;
1712	} __attribute__((__packed__, __may_alias__));
1713	float __u = ((const struct __mm_load1_ps_struct*)__p)->__u;
1714	return __extension__ (__m128){ __u, __u, __u, __u };
1715	}
1716
1717	#define _mm_load_ps1(p) _mm_load1_ps(p)
1718
1719	/// Loads a 128-bit floating-point vector of [4 x float] from an aligned
1720	/// memory location.
1721	///
1722	/// \headerfile <x86intrin.h>
1723	///
1724	/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
1725	///
1726	/// \param __p
1727	/// A pointer to a 128-bit memory location. The address of the memory
1728	/// location has to be 128-bit aligned.
1729	/// \returns A 128-bit vector of [4 x float] containing the loaded values.
1730	static __inline__ __m128 __DEFAULT_FN_ATTRS
1731	_mm_load_ps(const float *__p)
1732	{
1733	return (const __m128)__p;
1734	}
1735
1736	/// Loads a 128-bit floating-point vector of [4 x float] from an
1737	/// unaligned memory location.
1738	///
1739	/// \headerfile <x86intrin.h>
1740	///
1741	/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
1742	///
1743	/// \param __p
1744	/// A pointer to a 128-bit memory location. The address of the memory
1745	/// location does not have to be aligned.
1746	/// \returns A 128-bit vector of [4 x float] containing the loaded values.
1747	static __inline__ __m128 __DEFAULT_FN_ATTRS
1748	_mm_loadu_ps(const float *__p)
1749	{
1750	struct __loadu_ps {
1751	__m128_u __v;
1752	} __attribute__((__packed__, __may_alias__));
1753	return ((const struct __loadu_ps*)__p)->__v;
1754	}
1755
1756	/// Loads four packed float values, in reverse order, from an aligned
1757	/// memory location to 32-bit elements in a 128-bit vector of [4 x float].
1758	///
1759	/// \headerfile <x86intrin.h>
1760	///
1761	/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
1762	/// instruction.
1763	///
1764	/// \param __p
1765	/// A pointer to a 128-bit memory location. The address of the memory
1766	/// location has to be 128-bit aligned.
1767	/// \returns A 128-bit vector of [4 x float] containing the moved values, loaded
1768	/// in reverse order.
1769	static __inline__ __m128 __DEFAULT_FN_ATTRS
1770	_mm_loadr_ps(const float *__p)
1771	{
1772	__m128 __a = _mm_load_ps(__p);
1773	return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
1774	}
1775
1776	/// Create a 128-bit vector of [4 x float] with undefined values.
1777	///
1778	/// \headerfile <x86intrin.h>
1779	///
1780	/// This intrinsic has no corresponding instruction.
1781	///
1782	/// \returns A 128-bit vector of [4 x float] containing undefined values.
1783	static __inline__ __m128 __DEFAULT_FN_ATTRS
1784	_mm_undefined_ps(void)
1785	{
1786	return (__m128)__builtin_ia32_undef128();
1787	}
1788
1789	/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
1790	/// 32 bits of the vector are initialized with the specified single-precision
1791	/// floating-point value. The upper 96 bits are set to zero.
1792	///
1793	/// \headerfile <x86intrin.h>
1794	///
1795	/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
1796	///
1797	/// \param __w
1798	/// A single-precision floating-point value used to initialize the lower 32
1799	/// bits of the result.
1800	/// \returns An initialized 128-bit floating-point vector of [4 x float]. The
1801	/// lower 32 bits contain the value provided in the source operand. The
1802	/// upper 96 bits are set to zero.
1803	static __inline__ __m128 __DEFAULT_FN_ATTRS
1804	_mm_set_ss(float __w)
1805	{
1806	return __extension__ (__m128){ __w, 0, 0, 0 };
1807	}
1808
1809	/// Constructs a 128-bit floating-point vector of [4 x float], with each
1810	/// of the four single-precision floating-point vector elements set to the
1811	/// specified single-precision floating-point value.
1812	///
1813	/// \headerfile <x86intrin.h>
1814	///
1815	/// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
1816	///
1817	/// \param __w
1818	/// A single-precision floating-point value used to initialize each vector
1819	/// element of the result.
1820	/// \returns An initialized 128-bit floating-point vector of [4 x float].
1821	static __inline__ __m128 __DEFAULT_FN_ATTRS
1822	_mm_set1_ps(float __w)
1823	{
1824	return __extension__ (__m128){ __w, __w, __w, __w };
1825	}
1826
1827	/* Microsoft specific. */
1828	/// Constructs a 128-bit floating-point vector of [4 x float], with each
1829	/// of the four single-precision floating-point vector elements set to the
1830	/// specified single-precision floating-point value.
1831	///
1832	/// \headerfile <x86intrin.h>
1833	///
1834	/// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
1835	///
1836	/// \param __w
1837	/// A single-precision floating-point value used to initialize each vector
1838	/// element of the result.
1839	/// \returns An initialized 128-bit floating-point vector of [4 x float].
1840	static __inline__ __m128 __DEFAULT_FN_ATTRS
1841	_mm_set_ps1(float __w)
1842	{
1843	return _mm_set1_ps(__w);
1844	}
1845
1846	/// Constructs a 128-bit floating-point vector of [4 x float]
1847	/// initialized with the specified single-precision floating-point values.
1848	///
1849	/// \headerfile <x86intrin.h>
1850	///
1851	/// This intrinsic is a utility function and does not correspond to a specific
1852	/// instruction.
1853	///
1854	/// \param __z
1855	/// A single-precision floating-point value used to initialize bits [127:96]
1856	/// of the result.
1857	/// \param __y
1858	/// A single-precision floating-point value used to initialize bits [95:64]
1859	/// of the result.
1860	/// \param __x
1861	/// A single-precision floating-point value used to initialize bits [63:32]
1862	/// of the result.
1863	/// \param __w
1864	/// A single-precision floating-point value used to initialize bits [31:0]
1865	/// of the result.
1866	/// \returns An initialized 128-bit floating-point vector of [4 x float].
1867	static __inline__ __m128 __DEFAULT_FN_ATTRS
1868	_mm_set_ps(float __z, float __y, float __x, float __w)
1869	{
1870	return __extension__ (__m128){ __w, __x, __y, __z };
1871	}
1872
1873	/// Constructs a 128-bit floating-point vector of [4 x float],
1874	/// initialized in reverse order with the specified 32-bit single-precision
1875	/// float-point values.
1876	///
1877	/// \headerfile <x86intrin.h>
1878	///
1879	/// This intrinsic is a utility function and does not correspond to a specific
1880	/// instruction.
1881	///
1882	/// \param __z
1883	/// A single-precision floating-point value used to initialize bits [31:0]
1884	/// of the result.
1885	/// \param __y
1886	/// A single-precision floating-point value used to initialize bits [63:32]
1887	/// of the result.
1888	/// \param __x
1889	/// A single-precision floating-point value used to initialize bits [95:64]
1890	/// of the result.
1891	/// \param __w
1892	/// A single-precision floating-point value used to initialize bits [127:96]
1893	/// of the result.
1894	/// \returns An initialized 128-bit floating-point vector of [4 x float].
1895	static __inline__ __m128 __DEFAULT_FN_ATTRS
1896	_mm_setr_ps(float __z, float __y, float __x, float __w)
1897	{
1898	return __extension__ (__m128){ __z, __y, __x, __w };
1899	}
1900
1901	/// Constructs a 128-bit floating-point vector of [4 x float] initialized
1902	/// to zero.
1903	///
1904	/// \headerfile <x86intrin.h>
1905	///
1906	/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
1907	///
1908	/// \returns An initialized 128-bit floating-point vector of [4 x float] with
1909	/// all elements set to zero.
1910	static __inline__ __m128 __DEFAULT_FN_ATTRS
1911	_mm_setzero_ps(void)
1912	{
1913	return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };
1914	}
1915
1916	/// Stores the upper 64 bits of a 128-bit vector of [4 x float] to a
1917	/// memory location.
1918	///
1919	/// \headerfile <x86intrin.h>
1920	///
1921	/// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> instruction.
1922	///
1923	/// \param __p
1924	/// A pointer to a 64-bit memory location.
1925	/// \param __a
1926	/// A 128-bit vector of [4 x float] containing the values to be stored.
1927	static __inline__ void __DEFAULT_FN_ATTRS
1928	_mm_storeh_pi(__m64 *__p, __m128 __a)
1929	{
1930	typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8)));
1931	struct __mm_storeh_pi_struct {
1932	__mm_storeh_pi_v2f32 __u;
1933	} __attribute__((__packed__, __may_alias__));
1934	((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 2, 3);
1935	}
1936
1937	/// Stores the lower 64 bits of a 128-bit vector of [4 x float] to a
1938	/// memory location.
1939	///
1940	/// \headerfile <x86intrin.h>
1941	///
1942	/// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
1943	///
1944	/// \param __p
1945	/// A pointer to a memory location that will receive the float values.
1946	/// \param __a
1947	/// A 128-bit vector of [4 x float] containing the values to be stored.
1948	static __inline__ void __DEFAULT_FN_ATTRS
1949	_mm_storel_pi(__m64 *__p, __m128 __a)
1950	{
1951	typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8)));
1952	struct __mm_storeh_pi_struct {
1953	__mm_storeh_pi_v2f32 __u;
1954	} __attribute__((__packed__, __may_alias__));
1955	((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 0, 1);
1956	}
1957
1958	/// Stores the lower 32 bits of a 128-bit vector of [4 x float] to a
1959	/// memory location.
1960	///
1961	/// \headerfile <x86intrin.h>
1962	///
1963	/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
1964	///
1965	/// \param __p
1966	/// A pointer to a 32-bit memory location.
1967	/// \param __a
1968	/// A 128-bit vector of [4 x float] containing the value to be stored.
1969	static __inline__ void __DEFAULT_FN_ATTRS
1970	_mm_store_ss(float *__p, __m128 __a)
1971	{
1972	struct __mm_store_ss_struct {
1973	float __u;
1974	} __attribute__((__packed__, __may_alias__));
1975	((struct __mm_store_ss_struct*)__p)->__u = __a[0];
1976	}
1977
1978	/// Stores a 128-bit vector of [4 x float] to an unaligned memory
1979	/// location.
1980	///
1981	/// \headerfile <x86intrin.h>
1982	///
1983	/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
1984	///
1985	/// \param __p
1986	/// A pointer to a 128-bit memory location. The address of the memory
1987	/// location does not have to be aligned.
1988	/// \param __a
1989	/// A 128-bit vector of [4 x float] containing the values to be stored.
1990	static __inline__ void __DEFAULT_FN_ATTRS
1991	_mm_storeu_ps(float *__p, __m128 __a)
1992	{
1993	struct __storeu_ps {
1994	__m128_u __v;
1995	} __attribute__((__packed__, __may_alias__));
1996	((struct __storeu_ps*)__p)->__v = __a;
1997	}
1998
1999	/// Stores a 128-bit vector of [4 x float] into an aligned memory
2000	/// location.
2001	///
2002	/// \headerfile <x86intrin.h>
2003	///
2004	/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
2005	///
2006	/// \param __p
2007	/// A pointer to a 128-bit memory location. The address of the memory
2008	/// location has to be 16-byte aligned.
2009	/// \param __a
2010	/// A 128-bit vector of [4 x float] containing the values to be stored.
2011	static __inline__ void __DEFAULT_FN_ATTRS
2012	_mm_store_ps(float *__p, __m128 __a)
2013	{
2014	(__m128)__p = __a;
2015	}
2016
2017	/// Stores the lower 32 bits of a 128-bit vector of [4 x float] into
2018	/// four contiguous elements in an aligned memory location.
2019	///
2020	/// \headerfile <x86intrin.h>
2021	///
2022	/// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
2023	/// instruction.
2024	///
2025	/// \param __p
2026	/// A pointer to a 128-bit memory location.
2027	/// \param __a
2028	/// A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
2029	/// of the four contiguous elements pointed by \a __p.
2030	static __inline__ void __DEFAULT_FN_ATTRS
2031	_mm_store1_ps(float *__p, __m128 __a)
2032	{
2033	__a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 0, 0);
2034	_mm_store_ps(__p, __a);
2035	}
2036
2037	/// Stores the lower 32 bits of a 128-bit vector of [4 x float] into
2038	/// four contiguous elements in an aligned memory location.
2039	///
2040	/// \headerfile <x86intrin.h>
2041	///
2042	/// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
2043	/// instruction.
2044	///
2045	/// \param __p
2046	/// A pointer to a 128-bit memory location.
2047	/// \param __a
2048	/// A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
2049	/// of the four contiguous elements pointed by \a __p.
2050	static __inline__ void __DEFAULT_FN_ATTRS
2051	_mm_store_ps1(float *__p, __m128 __a)
2052	{
2053	_mm_store1_ps(__p, __a);
2054	}
2055
2056	/// Stores float values from a 128-bit vector of [4 x float] to an
2057	/// aligned memory location in reverse order.
2058	///
2059	/// \headerfile <x86intrin.h>
2060	///
2061	/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
2062	/// instruction.
2063	///
2064	/// \param __p
2065	/// A pointer to a 128-bit memory location. The address of the memory
2066	/// location has to be 128-bit aligned.
2067	/// \param __a
2068	/// A 128-bit vector of [4 x float] containing the values to be stored.
2069	static __inline__ void __DEFAULT_FN_ATTRS
2070	_mm_storer_ps(float *__p, __m128 __a)
2071	{
2072	__a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
2073	_mm_store_ps(__p, __a);
2074	}
2075
2076	#define _MM_HINT_ET0 7
2077	#define _MM_HINT_ET1 6
2078	#define _MM_HINT_T0 3
2079	#define _MM_HINT_T1 2
2080	#define _MM_HINT_T2 1
2081	#define _MM_HINT_NTA 0
2082
2083	#ifndef _MSC_VER
2084	/* FIXME: We have to #define this because "sel" must be a constant integer, and
2085	Sema doesn't do any form of constant propagation yet. */
2086
2087	/// Loads one cache line of data from the specified address to a location
2088	/// closer to the processor.
2089	///
2090	/// \headerfile <x86intrin.h>
2091	///
2092	/// \code
2093	/// void _mm_prefetch(const void *a, const int sel);
2094	/// \endcode
2095	///
2096	/// This intrinsic corresponds to the <c> PREFETCHNTA </c> instruction.
2097	///
2098	/// \param a
2099	/// A pointer to a memory location containing a cache line of data.
2100	/// \param sel
2101	/// A predefined integer constant specifying the type of prefetch
2102	/// operation: \n
2103	/// _MM_HINT_NTA: Move data using the non-temporal access (NTA) hint. The
2104	/// PREFETCHNTA instruction will be generated. \n
2105	/// _MM_HINT_T0: Move data using the T0 hint. The PREFETCHT0 instruction will
2106	/// be generated. \n
2107	/// _MM_HINT_T1: Move data using the T1 hint. The PREFETCHT1 instruction will
2108	/// be generated. \n
2109	/// _MM_HINT_T2: Move data using the T2 hint. The PREFETCHT2 instruction will
2110	/// be generated.
2111	#define _mm_prefetch(a, sel) (__builtin_prefetch((const void *)(a), \
2112	((sel) >> 2) & 1, (sel) & 0x3))
2113	#endif
2114
2115	/// Stores a 64-bit integer in the specified aligned memory location. To
2116	/// minimize caching, the data is flagged as non-temporal (unlikely to be
2117	/// used again soon).
2118	///
2119	/// \headerfile <x86intrin.h>
2120	///
2121	/// This intrinsic corresponds to the <c> MOVNTQ </c> instruction.
2122	///
2123	/// \param __p
2124	/// A pointer to an aligned memory location used to store the register value.
2125	/// \param __a
2126	/// A 64-bit integer containing the value to be stored.
2127	static __inline__ void __DEFAULT_FN_ATTRS_MMX
2128	_mm_stream_pi(void *__p, __m64 __a)
2129	{
2130	__builtin_ia32_movntq((__m64 *)__p, __a);
2131	}
2132
2133	/// Moves packed float values from a 128-bit vector of [4 x float] to a
2134	/// 128-bit aligned memory location. To minimize caching, the data is flagged
2135	/// as non-temporal (unlikely to be used again soon).
2136	///
2137	/// \headerfile <x86intrin.h>
2138	///
2139	/// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
2140	///
2141	/// \param __p
2142	/// A pointer to a 128-bit aligned memory location that will receive the
2143	/// single-precision floating-point values.
2144	/// \param __a
2145	/// A 128-bit vector of [4 x float] containing the values to be moved.
2146	static __inline__ void __DEFAULT_FN_ATTRS
2147	_mm_stream_ps(void *__p, __m128 __a)
2148	{
2149	__builtin_nontemporal_store((__v4sf)__a, (__v4sf*)__p);
2150	}
2151
2152	#if defined(__cplusplus)
2153	extern "C" {
2154	#endif
2155
2156	/// Forces strong memory ordering (serialization) between store
2157	/// instructions preceding this instruction and store instructions following
2158	/// this instruction, ensuring the system completes all previous stores
2159	/// before executing subsequent stores.
2160	///
2161	/// \headerfile <x86intrin.h>
2162	///
2163	/// This intrinsic corresponds to the <c> SFENCE </c> instruction.
2164	///
2165	void _mm_sfence(void);
2166
2167	#if defined(__cplusplus)
2168	} // extern "C"
2169	#endif
2170
2171	/// Extracts 16-bit element from a 64-bit vector of [4 x i16] and
2172	/// returns it, as specified by the immediate integer operand.
2173	///
2174	/// \headerfile <x86intrin.h>
2175	///
2176	/// \code
2177	/// int _mm_extract_pi16(__m64 a, int n);
2178	/// \endcode
2179	///
2180	/// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
2181	///
2182	/// \param a
2183	/// A 64-bit vector of [4 x i16].
2184	/// \param n
2185	/// An immediate integer operand that determines which bits are extracted: \n
2186	/// 0: Bits [15:0] are copied to the destination. \n
2187	/// 1: Bits [31:16] are copied to the destination. \n
2188	/// 2: Bits [47:32] are copied to the destination. \n
2189	/// 3: Bits [63:48] are copied to the destination.
2190	/// \returns A 16-bit integer containing the extracted 16 bits of packed data.
2191	#define _mm_extract_pi16(a, n) \
2192	((int)__builtin_ia32_vec_ext_v4hi((__v4hi)a, (int)n))
2193
2194	/// Copies data from the 64-bit vector of [4 x i16] to the destination,
2195	/// and inserts the lower 16-bits of an integer operand at the 16-bit offset
2196	/// specified by the immediate operand \a n.
2197	///
2198	/// \headerfile <x86intrin.h>
2199	///
2200	/// \code
2201	/// __m64 _mm_insert_pi16(__m64 a, int d, int n);
2202	/// \endcode
2203	///
2204	/// This intrinsic corresponds to the <c> PINSRW </c> instruction.
2205	///
2206	/// \param a
2207	/// A 64-bit vector of [4 x i16].
2208	/// \param d
2209	/// An integer. The lower 16-bit value from this operand is written to the
2210	/// destination at the offset specified by operand \a n.
2211	/// \param n
2212	/// An immediate integer operant that determines which the bits to be used
2213	/// in the destination. \n
2214	/// 0: Bits [15:0] are copied to the destination. \n
2215	/// 1: Bits [31:16] are copied to the destination. \n
2216	/// 2: Bits [47:32] are copied to the destination. \n
2217	/// 3: Bits [63:48] are copied to the destination. \n
2218	/// The remaining bits in the destination are copied from the corresponding
2219	/// bits in operand \a a.
2220	/// \returns A 64-bit integer vector containing the copied packed data from the
2221	/// operands.
2222	#define _mm_insert_pi16(a, d, n) \
2223	((__m64)__builtin_ia32_vec_set_v4hi((__v4hi)a, (int)d, (int)n))
2224
2225	/// Compares each of the corresponding packed 16-bit integer values of
2226	/// the 64-bit integer vectors, and writes the greater value to the
2227	/// corresponding bits in the destination.
2228	///
2229	/// \headerfile <x86intrin.h>
2230	///
2231	/// This intrinsic corresponds to the <c> PMAXSW </c> instruction.
2232	///
2233	/// \param __a
2234	/// A 64-bit integer vector containing one of the source operands.
2235	/// \param __b
2236	/// A 64-bit integer vector containing one of the source operands.
2237	/// \returns A 64-bit integer vector containing the comparison results.
2238	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2239	_mm_max_pi16(__m64 __a, __m64 __b)
2240	{
2241	return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b);
2242	}
2243
2244	/// Compares each of the corresponding packed 8-bit unsigned integer
2245	/// values of the 64-bit integer vectors, and writes the greater value to the
2246	/// corresponding bits in the destination.
2247	///
2248	/// \headerfile <x86intrin.h>
2249	///
2250	/// This intrinsic corresponds to the <c> PMAXUB </c> instruction.
2251	///
2252	/// \param __a
2253	/// A 64-bit integer vector containing one of the source operands.
2254	/// \param __b
2255	/// A 64-bit integer vector containing one of the source operands.
2256	/// \returns A 64-bit integer vector containing the comparison results.
2257	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2258	_mm_max_pu8(__m64 __a, __m64 __b)
2259	{
2260	return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b);
2261	}
2262
2263	/// Compares each of the corresponding packed 16-bit integer values of
2264	/// the 64-bit integer vectors, and writes the lesser value to the
2265	/// corresponding bits in the destination.
2266	///
2267	/// \headerfile <x86intrin.h>
2268	///
2269	/// This intrinsic corresponds to the <c> PMINSW </c> instruction.
2270	///
2271	/// \param __a
2272	/// A 64-bit integer vector containing one of the source operands.
2273	/// \param __b
2274	/// A 64-bit integer vector containing one of the source operands.
2275	/// \returns A 64-bit integer vector containing the comparison results.
2276	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2277	_mm_min_pi16(__m64 __a, __m64 __b)
2278	{
2279	return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b);
2280	}
2281
2282	/// Compares each of the corresponding packed 8-bit unsigned integer
2283	/// values of the 64-bit integer vectors, and writes the lesser value to the
2284	/// corresponding bits in the destination.
2285	///
2286	/// \headerfile <x86intrin.h>
2287	///
2288	/// This intrinsic corresponds to the <c> PMINUB </c> instruction.
2289	///
2290	/// \param __a
2291	/// A 64-bit integer vector containing one of the source operands.
2292	/// \param __b
2293	/// A 64-bit integer vector containing one of the source operands.
2294	/// \returns A 64-bit integer vector containing the comparison results.
2295	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2296	_mm_min_pu8(__m64 __a, __m64 __b)
2297	{
2298	return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b);
2299	}
2300
2301	/// Takes the most significant bit from each 8-bit element in a 64-bit
2302	/// integer vector to create an 8-bit mask value. Zero-extends the value to
2303	/// 32-bit integer and writes it to the destination.
2304	///
2305	/// \headerfile <x86intrin.h>
2306	///
2307	/// This intrinsic corresponds to the <c> PMOVMSKB </c> instruction.
2308	///
2309	/// \param __a
2310	/// A 64-bit integer vector containing the values with bits to be extracted.
2311	/// \returns The most significant bit from each 8-bit element in \a __a,
2312	/// written to bits [7:0].
2313	static __inline__ int __DEFAULT_FN_ATTRS_MMX
2314	_mm_movemask_pi8(__m64 __a)
2315	{
2316	return __builtin_ia32_pmovmskb((__v8qi)__a);
2317	}
2318
2319	/// Multiplies packed 16-bit unsigned integer values and writes the
2320	/// high-order 16 bits of each 32-bit product to the corresponding bits in
2321	/// the destination.
2322	///
2323	/// \headerfile <x86intrin.h>
2324	///
2325	/// This intrinsic corresponds to the <c> PMULHUW </c> instruction.
2326	///
2327	/// \param __a
2328	/// A 64-bit integer vector containing one of the source operands.
2329	/// \param __b
2330	/// A 64-bit integer vector containing one of the source operands.
2331	/// \returns A 64-bit integer vector containing the products of both operands.
2332	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2333	_mm_mulhi_pu16(__m64 __a, __m64 __b)
2334	{
2335	return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b);
2336	}
2337
2338	/// Shuffles the 4 16-bit integers from a 64-bit integer vector to the
2339	/// destination, as specified by the immediate value operand.
2340	///
2341	/// \headerfile <x86intrin.h>
2342	///
2343	/// \code
2344	/// __m64 _mm_shuffle_pi16(__m64 a, const int n);
2345	/// \endcode
2346	///
2347	/// This intrinsic corresponds to the <c> PSHUFW </c> instruction.
2348	///
2349	/// \param a
2350	/// A 64-bit integer vector containing the values to be shuffled.
2351	/// \param n
2352	/// An immediate value containing an 8-bit value specifying which elements to
2353	/// copy from \a a. The destinations within the 64-bit destination are
2354	/// assigned values as follows: \n
2355	/// Bits [1:0] are used to assign values to bits [15:0] in the
2356	/// destination. \n
2357	/// Bits [3:2] are used to assign values to bits [31:16] in the
2358	/// destination. \n
2359	/// Bits [5:4] are used to assign values to bits [47:32] in the
2360	/// destination. \n
2361	/// Bits [7:6] are used to assign values to bits [63:48] in the
2362	/// destination. \n
2363	/// Bit value assignments: \n
2364	/// 00: assigned from bits [15:0] of \a a. \n
2365	/// 01: assigned from bits [31:16] of \a a. \n
2366	/// 10: assigned from bits [47:32] of \a a. \n
2367	/// 11: assigned from bits [63:48] of \a a. \n
2368	/// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
2369	/// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
2370	/// <c>[b6, b4, b2, b0]</c>.
2371	/// \returns A 64-bit integer vector containing the shuffled values.
2372	#define _mm_shuffle_pi16(a, n) \
2373	((__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n)))
2374
2375	/// Conditionally copies the values from each 8-bit element in the first
2376	/// 64-bit integer vector operand to the specified memory location, as
2377	/// specified by the most significant bit in the corresponding element in the
2378	/// second 64-bit integer vector operand.
2379	///
2380	/// To minimize caching, the data is flagged as non-temporal
2381	/// (unlikely to be used again soon).
2382	///
2383	/// \headerfile <x86intrin.h>
2384	///
2385	/// This intrinsic corresponds to the <c> MASKMOVQ </c> instruction.
2386	///
2387	/// \param __d
2388	/// A 64-bit integer vector containing the values with elements to be copied.
2389	/// \param __n
2390	/// A 64-bit integer vector operand. The most significant bit from each 8-bit
2391	/// element determines whether the corresponding element in operand \a __d
2392	/// is copied. If the most significant bit of a given element is 1, the
2393	/// corresponding element in operand \a __d is copied.
2394	/// \param __p
2395	/// A pointer to a 64-bit memory location that will receive the conditionally
2396	/// copied integer values. The address of the memory location does not have
2397	/// to be aligned.
2398	static __inline__ void __DEFAULT_FN_ATTRS_MMX
2399	_mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
2400	{
2401	__builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p);
2402	}
2403
2404	/// Computes the rounded averages of the packed unsigned 8-bit integer
2405	/// values and writes the averages to the corresponding bits in the
2406	/// destination.
2407	///
2408	/// \headerfile <x86intrin.h>
2409	///
2410	/// This intrinsic corresponds to the <c> PAVGB </c> instruction.
2411	///
2412	/// \param __a
2413	/// A 64-bit integer vector containing one of the source operands.
2414	/// \param __b
2415	/// A 64-bit integer vector containing one of the source operands.
2416	/// \returns A 64-bit integer vector containing the averages of both operands.
2417	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2418	_mm_avg_pu8(__m64 __a, __m64 __b)
2419	{
2420	return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b);
2421	}
2422
2423	/// Computes the rounded averages of the packed unsigned 16-bit integer
2424	/// values and writes the averages to the corresponding bits in the
2425	/// destination.
2426	///
2427	/// \headerfile <x86intrin.h>
2428	///
2429	/// This intrinsic corresponds to the <c> PAVGW </c> instruction.
2430	///
2431	/// \param __a
2432	/// A 64-bit integer vector containing one of the source operands.
2433	/// \param __b
2434	/// A 64-bit integer vector containing one of the source operands.
2435	/// \returns A 64-bit integer vector containing the averages of both operands.
2436	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2437	_mm_avg_pu16(__m64 __a, __m64 __b)
2438	{
2439	return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b);
2440	}
2441
2442	/// Subtracts the corresponding 8-bit unsigned integer values of the two
2443	/// 64-bit vector operands and computes the absolute value for each of the
2444	/// difference. Then sum of the 8 absolute differences is written to the
2445	/// bits [15:0] of the destination; the remaining bits [63:16] are cleared.
2446	///
2447	/// \headerfile <x86intrin.h>
2448	///
2449	/// This intrinsic corresponds to the <c> PSADBW </c> instruction.
2450	///
2451	/// \param __a
2452	/// A 64-bit integer vector containing one of the source operands.
2453	/// \param __b
2454	/// A 64-bit integer vector containing one of the source operands.
2455	/// \returns A 64-bit integer vector whose lower 16 bits contain the sums of the
2456	/// sets of absolute differences between both operands. The upper bits are
2457	/// cleared.
2458	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2459	_mm_sad_pu8(__m64 __a, __m64 __b)
2460	{
2461	return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b);
2462	}
2463
2464	#if defined(__cplusplus)
2465	extern "C" {
2466	#endif
2467
2468	/// Returns the contents of the MXCSR register as a 32-bit unsigned
2469	/// integer value.
2470	///
2471	/// There are several groups of macros associated with this
2472	/// intrinsic, including:
2473	/// <ul>
2474	/// <li>
2475	/// For checking exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
2476	/// _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
2477	/// _MM_EXCEPT_INEXACT. There is a convenience wrapper
2478	/// _MM_GET_EXCEPTION_STATE().
2479	/// </li>
2480	/// <li>
2481	/// For checking exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
2482	/// _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
2483	/// There is a convenience wrapper _MM_GET_EXCEPTION_MASK().
2484	/// </li>
2485	/// <li>
2486	/// For checking rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
2487	/// _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
2488	/// _MM_GET_ROUNDING_MODE().
2489	/// </li>
2490	/// <li>
2491	/// For checking flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
2492	/// There is a convenience wrapper _MM_GET_FLUSH_ZERO_MODE().
2493	/// </li>
2494	/// <li>
2495	/// For checking denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
2496	/// _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
2497	/// _MM_GET_DENORMALS_ZERO_MODE().
2498	/// </li>
2499	/// </ul>
2500	///
2501	/// For example, the following expression checks if an overflow exception has
2502	/// occurred:
2503	/// \code
2504	/// ( _mm_getcsr() & _MM_EXCEPT_OVERFLOW )
2505	/// \endcode
2506	///
2507	/// The following expression gets the current rounding mode:
2508	/// \code
2509	/// _MM_GET_ROUNDING_MODE()
2510	/// \endcode
2511	///
2512	/// \headerfile <x86intrin.h>
2513	///
2514	/// This intrinsic corresponds to the <c> VSTMXCSR / STMXCSR </c> instruction.
2515	///
2516	/// \returns A 32-bit unsigned integer containing the contents of the MXCSR
2517	/// register.
2518	unsigned int _mm_getcsr(void);
2519
2520	/// Sets the MXCSR register with the 32-bit unsigned integer value.
2521	///
2522	/// There are several groups of macros associated with this intrinsic,
2523	/// including:
2524	/// <ul>
2525	/// <li>
2526	/// For setting exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
2527	/// _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
2528	/// _MM_EXCEPT_INEXACT. There is a convenience wrapper
2529	/// _MM_SET_EXCEPTION_STATE(x) where x is one of these macros.
2530	/// </li>
2531	/// <li>
2532	/// For setting exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
2533	/// _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
2534	/// There is a convenience wrapper _MM_SET_EXCEPTION_MASK(x) where x is one
2535	/// of these macros.
2536	/// </li>
2537	/// <li>
2538	/// For setting rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
2539	/// _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
2540	/// _MM_SET_ROUNDING_MODE(x) where x is one of these macros.
2541	/// </li>
2542	/// <li>
2543	/// For setting flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
2544	/// There is a convenience wrapper _MM_SET_FLUSH_ZERO_MODE(x) where x is
2545	/// one of these macros.
2546	/// </li>
2547	/// <li>
2548	/// For setting denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
2549	/// _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
2550	/// _MM_SET_DENORMALS_ZERO_MODE(x) where x is one of these macros.
2551	/// </li>
2552	/// </ul>
2553	///
2554	/// For example, the following expression causes subsequent floating-point
2555	/// operations to round up:
2556	/// _mm_setcsr(_mm_getcsr() \| _MM_ROUND_UP)
2557	///
2558	/// The following example sets the DAZ and FTZ flags:
2559	/// \code
2560	/// void setFlags() {
2561	/// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
2562	/// _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
2563	/// }
2564	/// \endcode
2565	///
2566	/// \headerfile <x86intrin.h>
2567	///
2568	/// This intrinsic corresponds to the <c> VLDMXCSR / LDMXCSR </c> instruction.
2569	///
2570	/// \param __i
2571	/// A 32-bit unsigned integer value to be written to the MXCSR register.
2572	void _mm_setcsr(unsigned int __i);
2573
2574	#if defined(__cplusplus)
2575	} // extern "C"
2576	#endif
2577
2578	/// Selects 4 float values from the 128-bit operands of [4 x float], as
2579	/// specified by the immediate value operand.
2580	///
2581	/// \headerfile <x86intrin.h>
2582	///
2583	/// \code
2584	/// __m128 _mm_shuffle_ps(__m128 a, __m128 b, const int mask);
2585	/// \endcode
2586	///
2587	/// This intrinsic corresponds to the <c> VSHUFPS / SHUFPS </c> instruction.
2588	///
2589	/// \param a
2590	/// A 128-bit vector of [4 x float].
2591	/// \param b
2592	/// A 128-bit vector of [4 x float].
2593	/// \param mask
2594	/// An immediate value containing an 8-bit value specifying which elements to
2595	/// copy from \a a and \a b. \n
2596	/// Bits [3:0] specify the values copied from operand \a a. \n
2597	/// Bits [7:4] specify the values copied from operand \a b. \n
2598	/// The destinations within the 128-bit destination are assigned values as
2599	/// follows: \n
2600	/// Bits [1:0] are used to assign values to bits [31:0] in the
2601	/// destination. \n
2602	/// Bits [3:2] are used to assign values to bits [63:32] in the
2603	/// destination. \n
2604	/// Bits [5:4] are used to assign values to bits [95:64] in the
2605	/// destination. \n
2606	/// Bits [7:6] are used to assign values to bits [127:96] in the
2607	/// destination. \n
2608	/// Bit value assignments: \n
2609	/// 00: Bits [31:0] copied from the specified operand. \n
2610	/// 01: Bits [63:32] copied from the specified operand. \n
2611	/// 10: Bits [95:64] copied from the specified operand. \n
2612	/// 11: Bits [127:96] copied from the specified operand. \n
2613	/// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
2614	/// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
2615	/// <c>[b6, b4, b2, b0]</c>.
2616	/// \returns A 128-bit vector of [4 x float] containing the shuffled values.
2617	#define _mm_shuffle_ps(a, b, mask) \
2618	((__m128)__builtin_ia32_shufps((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \
2619	(int)(mask)))
2620
2621	/// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
2622	/// [4 x float] and interleaves them into a 128-bit vector of [4 x float].
2623	///
2624	/// \headerfile <x86intrin.h>
2625	///
2626	/// This intrinsic corresponds to the <c> VUNPCKHPS / UNPCKHPS </c> instruction.
2627	///
2628	/// \param __a
2629	/// A 128-bit vector of [4 x float]. \n
2630	/// Bits [95:64] are written to bits [31:0] of the destination. \n
2631	/// Bits [127:96] are written to bits [95:64] of the destination.
2632	/// \param __b
2633	/// A 128-bit vector of [4 x float].
2634	/// Bits [95:64] are written to bits [63:32] of the destination. \n
2635	/// Bits [127:96] are written to bits [127:96] of the destination.
2636	/// \returns A 128-bit vector of [4 x float] containing the interleaved values.
2637	static __inline__ __m128 __DEFAULT_FN_ATTRS
2638	_mm_unpackhi_ps(__m128 __a, __m128 __b)
2639	{
2640	return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 2, 6, 3, 7);
2641	}
2642
2643	/// Unpacks the low-order (index 0,1) values from two 128-bit vectors of
2644	/// [4 x float] and interleaves them into a 128-bit vector of [4 x float].
2645	///
2646	/// \headerfile <x86intrin.h>
2647	///
2648	/// This intrinsic corresponds to the <c> VUNPCKLPS / UNPCKLPS </c> instruction.
2649	///
2650	/// \param __a
2651	/// A 128-bit vector of [4 x float]. \n
2652	/// Bits [31:0] are written to bits [31:0] of the destination. \n
2653	/// Bits [63:32] are written to bits [95:64] of the destination.
2654	/// \param __b
2655	/// A 128-bit vector of [4 x float]. \n
2656	/// Bits [31:0] are written to bits [63:32] of the destination. \n
2657	/// Bits [63:32] are written to bits [127:96] of the destination.
2658	/// \returns A 128-bit vector of [4 x float] containing the interleaved values.
2659	static __inline__ __m128 __DEFAULT_FN_ATTRS
2660	_mm_unpacklo_ps(__m128 __a, __m128 __b)
2661	{
2662	return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 4, 1, 5);
2663	}
2664
2665	/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
2666	/// 32 bits are set to the lower 32 bits of the second parameter. The upper
2667	/// 96 bits are set to the upper 96 bits of the first parameter.
2668	///
2669	/// \headerfile <x86intrin.h>
2670	///
2671	/// This intrinsic corresponds to the <c> VBLENDPS / BLENDPS / MOVSS </c>
2672	/// instruction.
2673	///
2674	/// \param __a
2675	/// A 128-bit floating-point vector of [4 x float]. The upper 96 bits are
2676	/// written to the upper 96 bits of the result.
2677	/// \param __b
2678	/// A 128-bit floating-point vector of [4 x float]. The lower 32 bits are
2679	/// written to the lower 32 bits of the result.
2680	/// \returns A 128-bit floating-point vector of [4 x float].
2681	static __inline__ __m128 __DEFAULT_FN_ATTRS
2682	_mm_move_ss(__m128 __a, __m128 __b)
2683	{
2684	__a[0] = __b[0];
2685	return __a;
2686	}
2687
2688	/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
2689	/// 64 bits are set to the upper 64 bits of the second parameter. The upper
2690	/// 64 bits are set to the upper 64 bits of the first parameter.
2691	///
2692	/// \headerfile <x86intrin.h>
2693	///
2694	/// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
2695	///
2696	/// \param __a
2697	/// A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
2698	/// written to the upper 64 bits of the result.
2699	/// \param __b
2700	/// A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
2701	/// written to the lower 64 bits of the result.
2702	/// \returns A 128-bit floating-point vector of [4 x float].
2703	static __inline__ __m128 __DEFAULT_FN_ATTRS
2704	_mm_movehl_ps(__m128 __a, __m128 __b)
2705	{
2706	return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 6, 7, 2, 3);
2707	}
2708
2709	/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
2710	/// 64 bits are set to the lower 64 bits of the first parameter. The upper
2711	/// 64 bits are set to the lower 64 bits of the second parameter.
2712	///
2713	/// \headerfile <x86intrin.h>
2714	///
2715	/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
2716	///
2717	/// \param __a
2718	/// A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
2719	/// written to the lower 64 bits of the result.
2720	/// \param __b
2721	/// A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
2722	/// written to the upper 64 bits of the result.
2723	/// \returns A 128-bit floating-point vector of [4 x float].
2724	static __inline__ __m128 __DEFAULT_FN_ATTRS
2725	_mm_movelh_ps(__m128 __a, __m128 __b)
2726	{
2727	return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 1, 4, 5);
2728	}
2729
2730	/// Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x
2731	/// float].
2732	///
2733	/// \headerfile <x86intrin.h>
2734	///
2735	/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2736	///
2737	/// \param __a
2738	/// A 64-bit vector of [4 x i16]. The elements of the destination are copied
2739	/// from the corresponding elements in this operand.
2740	/// \returns A 128-bit vector of [4 x float] containing the copied and converted
2741	/// values from the operand.
2742	static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
2743	_mm_cvtpi16_ps(__m64 __a)
2744	{
2745	__m64 __b, __c;
2746	__m128 __r;
2747
2748	__b = _mm_setzero_si64();
2749	__b = _mm_cmpgt_pi16(__b, __a);
2750	__c = _mm_unpackhi_pi16(__a, __b);
2751	__r = _mm_setzero_ps();
2752	__r = _mm_cvtpi32_ps(__r, __c);
2753	__r = _mm_movelh_ps(__r, __r);
2754	__c = _mm_unpacklo_pi16(__a, __b);
2755	__r = _mm_cvtpi32_ps(__r, __c);
2756
2757	return __r;
2758	}
2759
2760	/// Converts a 64-bit vector of 16-bit unsigned integer values into a
2761	/// 128-bit vector of [4 x float].
2762	///
2763	/// \headerfile <x86intrin.h>
2764	///
2765	/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2766	///
2767	/// \param __a
2768	/// A 64-bit vector of 16-bit unsigned integer values. The elements of the
2769	/// destination are copied from the corresponding elements in this operand.
2770	/// \returns A 128-bit vector of [4 x float] containing the copied and converted
2771	/// values from the operand.
2772	static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
2773	_mm_cvtpu16_ps(__m64 __a)
2774	{
2775	__m64 __b, __c;
2776	__m128 __r;
2777
2778	__b = _mm_setzero_si64();
2779	__c = _mm_unpackhi_pi16(__a, __b);
2780	__r = _mm_setzero_ps();
2781	__r = _mm_cvtpi32_ps(__r, __c);
2782	__r = _mm_movelh_ps(__r, __r);
2783	__c = _mm_unpacklo_pi16(__a, __b);
2784	__r = _mm_cvtpi32_ps(__r, __c);
2785
2786	return __r;
2787	}
2788
2789	/// Converts the lower four 8-bit values from a 64-bit vector of [8 x i8]
2790	/// into a 128-bit vector of [4 x float].
2791	///
2792	/// \headerfile <x86intrin.h>
2793	///
2794	/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2795	///
2796	/// \param __a
2797	/// A 64-bit vector of [8 x i8]. The elements of the destination are copied
2798	/// from the corresponding lower 4 elements in this operand.
2799	/// \returns A 128-bit vector of [4 x float] containing the copied and converted
2800	/// values from the operand.
2801	static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
2802	_mm_cvtpi8_ps(__m64 __a)
2803	{
2804	__m64 __b;
2805
2806	__b = _mm_setzero_si64();
2807	__b = _mm_cmpgt_pi8(__b, __a);
2808	__b = _mm_unpacklo_pi8(__a, __b);
2809
2810	return _mm_cvtpi16_ps(__b);
2811	}
2812
2813	/// Converts the lower four unsigned 8-bit integer values from a 64-bit
2814	/// vector of [8 x u8] into a 128-bit vector of [4 x float].
2815	///
2816	/// \headerfile <x86intrin.h>
2817	///
2818	/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2819	///
2820	/// \param __a
2821	/// A 64-bit vector of unsigned 8-bit integer values. The elements of the
2822	/// destination are copied from the corresponding lower 4 elements in this
2823	/// operand.
2824	/// \returns A 128-bit vector of [4 x float] containing the copied and converted
2825	/// values from the source operand.
2826	static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
2827	_mm_cvtpu8_ps(__m64 __a)
2828	{
2829	__m64 __b;
2830
2831	__b = _mm_setzero_si64();
2832	__b = _mm_unpacklo_pi8(__a, __b);
2833
2834	return _mm_cvtpi16_ps(__b);
2835	}
2836
2837	/// Converts the two 32-bit signed integer values from each 64-bit vector
2838	/// operand of [2 x i32] into a 128-bit vector of [4 x float].
2839	///
2840	/// \headerfile <x86intrin.h>
2841	///
2842	/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2843	///
2844	/// \param __a
2845	/// A 64-bit vector of [2 x i32]. The lower elements of the destination are
2846	/// copied from the elements in this operand.
2847	/// \param __b
2848	/// A 64-bit vector of [2 x i32]. The upper elements of the destination are
2849	/// copied from the elements in this operand.
2850	/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
2851	/// copied and converted values from the first operand. The upper 64 bits
2852	/// contain the copied and converted values from the second operand.
2853	static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
2854	_mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
2855	{
2856	__m128 __c;
2857
2858	__c = _mm_setzero_ps();
2859	__c = _mm_cvtpi32_ps(__c, __b);
2860	__c = _mm_movelh_ps(__c, __c);
2861
2862	return _mm_cvtpi32_ps(__c, __a);
2863	}
2864
2865	/// Converts each single-precision floating-point element of a 128-bit
2866	/// floating-point vector of [4 x float] into a 16-bit signed integer, and
2867	/// packs the results into a 64-bit integer vector of [4 x i16].
2868	///
2869	/// If the floating-point element is NaN or infinity, or if the
2870	/// floating-point element is greater than 0x7FFFFFFF or less than -0x8000,
2871	/// it is converted to 0x8000. Otherwise if the floating-point element is
2872	/// greater than 0x7FFF, it is converted to 0x7FFF.
2873	///
2874	/// \headerfile <x86intrin.h>
2875	///
2876	/// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
2877	///
2878	/// \param __a
2879	/// A 128-bit floating-point vector of [4 x float].
2880	/// \returns A 64-bit integer vector of [4 x i16] containing the converted
2881	/// values.
2882	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2883	_mm_cvtps_pi16(__m128 __a)
2884	{
2885	__m64 __b, __c;
2886
2887	__b = _mm_cvtps_pi32(__a);
2888	__a = _mm_movehl_ps(__a, __a);
2889	__c = _mm_cvtps_pi32(__a);
2890
2891	return _mm_packs_pi32(__b, __c);
2892	}
2893
2894	/// Converts each single-precision floating-point element of a 128-bit
2895	/// floating-point vector of [4 x float] into an 8-bit signed integer, and
2896	/// packs the results into the lower 32 bits of a 64-bit integer vector of
2897	/// [8 x i8]. The upper 32 bits of the vector are set to 0.
2898	///
2899	/// If the floating-point element is NaN or infinity, or if the
2900	/// floating-point element is greater than 0x7FFFFFFF or less than -0x80, it
2901	/// is converted to 0x80. Otherwise if the floating-point element is greater
2902	/// than 0x7F, it is converted to 0x7F.
2903	///
2904	/// \headerfile <x86intrin.h>
2905	///
2906	/// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
2907	///
2908	/// \param __a
2909	/// 128-bit floating-point vector of [4 x float].
2910	/// \returns A 64-bit integer vector of [8 x i8]. The lower 32 bits contain the
2911	/// converted values and the uppper 32 bits are set to zero.
2912	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2913	_mm_cvtps_pi8(__m128 __a)
2914	{
2915	__m64 __b, __c;
2916
2917	__b = _mm_cvtps_pi16(__a);
2918	__c = _mm_setzero_si64();
2919
2920	return _mm_packs_pi16(__b, __c);
2921	}
2922
2923	/// Extracts the sign bits from each single-precision floating-point
2924	/// element of a 128-bit floating-point vector of [4 x float] and returns the
2925	/// sign bits in bits [0:3] of the result. Bits [31:4] of the result are set
2926	/// to zero.
2927	///
2928	/// \headerfile <x86intrin.h>
2929	///
2930	/// This intrinsic corresponds to the <c> VMOVMSKPS / MOVMSKPS </c> instruction.
2931	///
2932	/// \param __a
2933	/// A 128-bit floating-point vector of [4 x float].
2934	/// \returns A 32-bit integer value. Bits [3:0] contain the sign bits from each
2935	/// single-precision floating-point element of the parameter. Bits [31:4] are
2936	/// set to zero.
2937	static __inline__ int __DEFAULT_FN_ATTRS
2938	_mm_movemask_ps(__m128 __a)
2939	{
2940	return __builtin_ia32_movmskps((__v4sf)__a);
2941	}
2942
2943
2944	#define _MM_ALIGN16 __attribute__((aligned(16)))
2945
2946	#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) \| ((y) << 4) \| ((x) << 2) \| (w))
2947
2948	#define _MM_EXCEPT_INVALID (0x0001U)
2949	#define _MM_EXCEPT_DENORM (0x0002U)
2950	#define _MM_EXCEPT_DIV_ZERO (0x0004U)
2951	#define _MM_EXCEPT_OVERFLOW (0x0008U)
2952	#define _MM_EXCEPT_UNDERFLOW (0x0010U)
2953	#define _MM_EXCEPT_INEXACT (0x0020U)
2954	#define _MM_EXCEPT_MASK (0x003fU)
2955
2956	#define _MM_MASK_INVALID (0x0080U)
2957	#define _MM_MASK_DENORM (0x0100U)
2958	#define _MM_MASK_DIV_ZERO (0x0200U)
2959	#define _MM_MASK_OVERFLOW (0x0400U)
2960	#define _MM_MASK_UNDERFLOW (0x0800U)
2961	#define _MM_MASK_INEXACT (0x1000U)
2962	#define _MM_MASK_MASK (0x1f80U)
2963
2964	#define _MM_ROUND_NEAREST (0x0000U)
2965	#define _MM_ROUND_DOWN (0x2000U)
2966	#define _MM_ROUND_UP (0x4000U)
2967	#define _MM_ROUND_TOWARD_ZERO (0x6000U)
2968	#define _MM_ROUND_MASK (0x6000U)
2969
2970	#define _MM_FLUSH_ZERO_MASK (0x8000U)
2971	#define _MM_FLUSH_ZERO_ON (0x8000U)
2972	#define _MM_FLUSH_ZERO_OFF (0x0000U)
2973
2974	#define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK)
2975	#define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK)
2976	#define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)
2977	#define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK)
2978
2979	#define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) \| (x)))
2980	#define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) \| (x)))
2981	#define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) \| (x)))
2982	#define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) \| (x)))
2983
2984	#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
2985	do { \
2986	__m128 tmp3, tmp2, tmp1, tmp0; \
2987	tmp0 = _mm_unpacklo_ps((row0), (row1)); \
2988	tmp2 = _mm_unpacklo_ps((row2), (row3)); \
2989	tmp1 = _mm_unpackhi_ps((row0), (row1)); \
2990	tmp3 = _mm_unpackhi_ps((row2), (row3)); \
2991	(row0) = _mm_movelh_ps(tmp0, tmp2); \
2992	(row1) = _mm_movehl_ps(tmp2, tmp0); \
2993	(row2) = _mm_movelh_ps(tmp1, tmp3); \
2994	(row3) = _mm_movehl_ps(tmp3, tmp1); \
2995	} while (0)
2996
2997	/* Aliases for compatibility. */
2998	#define _m_pextrw _mm_extract_pi16
2999	#define _m_pinsrw _mm_insert_pi16
3000	#define _m_pmaxsw _mm_max_pi16
3001	#define _m_pmaxub _mm_max_pu8
3002	#define _m_pminsw _mm_min_pi16
3003	#define _m_pminub _mm_min_pu8
3004	#define _m_pmovmskb _mm_movemask_pi8
3005	#define _m_pmulhuw _mm_mulhi_pu16
3006	#define _m_pshufw _mm_shuffle_pi16
3007	#define _m_maskmovq _mm_maskmove_si64
3008	#define _m_pavgb _mm_avg_pu8
3009	#define _m_pavgw _mm_avg_pu16
3010	#define _m_psadbw _mm_sad_pu8
3011	#define _m_ _mm_
3012
3013	#undef __DEFAULT_FN_ATTRS
3014	#undef __DEFAULT_FN_ATTRS_MMX
3015
3016	/* Ugly hack for backwards-compatibility (compatible with gcc) */
3017	#if defined(__SSE2__) && !__building_module(_Builtin_intrinsics)
3018	#include <emmintrin.h>
3019	#endif
3020
3021	#endif /* __XMMINTRIN_H */
3022

Warning: This file is not a C or C++ file. It does not have highlighting.

source code of clang/lib/Headers/xmmintrin.h