avxintrin.h source code [clang/lib/Headers/avxintrin.h]

Warning: This file is not a C or C++ file. It does not have highlighting.

1	/*===---- avxintrin.h - AVX intrinsics -------------------------------------===
2	*
3	* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	* See https://llvm.org/LICENSE.txt for license information.
5	* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	*
7	*===-----------------------------------------------------------------------===
8	*/
9
10	#ifndef __IMMINTRIN_H
11	#error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
12	#endif
13
14	#ifndef __AVXINTRIN_H
15	#define __AVXINTRIN_H
16
17	typedef double __v4df __attribute__ ((__vector_size__ (32)));
18	typedef float __v8sf __attribute__ ((__vector_size__ (32)));
19	typedef long long __v4di __attribute__ ((__vector_size__ (32)));
20	typedef int __v8si __attribute__ ((__vector_size__ (32)));
21	typedef short __v16hi __attribute__ ((__vector_size__ (32)));
22	typedef char __v32qi __attribute__ ((__vector_size__ (32)));
23
24	/* Unsigned types */
25	typedef unsigned long long __v4du __attribute__ ((__vector_size__ (32)));
26	typedef unsigned int __v8su __attribute__ ((__vector_size__ (32)));
27	typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32)));
28	typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32)));
29
30	/* We need an explicitly signed variant for char. Note that this shouldn't
31	* appear in the interface though. */
32	typedef signed char __v32qs __attribute__((__vector_size__(32)));
33
34	typedef float __m256 __attribute__ ((__vector_size__ (32), __aligned__(32)));
35	typedef double __m256d __attribute__((__vector_size__(32), __aligned__(32)));
36	typedef long long __m256i __attribute__((__vector_size__(32), __aligned__(32)));
37
38	typedef float __m256_u __attribute__ ((__vector_size__ (32), __aligned__(1)));
39	typedef double __m256d_u __attribute__((__vector_size__(32), __aligned__(1)));
40	typedef long long __m256i_u __attribute__((__vector_size__(32), __aligned__(1)));
41
42	#ifdef __SSE2__
43	/* Both _Float16 and __bf16 require SSE2 being enabled. */
44	typedef _Float16 __v16hf __attribute__((__vector_size__(32), __aligned__(32)));
45	typedef _Float16 __m256h __attribute__((__vector_size__(32), __aligned__(32)));
46	typedef _Float16 __m256h_u __attribute__((__vector_size__(32), __aligned__(1)));
47
48	typedef __bf16 __v16bf __attribute__((__vector_size__(32), __aligned__(32)));
49	typedef __bf16 __m256bh __attribute__((__vector_size__(32), __aligned__(32)));
50	#endif
51
52	/* Define the default attributes for the functions in this file. */
53	#define __DEFAULT_FN_ATTRS \
54	__attribute__((__always_inline__, __nodebug__, __target__("avx,no-evex512"), \
55	__min_vector_width__(256)))
56	#define __DEFAULT_FN_ATTRS128 \
57	__attribute__((__always_inline__, __nodebug__, __target__("avx,no-evex512"), \
58	__min_vector_width__(128)))
59
60	/* Arithmetic */
61	/// Adds two 256-bit vectors of [4 x double].
62	///
63	/// \headerfile <x86intrin.h>
64	///
65	/// This intrinsic corresponds to the <c> VADDPD </c> instruction.
66	///
67	/// \param __a
68	/// A 256-bit vector of [4 x double] containing one of the source operands.
69	/// \param __b
70	/// A 256-bit vector of [4 x double] containing one of the source operands.
71	/// \returns A 256-bit vector of [4 x double] containing the sums of both
72	/// operands.
73	static __inline __m256d __DEFAULT_FN_ATTRS
74	_mm256_add_pd(__m256d __a, __m256d __b)
75	{
76	return (__m256d)((__v4df)__a+(__v4df)__b);
77	}
78
79	/// Adds two 256-bit vectors of [8 x float].
80	///
81	/// \headerfile <x86intrin.h>
82	///
83	/// This intrinsic corresponds to the <c> VADDPS </c> instruction.
84	///
85	/// \param __a
86	/// A 256-bit vector of [8 x float] containing one of the source operands.
87	/// \param __b
88	/// A 256-bit vector of [8 x float] containing one of the source operands.
89	/// \returns A 256-bit vector of [8 x float] containing the sums of both
90	/// operands.
91	static __inline __m256 __DEFAULT_FN_ATTRS
92	_mm256_add_ps(__m256 __a, __m256 __b)
93	{
94	return (__m256)((__v8sf)__a+(__v8sf)__b);
95	}
96
97	/// Subtracts two 256-bit vectors of [4 x double].
98	///
99	/// \headerfile <x86intrin.h>
100	///
101	/// This intrinsic corresponds to the <c> VSUBPD </c> instruction.
102	///
103	/// \param __a
104	/// A 256-bit vector of [4 x double] containing the minuend.
105	/// \param __b
106	/// A 256-bit vector of [4 x double] containing the subtrahend.
107	/// \returns A 256-bit vector of [4 x double] containing the differences between
108	/// both operands.
109	static __inline __m256d __DEFAULT_FN_ATTRS
110	_mm256_sub_pd(__m256d __a, __m256d __b)
111	{
112	return (__m256d)((__v4df)__a-(__v4df)__b);
113	}
114
115	/// Subtracts two 256-bit vectors of [8 x float].
116	///
117	/// \headerfile <x86intrin.h>
118	///
119	/// This intrinsic corresponds to the <c> VSUBPS </c> instruction.
120	///
121	/// \param __a
122	/// A 256-bit vector of [8 x float] containing the minuend.
123	/// \param __b
124	/// A 256-bit vector of [8 x float] containing the subtrahend.
125	/// \returns A 256-bit vector of [8 x float] containing the differences between
126	/// both operands.
127	static __inline __m256 __DEFAULT_FN_ATTRS
128	_mm256_sub_ps(__m256 __a, __m256 __b)
129	{
130	return (__m256)((__v8sf)__a-(__v8sf)__b);
131	}
132
133	/// Adds the even-indexed values and subtracts the odd-indexed values of
134	/// two 256-bit vectors of [4 x double].
135	///
136	/// \headerfile <x86intrin.h>
137	///
138	/// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction.
139	///
140	/// \param __a
141	/// A 256-bit vector of [4 x double] containing the left source operand.
142	/// \param __b
143	/// A 256-bit vector of [4 x double] containing the right source operand.
144	/// \returns A 256-bit vector of [4 x double] containing the alternating sums
145	/// and differences between both operands.
146	static __inline __m256d __DEFAULT_FN_ATTRS
147	_mm256_addsub_pd(__m256d __a, __m256d __b)
148	{
149	return (__m256d)__builtin_ia32_addsubpd256((__v4df)__a, (__v4df)__b);
150	}
151
152	/// Adds the even-indexed values and subtracts the odd-indexed values of
153	/// two 256-bit vectors of [8 x float].
154	///
155	/// \headerfile <x86intrin.h>
156	///
157	/// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction.
158	///
159	/// \param __a
160	/// A 256-bit vector of [8 x float] containing the left source operand.
161	/// \param __b
162	/// A 256-bit vector of [8 x float] containing the right source operand.
163	/// \returns A 256-bit vector of [8 x float] containing the alternating sums and
164	/// differences between both operands.
165	static __inline __m256 __DEFAULT_FN_ATTRS
166	_mm256_addsub_ps(__m256 __a, __m256 __b)
167	{
168	return (__m256)__builtin_ia32_addsubps256((__v8sf)__a, (__v8sf)__b);
169	}
170
171	/// Divides two 256-bit vectors of [4 x double].
172	///
173	/// \headerfile <x86intrin.h>
174	///
175	/// This intrinsic corresponds to the <c> VDIVPD </c> instruction.
176	///
177	/// \param __a
178	/// A 256-bit vector of [4 x double] containing the dividend.
179	/// \param __b
180	/// A 256-bit vector of [4 x double] containing the divisor.
181	/// \returns A 256-bit vector of [4 x double] containing the quotients of both
182	/// operands.
183	static __inline __m256d __DEFAULT_FN_ATTRS
184	_mm256_div_pd(__m256d __a, __m256d __b)
185	{
186	return (__m256d)((__v4df)__a/(__v4df)__b);
187	}
188
189	/// Divides two 256-bit vectors of [8 x float].
190	///
191	/// \headerfile <x86intrin.h>
192	///
193	/// This intrinsic corresponds to the <c> VDIVPS </c> instruction.
194	///
195	/// \param __a
196	/// A 256-bit vector of [8 x float] containing the dividend.
197	/// \param __b
198	/// A 256-bit vector of [8 x float] containing the divisor.
199	/// \returns A 256-bit vector of [8 x float] containing the quotients of both
200	/// operands.
201	static __inline __m256 __DEFAULT_FN_ATTRS
202	_mm256_div_ps(__m256 __a, __m256 __b)
203	{
204	return (__m256)((__v8sf)__a/(__v8sf)__b);
205	}
206
207	/// Compares two 256-bit vectors of [4 x double] and returns the greater
208	/// of each pair of values.
209	///
210	/// \headerfile <x86intrin.h>
211	///
212	/// This intrinsic corresponds to the <c> VMAXPD </c> instruction.
213	///
214	/// \param __a
215	/// A 256-bit vector of [4 x double] containing one of the operands.
216	/// \param __b
217	/// A 256-bit vector of [4 x double] containing one of the operands.
218	/// \returns A 256-bit vector of [4 x double] containing the maximum values
219	/// between both operands.
220	static __inline __m256d __DEFAULT_FN_ATTRS
221	_mm256_max_pd(__m256d __a, __m256d __b)
222	{
223	return (__m256d)__builtin_ia32_maxpd256((__v4df)__a, (__v4df)__b);
224	}
225
226	/// Compares two 256-bit vectors of [8 x float] and returns the greater
227	/// of each pair of values.
228	///
229	/// \headerfile <x86intrin.h>
230	///
231	/// This intrinsic corresponds to the <c> VMAXPS </c> instruction.
232	///
233	/// \param __a
234	/// A 256-bit vector of [8 x float] containing one of the operands.
235	/// \param __b
236	/// A 256-bit vector of [8 x float] containing one of the operands.
237	/// \returns A 256-bit vector of [8 x float] containing the maximum values
238	/// between both operands.
239	static __inline __m256 __DEFAULT_FN_ATTRS
240	_mm256_max_ps(__m256 __a, __m256 __b)
241	{
242	return (__m256)__builtin_ia32_maxps256((__v8sf)__a, (__v8sf)__b);
243	}
244
245	/// Compares two 256-bit vectors of [4 x double] and returns the lesser
246	/// of each pair of values.
247	///
248	/// \headerfile <x86intrin.h>
249	///
250	/// This intrinsic corresponds to the <c> VMINPD </c> instruction.
251	///
252	/// \param __a
253	/// A 256-bit vector of [4 x double] containing one of the operands.
254	/// \param __b
255	/// A 256-bit vector of [4 x double] containing one of the operands.
256	/// \returns A 256-bit vector of [4 x double] containing the minimum values
257	/// between both operands.
258	static __inline __m256d __DEFAULT_FN_ATTRS
259	_mm256_min_pd(__m256d __a, __m256d __b)
260	{
261	return (__m256d)__builtin_ia32_minpd256((__v4df)__a, (__v4df)__b);
262	}
263
264	/// Compares two 256-bit vectors of [8 x float] and returns the lesser
265	/// of each pair of values.
266	///
267	/// \headerfile <x86intrin.h>
268	///
269	/// This intrinsic corresponds to the <c> VMINPS </c> instruction.
270	///
271	/// \param __a
272	/// A 256-bit vector of [8 x float] containing one of the operands.
273	/// \param __b
274	/// A 256-bit vector of [8 x float] containing one of the operands.
275	/// \returns A 256-bit vector of [8 x float] containing the minimum values
276	/// between both operands.
277	static __inline __m256 __DEFAULT_FN_ATTRS
278	_mm256_min_ps(__m256 __a, __m256 __b)
279	{
280	return (__m256)__builtin_ia32_minps256((__v8sf)__a, (__v8sf)__b);
281	}
282
283	/// Multiplies two 256-bit vectors of [4 x double].
284	///
285	/// \headerfile <x86intrin.h>
286	///
287	/// This intrinsic corresponds to the <c> VMULPD </c> instruction.
288	///
289	/// \param __a
290	/// A 256-bit vector of [4 x double] containing one of the operands.
291	/// \param __b
292	/// A 256-bit vector of [4 x double] containing one of the operands.
293	/// \returns A 256-bit vector of [4 x double] containing the products of both
294	/// operands.
295	static __inline __m256d __DEFAULT_FN_ATTRS
296	_mm256_mul_pd(__m256d __a, __m256d __b)
297	{
298	return (__m256d)((__v4df)__a * (__v4df)__b);
299	}
300
301	/// Multiplies two 256-bit vectors of [8 x float].
302	///
303	/// \headerfile <x86intrin.h>
304	///
305	/// This intrinsic corresponds to the <c> VMULPS </c> instruction.
306	///
307	/// \param __a
308	/// A 256-bit vector of [8 x float] containing one of the operands.
309	/// \param __b
310	/// A 256-bit vector of [8 x float] containing one of the operands.
311	/// \returns A 256-bit vector of [8 x float] containing the products of both
312	/// operands.
313	static __inline __m256 __DEFAULT_FN_ATTRS
314	_mm256_mul_ps(__m256 __a, __m256 __b)
315	{
316	return (__m256)((__v8sf)__a * (__v8sf)__b);
317	}
318
319	/// Calculates the square roots of the values in a 256-bit vector of
320	/// [4 x double].
321	///
322	/// \headerfile <x86intrin.h>
323	///
324	/// This intrinsic corresponds to the <c> VSQRTPD </c> instruction.
325	///
326	/// \param __a
327	/// A 256-bit vector of [4 x double].
328	/// \returns A 256-bit vector of [4 x double] containing the square roots of the
329	/// values in the operand.
330	static __inline __m256d __DEFAULT_FN_ATTRS
331	_mm256_sqrt_pd(__m256d __a)
332	{
333	return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a);
334	}
335
336	/// Calculates the square roots of the values in a 256-bit vector of
337	/// [8 x float].
338	///
339	/// \headerfile <x86intrin.h>
340	///
341	/// This intrinsic corresponds to the <c> VSQRTPS </c> instruction.
342	///
343	/// \param __a
344	/// A 256-bit vector of [8 x float].
345	/// \returns A 256-bit vector of [8 x float] containing the square roots of the
346	/// values in the operand.
347	static __inline __m256 __DEFAULT_FN_ATTRS
348	_mm256_sqrt_ps(__m256 __a)
349	{
350	return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a);
351	}
352
353	/// Calculates the reciprocal square roots of the values in a 256-bit
354	/// vector of [8 x float].
355	///
356	/// \headerfile <x86intrin.h>
357	///
358	/// This intrinsic corresponds to the <c> VRSQRTPS </c> instruction.
359	///
360	/// \param __a
361	/// A 256-bit vector of [8 x float].
362	/// \returns A 256-bit vector of [8 x float] containing the reciprocal square
363	/// roots of the values in the operand.
364	static __inline __m256 __DEFAULT_FN_ATTRS
365	_mm256_rsqrt_ps(__m256 __a)
366	{
367	return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__a);
368	}
369
370	/// Calculates the reciprocals of the values in a 256-bit vector of
371	/// [8 x float].
372	///
373	/// \headerfile <x86intrin.h>
374	///
375	/// This intrinsic corresponds to the <c> VRCPPS </c> instruction.
376	///
377	/// \param __a
378	/// A 256-bit vector of [8 x float].
379	/// \returns A 256-bit vector of [8 x float] containing the reciprocals of the
380	/// values in the operand.
381	static __inline __m256 __DEFAULT_FN_ATTRS
382	_mm256_rcp_ps(__m256 __a)
383	{
384	return (__m256)__builtin_ia32_rcpps256((__v8sf)__a);
385	}
386
387	/// Rounds the values in a 256-bit vector of [4 x double] as specified
388	/// by the byte operand. The source values are rounded to integer values and
389	/// returned as 64-bit double-precision floating-point values.
390	///
391	/// \headerfile <x86intrin.h>
392	///
393	/// \code
394	/// __m256d _mm256_round_pd(__m256d V, const int M);
395	/// \endcode
396	///
397	/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
398	///
399	/// \param V
400	/// A 256-bit vector of [4 x double].
401	/// \param M
402	/// An integer value that specifies the rounding operation. \n
403	/// Bits [7:4] are reserved. \n
404	/// Bit [3] is a precision exception value: \n
405	/// 0: A normal PE exception is used. \n
406	/// 1: The PE field is not updated. \n
407	/// Bit [2] is the rounding control source: \n
408	/// 0: Use bits [1:0] of \a M. \n
409	/// 1: Use the current MXCSR setting. \n
410	/// Bits [1:0] contain the rounding control definition: \n
411	/// 00: Nearest. \n
412	/// 01: Downward (toward negative infinity). \n
413	/// 10: Upward (toward positive infinity). \n
414	/// 11: Truncated.
415	/// \returns A 256-bit vector of [4 x double] containing the rounded values.
416	#define _mm256_round_pd(V, M) \
417	((__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M)))
418
419	/// Rounds the values stored in a 256-bit vector of [8 x float] as
420	/// specified by the byte operand. The source values are rounded to integer
421	/// values and returned as floating-point values.
422	///
423	/// \headerfile <x86intrin.h>
424	///
425	/// \code
426	/// __m256 _mm256_round_ps(__m256 V, const int M);
427	/// \endcode
428	///
429	/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
430	///
431	/// \param V
432	/// A 256-bit vector of [8 x float].
433	/// \param M
434	/// An integer value that specifies the rounding operation. \n
435	/// Bits [7:4] are reserved. \n
436	/// Bit [3] is a precision exception value: \n
437	/// 0: A normal PE exception is used. \n
438	/// 1: The PE field is not updated. \n
439	/// Bit [2] is the rounding control source: \n
440	/// 0: Use bits [1:0] of \a M. \n
441	/// 1: Use the current MXCSR setting. \n
442	/// Bits [1:0] contain the rounding control definition: \n
443	/// 00: Nearest. \n
444	/// 01: Downward (toward negative infinity). \n
445	/// 10: Upward (toward positive infinity). \n
446	/// 11: Truncated.
447	/// \returns A 256-bit vector of [8 x float] containing the rounded values.
448	#define _mm256_round_ps(V, M) \
449	((__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M)))
450
451	/// Rounds up the values stored in a 256-bit vector of [4 x double]. The
452	/// source values are rounded up to integer values and returned as 64-bit
453	/// double-precision floating-point values.
454	///
455	/// \headerfile <x86intrin.h>
456	///
457	/// \code
458	/// __m256d _mm256_ceil_pd(__m256d V);
459	/// \endcode
460	///
461	/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
462	///
463	/// \param V
464	/// A 256-bit vector of [4 x double].
465	/// \returns A 256-bit vector of [4 x double] containing the rounded up values.
466	#define _mm256_ceil_pd(V) _mm256_round_pd((V), _MM_FROUND_CEIL)
467
468	/// Rounds down the values stored in a 256-bit vector of [4 x double].
469	/// The source values are rounded down to integer values and returned as
470	/// 64-bit double-precision floating-point values.
471	///
472	/// \headerfile <x86intrin.h>
473	///
474	/// \code
475	/// __m256d _mm256_floor_pd(__m256d V);
476	/// \endcode
477	///
478	/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
479	///
480	/// \param V
481	/// A 256-bit vector of [4 x double].
482	/// \returns A 256-bit vector of [4 x double] containing the rounded down
483	/// values.
484	#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
485
486	/// Rounds up the values stored in a 256-bit vector of [8 x float]. The
487	/// source values are rounded up to integer values and returned as
488	/// floating-point values.
489	///
490	/// \headerfile <x86intrin.h>
491	///
492	/// \code
493	/// __m256 _mm256_ceil_ps(__m256 V);
494	/// \endcode
495	///
496	/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
497	///
498	/// \param V
499	/// A 256-bit vector of [8 x float].
500	/// \returns A 256-bit vector of [8 x float] containing the rounded up values.
501	#define _mm256_ceil_ps(V) _mm256_round_ps((V), _MM_FROUND_CEIL)
502
503	/// Rounds down the values stored in a 256-bit vector of [8 x float]. The
504	/// source values are rounded down to integer values and returned as
505	/// floating-point values.
506	///
507	/// \headerfile <x86intrin.h>
508	///
509	/// \code
510	/// __m256 _mm256_floor_ps(__m256 V);
511	/// \endcode
512	///
513	/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
514	///
515	/// \param V
516	/// A 256-bit vector of [8 x float].
517	/// \returns A 256-bit vector of [8 x float] containing the rounded down values.
518	#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
519
520	/* Logical */
521	/// Performs a bitwise AND of two 256-bit vectors of [4 x double].
522	///
523	/// \headerfile <x86intrin.h>
524	///
525	/// This intrinsic corresponds to the <c> VANDPD </c> instruction.
526	///
527	/// \param __a
528	/// A 256-bit vector of [4 x double] containing one of the source operands.
529	/// \param __b
530	/// A 256-bit vector of [4 x double] containing one of the source operands.
531	/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
532	/// values between both operands.
533	static __inline __m256d __DEFAULT_FN_ATTRS
534	_mm256_and_pd(__m256d __a, __m256d __b)
535	{
536	return (__m256d)((__v4du)__a & (__v4du)__b);
537	}
538
539	/// Performs a bitwise AND of two 256-bit vectors of [8 x float].
540	///
541	/// \headerfile <x86intrin.h>
542	///
543	/// This intrinsic corresponds to the <c> VANDPS </c> instruction.
544	///
545	/// \param __a
546	/// A 256-bit vector of [8 x float] containing one of the source operands.
547	/// \param __b
548	/// A 256-bit vector of [8 x float] containing one of the source operands.
549	/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
550	/// values between both operands.
551	static __inline __m256 __DEFAULT_FN_ATTRS
552	_mm256_and_ps(__m256 __a, __m256 __b)
553	{
554	return (__m256)((__v8su)__a & (__v8su)__b);
555	}
556
557	/// Performs a bitwise AND of two 256-bit vectors of [4 x double], using
558	/// the one's complement of the values contained in the first source operand.
559	///
560	/// \headerfile <x86intrin.h>
561	///
562	/// This intrinsic corresponds to the <c> VANDNPD </c> instruction.
563	///
564	/// \param __a
565	/// A 256-bit vector of [4 x double] containing the left source operand. The
566	/// one's complement of this value is used in the bitwise AND.
567	/// \param __b
568	/// A 256-bit vector of [4 x double] containing the right source operand.
569	/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
570	/// values of the second operand and the one's complement of the first
571	/// operand.
572	static __inline __m256d __DEFAULT_FN_ATTRS
573	_mm256_andnot_pd(__m256d __a, __m256d __b)
574	{
575	return (__m256d)(~(__v4du)__a & (__v4du)__b);
576	}
577
578	/// Performs a bitwise AND of two 256-bit vectors of [8 x float], using
579	/// the one's complement of the values contained in the first source operand.
580	///
581	/// \headerfile <x86intrin.h>
582	///
583	/// This intrinsic corresponds to the <c> VANDNPS </c> instruction.
584	///
585	/// \param __a
586	/// A 256-bit vector of [8 x float] containing the left source operand. The
587	/// one's complement of this value is used in the bitwise AND.
588	/// \param __b
589	/// A 256-bit vector of [8 x float] containing the right source operand.
590	/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
591	/// values of the second operand and the one's complement of the first
592	/// operand.
593	static __inline __m256 __DEFAULT_FN_ATTRS
594	_mm256_andnot_ps(__m256 __a, __m256 __b)
595	{
596	return (__m256)(~(__v8su)__a & (__v8su)__b);
597	}
598
599	/// Performs a bitwise OR of two 256-bit vectors of [4 x double].
600	///
601	/// \headerfile <x86intrin.h>
602	///
603	/// This intrinsic corresponds to the <c> VORPD </c> instruction.
604	///
605	/// \param __a
606	/// A 256-bit vector of [4 x double] containing one of the source operands.
607	/// \param __b
608	/// A 256-bit vector of [4 x double] containing one of the source operands.
609	/// \returns A 256-bit vector of [4 x double] containing the bitwise OR of the
610	/// values between both operands.
611	static __inline __m256d __DEFAULT_FN_ATTRS
612	_mm256_or_pd(__m256d __a, __m256d __b)
613	{
614	return (__m256d)((__v4du)__a \| (__v4du)__b);
615	}
616
617	/// Performs a bitwise OR of two 256-bit vectors of [8 x float].
618	///
619	/// \headerfile <x86intrin.h>
620	///
621	/// This intrinsic corresponds to the <c> VORPS </c> instruction.
622	///
623	/// \param __a
624	/// A 256-bit vector of [8 x float] containing one of the source operands.
625	/// \param __b
626	/// A 256-bit vector of [8 x float] containing one of the source operands.
627	/// \returns A 256-bit vector of [8 x float] containing the bitwise OR of the
628	/// values between both operands.
629	static __inline __m256 __DEFAULT_FN_ATTRS
630	_mm256_or_ps(__m256 __a, __m256 __b)
631	{
632	return (__m256)((__v8su)__a \| (__v8su)__b);
633	}
634
635	/// Performs a bitwise XOR of two 256-bit vectors of [4 x double].
636	///
637	/// \headerfile <x86intrin.h>
638	///
639	/// This intrinsic corresponds to the <c> VXORPD </c> instruction.
640	///
641	/// \param __a
642	/// A 256-bit vector of [4 x double] containing one of the source operands.
643	/// \param __b
644	/// A 256-bit vector of [4 x double] containing one of the source operands.
645	/// \returns A 256-bit vector of [4 x double] containing the bitwise XOR of the
646	/// values between both operands.
647	static __inline __m256d __DEFAULT_FN_ATTRS
648	_mm256_xor_pd(__m256d __a, __m256d __b)
649	{
650	return (__m256d)((__v4du)__a ^ (__v4du)__b);
651	}
652
653	/// Performs a bitwise XOR of two 256-bit vectors of [8 x float].
654	///
655	/// \headerfile <x86intrin.h>
656	///
657	/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
658	///
659	/// \param __a
660	/// A 256-bit vector of [8 x float] containing one of the source operands.
661	/// \param __b
662	/// A 256-bit vector of [8 x float] containing one of the source operands.
663	/// \returns A 256-bit vector of [8 x float] containing the bitwise XOR of the
664	/// values between both operands.
665	static __inline __m256 __DEFAULT_FN_ATTRS
666	_mm256_xor_ps(__m256 __a, __m256 __b)
667	{
668	return (__m256)((__v8su)__a ^ (__v8su)__b);
669	}
670
671	/* Horizontal arithmetic */
672	/// Horizontally adds the adjacent pairs of values contained in two
673	/// 256-bit vectors of [4 x double].
674	///
675	/// \headerfile <x86intrin.h>
676	///
677	/// This intrinsic corresponds to the <c> VHADDPD </c> instruction.
678	///
679	/// \param __a
680	/// A 256-bit vector of [4 x double] containing one of the source operands.
681	/// The horizontal sums of the values are returned in the even-indexed
682	/// elements of a vector of [4 x double].
683	/// \param __b
684	/// A 256-bit vector of [4 x double] containing one of the source operands.
685	/// The horizontal sums of the values are returned in the odd-indexed
686	/// elements of a vector of [4 x double].
687	/// \returns A 256-bit vector of [4 x double] containing the horizontal sums of
688	/// both operands.
689	static __inline __m256d __DEFAULT_FN_ATTRS
690	_mm256_hadd_pd(__m256d __a, __m256d __b)
691	{
692	return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b);
693	}
694
695	/// Horizontally adds the adjacent pairs of values contained in two
696	/// 256-bit vectors of [8 x float].
697	///
698	/// \headerfile <x86intrin.h>
699	///
700	/// This intrinsic corresponds to the <c> VHADDPS </c> instruction.
701	///
702	/// \param __a
703	/// A 256-bit vector of [8 x float] containing one of the source operands.
704	/// The horizontal sums of the values are returned in the elements with
705	/// index 0, 1, 4, 5 of a vector of [8 x float].
706	/// \param __b
707	/// A 256-bit vector of [8 x float] containing one of the source operands.
708	/// The horizontal sums of the values are returned in the elements with
709	/// index 2, 3, 6, 7 of a vector of [8 x float].
710	/// \returns A 256-bit vector of [8 x float] containing the horizontal sums of
711	/// both operands.
712	static __inline __m256 __DEFAULT_FN_ATTRS
713	_mm256_hadd_ps(__m256 __a, __m256 __b)
714	{
715	return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b);
716	}
717
718	/// Horizontally subtracts the adjacent pairs of values contained in two
719	/// 256-bit vectors of [4 x double].
720	///
721	/// \headerfile <x86intrin.h>
722	///
723	/// This intrinsic corresponds to the <c> VHSUBPD </c> instruction.
724	///
725	/// \param __a
726	/// A 256-bit vector of [4 x double] containing one of the source operands.
727	/// The horizontal differences between the values are returned in the
728	/// even-indexed elements of a vector of [4 x double].
729	/// \param __b
730	/// A 256-bit vector of [4 x double] containing one of the source operands.
731	/// The horizontal differences between the values are returned in the
732	/// odd-indexed elements of a vector of [4 x double].
733	/// \returns A 256-bit vector of [4 x double] containing the horizontal
734	/// differences of both operands.
735	static __inline __m256d __DEFAULT_FN_ATTRS
736	_mm256_hsub_pd(__m256d __a, __m256d __b)
737	{
738	return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b);
739	}
740
741	/// Horizontally subtracts the adjacent pairs of values contained in two
742	/// 256-bit vectors of [8 x float].
743	///
744	/// \headerfile <x86intrin.h>
745	///
746	/// This intrinsic corresponds to the <c> VHSUBPS </c> instruction.
747	///
748	/// \param __a
749	/// A 256-bit vector of [8 x float] containing one of the source operands.
750	/// The horizontal differences between the values are returned in the
751	/// elements with index 0, 1, 4, 5 of a vector of [8 x float].
752	/// \param __b
753	/// A 256-bit vector of [8 x float] containing one of the source operands.
754	/// The horizontal differences between the values are returned in the
755	/// elements with index 2, 3, 6, 7 of a vector of [8 x float].
756	/// \returns A 256-bit vector of [8 x float] containing the horizontal
757	/// differences of both operands.
758	static __inline __m256 __DEFAULT_FN_ATTRS
759	_mm256_hsub_ps(__m256 __a, __m256 __b)
760	{
761	return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b);
762	}
763
764	/* Vector permutations */
765	/// Copies the values in a 128-bit vector of [2 x double] as specified
766	/// by the 128-bit integer vector operand.
767	///
768	/// \headerfile <x86intrin.h>
769	///
770	/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
771	///
772	/// \param __a
773	/// A 128-bit vector of [2 x double].
774	/// \param __c
775	/// A 128-bit integer vector operand specifying how the values are to be
776	/// copied. \n
777	/// Bit [1]: \n
778	/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
779	/// vector. \n
780	/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
781	/// returned vector. \n
782	/// Bit [65]: \n
783	/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
784	/// returned vector. \n
785	/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
786	/// returned vector.
787	/// \returns A 128-bit vector of [2 x double] containing the copied values.
788	static __inline __m128d __DEFAULT_FN_ATTRS128
789	_mm_permutevar_pd(__m128d __a, __m128i __c)
790	{
791	return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c);
792	}
793
794	/// Copies the values in a 256-bit vector of [4 x double] as specified
795	/// by the 256-bit integer vector operand.
796	///
797	/// \headerfile <x86intrin.h>
798	///
799	/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
800	///
801	/// \param __a
802	/// A 256-bit vector of [4 x double].
803	/// \param __c
804	/// A 256-bit integer vector operand specifying how the values are to be
805	/// copied. \n
806	/// Bit [1]: \n
807	/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
808	/// vector. \n
809	/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
810	/// returned vector. \n
811	/// Bit [65]: \n
812	/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
813	/// returned vector. \n
814	/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
815	/// returned vector. \n
816	/// Bit [129]: \n
817	/// 0: Bits [191:128] of the source are copied to bits [191:128] of the
818	/// returned vector. \n
819	/// 1: Bits [255:192] of the source are copied to bits [191:128] of the
820	/// returned vector. \n
821	/// Bit [193]: \n
822	/// 0: Bits [191:128] of the source are copied to bits [255:192] of the
823	/// returned vector. \n
824	/// 1: Bits [255:192] of the source are copied to bits [255:192] of the
825	/// returned vector.
826	/// \returns A 256-bit vector of [4 x double] containing the copied values.
827	static __inline __m256d __DEFAULT_FN_ATTRS
828	_mm256_permutevar_pd(__m256d __a, __m256i __c)
829	{
830	return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__a, (__v4di)__c);
831	}
832
833	/// Copies the values stored in a 128-bit vector of [4 x float] as
834	/// specified by the 128-bit integer vector operand.
835	/// \headerfile <x86intrin.h>
836	///
837	/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
838	///
839	/// \param __a
840	/// A 128-bit vector of [4 x float].
841	/// \param __c
842	/// A 128-bit integer vector operand specifying how the values are to be
843	/// copied. \n
844	/// Bits [1:0]: \n
845	/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
846	/// returned vector. \n
847	/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
848	/// returned vector. \n
849	/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
850	/// returned vector. \n
851	/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
852	/// returned vector. \n
853	/// Bits [33:32]: \n
854	/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
855	/// returned vector. \n
856	/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
857	/// returned vector. \n
858	/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
859	/// returned vector. \n
860	/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
861	/// returned vector. \n
862	/// Bits [65:64]: \n
863	/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
864	/// returned vector. \n
865	/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
866	/// returned vector. \n
867	/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
868	/// returned vector. \n
869	/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
870	/// returned vector. \n
871	/// Bits [97:96]: \n
872	/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
873	/// returned vector. \n
874	/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
875	/// returned vector. \n
876	/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
877	/// returned vector. \n
878	/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
879	/// returned vector.
880	/// \returns A 128-bit vector of [4 x float] containing the copied values.
881	static __inline __m128 __DEFAULT_FN_ATTRS128
882	_mm_permutevar_ps(__m128 __a, __m128i __c)
883	{
884	return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__a, (__v4si)__c);
885	}
886
887	/// Copies the values stored in a 256-bit vector of [8 x float] as
888	/// specified by the 256-bit integer vector operand.
889	///
890	/// \headerfile <x86intrin.h>
891	///
892	/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
893	///
894	/// \param __a
895	/// A 256-bit vector of [8 x float].
896	/// \param __c
897	/// A 256-bit integer vector operand specifying how the values are to be
898	/// copied. \n
899	/// Bits [1:0]: \n
900	/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
901	/// returned vector. \n
902	/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
903	/// returned vector. \n
904	/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
905	/// returned vector. \n
906	/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
907	/// returned vector. \n
908	/// Bits [33:32]: \n
909	/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
910	/// returned vector. \n
911	/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
912	/// returned vector. \n
913	/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
914	/// returned vector. \n
915	/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
916	/// returned vector. \n
917	/// Bits [65:64]: \n
918	/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
919	/// returned vector. \n
920	/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
921	/// returned vector. \n
922	/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
923	/// returned vector. \n
924	/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
925	/// returned vector. \n
926	/// Bits [97:96]: \n
927	/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
928	/// returned vector. \n
929	/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
930	/// returned vector. \n
931	/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
932	/// returned vector. \n
933	/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
934	/// returned vector. \n
935	/// Bits [129:128]: \n
936	/// 00: Bits [159:128] of the source are copied to bits [159:128] of the
937	/// returned vector. \n
938	/// 01: Bits [191:160] of the source are copied to bits [159:128] of the
939	/// returned vector. \n
940	/// 10: Bits [223:192] of the source are copied to bits [159:128] of the
941	/// returned vector. \n
942	/// 11: Bits [255:224] of the source are copied to bits [159:128] of the
943	/// returned vector. \n
944	/// Bits [161:160]: \n
945	/// 00: Bits [159:128] of the source are copied to bits [191:160] of the
946	/// returned vector. \n
947	/// 01: Bits [191:160] of the source are copied to bits [191:160] of the
948	/// returned vector. \n
949	/// 10: Bits [223:192] of the source are copied to bits [191:160] of the
950	/// returned vector. \n
951	/// 11: Bits [255:224] of the source are copied to bits [191:160] of the
952	/// returned vector. \n
953	/// Bits [193:192]: \n
954	/// 00: Bits [159:128] of the source are copied to bits [223:192] of the
955	/// returned vector. \n
956	/// 01: Bits [191:160] of the source are copied to bits [223:192] of the
957	/// returned vector. \n
958	/// 10: Bits [223:192] of the source are copied to bits [223:192] of the
959	/// returned vector. \n
960	/// 11: Bits [255:224] of the source are copied to bits [223:192] of the
961	/// returned vector. \n
962	/// Bits [225:224]: \n
963	/// 00: Bits [159:128] of the source are copied to bits [255:224] of the
964	/// returned vector. \n
965	/// 01: Bits [191:160] of the source are copied to bits [255:224] of the
966	/// returned vector. \n
967	/// 10: Bits [223:192] of the source are copied to bits [255:224] of the
968	/// returned vector. \n
969	/// 11: Bits [255:224] of the source are copied to bits [255:224] of the
970	/// returned vector.
971	/// \returns A 256-bit vector of [8 x float] containing the copied values.
972	static __inline __m256 __DEFAULT_FN_ATTRS
973	_mm256_permutevar_ps(__m256 __a, __m256i __c)
974	{
975	return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a, (__v8si)__c);
976	}
977
978	/// Copies the values in a 128-bit vector of [2 x double] as specified
979	/// by the immediate integer operand.
980	///
981	/// \headerfile <x86intrin.h>
982	///
983	/// \code
984	/// __m128d _mm_permute_pd(__m128d A, const int C);
985	/// \endcode
986	///
987	/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
988	///
989	/// \param A
990	/// A 128-bit vector of [2 x double].
991	/// \param C
992	/// An immediate integer operand specifying how the values are to be
993	/// copied. \n
994	/// Bit [0]: \n
995	/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
996	/// vector. \n
997	/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
998	/// returned vector. \n
999	/// Bit [1]: \n
1000	/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
1001	/// returned vector. \n
1002	/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
1003	/// returned vector.
1004	/// \returns A 128-bit vector of [2 x double] containing the copied values.
1005	#define _mm_permute_pd(A, C) \
1006	((__m128d)__builtin_ia32_vpermilpd((__v2df)(__m128d)(A), (int)(C)))
1007
1008	/// Copies the values in a 256-bit vector of [4 x double] as specified by
1009	/// the immediate integer operand.
1010	///
1011	/// \headerfile <x86intrin.h>
1012	///
1013	/// \code
1014	/// __m256d _mm256_permute_pd(__m256d A, const int C);
1015	/// \endcode
1016	///
1017	/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
1018	///
1019	/// \param A
1020	/// A 256-bit vector of [4 x double].
1021	/// \param C
1022	/// An immediate integer operand specifying how the values are to be
1023	/// copied. \n
1024	/// Bit [0]: \n
1025	/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
1026	/// vector. \n
1027	/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
1028	/// returned vector. \n
1029	/// Bit [1]: \n
1030	/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
1031	/// returned vector. \n
1032	/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
1033	/// returned vector. \n
1034	/// Bit [2]: \n
1035	/// 0: Bits [191:128] of the source are copied to bits [191:128] of the
1036	/// returned vector. \n
1037	/// 1: Bits [255:192] of the source are copied to bits [191:128] of the
1038	/// returned vector. \n
1039	/// Bit [3]: \n
1040	/// 0: Bits [191:128] of the source are copied to bits [255:192] of the
1041	/// returned vector. \n
1042	/// 1: Bits [255:192] of the source are copied to bits [255:192] of the
1043	/// returned vector.
1044	/// \returns A 256-bit vector of [4 x double] containing the copied values.
1045	#define _mm256_permute_pd(A, C) \
1046	((__m256d)__builtin_ia32_vpermilpd256((__v4df)(__m256d)(A), (int)(C)))
1047
1048	/// Copies the values in a 128-bit vector of [4 x float] as specified by
1049	/// the immediate integer operand.
1050	///
1051	/// \headerfile <x86intrin.h>
1052	///
1053	/// \code
1054	/// __m128 _mm_permute_ps(__m128 A, const int C);
1055	/// \endcode
1056	///
1057	/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
1058	///
1059	/// \param A
1060	/// A 128-bit vector of [4 x float].
1061	/// \param C
1062	/// An immediate integer operand specifying how the values are to be
1063	/// copied. \n
1064	/// Bits [1:0]: \n
1065	/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
1066	/// returned vector. \n
1067	/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
1068	/// returned vector. \n
1069	/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
1070	/// returned vector. \n
1071	/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
1072	/// returned vector. \n
1073	/// Bits [3:2]: \n
1074	/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
1075	/// returned vector. \n
1076	/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
1077	/// returned vector. \n
1078	/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
1079	/// returned vector. \n
1080	/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
1081	/// returned vector. \n
1082	/// Bits [5:4]: \n
1083	/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
1084	/// returned vector. \n
1085	/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
1086	/// returned vector. \n
1087	/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
1088	/// returned vector. \n
1089	/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
1090	/// returned vector. \n
1091	/// Bits [7:6]: \n
1092	/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
1093	/// returned vector. \n
1094	/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
1095	/// returned vector. \n
1096	/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
1097	/// returned vector. \n
1098	/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
1099	/// returned vector.
1100	/// \returns A 128-bit vector of [4 x float] containing the copied values.
1101	#define _mm_permute_ps(A, C) \
1102	((__m128)__builtin_ia32_vpermilps((__v4sf)(__m128)(A), (int)(C)))
1103
1104	/// Copies the values in a 256-bit vector of [8 x float] as specified by
1105	/// the immediate integer operand.
1106	///
1107	/// \headerfile <x86intrin.h>
1108	///
1109	/// \code
1110	/// __m256 _mm256_permute_ps(__m256 A, const int C);
1111	/// \endcode
1112	///
1113	/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
1114	///
1115	/// \param A
1116	/// A 256-bit vector of [8 x float].
1117	/// \param C
1118	/// An immediate integer operand specifying how the values are to be
1119	/// copied. \n
1120	/// Bits [1:0]: \n
1121	/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
1122	/// returned vector. \n
1123	/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
1124	/// returned vector. \n
1125	/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
1126	/// returned vector. \n
1127	/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
1128	/// returned vector. \n
1129	/// Bits [3:2]: \n
1130	/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
1131	/// returned vector. \n
1132	/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
1133	/// returned vector. \n
1134	/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
1135	/// returned vector. \n
1136	/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
1137	/// returned vector. \n
1138	/// Bits [5:4]: \n
1139	/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
1140	/// returned vector. \n
1141	/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
1142	/// returned vector. \n
1143	/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
1144	/// returned vector. \n
1145	/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
1146	/// returned vector. \n
1147	/// Bits [7:6]: \n
1148	/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
1149	/// returned vector. \n
1150	/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
1151	/// returned vector. \n
1152	/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
1153	/// returned vector. \n
1154	/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
1155	/// returned vector. \n
1156	/// Bits [1:0]: \n
1157	/// 00: Bits [159:128] of the source are copied to bits [159:128] of the
1158	/// returned vector. \n
1159	/// 01: Bits [191:160] of the source are copied to bits [159:128] of the
1160	/// returned vector. \n
1161	/// 10: Bits [223:192] of the source are copied to bits [159:128] of the
1162	/// returned vector. \n
1163	/// 11: Bits [255:224] of the source are copied to bits [159:128] of the
1164	/// returned vector. \n
1165	/// Bits [3:2]: \n
1166	/// 00: Bits [159:128] of the source are copied to bits [191:160] of the
1167	/// returned vector. \n
1168	/// 01: Bits [191:160] of the source are copied to bits [191:160] of the
1169	/// returned vector. \n
1170	/// 10: Bits [223:192] of the source are copied to bits [191:160] of the
1171	/// returned vector. \n
1172	/// 11: Bits [255:224] of the source are copied to bits [191:160] of the
1173	/// returned vector. \n
1174	/// Bits [5:4]: \n
1175	/// 00: Bits [159:128] of the source are copied to bits [223:192] of the
1176	/// returned vector. \n
1177	/// 01: Bits [191:160] of the source are copied to bits [223:192] of the
1178	/// returned vector. \n
1179	/// 10: Bits [223:192] of the source are copied to bits [223:192] of the
1180	/// returned vector. \n
1181	/// 11: Bits [255:224] of the source are copied to bits [223:192] of the
1182	/// returned vector. \n
1183	/// Bits [7:6]: \n
1184	/// 00: Bits [159:128] of the source are copied to bits [255:224] of the
1185	/// returned vector. \n
1186	/// 01: Bits [191:160] of the source are copied to bits [255:224] of the
1187	/// returned vector. \n
1188	/// 10: Bits [223:192] of the source are copied to bits [255:224] of the
1189	/// returned vector. \n
1190	/// 11: Bits [255:224] of the source are copied to bits [255:224] of the
1191	/// returned vector.
1192	/// \returns A 256-bit vector of [8 x float] containing the copied values.
1193	#define _mm256_permute_ps(A, C) \
1194	((__m256)__builtin_ia32_vpermilps256((__v8sf)(__m256)(A), (int)(C)))
1195
1196	/// Permutes 128-bit data values stored in two 256-bit vectors of
1197	/// [4 x double], as specified by the immediate integer operand.
1198	///
1199	/// \headerfile <x86intrin.h>
1200	///
1201	/// \code
1202	/// __m256d _mm256_permute2f128_pd(__m256d V1, __m256d V2, const int M);
1203	/// \endcode
1204	///
1205	/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
1206	///
1207	/// \param V1
1208	/// A 256-bit vector of [4 x double].
1209	/// \param V2
1210	/// A 256-bit vector of [4 x double.
1211	/// \param M
1212	/// An immediate integer operand specifying how the values are to be
1213	/// permuted. \n
1214	/// Bits [1:0]: \n
1215	/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
1216	/// destination. \n
1217	/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
1218	/// destination. \n
1219	/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
1220	/// destination. \n
1221	/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
1222	/// destination. \n
1223	/// Bits [5:4]: \n
1224	/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
1225	/// destination. \n
1226	/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
1227	/// destination. \n
1228	/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
1229	/// destination. \n
1230	/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
1231	/// destination.
1232	/// \returns A 256-bit vector of [4 x double] containing the copied values.
1233	#define _mm256_permute2f128_pd(V1, V2, M) \
1234	((__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \
1235	(__v4df)(__m256d)(V2), (int)(M)))
1236
1237	/// Permutes 128-bit data values stored in two 256-bit vectors of
1238	/// [8 x float], as specified by the immediate integer operand.
1239	///
1240	/// \headerfile <x86intrin.h>
1241	///
1242	/// \code
1243	/// __m256 _mm256_permute2f128_ps(__m256 V1, __m256 V2, const int M);
1244	/// \endcode
1245	///
1246	/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
1247	///
1248	/// \param V1
1249	/// A 256-bit vector of [8 x float].
1250	/// \param V2
1251	/// A 256-bit vector of [8 x float].
1252	/// \param M
1253	/// An immediate integer operand specifying how the values are to be
1254	/// permuted. \n
1255	/// Bits [1:0]: \n
1256	/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
1257	/// destination. \n
1258	/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
1259	/// destination. \n
1260	/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
1261	/// destination. \n
1262	/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
1263	/// destination. \n
1264	/// Bits [5:4]: \n
1265	/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
1266	/// destination. \n
1267	/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
1268	/// destination. \n
1269	/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
1270	/// destination. \n
1271	/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
1272	/// destination.
1273	/// \returns A 256-bit vector of [8 x float] containing the copied values.
1274	#define _mm256_permute2f128_ps(V1, V2, M) \
1275	((__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \
1276	(__v8sf)(__m256)(V2), (int)(M)))
1277
1278	/// Permutes 128-bit data values stored in two 256-bit integer vectors,
1279	/// as specified by the immediate integer operand.
1280	///
1281	/// \headerfile <x86intrin.h>
1282	///
1283	/// \code
1284	/// __m256i _mm256_permute2f128_si256(__m256i V1, __m256i V2, const int M);
1285	/// \endcode
1286	///
1287	/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
1288	///
1289	/// \param V1
1290	/// A 256-bit integer vector.
1291	/// \param V2
1292	/// A 256-bit integer vector.
1293	/// \param M
1294	/// An immediate integer operand specifying how the values are to be copied.
1295	/// Bits [1:0]: \n
1296	/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
1297	/// destination. \n
1298	/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
1299	/// destination. \n
1300	/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
1301	/// destination. \n
1302	/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
1303	/// destination. \n
1304	/// Bits [5:4]: \n
1305	/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
1306	/// destination. \n
1307	/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
1308	/// destination. \n
1309	/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
1310	/// destination. \n
1311	/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
1312	/// destination.
1313	/// \returns A 256-bit integer vector containing the copied values.
1314	#define _mm256_permute2f128_si256(V1, V2, M) \
1315	((__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \
1316	(__v8si)(__m256i)(V2), (int)(M)))
1317
1318	/* Vector Blend */
1319	/// Merges 64-bit double-precision data values stored in either of the
1320	/// two 256-bit vectors of [4 x double], as specified by the immediate
1321	/// integer operand.
1322	///
1323	/// \headerfile <x86intrin.h>
1324	///
1325	/// \code
1326	/// __m256d _mm256_blend_pd(__m256d V1, __m256d V2, const int M);
1327	/// \endcode
1328	///
1329	/// This intrinsic corresponds to the <c> VBLENDPD </c> instruction.
1330	///
1331	/// \param V1
1332	/// A 256-bit vector of [4 x double].
1333	/// \param V2
1334	/// A 256-bit vector of [4 x double].
1335	/// \param M
1336	/// An immediate integer operand, with mask bits [3:0] specifying how the
1337	/// values are to be copied. The position of the mask bit corresponds to the
1338	/// index of a copied value. When a mask bit is 0, the corresponding 64-bit
1339	/// element in operand \a V1 is copied to the same position in the
1340	/// destination. When a mask bit is 1, the corresponding 64-bit element in
1341	/// operand \a V2 is copied to the same position in the destination.
1342	/// \returns A 256-bit vector of [4 x double] containing the copied values.
1343	#define _mm256_blend_pd(V1, V2, M) \
1344	((__m256d)__builtin_ia32_blendpd256((__v4df)(__m256d)(V1), \
1345	(__v4df)(__m256d)(V2), (int)(M)))
1346
1347	/// Merges 32-bit single-precision data values stored in either of the
1348	/// two 256-bit vectors of [8 x float], as specified by the immediate
1349	/// integer operand.
1350	///
1351	/// \headerfile <x86intrin.h>
1352	///
1353	/// \code
1354	/// __m256 _mm256_blend_ps(__m256 V1, __m256 V2, const int M);
1355	/// \endcode
1356	///
1357	/// This intrinsic corresponds to the <c> VBLENDPS </c> instruction.
1358	///
1359	/// \param V1
1360	/// A 256-bit vector of [8 x float].
1361	/// \param V2
1362	/// A 256-bit vector of [8 x float].
1363	/// \param M
1364	/// An immediate integer operand, with mask bits [7:0] specifying how the
1365	/// values are to be copied. The position of the mask bit corresponds to the
1366	/// index of a copied value. When a mask bit is 0, the corresponding 32-bit
1367	/// element in operand \a V1 is copied to the same position in the
1368	/// destination. When a mask bit is 1, the corresponding 32-bit element in
1369	/// operand \a V2 is copied to the same position in the destination.
1370	/// \returns A 256-bit vector of [8 x float] containing the copied values.
1371	#define _mm256_blend_ps(V1, V2, M) \
1372	((__m256)__builtin_ia32_blendps256((__v8sf)(__m256)(V1), \
1373	(__v8sf)(__m256)(V2), (int)(M)))
1374
1375	/// Merges 64-bit double-precision data values stored in either of the
1376	/// two 256-bit vectors of [4 x double], as specified by the 256-bit vector
1377	/// operand.
1378	///
1379	/// \headerfile <x86intrin.h>
1380	///
1381	/// This intrinsic corresponds to the <c> VBLENDVPD </c> instruction.
1382	///
1383	/// \param __a
1384	/// A 256-bit vector of [4 x double].
1385	/// \param __b
1386	/// A 256-bit vector of [4 x double].
1387	/// \param __c
1388	/// A 256-bit vector operand, with mask bits 255, 191, 127, and 63 specifying
1389	/// how the values are to be copied. The position of the mask bit corresponds
1390	/// to the most significant bit of a copied value. When a mask bit is 0, the
1391	/// corresponding 64-bit element in operand \a __a is copied to the same
1392	/// position in the destination. When a mask bit is 1, the corresponding
1393	/// 64-bit element in operand \a __b is copied to the same position in the
1394	/// destination.
1395	/// \returns A 256-bit vector of [4 x double] containing the copied values.
1396	static __inline __m256d __DEFAULT_FN_ATTRS
1397	_mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c)
1398	{
1399	return (__m256d)__builtin_ia32_blendvpd256(
1400	(__v4df)__a, (__v4df)__b, (__v4df)__c);
1401	}
1402
1403	/// Merges 32-bit single-precision data values stored in either of the
1404	/// two 256-bit vectors of [8 x float], as specified by the 256-bit vector
1405	/// operand.
1406	///
1407	/// \headerfile <x86intrin.h>
1408	///
1409	/// This intrinsic corresponds to the <c> VBLENDVPS </c> instruction.
1410	///
1411	/// \param __a
1412	/// A 256-bit vector of [8 x float].
1413	/// \param __b
1414	/// A 256-bit vector of [8 x float].
1415	/// \param __c
1416	/// A 256-bit vector operand, with mask bits 255, 223, 191, 159, 127, 95, 63,
1417	/// and 31 specifying how the values are to be copied. The position of the
1418	/// mask bit corresponds to the most significant bit of a copied value. When
1419	/// a mask bit is 0, the corresponding 32-bit element in operand \a __a is
1420	/// copied to the same position in the destination. When a mask bit is 1, the
1421	/// corresponding 32-bit element in operand \a __b is copied to the same
1422	/// position in the destination.
1423	/// \returns A 256-bit vector of [8 x float] containing the copied values.
1424	static __inline __m256 __DEFAULT_FN_ATTRS
1425	_mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
1426	{
1427	return (__m256)__builtin_ia32_blendvps256(
1428	(__v8sf)__a, (__v8sf)__b, (__v8sf)__c);
1429	}
1430
1431	/* Vector Dot Product */
1432	/// Computes two dot products in parallel, using the lower and upper
1433	/// halves of two [8 x float] vectors as input to the two computations, and
1434	/// returning the two dot products in the lower and upper halves of the
1435	/// [8 x float] result.
1436	///
1437	/// The immediate integer operand controls which input elements will
1438	/// contribute to the dot product, and where the final results are returned.
1439	/// In general, for each dot product, the four corresponding elements of the
1440	/// input vectors are multiplied; the first two and second two products are
1441	/// summed, then the two sums are added to form the final result.
1442	///
1443	/// \headerfile <x86intrin.h>
1444	///
1445	/// \code
1446	/// __m256 _mm256_dp_ps(__m256 V1, __m256 V2, const int M);
1447	/// \endcode
1448	///
1449	/// This intrinsic corresponds to the <c> VDPPS </c> instruction.
1450	///
1451	/// \param V1
1452	/// A vector of [8 x float] values, treated as two [4 x float] vectors.
1453	/// \param V2
1454	/// A vector of [8 x float] values, treated as two [4 x float] vectors.
1455	/// \param M
1456	/// An immediate integer argument. Bits [7:4] determine which elements of
1457	/// the input vectors are used, with bit [4] corresponding to the lowest
1458	/// element and bit [7] corresponding to the highest element of each [4 x
1459	/// float] subvector. If a bit is set, the corresponding elements from the
1460	/// two input vectors are used as an input for dot product; otherwise that
1461	/// input is treated as zero. Bits [3:0] determine which elements of the
1462	/// result will receive a copy of the final dot product, with bit [0]
1463	/// corresponding to the lowest element and bit [3] corresponding to the
1464	/// highest element of each [4 x float] subvector. If a bit is set, the dot
1465	/// product is returned in the corresponding element; otherwise that element
1466	/// is set to zero. The bitmask is applied in the same way to each of the
1467	/// two parallel dot product computations.
1468	/// \returns A 256-bit vector of [8 x float] containing the two dot products.
1469	#define _mm256_dp_ps(V1, V2, M) \
1470	((__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \
1471	(__v8sf)(__m256)(V2), (M)))
1472
1473	/* Vector shuffle */
1474	/// Selects 8 float values from the 256-bit operands of [8 x float], as
1475	/// specified by the immediate value operand.
1476	///
1477	/// The four selected elements in each operand are copied to the destination
1478	/// according to the bits specified in the immediate operand. The selected
1479	/// elements from the first 256-bit operand are copied to bits [63:0] and
1480	/// bits [191:128] of the destination, and the selected elements from the
1481	/// second 256-bit operand are copied to bits [127:64] and bits [255:192] of
1482	/// the destination. For example, if bits [7:0] of the immediate operand
1483	/// contain a value of 0xFF, the 256-bit destination vector would contain the
1484	/// following values: b[7], b[7], a[7], a[7], b[3], b[3], a[3], a[3].
1485	///
1486	/// \headerfile <x86intrin.h>
1487	///
1488	/// \code
1489	/// __m256 _mm256_shuffle_ps(__m256 a, __m256 b, const int mask);
1490	/// \endcode
1491	///
1492	/// This intrinsic corresponds to the <c> VSHUFPS </c> instruction.
1493	///
1494	/// \param a
1495	/// A 256-bit vector of [8 x float]. The four selected elements in this
1496	/// operand are copied to bits [63:0] and bits [191:128] in the destination,
1497	/// according to the bits specified in the immediate operand.
1498	/// \param b
1499	/// A 256-bit vector of [8 x float]. The four selected elements in this
1500	/// operand are copied to bits [127:64] and bits [255:192] in the
1501	/// destination, according to the bits specified in the immediate operand.
1502	/// \param mask
1503	/// An immediate value containing an 8-bit value specifying which elements to
1504	/// copy from \a a and \a b \n.
1505	/// Bits [3:0] specify the values copied from operand \a a. \n
1506	/// Bits [7:4] specify the values copied from operand \a b. \n
1507	/// The destinations within the 256-bit destination are assigned values as
1508	/// follows, according to the bit value assignments described below: \n
1509	/// Bits [1:0] are used to assign values to bits [31:0] and [159:128] in the
1510	/// destination. \n
1511	/// Bits [3:2] are used to assign values to bits [63:32] and [191:160] in the
1512	/// destination. \n
1513	/// Bits [5:4] are used to assign values to bits [95:64] and [223:192] in the
1514	/// destination. \n
1515	/// Bits [7:6] are used to assign values to bits [127:96] and [255:224] in
1516	/// the destination. \n
1517	/// Bit value assignments: \n
1518	/// 00: Bits [31:0] and [159:128] are copied from the selected operand. \n
1519	/// 01: Bits [63:32] and [191:160] are copied from the selected operand. \n
1520	/// 10: Bits [95:64] and [223:192] are copied from the selected operand. \n
1521	/// 11: Bits [127:96] and [255:224] are copied from the selected operand. \n
1522	/// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
1523	/// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
1524	/// <c>[b6, b4, b2, b0]</c>.
1525	/// \returns A 256-bit vector of [8 x float] containing the shuffled values.
1526	#define _mm256_shuffle_ps(a, b, mask) \
1527	((__m256)__builtin_ia32_shufps256((__v8sf)(__m256)(a), \
1528	(__v8sf)(__m256)(b), (int)(mask)))
1529
1530	/// Selects four double-precision values from the 256-bit operands of
1531	/// [4 x double], as specified by the immediate value operand.
1532	///
1533	/// The selected elements from the first 256-bit operand are copied to bits
1534	/// [63:0] and bits [191:128] in the destination, and the selected elements
1535	/// from the second 256-bit operand are copied to bits [127:64] and bits
1536	/// [255:192] in the destination. For example, if bits [3:0] of the immediate
1537	/// operand contain a value of 0xF, the 256-bit destination vector would
1538	/// contain the following values: b[3], a[3], b[1], a[1].
1539	///
1540	/// \headerfile <x86intrin.h>
1541	///
1542	/// \code
1543	/// __m256d _mm256_shuffle_pd(__m256d a, __m256d b, const int mask);
1544	/// \endcode
1545	///
1546	/// This intrinsic corresponds to the <c> VSHUFPD </c> instruction.
1547	///
1548	/// \param a
1549	/// A 256-bit vector of [4 x double].
1550	/// \param b
1551	/// A 256-bit vector of [4 x double].
1552	/// \param mask
1553	/// An immediate value containing 8-bit values specifying which elements to
1554	/// copy from \a a and \a b: \n
1555	/// Bit [0]=0: Bits [63:0] are copied from \a a to bits [63:0] of the
1556	/// destination. \n
1557	/// Bit [0]=1: Bits [127:64] are copied from \a a to bits [63:0] of the
1558	/// destination. \n
1559	/// Bit [1]=0: Bits [63:0] are copied from \a b to bits [127:64] of the
1560	/// destination. \n
1561	/// Bit [1]=1: Bits [127:64] are copied from \a b to bits [127:64] of the
1562	/// destination. \n
1563	/// Bit [2]=0: Bits [191:128] are copied from \a a to bits [191:128] of the
1564	/// destination. \n
1565	/// Bit [2]=1: Bits [255:192] are copied from \a a to bits [191:128] of the
1566	/// destination. \n
1567	/// Bit [3]=0: Bits [191:128] are copied from \a b to bits [255:192] of the
1568	/// destination. \n
1569	/// Bit [3]=1: Bits [255:192] are copied from \a b to bits [255:192] of the
1570	/// destination.
1571	/// \returns A 256-bit vector of [4 x double] containing the shuffled values.
1572	#define _mm256_shuffle_pd(a, b, mask) \
1573	((__m256d)__builtin_ia32_shufpd256((__v4df)(__m256d)(a), \
1574	(__v4df)(__m256d)(b), (int)(mask)))
1575
1576	/* Compare */
1577	#define _CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */
1578	#define _CMP_LT_OS 0x01 /* Less-than (ordered, signaling) */
1579	#define _CMP_LE_OS 0x02 /* Less-than-or-equal (ordered, signaling) */
1580	#define _CMP_UNORD_Q 0x03 /* Unordered (non-signaling) */
1581	#define _CMP_NEQ_UQ 0x04 /* Not-equal (unordered, non-signaling) */
1582	#define _CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */
1583	#define _CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered, signaling) */
1584	#define _CMP_ORD_Q 0x07 /* Ordered (non-signaling) */
1585	#define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */
1586	#define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unordered, signaling) */
1587	#define _CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */
1588	#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */
1589	#define _CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */
1590	#define _CMP_GE_OS 0x0d /* Greater-than-or-equal (ordered, signaling) */
1591	#define _CMP_GT_OS 0x0e /* Greater-than (ordered, signaling) */
1592	#define _CMP_TRUE_UQ 0x0f /* True (unordered, non-signaling) */
1593	#define _CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */
1594	#define _CMP_LT_OQ 0x11 /* Less-than (ordered, non-signaling) */
1595	#define _CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, non-signaling) */
1596	#define _CMP_UNORD_S 0x13 /* Unordered (signaling) */
1597	#define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */
1598	#define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */
1599	#define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unordered, non-signaling) */
1600	#define _CMP_ORD_S 0x17 /* Ordered (signaling) */
1601	#define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */
1602	#define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unordered, non-signaling) */
1603	#define _CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */
1604	#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling) */
1605	#define _CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */
1606	#define _CMP_GE_OQ 0x1d /* Greater-than-or-equal (ordered, non-signaling) */
1607	#define _CMP_GT_OQ 0x1e /* Greater-than (ordered, non-signaling) */
1608	#define _CMP_TRUE_US 0x1f /* True (unordered, signaling) */
1609
1610	/// Compares each of the corresponding double-precision values of two
1611	/// 128-bit vectors of [2 x double], using the operation specified by the
1612	/// immediate integer operand.
1613	///
1614	/// Returns a [2 x double] vector consisting of two doubles corresponding to
1615	/// the two comparison results: zero if the comparison is false, and all 1's
1616	/// if the comparison is true.
1617	///
1618	/// \headerfile <x86intrin.h>
1619	///
1620	/// \code
1621	/// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c);
1622	/// \endcode
1623	///
1624	/// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
1625	///
1626	/// \param a
1627	/// A 128-bit vector of [2 x double].
1628	/// \param b
1629	/// A 128-bit vector of [2 x double].
1630	/// \param c
1631	/// An immediate integer operand, with bits [4:0] specifying which comparison
1632	/// operation to use: \n
1633	/// 0x00: Equal (ordered, non-signaling) \n
1634	/// 0x01: Less-than (ordered, signaling) \n
1635	/// 0x02: Less-than-or-equal (ordered, signaling) \n
1636	/// 0x03: Unordered (non-signaling) \n
1637	/// 0x04: Not-equal (unordered, non-signaling) \n
1638	/// 0x05: Not-less-than (unordered, signaling) \n
1639	/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1640	/// 0x07: Ordered (non-signaling) \n
1641	/// 0x08: Equal (unordered, non-signaling) \n
1642	/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1643	/// 0x0A: Not-greater-than (unordered, signaling) \n
1644	/// 0x0B: False (ordered, non-signaling) \n
1645	/// 0x0C: Not-equal (ordered, non-signaling) \n
1646	/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1647	/// 0x0E: Greater-than (ordered, signaling) \n
1648	/// 0x0F: True (unordered, non-signaling) \n
1649	/// 0x10: Equal (ordered, signaling) \n
1650	/// 0x11: Less-than (ordered, non-signaling) \n
1651	/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1652	/// 0x13: Unordered (signaling) \n
1653	/// 0x14: Not-equal (unordered, signaling) \n
1654	/// 0x15: Not-less-than (unordered, non-signaling) \n
1655	/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1656	/// 0x17: Ordered (signaling) \n
1657	/// 0x18: Equal (unordered, signaling) \n
1658	/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1659	/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1660	/// 0x1B: False (ordered, signaling) \n
1661	/// 0x1C: Not-equal (ordered, signaling) \n
1662	/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1663	/// 0x1E: Greater-than (ordered, non-signaling) \n
1664	/// 0x1F: True (unordered, signaling)
1665	/// \returns A 128-bit vector of [2 x double] containing the comparison results.
1666	#define _mm_cmp_pd(a, b, c) \
1667	((__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), \
1668	(__v2df)(__m128d)(b), (c)))
1669
1670	/// Compares each of the corresponding values of two 128-bit vectors of
1671	/// [4 x float], using the operation specified by the immediate integer
1672	/// operand.
1673	///
1674	/// Returns a [4 x float] vector consisting of four floats corresponding to
1675	/// the four comparison results: zero if the comparison is false, and all 1's
1676	/// if the comparison is true.
1677	///
1678	/// \headerfile <x86intrin.h>
1679	///
1680	/// \code
1681	/// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c);
1682	/// \endcode
1683	///
1684	/// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
1685	///
1686	/// \param a
1687	/// A 128-bit vector of [4 x float].
1688	/// \param b
1689	/// A 128-bit vector of [4 x float].
1690	/// \param c
1691	/// An immediate integer operand, with bits [4:0] specifying which comparison
1692	/// operation to use: \n
1693	/// 0x00: Equal (ordered, non-signaling) \n
1694	/// 0x01: Less-than (ordered, signaling) \n
1695	/// 0x02: Less-than-or-equal (ordered, signaling) \n
1696	/// 0x03: Unordered (non-signaling) \n
1697	/// 0x04: Not-equal (unordered, non-signaling) \n
1698	/// 0x05: Not-less-than (unordered, signaling) \n
1699	/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1700	/// 0x07: Ordered (non-signaling) \n
1701	/// 0x08: Equal (unordered, non-signaling) \n
1702	/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1703	/// 0x0A: Not-greater-than (unordered, signaling) \n
1704	/// 0x0B: False (ordered, non-signaling) \n
1705	/// 0x0C: Not-equal (ordered, non-signaling) \n
1706	/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1707	/// 0x0E: Greater-than (ordered, signaling) \n
1708	/// 0x0F: True (unordered, non-signaling) \n
1709	/// 0x10: Equal (ordered, signaling) \n
1710	/// 0x11: Less-than (ordered, non-signaling) \n
1711	/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1712	/// 0x13: Unordered (signaling) \n
1713	/// 0x14: Not-equal (unordered, signaling) \n
1714	/// 0x15: Not-less-than (unordered, non-signaling) \n
1715	/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1716	/// 0x17: Ordered (signaling) \n
1717	/// 0x18: Equal (unordered, signaling) \n
1718	/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1719	/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1720	/// 0x1B: False (ordered, signaling) \n
1721	/// 0x1C: Not-equal (ordered, signaling) \n
1722	/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1723	/// 0x1E: Greater-than (ordered, non-signaling) \n
1724	/// 0x1F: True (unordered, signaling)
1725	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
1726	#define _mm_cmp_ps(a, b, c) \
1727	((__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), \
1728	(__v4sf)(__m128)(b), (c)))
1729
1730	/// Compares each of the corresponding double-precision values of two
1731	/// 256-bit vectors of [4 x double], using the operation specified by the
1732	/// immediate integer operand.
1733	///
1734	/// Returns a [4 x double] vector consisting of four doubles corresponding to
1735	/// the four comparison results: zero if the comparison is false, and all 1's
1736	/// if the comparison is true.
1737	///
1738	/// \headerfile <x86intrin.h>
1739	///
1740	/// \code
1741	/// __m256d _mm256_cmp_pd(__m256d a, __m256d b, const int c);
1742	/// \endcode
1743	///
1744	/// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
1745	///
1746	/// \param a
1747	/// A 256-bit vector of [4 x double].
1748	/// \param b
1749	/// A 256-bit vector of [4 x double].
1750	/// \param c
1751	/// An immediate integer operand, with bits [4:0] specifying which comparison
1752	/// operation to use: \n
1753	/// 0x00: Equal (ordered, non-signaling) \n
1754	/// 0x01: Less-than (ordered, signaling) \n
1755	/// 0x02: Less-than-or-equal (ordered, signaling) \n
1756	/// 0x03: Unordered (non-signaling) \n
1757	/// 0x04: Not-equal (unordered, non-signaling) \n
1758	/// 0x05: Not-less-than (unordered, signaling) \n
1759	/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1760	/// 0x07: Ordered (non-signaling) \n
1761	/// 0x08: Equal (unordered, non-signaling) \n
1762	/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1763	/// 0x0A: Not-greater-than (unordered, signaling) \n
1764	/// 0x0B: False (ordered, non-signaling) \n
1765	/// 0x0C: Not-equal (ordered, non-signaling) \n
1766	/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1767	/// 0x0E: Greater-than (ordered, signaling) \n
1768	/// 0x0F: True (unordered, non-signaling) \n
1769	/// 0x10: Equal (ordered, signaling) \n
1770	/// 0x11: Less-than (ordered, non-signaling) \n
1771	/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1772	/// 0x13: Unordered (signaling) \n
1773	/// 0x14: Not-equal (unordered, signaling) \n
1774	/// 0x15: Not-less-than (unordered, non-signaling) \n
1775	/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1776	/// 0x17: Ordered (signaling) \n
1777	/// 0x18: Equal (unordered, signaling) \n
1778	/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1779	/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1780	/// 0x1B: False (ordered, signaling) \n
1781	/// 0x1C: Not-equal (ordered, signaling) \n
1782	/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1783	/// 0x1E: Greater-than (ordered, non-signaling) \n
1784	/// 0x1F: True (unordered, signaling)
1785	/// \returns A 256-bit vector of [4 x double] containing the comparison results.
1786	#define _mm256_cmp_pd(a, b, c) \
1787	((__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \
1788	(__v4df)(__m256d)(b), (c)))
1789
1790	/// Compares each of the corresponding values of two 256-bit vectors of
1791	/// [8 x float], using the operation specified by the immediate integer
1792	/// operand.
1793	///
1794	/// Returns a [8 x float] vector consisting of eight floats corresponding to
1795	/// the eight comparison results: zero if the comparison is false, and all
1796	/// 1's if the comparison is true.
1797	///
1798	/// \headerfile <x86intrin.h>
1799	///
1800	/// \code
1801	/// __m256 _mm256_cmp_ps(__m256 a, __m256 b, const int c);
1802	/// \endcode
1803	///
1804	/// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
1805	///
1806	/// \param a
1807	/// A 256-bit vector of [8 x float].
1808	/// \param b
1809	/// A 256-bit vector of [8 x float].
1810	/// \param c
1811	/// An immediate integer operand, with bits [4:0] specifying which comparison
1812	/// operation to use: \n
1813	/// 0x00: Equal (ordered, non-signaling) \n
1814	/// 0x01: Less-than (ordered, signaling) \n
1815	/// 0x02: Less-than-or-equal (ordered, signaling) \n
1816	/// 0x03: Unordered (non-signaling) \n
1817	/// 0x04: Not-equal (unordered, non-signaling) \n
1818	/// 0x05: Not-less-than (unordered, signaling) \n
1819	/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1820	/// 0x07: Ordered (non-signaling) \n
1821	/// 0x08: Equal (unordered, non-signaling) \n
1822	/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1823	/// 0x0A: Not-greater-than (unordered, signaling) \n
1824	/// 0x0B: False (ordered, non-signaling) \n
1825	/// 0x0C: Not-equal (ordered, non-signaling) \n
1826	/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1827	/// 0x0E: Greater-than (ordered, signaling) \n
1828	/// 0x0F: True (unordered, non-signaling) \n
1829	/// 0x10: Equal (ordered, signaling) \n
1830	/// 0x11: Less-than (ordered, non-signaling) \n
1831	/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1832	/// 0x13: Unordered (signaling) \n
1833	/// 0x14: Not-equal (unordered, signaling) \n
1834	/// 0x15: Not-less-than (unordered, non-signaling) \n
1835	/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1836	/// 0x17: Ordered (signaling) \n
1837	/// 0x18: Equal (unordered, signaling) \n
1838	/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1839	/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1840	/// 0x1B: False (ordered, signaling) \n
1841	/// 0x1C: Not-equal (ordered, signaling) \n
1842	/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1843	/// 0x1E: Greater-than (ordered, non-signaling) \n
1844	/// 0x1F: True (unordered, signaling)
1845	/// \returns A 256-bit vector of [8 x float] containing the comparison results.
1846	#define _mm256_cmp_ps(a, b, c) \
1847	((__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \
1848	(__v8sf)(__m256)(b), (c)))
1849
1850	/// Compares each of the corresponding scalar double-precision values of
1851	/// two 128-bit vectors of [2 x double], using the operation specified by the
1852	/// immediate integer operand.
1853	///
1854	/// If the result is true, all 64 bits of the destination vector are set;
1855	/// otherwise they are cleared.
1856	///
1857	/// \headerfile <x86intrin.h>
1858	///
1859	/// \code
1860	/// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c);
1861	/// \endcode
1862	///
1863	/// This intrinsic corresponds to the <c> VCMPSD </c> instruction.
1864	///
1865	/// \param a
1866	/// A 128-bit vector of [2 x double].
1867	/// \param b
1868	/// A 128-bit vector of [2 x double].
1869	/// \param c
1870	/// An immediate integer operand, with bits [4:0] specifying which comparison
1871	/// operation to use: \n
1872	/// 0x00: Equal (ordered, non-signaling) \n
1873	/// 0x01: Less-than (ordered, signaling) \n
1874	/// 0x02: Less-than-or-equal (ordered, signaling) \n
1875	/// 0x03: Unordered (non-signaling) \n
1876	/// 0x04: Not-equal (unordered, non-signaling) \n
1877	/// 0x05: Not-less-than (unordered, signaling) \n
1878	/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1879	/// 0x07: Ordered (non-signaling) \n
1880	/// 0x08: Equal (unordered, non-signaling) \n
1881	/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1882	/// 0x0A: Not-greater-than (unordered, signaling) \n
1883	/// 0x0B: False (ordered, non-signaling) \n
1884	/// 0x0C: Not-equal (ordered, non-signaling) \n
1885	/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1886	/// 0x0E: Greater-than (ordered, signaling) \n
1887	/// 0x0F: True (unordered, non-signaling) \n
1888	/// 0x10: Equal (ordered, signaling) \n
1889	/// 0x11: Less-than (ordered, non-signaling) \n
1890	/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1891	/// 0x13: Unordered (signaling) \n
1892	/// 0x14: Not-equal (unordered, signaling) \n
1893	/// 0x15: Not-less-than (unordered, non-signaling) \n
1894	/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1895	/// 0x17: Ordered (signaling) \n
1896	/// 0x18: Equal (unordered, signaling) \n
1897	/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1898	/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1899	/// 0x1B: False (ordered, signaling) \n
1900	/// 0x1C: Not-equal (ordered, signaling) \n
1901	/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1902	/// 0x1E: Greater-than (ordered, non-signaling) \n
1903	/// 0x1F: True (unordered, signaling)
1904	/// \returns A 128-bit vector of [2 x double] containing the comparison results.
1905	#define _mm_cmp_sd(a, b, c) \
1906	((__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), \
1907	(__v2df)(__m128d)(b), (c)))
1908
1909	/// Compares each of the corresponding scalar values of two 128-bit
1910	/// vectors of [4 x float], using the operation specified by the immediate
1911	/// integer operand.
1912	///
1913	/// If the result is true, all 32 bits of the destination vector are set;
1914	/// otherwise they are cleared.
1915	///
1916	/// \headerfile <x86intrin.h>
1917	///
1918	/// \code
1919	/// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c);
1920	/// \endcode
1921	///
1922	/// This intrinsic corresponds to the <c> VCMPSS </c> instruction.
1923	///
1924	/// \param a
1925	/// A 128-bit vector of [4 x float].
1926	/// \param b
1927	/// A 128-bit vector of [4 x float].
1928	/// \param c
1929	/// An immediate integer operand, with bits [4:0] specifying which comparison
1930	/// operation to use: \n
1931	/// 0x00: Equal (ordered, non-signaling) \n
1932	/// 0x01: Less-than (ordered, signaling) \n
1933	/// 0x02: Less-than-or-equal (ordered, signaling) \n
1934	/// 0x03: Unordered (non-signaling) \n
1935	/// 0x04: Not-equal (unordered, non-signaling) \n
1936	/// 0x05: Not-less-than (unordered, signaling) \n
1937	/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1938	/// 0x07: Ordered (non-signaling) \n
1939	/// 0x08: Equal (unordered, non-signaling) \n
1940	/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1941	/// 0x0A: Not-greater-than (unordered, signaling) \n
1942	/// 0x0B: False (ordered, non-signaling) \n
1943	/// 0x0C: Not-equal (ordered, non-signaling) \n
1944	/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1945	/// 0x0E: Greater-than (ordered, signaling) \n
1946	/// 0x0F: True (unordered, non-signaling) \n
1947	/// 0x10: Equal (ordered, signaling) \n
1948	/// 0x11: Less-than (ordered, non-signaling) \n
1949	/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1950	/// 0x13: Unordered (signaling) \n
1951	/// 0x14: Not-equal (unordered, signaling) \n
1952	/// 0x15: Not-less-than (unordered, non-signaling) \n
1953	/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1954	/// 0x17: Ordered (signaling) \n
1955	/// 0x18: Equal (unordered, signaling) \n
1956	/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1957	/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1958	/// 0x1B: False (ordered, signaling) \n
1959	/// 0x1C: Not-equal (ordered, signaling) \n
1960	/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1961	/// 0x1E: Greater-than (ordered, non-signaling) \n
1962	/// 0x1F: True (unordered, signaling)
1963	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
1964	#define _mm_cmp_ss(a, b, c) \
1965	((__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), \
1966	(__v4sf)(__m128)(b), (c)))
1967
1968	/// Takes a [8 x i32] vector and returns the vector element value
1969	/// indexed by the immediate constant operand.
1970	///
1971	/// \headerfile <x86intrin.h>
1972	///
1973	/// \code
1974	/// int _mm256_extract_epi32(__m256i X, const int N);
1975	/// \endcode
1976	///
1977	/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
1978	/// instruction.
1979	///
1980	/// \param X
1981	/// A 256-bit vector of [8 x i32].
1982	/// \param N
1983	/// An immediate integer operand with bits [2:0] determining which vector
1984	/// element is extracted and returned.
1985	/// \returns A 32-bit integer containing the extracted 32 bits of extended
1986	/// packed data.
1987	#define _mm256_extract_epi32(X, N) \
1988	((int)__builtin_ia32_vec_ext_v8si((__v8si)(__m256i)(X), (int)(N)))
1989
1990	/// Takes a [16 x i16] vector and returns the vector element value
1991	/// indexed by the immediate constant operand.
1992	///
1993	/// \headerfile <x86intrin.h>
1994	///
1995	/// \code
1996	/// int _mm256_extract_epi16(__m256i X, const int N);
1997	/// \endcode
1998	///
1999	/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
2000	/// instruction.
2001	///
2002	/// \param X
2003	/// A 256-bit integer vector of [16 x i16].
2004	/// \param N
2005	/// An immediate integer operand with bits [3:0] determining which vector
2006	/// element is extracted and returned.
2007	/// \returns A 32-bit integer containing the extracted 16 bits of zero extended
2008	/// packed data.
2009	#define _mm256_extract_epi16(X, N) \
2010	((int)(unsigned short)__builtin_ia32_vec_ext_v16hi((__v16hi)(__m256i)(X), \
2011	(int)(N)))
2012
2013	/// Takes a [32 x i8] vector and returns the vector element value
2014	/// indexed by the immediate constant operand.
2015	///
2016	/// \headerfile <x86intrin.h>
2017	///
2018	/// \code
2019	/// int _mm256_extract_epi8(__m256i X, const int N);
2020	/// \endcode
2021	///
2022	/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
2023	/// instruction.
2024	///
2025	/// \param X
2026	/// A 256-bit integer vector of [32 x i8].
2027	/// \param N
2028	/// An immediate integer operand with bits [4:0] determining which vector
2029	/// element is extracted and returned.
2030	/// \returns A 32-bit integer containing the extracted 8 bits of zero extended
2031	/// packed data.
2032	#define _mm256_extract_epi8(X, N) \
2033	((int)(unsigned char)__builtin_ia32_vec_ext_v32qi((__v32qi)(__m256i)(X), \
2034	(int)(N)))
2035
2036	#ifdef __x86_64__
2037	/// Takes a [4 x i64] vector and returns the vector element value
2038	/// indexed by the immediate constant operand.
2039	///
2040	/// \headerfile <x86intrin.h>
2041	///
2042	/// \code
2043	/// long long _mm256_extract_epi64(__m256i X, const int N);
2044	/// \endcode
2045	///
2046	/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
2047	/// instruction.
2048	///
2049	/// \param X
2050	/// A 256-bit integer vector of [4 x i64].
2051	/// \param N
2052	/// An immediate integer operand with bits [1:0] determining which vector
2053	/// element is extracted and returned.
2054	/// \returns A 64-bit integer containing the extracted 64 bits of extended
2055	/// packed data.
2056	#define _mm256_extract_epi64(X, N) \
2057	((long long)__builtin_ia32_vec_ext_v4di((__v4di)(__m256i)(X), (int)(N)))
2058	#endif
2059
2060	/// Takes a [8 x i32] vector and replaces the vector element value
2061	/// indexed by the immediate constant operand by a new value. Returns the
2062	/// modified vector.
2063	///
2064	/// \headerfile <x86intrin.h>
2065	///
2066	/// \code
2067	/// __m256i _mm256_insert_epi32(__m256i X, int I, const int N);
2068	/// \endcode
2069	///
2070	/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2071	/// instruction.
2072	///
2073	/// \param X
2074	/// A vector of [8 x i32] to be used by the insert operation.
2075	/// \param I
2076	/// An integer value. The replacement value for the insert operation.
2077	/// \param N
2078	/// An immediate integer specifying the index of the vector element to be
2079	/// replaced.
2080	/// \returns A copy of vector \a X, after replacing its element indexed by
2081	/// \a N with \a I.
2082	#define _mm256_insert_epi32(X, I, N) \
2083	((__m256i)__builtin_ia32_vec_set_v8si((__v8si)(__m256i)(X), \
2084	(int)(I), (int)(N)))
2085
2086
2087	/// Takes a [16 x i16] vector and replaces the vector element value
2088	/// indexed by the immediate constant operand with a new value. Returns the
2089	/// modified vector.
2090	///
2091	/// \headerfile <x86intrin.h>
2092	///
2093	/// \code
2094	/// __m256i _mm256_insert_epi16(__m256i X, int I, const int N);
2095	/// \endcode
2096	///
2097	/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2098	/// instruction.
2099	///
2100	/// \param X
2101	/// A vector of [16 x i16] to be used by the insert operation.
2102	/// \param I
2103	/// An i16 integer value. The replacement value for the insert operation.
2104	/// \param N
2105	/// An immediate integer specifying the index of the vector element to be
2106	/// replaced.
2107	/// \returns A copy of vector \a X, after replacing its element indexed by
2108	/// \a N with \a I.
2109	#define _mm256_insert_epi16(X, I, N) \
2110	((__m256i)__builtin_ia32_vec_set_v16hi((__v16hi)(__m256i)(X), \
2111	(int)(I), (int)(N)))
2112
2113	/// Takes a [32 x i8] vector and replaces the vector element value
2114	/// indexed by the immediate constant operand with a new value. Returns the
2115	/// modified vector.
2116	///
2117	/// \headerfile <x86intrin.h>
2118	///
2119	/// \code
2120	/// __m256i _mm256_insert_epi8(__m256i X, int I, const int N);
2121	/// \endcode
2122	///
2123	/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2124	/// instruction.
2125	///
2126	/// \param X
2127	/// A vector of [32 x i8] to be used by the insert operation.
2128	/// \param I
2129	/// An i8 integer value. The replacement value for the insert operation.
2130	/// \param N
2131	/// An immediate integer specifying the index of the vector element to be
2132	/// replaced.
2133	/// \returns A copy of vector \a X, after replacing its element indexed by
2134	/// \a N with \a I.
2135	#define _mm256_insert_epi8(X, I, N) \
2136	((__m256i)__builtin_ia32_vec_set_v32qi((__v32qi)(__m256i)(X), \
2137	(int)(I), (int)(N)))
2138
2139	#ifdef __x86_64__
2140	/// Takes a [4 x i64] vector and replaces the vector element value
2141	/// indexed by the immediate constant operand with a new value. Returns the
2142	/// modified vector.
2143	///
2144	/// \headerfile <x86intrin.h>
2145	///
2146	/// \code
2147	/// __m256i _mm256_insert_epi64(__m256i X, int I, const int N);
2148	/// \endcode
2149	///
2150	/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2151	/// instruction.
2152	///
2153	/// \param X
2154	/// A vector of [4 x i64] to be used by the insert operation.
2155	/// \param I
2156	/// A 64-bit integer value. The replacement value for the insert operation.
2157	/// \param N
2158	/// An immediate integer specifying the index of the vector element to be
2159	/// replaced.
2160	/// \returns A copy of vector \a X, after replacing its element indexed by
2161	/// \a N with \a I.
2162	#define _mm256_insert_epi64(X, I, N) \
2163	((__m256i)__builtin_ia32_vec_set_v4di((__v4di)(__m256i)(X), \
2164	(long long)(I), (int)(N)))
2165	#endif
2166
2167	/* Conversion */
2168	/// Converts a vector of [4 x i32] into a vector of [4 x double].
2169	///
2170	/// \headerfile <x86intrin.h>
2171	///
2172	/// This intrinsic corresponds to the <c> VCVTDQ2PD </c> instruction.
2173	///
2174	/// \param __a
2175	/// A 128-bit integer vector of [4 x i32].
2176	/// \returns A 256-bit vector of [4 x double] containing the converted values.
2177	static __inline __m256d __DEFAULT_FN_ATTRS
2178	_mm256_cvtepi32_pd(__m128i __a)
2179	{
2180	return (__m256d)__builtin_convertvector((__v4si)__a, __v4df);
2181	}
2182
2183	/// Converts a vector of [8 x i32] into a vector of [8 x float].
2184	///
2185	/// \headerfile <x86intrin.h>
2186	///
2187	/// This intrinsic corresponds to the <c> VCVTDQ2PS </c> instruction.
2188	///
2189	/// \param __a
2190	/// A 256-bit integer vector.
2191	/// \returns A 256-bit vector of [8 x float] containing the converted values.
2192	static __inline __m256 __DEFAULT_FN_ATTRS
2193	_mm256_cvtepi32_ps(__m256i __a)
2194	{
2195	return (__m256)__builtin_convertvector((__v8si)__a, __v8sf);
2196	}
2197
2198	/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of
2199	/// [4 x float].
2200	///
2201	/// \headerfile <x86intrin.h>
2202	///
2203	/// This intrinsic corresponds to the <c> VCVTPD2PS </c> instruction.
2204	///
2205	/// \param __a
2206	/// A 256-bit vector of [4 x double].
2207	/// \returns A 128-bit vector of [4 x float] containing the converted values.
2208	static __inline __m128 __DEFAULT_FN_ATTRS
2209	_mm256_cvtpd_ps(__m256d __a)
2210	{
2211	return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) __a);
2212	}
2213
2214	/// Converts a vector of [8 x float] into a vector of [8 x i32].
2215	///
2216	/// \headerfile <x86intrin.h>
2217	///
2218	/// This intrinsic corresponds to the <c> VCVTPS2DQ </c> instruction.
2219	///
2220	/// \param __a
2221	/// A 256-bit vector of [8 x float].
2222	/// \returns A 256-bit integer vector containing the converted values.
2223	static __inline __m256i __DEFAULT_FN_ATTRS
2224	_mm256_cvtps_epi32(__m256 __a)
2225	{
2226	return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) __a);
2227	}
2228
2229	/// Converts a 128-bit vector of [4 x float] into a 256-bit vector of [4
2230	/// x double].
2231	///
2232	/// \headerfile <x86intrin.h>
2233	///
2234	/// This intrinsic corresponds to the <c> VCVTPS2PD </c> instruction.
2235	///
2236	/// \param __a
2237	/// A 128-bit vector of [4 x float].
2238	/// \returns A 256-bit vector of [4 x double] containing the converted values.
2239	static __inline __m256d __DEFAULT_FN_ATTRS
2240	_mm256_cvtps_pd(__m128 __a)
2241	{
2242	return (__m256d)__builtin_convertvector((__v4sf)__a, __v4df);
2243	}
2244
2245	/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4
2246	/// x i32], truncating the result by rounding towards zero when it is
2247	/// inexact.
2248	///
2249	/// \headerfile <x86intrin.h>
2250	///
2251	/// This intrinsic corresponds to the <c> VCVTTPD2DQ </c> instruction.
2252	///
2253	/// \param __a
2254	/// A 256-bit vector of [4 x double].
2255	/// \returns A 128-bit integer vector containing the converted values.
2256	static __inline __m128i __DEFAULT_FN_ATTRS
2257	_mm256_cvttpd_epi32(__m256d __a)
2258	{
2259	return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) __a);
2260	}
2261
2262	/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4
2263	/// x i32]. When a conversion is inexact, the value returned is rounded
2264	/// according to the rounding control bits in the MXCSR register.
2265	///
2266	/// \headerfile <x86intrin.h>
2267	///
2268	/// This intrinsic corresponds to the <c> VCVTPD2DQ </c> instruction.
2269	///
2270	/// \param __a
2271	/// A 256-bit vector of [4 x double].
2272	/// \returns A 128-bit integer vector containing the converted values.
2273	static __inline __m128i __DEFAULT_FN_ATTRS
2274	_mm256_cvtpd_epi32(__m256d __a)
2275	{
2276	return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a);
2277	}
2278
2279	/// Converts a vector of [8 x float] into a vector of [8 x i32],
2280	/// truncating the result by rounding towards zero when it is inexact.
2281	///
2282	/// \headerfile <x86intrin.h>
2283	///
2284	/// This intrinsic corresponds to the <c> VCVTTPS2DQ </c> instruction.
2285	///
2286	/// \param __a
2287	/// A 256-bit vector of [8 x float].
2288	/// \returns A 256-bit integer vector containing the converted values.
2289	static __inline __m256i __DEFAULT_FN_ATTRS
2290	_mm256_cvttps_epi32(__m256 __a)
2291	{
2292	return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) __a);
2293	}
2294
2295	/// Returns the first element of the input vector of [4 x double].
2296	///
2297	/// \headerfile <x86intrin.h>
2298	///
2299	/// This intrinsic is a utility function and does not correspond to a specific
2300	/// instruction.
2301	///
2302	/// \param __a
2303	/// A 256-bit vector of [4 x double].
2304	/// \returns A 64 bit double containing the first element of the input vector.
2305	static __inline double __DEFAULT_FN_ATTRS
2306	_mm256_cvtsd_f64(__m256d __a)
2307	{
2308	return __a[0];
2309	}
2310
2311	/// Returns the first element of the input vector of [8 x i32].
2312	///
2313	/// \headerfile <x86intrin.h>
2314	///
2315	/// This intrinsic is a utility function and does not correspond to a specific
2316	/// instruction.
2317	///
2318	/// \param __a
2319	/// A 256-bit vector of [8 x i32].
2320	/// \returns A 32 bit integer containing the first element of the input vector.
2321	static __inline int __DEFAULT_FN_ATTRS
2322	_mm256_cvtsi256_si32(__m256i __a)
2323	{
2324	__v8si __b = (__v8si)__a;
2325	return __b[0];
2326	}
2327
2328	/// Returns the first element of the input vector of [8 x float].
2329	///
2330	/// \headerfile <x86intrin.h>
2331	///
2332	/// This intrinsic is a utility function and does not correspond to a specific
2333	/// instruction.
2334	///
2335	/// \param __a
2336	/// A 256-bit vector of [8 x float].
2337	/// \returns A 32 bit float containing the first element of the input vector.
2338	static __inline float __DEFAULT_FN_ATTRS
2339	_mm256_cvtss_f32(__m256 __a)
2340	{
2341	return __a[0];
2342	}
2343
2344	/* Vector replicate */
2345	/// Moves and duplicates odd-indexed values from a 256-bit vector of
2346	/// [8 x float] to float values in a 256-bit vector of [8 x float].
2347	///
2348	/// \headerfile <x86intrin.h>
2349	///
2350	/// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction.
2351	///
2352	/// \param __a
2353	/// A 256-bit vector of [8 x float]. \n
2354	/// Bits [255:224] of \a __a are written to bits [255:224] and [223:192] of
2355	/// the return value. \n
2356	/// Bits [191:160] of \a __a are written to bits [191:160] and [159:128] of
2357	/// the return value. \n
2358	/// Bits [127:96] of \a __a are written to bits [127:96] and [95:64] of the
2359	/// return value. \n
2360	/// Bits [63:32] of \a __a are written to bits [63:32] and [31:0] of the
2361	/// return value.
2362	/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
2363	/// values.
2364	static __inline __m256 __DEFAULT_FN_ATTRS
2365	_mm256_movehdup_ps(__m256 __a)
2366	{
2367	return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 1, 1, 3, 3, 5, 5, 7, 7);
2368	}
2369
2370	/// Moves and duplicates even-indexed values from a 256-bit vector of
2371	/// [8 x float] to float values in a 256-bit vector of [8 x float].
2372	///
2373	/// \headerfile <x86intrin.h>
2374	///
2375	/// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction.
2376	///
2377	/// \param __a
2378	/// A 256-bit vector of [8 x float]. \n
2379	/// Bits [223:192] of \a __a are written to bits [255:224] and [223:192] of
2380	/// the return value. \n
2381	/// Bits [159:128] of \a __a are written to bits [191:160] and [159:128] of
2382	/// the return value. \n
2383	/// Bits [95:64] of \a __a are written to bits [127:96] and [95:64] of the
2384	/// return value. \n
2385	/// Bits [31:0] of \a __a are written to bits [63:32] and [31:0] of the
2386	/// return value.
2387	/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
2388	/// values.
2389	static __inline __m256 __DEFAULT_FN_ATTRS
2390	_mm256_moveldup_ps(__m256 __a)
2391	{
2392	return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 0, 2, 2, 4, 4, 6, 6);
2393	}
2394
2395	/// Moves and duplicates double-precision floating point values from a
2396	/// 256-bit vector of [4 x double] to double-precision values in a 256-bit
2397	/// vector of [4 x double].
2398	///
2399	/// \headerfile <x86intrin.h>
2400	///
2401	/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
2402	///
2403	/// \param __a
2404	/// A 256-bit vector of [4 x double]. \n
2405	/// Bits [63:0] of \a __a are written to bits [127:64] and [63:0] of the
2406	/// return value. \n
2407	/// Bits [191:128] of \a __a are written to bits [255:192] and [191:128] of
2408	/// the return value.
2409	/// \returns A 256-bit vector of [4 x double] containing the moved and
2410	/// duplicated values.
2411	static __inline __m256d __DEFAULT_FN_ATTRS
2412	_mm256_movedup_pd(__m256d __a)
2413	{
2414	return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 0, 2, 2);
2415	}
2416
2417	/* Unpack and Interleave */
2418	/// Unpacks the odd-indexed vector elements from two 256-bit vectors of
2419	/// [4 x double] and interleaves them into a 256-bit vector of [4 x double].
2420	///
2421	/// \headerfile <x86intrin.h>
2422	///
2423	/// This intrinsic corresponds to the <c> VUNPCKHPD </c> instruction.
2424	///
2425	/// \param __a
2426	/// A 256-bit floating-point vector of [4 x double]. \n
2427	/// Bits [127:64] are written to bits [63:0] of the return value. \n
2428	/// Bits [255:192] are written to bits [191:128] of the return value. \n
2429	/// \param __b
2430	/// A 256-bit floating-point vector of [4 x double]. \n
2431	/// Bits [127:64] are written to bits [127:64] of the return value. \n
2432	/// Bits [255:192] are written to bits [255:192] of the return value. \n
2433	/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
2434	static __inline __m256d __DEFAULT_FN_ATTRS
2435	_mm256_unpackhi_pd(__m256d __a, __m256d __b)
2436	{
2437	return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 1, 5, 1+2, 5+2);
2438	}
2439
2440	/// Unpacks the even-indexed vector elements from two 256-bit vectors of
2441	/// [4 x double] and interleaves them into a 256-bit vector of [4 x double].
2442	///
2443	/// \headerfile <x86intrin.h>
2444	///
2445	/// This intrinsic corresponds to the <c> VUNPCKLPD </c> instruction.
2446	///
2447	/// \param __a
2448	/// A 256-bit floating-point vector of [4 x double]. \n
2449	/// Bits [63:0] are written to bits [63:0] of the return value. \n
2450	/// Bits [191:128] are written to bits [191:128] of the return value.
2451	/// \param __b
2452	/// A 256-bit floating-point vector of [4 x double]. \n
2453	/// Bits [63:0] are written to bits [127:64] of the return value. \n
2454	/// Bits [191:128] are written to bits [255:192] of the return value. \n
2455	/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
2456	static __inline __m256d __DEFAULT_FN_ATTRS
2457	_mm256_unpacklo_pd(__m256d __a, __m256d __b)
2458	{
2459	return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 0, 4, 0+2, 4+2);
2460	}
2461
2462	/// Unpacks the 32-bit vector elements 2, 3, 6 and 7 from each of the
2463	/// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
2464	/// vector of [8 x float].
2465	///
2466	/// \headerfile <x86intrin.h>
2467	///
2468	/// This intrinsic corresponds to the <c> VUNPCKHPS </c> instruction.
2469	///
2470	/// \param __a
2471	/// A 256-bit vector of [8 x float]. \n
2472	/// Bits [95:64] are written to bits [31:0] of the return value. \n
2473	/// Bits [127:96] are written to bits [95:64] of the return value. \n
2474	/// Bits [223:192] are written to bits [159:128] of the return value. \n
2475	/// Bits [255:224] are written to bits [223:192] of the return value.
2476	/// \param __b
2477	/// A 256-bit vector of [8 x float]. \n
2478	/// Bits [95:64] are written to bits [63:32] of the return value. \n
2479	/// Bits [127:96] are written to bits [127:96] of the return value. \n
2480	/// Bits [223:192] are written to bits [191:160] of the return value. \n
2481	/// Bits [255:224] are written to bits [255:224] of the return value.
2482	/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
2483	static __inline __m256 __DEFAULT_FN_ATTRS
2484	_mm256_unpackhi_ps(__m256 __a, __m256 __b)
2485	{
2486	return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
2487	}
2488
2489	/// Unpacks the 32-bit vector elements 0, 1, 4 and 5 from each of the
2490	/// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
2491	/// vector of [8 x float].
2492	///
2493	/// \headerfile <x86intrin.h>
2494	///
2495	/// This intrinsic corresponds to the <c> VUNPCKLPS </c> instruction.
2496	///
2497	/// \param __a
2498	/// A 256-bit vector of [8 x float]. \n
2499	/// Bits [31:0] are written to bits [31:0] of the return value. \n
2500	/// Bits [63:32] are written to bits [95:64] of the return value. \n
2501	/// Bits [159:128] are written to bits [159:128] of the return value. \n
2502	/// Bits [191:160] are written to bits [223:192] of the return value.
2503	/// \param __b
2504	/// A 256-bit vector of [8 x float]. \n
2505	/// Bits [31:0] are written to bits [63:32] of the return value. \n
2506	/// Bits [63:32] are written to bits [127:96] of the return value. \n
2507	/// Bits [159:128] are written to bits [191:160] of the return value. \n
2508	/// Bits [191:160] are written to bits [255:224] of the return value.
2509	/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
2510	static __inline __m256 __DEFAULT_FN_ATTRS
2511	_mm256_unpacklo_ps(__m256 __a, __m256 __b)
2512	{
2513	return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
2514	}
2515
2516	/* Bit Test */
2517	/// Given two 128-bit floating-point vectors of [2 x double], perform an
2518	/// element-by-element comparison of the double-precision element in the
2519	/// first source vector and the corresponding element in the second source
2520	/// vector.
2521	///
2522	/// The EFLAGS register is updated as follows: \n
2523	/// If there is at least one pair of double-precision elements where the
2524	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2525	/// ZF flag is set to 1. \n
2526	/// If there is at least one pair of double-precision elements where the
2527	/// sign-bit of the first element is 0 and the sign-bit of the second element
2528	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2529	/// This intrinsic returns the value of the ZF flag.
2530	///
2531	/// \headerfile <x86intrin.h>
2532	///
2533	/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2534	///
2535	/// \param __a
2536	/// A 128-bit vector of [2 x double].
2537	/// \param __b
2538	/// A 128-bit vector of [2 x double].
2539	/// \returns the ZF flag in the EFLAGS register.
2540	static __inline int __DEFAULT_FN_ATTRS128
2541	_mm_testz_pd(__m128d __a, __m128d __b)
2542	{
2543	return __builtin_ia32_vtestzpd((__v2df)__a, (__v2df)__b);
2544	}
2545
2546	/// Given two 128-bit floating-point vectors of [2 x double], perform an
2547	/// element-by-element comparison of the double-precision element in the
2548	/// first source vector and the corresponding element in the second source
2549	/// vector.
2550	///
2551	/// The EFLAGS register is updated as follows: \n
2552	/// If there is at least one pair of double-precision elements where the
2553	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2554	/// ZF flag is set to 1. \n
2555	/// If there is at least one pair of double-precision elements where the
2556	/// sign-bit of the first element is 0 and the sign-bit of the second element
2557	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2558	/// This intrinsic returns the value of the CF flag.
2559	///
2560	/// \headerfile <x86intrin.h>
2561	///
2562	/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2563	///
2564	/// \param __a
2565	/// A 128-bit vector of [2 x double].
2566	/// \param __b
2567	/// A 128-bit vector of [2 x double].
2568	/// \returns the CF flag in the EFLAGS register.
2569	static __inline int __DEFAULT_FN_ATTRS128
2570	_mm_testc_pd(__m128d __a, __m128d __b)
2571	{
2572	return __builtin_ia32_vtestcpd((__v2df)__a, (__v2df)__b);
2573	}
2574
2575	/// Given two 128-bit floating-point vectors of [2 x double], perform an
2576	/// element-by-element comparison of the double-precision element in the
2577	/// first source vector and the corresponding element in the second source
2578	/// vector.
2579	///
2580	/// The EFLAGS register is updated as follows: \n
2581	/// If there is at least one pair of double-precision elements where the
2582	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2583	/// ZF flag is set to 1. \n
2584	/// If there is at least one pair of double-precision elements where the
2585	/// sign-bit of the first element is 0 and the sign-bit of the second element
2586	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2587	/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2588	/// otherwise it returns 0.
2589	///
2590	/// \headerfile <x86intrin.h>
2591	///
2592	/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2593	///
2594	/// \param __a
2595	/// A 128-bit vector of [2 x double].
2596	/// \param __b
2597	/// A 128-bit vector of [2 x double].
2598	/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2599	static __inline int __DEFAULT_FN_ATTRS128
2600	_mm_testnzc_pd(__m128d __a, __m128d __b)
2601	{
2602	return __builtin_ia32_vtestnzcpd((__v2df)__a, (__v2df)__b);
2603	}
2604
2605	/// Given two 128-bit floating-point vectors of [4 x float], perform an
2606	/// element-by-element comparison of the single-precision element in the
2607	/// first source vector and the corresponding element in the second source
2608	/// vector.
2609	///
2610	/// The EFLAGS register is updated as follows: \n
2611	/// If there is at least one pair of single-precision elements where the
2612	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2613	/// ZF flag is set to 1. \n
2614	/// If there is at least one pair of single-precision elements where the
2615	/// sign-bit of the first element is 0 and the sign-bit of the second element
2616	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2617	/// This intrinsic returns the value of the ZF flag.
2618	///
2619	/// \headerfile <x86intrin.h>
2620	///
2621	/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2622	///
2623	/// \param __a
2624	/// A 128-bit vector of [4 x float].
2625	/// \param __b
2626	/// A 128-bit vector of [4 x float].
2627	/// \returns the ZF flag.
2628	static __inline int __DEFAULT_FN_ATTRS128
2629	_mm_testz_ps(__m128 __a, __m128 __b)
2630	{
2631	return __builtin_ia32_vtestzps((__v4sf)__a, (__v4sf)__b);
2632	}
2633
2634	/// Given two 128-bit floating-point vectors of [4 x float], perform an
2635	/// element-by-element comparison of the single-precision element in the
2636	/// first source vector and the corresponding element in the second source
2637	/// vector.
2638	///
2639	/// The EFLAGS register is updated as follows: \n
2640	/// If there is at least one pair of single-precision elements where the
2641	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2642	/// ZF flag is set to 1. \n
2643	/// If there is at least one pair of single-precision elements where the
2644	/// sign-bit of the first element is 0 and the sign-bit of the second element
2645	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2646	/// This intrinsic returns the value of the CF flag.
2647	///
2648	/// \headerfile <x86intrin.h>
2649	///
2650	/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2651	///
2652	/// \param __a
2653	/// A 128-bit vector of [4 x float].
2654	/// \param __b
2655	/// A 128-bit vector of [4 x float].
2656	/// \returns the CF flag.
2657	static __inline int __DEFAULT_FN_ATTRS128
2658	_mm_testc_ps(__m128 __a, __m128 __b)
2659	{
2660	return __builtin_ia32_vtestcps((__v4sf)__a, (__v4sf)__b);
2661	}
2662
2663	/// Given two 128-bit floating-point vectors of [4 x float], perform an
2664	/// element-by-element comparison of the single-precision element in the
2665	/// first source vector and the corresponding element in the second source
2666	/// vector.
2667	///
2668	/// The EFLAGS register is updated as follows: \n
2669	/// If there is at least one pair of single-precision elements where the
2670	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2671	/// ZF flag is set to 1. \n
2672	/// If there is at least one pair of single-precision elements where the
2673	/// sign-bit of the first element is 0 and the sign-bit of the second element
2674	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2675	/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2676	/// otherwise it returns 0.
2677	///
2678	/// \headerfile <x86intrin.h>
2679	///
2680	/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2681	///
2682	/// \param __a
2683	/// A 128-bit vector of [4 x float].
2684	/// \param __b
2685	/// A 128-bit vector of [4 x float].
2686	/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2687	static __inline int __DEFAULT_FN_ATTRS128
2688	_mm_testnzc_ps(__m128 __a, __m128 __b)
2689	{
2690	return __builtin_ia32_vtestnzcps((__v4sf)__a, (__v4sf)__b);
2691	}
2692
2693	/// Given two 256-bit floating-point vectors of [4 x double], perform an
2694	/// element-by-element comparison of the double-precision elements in the
2695	/// first source vector and the corresponding elements in the second source
2696	/// vector.
2697	///
2698	/// The EFLAGS register is updated as follows: \n
2699	/// If there is at least one pair of double-precision elements where the
2700	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2701	/// ZF flag is set to 1. \n
2702	/// If there is at least one pair of double-precision elements where the
2703	/// sign-bit of the first element is 0 and the sign-bit of the second element
2704	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2705	/// This intrinsic returns the value of the ZF flag.
2706	///
2707	/// \headerfile <x86intrin.h>
2708	///
2709	/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2710	///
2711	/// \param __a
2712	/// A 256-bit vector of [4 x double].
2713	/// \param __b
2714	/// A 256-bit vector of [4 x double].
2715	/// \returns the ZF flag.
2716	static __inline int __DEFAULT_FN_ATTRS
2717	_mm256_testz_pd(__m256d __a, __m256d __b)
2718	{
2719	return __builtin_ia32_vtestzpd256((__v4df)__a, (__v4df)__b);
2720	}
2721
2722	/// Given two 256-bit floating-point vectors of [4 x double], perform an
2723	/// element-by-element comparison of the double-precision elements in the
2724	/// first source vector and the corresponding elements in the second source
2725	/// vector.
2726	///
2727	/// The EFLAGS register is updated as follows: \n
2728	/// If there is at least one pair of double-precision elements where the
2729	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2730	/// ZF flag is set to 1. \n
2731	/// If there is at least one pair of double-precision elements where the
2732	/// sign-bit of the first element is 0 and the sign-bit of the second element
2733	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2734	/// This intrinsic returns the value of the CF flag.
2735	///
2736	/// \headerfile <x86intrin.h>
2737	///
2738	/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2739	///
2740	/// \param __a
2741	/// A 256-bit vector of [4 x double].
2742	/// \param __b
2743	/// A 256-bit vector of [4 x double].
2744	/// \returns the CF flag.
2745	static __inline int __DEFAULT_FN_ATTRS
2746	_mm256_testc_pd(__m256d __a, __m256d __b)
2747	{
2748	return __builtin_ia32_vtestcpd256((__v4df)__a, (__v4df)__b);
2749	}
2750
2751	/// Given two 256-bit floating-point vectors of [4 x double], perform an
2752	/// element-by-element comparison of the double-precision elements in the
2753	/// first source vector and the corresponding elements in the second source
2754	/// vector.
2755	///
2756	/// The EFLAGS register is updated as follows: \n
2757	/// If there is at least one pair of double-precision elements where the
2758	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2759	/// ZF flag is set to 1. \n
2760	/// If there is at least one pair of double-precision elements where the
2761	/// sign-bit of the first element is 0 and the sign-bit of the second element
2762	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2763	/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2764	/// otherwise it returns 0.
2765	///
2766	/// \headerfile <x86intrin.h>
2767	///
2768	/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2769	///
2770	/// \param __a
2771	/// A 256-bit vector of [4 x double].
2772	/// \param __b
2773	/// A 256-bit vector of [4 x double].
2774	/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2775	static __inline int __DEFAULT_FN_ATTRS
2776	_mm256_testnzc_pd(__m256d __a, __m256d __b)
2777	{
2778	return __builtin_ia32_vtestnzcpd256((__v4df)__a, (__v4df)__b);
2779	}
2780
2781	/// Given two 256-bit floating-point vectors of [8 x float], perform an
2782	/// element-by-element comparison of the single-precision element in the
2783	/// first source vector and the corresponding element in the second source
2784	/// vector.
2785	///
2786	/// The EFLAGS register is updated as follows: \n
2787	/// If there is at least one pair of single-precision elements where the
2788	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2789	/// ZF flag is set to 1. \n
2790	/// If there is at least one pair of single-precision elements where the
2791	/// sign-bit of the first element is 0 and the sign-bit of the second element
2792	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2793	/// This intrinsic returns the value of the ZF flag.
2794	///
2795	/// \headerfile <x86intrin.h>
2796	///
2797	/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2798	///
2799	/// \param __a
2800	/// A 256-bit vector of [8 x float].
2801	/// \param __b
2802	/// A 256-bit vector of [8 x float].
2803	/// \returns the ZF flag.
2804	static __inline int __DEFAULT_FN_ATTRS
2805	_mm256_testz_ps(__m256 __a, __m256 __b)
2806	{
2807	return __builtin_ia32_vtestzps256((__v8sf)__a, (__v8sf)__b);
2808	}
2809
2810	/// Given two 256-bit floating-point vectors of [8 x float], perform an
2811	/// element-by-element comparison of the single-precision element in the
2812	/// first source vector and the corresponding element in the second source
2813	/// vector.
2814	///
2815	/// The EFLAGS register is updated as follows: \n
2816	/// If there is at least one pair of single-precision elements where the
2817	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2818	/// ZF flag is set to 1. \n
2819	/// If there is at least one pair of single-precision elements where the
2820	/// sign-bit of the first element is 0 and the sign-bit of the second element
2821	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2822	/// This intrinsic returns the value of the CF flag.
2823	///
2824	/// \headerfile <x86intrin.h>
2825	///
2826	/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2827	///
2828	/// \param __a
2829	/// A 256-bit vector of [8 x float].
2830	/// \param __b
2831	/// A 256-bit vector of [8 x float].
2832	/// \returns the CF flag.
2833	static __inline int __DEFAULT_FN_ATTRS
2834	_mm256_testc_ps(__m256 __a, __m256 __b)
2835	{
2836	return __builtin_ia32_vtestcps256((__v8sf)__a, (__v8sf)__b);
2837	}
2838
2839	/// Given two 256-bit floating-point vectors of [8 x float], perform an
2840	/// element-by-element comparison of the single-precision elements in the
2841	/// first source vector and the corresponding elements in the second source
2842	/// vector.
2843	///
2844	/// The EFLAGS register is updated as follows: \n
2845	/// If there is at least one pair of single-precision elements where the
2846	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2847	/// ZF flag is set to 1. \n
2848	/// If there is at least one pair of single-precision elements where the
2849	/// sign-bit of the first element is 0 and the sign-bit of the second element
2850	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2851	/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2852	/// otherwise it returns 0.
2853	///
2854	/// \headerfile <x86intrin.h>
2855	///
2856	/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2857	///
2858	/// \param __a
2859	/// A 256-bit vector of [8 x float].
2860	/// \param __b
2861	/// A 256-bit vector of [8 x float].
2862	/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2863	static __inline int __DEFAULT_FN_ATTRS
2864	_mm256_testnzc_ps(__m256 __a, __m256 __b)
2865	{
2866	return __builtin_ia32_vtestnzcps256((__v8sf)__a, (__v8sf)__b);
2867	}
2868
2869	/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
2870	/// of the two source vectors.
2871	///
2872	/// The EFLAGS register is updated as follows: \n
2873	/// If there is at least one pair of bits where both bits are 1, the ZF flag
2874	/// is set to 0. Otherwise the ZF flag is set to 1. \n
2875	/// If there is at least one pair of bits where the bit from the first source
2876	/// vector is 0 and the bit from the second source vector is 1, the CF flag
2877	/// is set to 0. Otherwise the CF flag is set to 1. \n
2878	/// This intrinsic returns the value of the ZF flag.
2879	///
2880	/// \headerfile <x86intrin.h>
2881	///
2882	/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
2883	///
2884	/// \param __a
2885	/// A 256-bit integer vector.
2886	/// \param __b
2887	/// A 256-bit integer vector.
2888	/// \returns the ZF flag.
2889	static __inline int __DEFAULT_FN_ATTRS
2890	_mm256_testz_si256(__m256i __a, __m256i __b)
2891	{
2892	return __builtin_ia32_ptestz256((__v4di)__a, (__v4di)__b);
2893	}
2894
2895	/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
2896	/// of the two source vectors.
2897	///
2898	/// The EFLAGS register is updated as follows: \n
2899	/// If there is at least one pair of bits where both bits are 1, the ZF flag
2900	/// is set to 0. Otherwise the ZF flag is set to 1. \n
2901	/// If there is at least one pair of bits where the bit from the first source
2902	/// vector is 0 and the bit from the second source vector is 1, the CF flag
2903	/// is set to 0. Otherwise the CF flag is set to 1. \n
2904	/// This intrinsic returns the value of the CF flag.
2905	///
2906	/// \headerfile <x86intrin.h>
2907	///
2908	/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
2909	///
2910	/// \param __a
2911	/// A 256-bit integer vector.
2912	/// \param __b
2913	/// A 256-bit integer vector.
2914	/// \returns the CF flag.
2915	static __inline int __DEFAULT_FN_ATTRS
2916	_mm256_testc_si256(__m256i __a, __m256i __b)
2917	{
2918	return __builtin_ia32_ptestc256((__v4di)__a, (__v4di)__b);
2919	}
2920
2921	/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
2922	/// of the two source vectors.
2923	///
2924	/// The EFLAGS register is updated as follows: \n
2925	/// If there is at least one pair of bits where both bits are 1, the ZF flag
2926	/// is set to 0. Otherwise the ZF flag is set to 1. \n
2927	/// If there is at least one pair of bits where the bit from the first source
2928	/// vector is 0 and the bit from the second source vector is 1, the CF flag
2929	/// is set to 0. Otherwise the CF flag is set to 1. \n
2930	/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2931	/// otherwise it returns 0.
2932	///
2933	/// \headerfile <x86intrin.h>
2934	///
2935	/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
2936	///
2937	/// \param __a
2938	/// A 256-bit integer vector.
2939	/// \param __b
2940	/// A 256-bit integer vector.
2941	/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2942	static __inline int __DEFAULT_FN_ATTRS
2943	_mm256_testnzc_si256(__m256i __a, __m256i __b)
2944	{
2945	return __builtin_ia32_ptestnzc256((__v4di)__a, (__v4di)__b);
2946	}
2947
2948	/* Vector extract sign mask */
2949	/// Extracts the sign bits of double-precision floating point elements
2950	/// in a 256-bit vector of [4 x double] and writes them to the lower order
2951	/// bits of the return value.
2952	///
2953	/// \headerfile <x86intrin.h>
2954	///
2955	/// This intrinsic corresponds to the <c> VMOVMSKPD </c> instruction.
2956	///
2957	/// \param __a
2958	/// A 256-bit vector of [4 x double] containing the double-precision
2959	/// floating point values with sign bits to be extracted.
2960	/// \returns The sign bits from the operand, written to bits [3:0].
2961	static __inline int __DEFAULT_FN_ATTRS
2962	_mm256_movemask_pd(__m256d __a)
2963	{
2964	return __builtin_ia32_movmskpd256((__v4df)__a);
2965	}
2966
2967	/// Extracts the sign bits of single-precision floating point elements
2968	/// in a 256-bit vector of [8 x float] and writes them to the lower order
2969	/// bits of the return value.
2970	///
2971	/// \headerfile <x86intrin.h>
2972	///
2973	/// This intrinsic corresponds to the <c> VMOVMSKPS </c> instruction.
2974	///
2975	/// \param __a
2976	/// A 256-bit vector of [8 x float] containing the single-precision floating
2977	/// point values with sign bits to be extracted.
2978	/// \returns The sign bits from the operand, written to bits [7:0].
2979	static __inline int __DEFAULT_FN_ATTRS
2980	_mm256_movemask_ps(__m256 __a)
2981	{
2982	return __builtin_ia32_movmskps256((__v8sf)__a);
2983	}
2984
2985	/* Vector __zero */
2986	/// Zeroes the contents of all XMM or YMM registers.
2987	///
2988	/// \headerfile <x86intrin.h>
2989	///
2990	/// This intrinsic corresponds to the <c> VZEROALL </c> instruction.
2991	static __inline void __attribute__((__always_inline__, __nodebug__, __target__("avx")))
2992	_mm256_zeroall(void)
2993	{
2994	__builtin_ia32_vzeroall();
2995	}
2996
2997	/// Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
2998	///
2999	/// \headerfile <x86intrin.h>
3000	///
3001	/// This intrinsic corresponds to the <c> VZEROUPPER </c> instruction.
3002	static __inline void __attribute__((__always_inline__, __nodebug__, __target__("avx")))
3003	_mm256_zeroupper(void)
3004	{
3005	__builtin_ia32_vzeroupper();
3006	}
3007
3008	/* Vector load with broadcast */
3009	/// Loads a scalar single-precision floating point value from the
3010	/// specified address pointed to by \a __a and broadcasts it to the elements
3011	/// of a [4 x float] vector.
3012	///
3013	/// \headerfile <x86intrin.h>
3014	///
3015	/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
3016	///
3017	/// \param __a
3018	/// The single-precision floating point value to be broadcast.
3019	/// \returns A 128-bit vector of [4 x float] whose 32-bit elements are set
3020	/// equal to the broadcast value.
3021	static __inline __m128 __DEFAULT_FN_ATTRS128
3022	_mm_broadcast_ss(float const *__a)
3023	{
3024	struct __mm_broadcast_ss_struct {
3025	float __f;
3026	} __attribute__((__packed__, __may_alias__));
3027	float __f = ((const struct __mm_broadcast_ss_struct*)__a)->__f;
3028	return __extension__ (__m128){ __f, __f, __f, __f };
3029	}
3030
3031	/// Loads a scalar double-precision floating point value from the
3032	/// specified address pointed to by \a __a and broadcasts it to the elements
3033	/// of a [4 x double] vector.
3034	///
3035	/// \headerfile <x86intrin.h>
3036	///
3037	/// This intrinsic corresponds to the <c> VBROADCASTSD </c> instruction.
3038	///
3039	/// \param __a
3040	/// The double-precision floating point value to be broadcast.
3041	/// \returns A 256-bit vector of [4 x double] whose 64-bit elements are set
3042	/// equal to the broadcast value.
3043	static __inline __m256d __DEFAULT_FN_ATTRS
3044	_mm256_broadcast_sd(double const *__a)
3045	{
3046	struct __mm256_broadcast_sd_struct {
3047	double __d;
3048	} __attribute__((__packed__, __may_alias__));
3049	double __d = ((const struct __mm256_broadcast_sd_struct*)__a)->__d;
3050	return __extension__ (__m256d)(__v4df){ __d, __d, __d, __d };
3051	}
3052
3053	/// Loads a scalar single-precision floating point value from the
3054	/// specified address pointed to by \a __a and broadcasts it to the elements
3055	/// of a [8 x float] vector.
3056	///
3057	/// \headerfile <x86intrin.h>
3058	///
3059	/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
3060	///
3061	/// \param __a
3062	/// The single-precision floating point value to be broadcast.
3063	/// \returns A 256-bit vector of [8 x float] whose 32-bit elements are set
3064	/// equal to the broadcast value.
3065	static __inline __m256 __DEFAULT_FN_ATTRS
3066	_mm256_broadcast_ss(float const *__a)
3067	{
3068	struct __mm256_broadcast_ss_struct {
3069	float __f;
3070	} __attribute__((__packed__, __may_alias__));
3071	float __f = ((const struct __mm256_broadcast_ss_struct*)__a)->__f;
3072	return __extension__ (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f };
3073	}
3074
3075	/// Loads the data from a 128-bit vector of [2 x double] from the
3076	/// specified address pointed to by \a __a and broadcasts it to 128-bit
3077	/// elements in a 256-bit vector of [4 x double].
3078	///
3079	/// \headerfile <x86intrin.h>
3080	///
3081	/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
3082	///
3083	/// \param __a
3084	/// The 128-bit vector of [2 x double] to be broadcast.
3085	/// \returns A 256-bit vector of [4 x double] whose 128-bit elements are set
3086	/// equal to the broadcast value.
3087	static __inline __m256d __DEFAULT_FN_ATTRS
3088	_mm256_broadcast_pd(__m128d const *__a)
3089	{
3090	__m128d __b = _mm_loadu_pd((const double *)__a);
3091	return (__m256d)__builtin_shufflevector((__v2df)__b, (__v2df)__b,
3092	0, 1, 0, 1);
3093	}
3094
3095	/// Loads the data from a 128-bit vector of [4 x float] from the
3096	/// specified address pointed to by \a __a and broadcasts it to 128-bit
3097	/// elements in a 256-bit vector of [8 x float].
3098	///
3099	/// \headerfile <x86intrin.h>
3100	///
3101	/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
3102	///
3103	/// \param __a
3104	/// The 128-bit vector of [4 x float] to be broadcast.
3105	/// \returns A 256-bit vector of [8 x float] whose 128-bit elements are set
3106	/// equal to the broadcast value.
3107	static __inline __m256 __DEFAULT_FN_ATTRS
3108	_mm256_broadcast_ps(__m128 const *__a)
3109	{
3110	__m128 __b = _mm_loadu_ps((const float *)__a);
3111	return (__m256)__builtin_shufflevector((__v4sf)__b, (__v4sf)__b,
3112	0, 1, 2, 3, 0, 1, 2, 3);
3113	}
3114
3115	/* SIMD load ops */
3116	/// Loads 4 double-precision floating point values from a 32-byte aligned
3117	/// memory location pointed to by \a __p into a vector of [4 x double].
3118	///
3119	/// \headerfile <x86intrin.h>
3120	///
3121	/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
3122	///
3123	/// \param __p
3124	/// A 32-byte aligned pointer to a memory location containing
3125	/// double-precision floating point values.
3126	/// \returns A 256-bit vector of [4 x double] containing the moved values.
3127	static __inline __m256d __DEFAULT_FN_ATTRS
3128	_mm256_load_pd(double const *__p)
3129	{
3130	return (const __m256d )__p;
3131	}
3132
3133	/// Loads 8 single-precision floating point values from a 32-byte aligned
3134	/// memory location pointed to by \a __p into a vector of [8 x float].
3135	///
3136	/// \headerfile <x86intrin.h>
3137	///
3138	/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
3139	///
3140	/// \param __p
3141	/// A 32-byte aligned pointer to a memory location containing float values.
3142	/// \returns A 256-bit vector of [8 x float] containing the moved values.
3143	static __inline __m256 __DEFAULT_FN_ATTRS
3144	_mm256_load_ps(float const *__p)
3145	{
3146	return (const __m256 )__p;
3147	}
3148
3149	/// Loads 4 double-precision floating point values from an unaligned
3150	/// memory location pointed to by \a __p into a vector of [4 x double].
3151	///
3152	/// \headerfile <x86intrin.h>
3153	///
3154	/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
3155	///
3156	/// \param __p
3157	/// A pointer to a memory location containing double-precision floating
3158	/// point values.
3159	/// \returns A 256-bit vector of [4 x double] containing the moved values.
3160	static __inline __m256d __DEFAULT_FN_ATTRS
3161	_mm256_loadu_pd(double const *__p)
3162	{
3163	struct __loadu_pd {
3164	__m256d_u __v;
3165	} __attribute__((__packed__, __may_alias__));
3166	return ((const struct __loadu_pd*)__p)->__v;
3167	}
3168
3169	/// Loads 8 single-precision floating point values from an unaligned
3170	/// memory location pointed to by \a __p into a vector of [8 x float].
3171	///
3172	/// \headerfile <x86intrin.h>
3173	///
3174	/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
3175	///
3176	/// \param __p
3177	/// A pointer to a memory location containing single-precision floating
3178	/// point values.
3179	/// \returns A 256-bit vector of [8 x float] containing the moved values.
3180	static __inline __m256 __DEFAULT_FN_ATTRS
3181	_mm256_loadu_ps(float const *__p)
3182	{
3183	struct __loadu_ps {
3184	__m256_u __v;
3185	} __attribute__((__packed__, __may_alias__));
3186	return ((const struct __loadu_ps*)__p)->__v;
3187	}
3188
3189	/// Loads 256 bits of integer data from a 32-byte aligned memory
3190	/// location pointed to by \a __p into elements of a 256-bit integer vector.
3191	///
3192	/// \headerfile <x86intrin.h>
3193	///
3194	/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
3195	///
3196	/// \param __p
3197	/// A 32-byte aligned pointer to a 256-bit integer vector containing integer
3198	/// values.
3199	/// \returns A 256-bit integer vector containing the moved values.
3200	static __inline __m256i __DEFAULT_FN_ATTRS
3201	_mm256_load_si256(__m256i const *__p)
3202	{
3203	return *__p;
3204	}
3205
3206	/// Loads 256 bits of integer data from an unaligned memory location
3207	/// pointed to by \a __p into a 256-bit integer vector.
3208	///
3209	/// \headerfile <x86intrin.h>
3210	///
3211	/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
3212	///
3213	/// \param __p
3214	/// A pointer to a 256-bit integer vector containing integer values.
3215	/// \returns A 256-bit integer vector containing the moved values.
3216	static __inline __m256i __DEFAULT_FN_ATTRS
3217	_mm256_loadu_si256(__m256i_u const *__p)
3218	{
3219	struct __loadu_si256 {
3220	__m256i_u __v;
3221	} __attribute__((__packed__, __may_alias__));
3222	return ((const struct __loadu_si256*)__p)->__v;
3223	}
3224
3225	/// Loads 256 bits of integer data from an unaligned memory location
3226	/// pointed to by \a __p into a 256-bit integer vector. This intrinsic may
3227	/// perform better than \c _mm256_loadu_si256 when the data crosses a cache
3228	/// line boundary.
3229	///
3230	/// \headerfile <x86intrin.h>
3231	///
3232	/// This intrinsic corresponds to the <c> VLDDQU </c> instruction.
3233	///
3234	/// \param __p
3235	/// A pointer to a 256-bit integer vector containing integer values.
3236	/// \returns A 256-bit integer vector containing the moved values.
3237	static __inline __m256i __DEFAULT_FN_ATTRS
3238	_mm256_lddqu_si256(__m256i_u const *__p)
3239	{
3240	return (__m256i)__builtin_ia32_lddqu256((char const *)__p);
3241	}
3242
3243	/* SIMD store ops */
3244	/// Stores double-precision floating point values from a 256-bit vector
3245	/// of [4 x double] to a 32-byte aligned memory location pointed to by
3246	/// \a __p.
3247	///
3248	/// \headerfile <x86intrin.h>
3249	///
3250	/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
3251	///
3252	/// \param __p
3253	/// A 32-byte aligned pointer to a memory location that will receive the
3254	/// double-precision floaing point values.
3255	/// \param __a
3256	/// A 256-bit vector of [4 x double] containing the values to be moved.
3257	static __inline void __DEFAULT_FN_ATTRS
3258	_mm256_store_pd(double *__p, __m256d __a)
3259	{
3260	(__m256d )__p = __a;
3261	}
3262
3263	/// Stores single-precision floating point values from a 256-bit vector
3264	/// of [8 x float] to a 32-byte aligned memory location pointed to by \a __p.
3265	///
3266	/// \headerfile <x86intrin.h>
3267	///
3268	/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
3269	///
3270	/// \param __p
3271	/// A 32-byte aligned pointer to a memory location that will receive the
3272	/// float values.
3273	/// \param __a
3274	/// A 256-bit vector of [8 x float] containing the values to be moved.
3275	static __inline void __DEFAULT_FN_ATTRS
3276	_mm256_store_ps(float *__p, __m256 __a)
3277	{
3278	(__m256 )__p = __a;
3279	}
3280
3281	/// Stores double-precision floating point values from a 256-bit vector
3282	/// of [4 x double] to an unaligned memory location pointed to by \a __p.
3283	///
3284	/// \headerfile <x86intrin.h>
3285	///
3286	/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
3287	///
3288	/// \param __p
3289	/// A pointer to a memory location that will receive the double-precision
3290	/// floating point values.
3291	/// \param __a
3292	/// A 256-bit vector of [4 x double] containing the values to be moved.
3293	static __inline void __DEFAULT_FN_ATTRS
3294	_mm256_storeu_pd(double *__p, __m256d __a)
3295	{
3296	struct __storeu_pd {
3297	__m256d_u __v;
3298	} __attribute__((__packed__, __may_alias__));
3299	((struct __storeu_pd*)__p)->__v = __a;
3300	}
3301
3302	/// Stores single-precision floating point values from a 256-bit vector
3303	/// of [8 x float] to an unaligned memory location pointed to by \a __p.
3304	///
3305	/// \headerfile <x86intrin.h>
3306	///
3307	/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
3308	///
3309	/// \param __p
3310	/// A pointer to a memory location that will receive the float values.
3311	/// \param __a
3312	/// A 256-bit vector of [8 x float] containing the values to be moved.
3313	static __inline void __DEFAULT_FN_ATTRS
3314	_mm256_storeu_ps(float *__p, __m256 __a)
3315	{
3316	struct __storeu_ps {
3317	__m256_u __v;
3318	} __attribute__((__packed__, __may_alias__));
3319	((struct __storeu_ps*)__p)->__v = __a;
3320	}
3321
3322	/// Stores integer values from a 256-bit integer vector to a 32-byte
3323	/// aligned memory location pointed to by \a __p.
3324	///
3325	/// \headerfile <x86intrin.h>
3326	///
3327	/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
3328	///
3329	/// \param __p
3330	/// A 32-byte aligned pointer to a memory location that will receive the
3331	/// integer values.
3332	/// \param __a
3333	/// A 256-bit integer vector containing the values to be moved.
3334	static __inline void __DEFAULT_FN_ATTRS
3335	_mm256_store_si256(__m256i *__p, __m256i __a)
3336	{
3337	*__p = __a;
3338	}
3339
3340	/// Stores integer values from a 256-bit integer vector to an unaligned
3341	/// memory location pointed to by \a __p.
3342	///
3343	/// \headerfile <x86intrin.h>
3344	///
3345	/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
3346	///
3347	/// \param __p
3348	/// A pointer to a memory location that will receive the integer values.
3349	/// \param __a
3350	/// A 256-bit integer vector containing the values to be moved.
3351	static __inline void __DEFAULT_FN_ATTRS
3352	_mm256_storeu_si256(__m256i_u *__p, __m256i __a)
3353	{
3354	struct __storeu_si256 {
3355	__m256i_u __v;
3356	} __attribute__((__packed__, __may_alias__));
3357	((struct __storeu_si256*)__p)->__v = __a;
3358	}
3359
3360	/* Conditional load ops */
3361	/// Conditionally loads double-precision floating point elements from a
3362	/// memory location pointed to by \a __p into a 128-bit vector of
3363	/// [2 x double], depending on the mask bits associated with each data
3364	/// element.
3365	///
3366	/// \headerfile <x86intrin.h>
3367	///
3368	/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3369	///
3370	/// \param __p
3371	/// A pointer to a memory location that contains the double-precision
3372	/// floating point values.
3373	/// \param __m
3374	/// A 128-bit integer vector containing the mask. The most significant bit of
3375	/// each data element represents the mask bits. If a mask bit is zero, the
3376	/// corresponding value in the memory location is not loaded and the
3377	/// corresponding field in the return value is set to zero.
3378	/// \returns A 128-bit vector of [2 x double] containing the loaded values.
3379	static __inline __m128d __DEFAULT_FN_ATTRS128
3380	_mm_maskload_pd(double const *__p, __m128i __m)
3381	{
3382	return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2di)__m);
3383	}
3384
3385	/// Conditionally loads double-precision floating point elements from a
3386	/// memory location pointed to by \a __p into a 256-bit vector of
3387	/// [4 x double], depending on the mask bits associated with each data
3388	/// element.
3389	///
3390	/// \headerfile <x86intrin.h>
3391	///
3392	/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3393	///
3394	/// \param __p
3395	/// A pointer to a memory location that contains the double-precision
3396	/// floating point values.
3397	/// \param __m
3398	/// A 256-bit integer vector of [4 x quadword] containing the mask. The most
3399	/// significant bit of each quadword element represents the mask bits. If a
3400	/// mask bit is zero, the corresponding value in the memory location is not
3401	/// loaded and the corresponding field in the return value is set to zero.
3402	/// \returns A 256-bit vector of [4 x double] containing the loaded values.
3403	static __inline __m256d __DEFAULT_FN_ATTRS
3404	_mm256_maskload_pd(double const *__p, __m256i __m)
3405	{
3406	return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)__p,
3407	(__v4di)__m);
3408	}
3409
3410	/// Conditionally loads single-precision floating point elements from a
3411	/// memory location pointed to by \a __p into a 128-bit vector of
3412	/// [4 x float], depending on the mask bits associated with each data
3413	/// element.
3414	///
3415	/// \headerfile <x86intrin.h>
3416	///
3417	/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3418	///
3419	/// \param __p
3420	/// A pointer to a memory location that contains the single-precision
3421	/// floating point values.
3422	/// \param __m
3423	/// A 128-bit integer vector containing the mask. The most significant bit of
3424	/// each data element represents the mask bits. If a mask bit is zero, the
3425	/// corresponding value in the memory location is not loaded and the
3426	/// corresponding field in the return value is set to zero.
3427	/// \returns A 128-bit vector of [4 x float] containing the loaded values.
3428	static __inline __m128 __DEFAULT_FN_ATTRS128
3429	_mm_maskload_ps(float const *__p, __m128i __m)
3430	{
3431	return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4si)__m);
3432	}
3433
3434	/// Conditionally loads single-precision floating point elements from a
3435	/// memory location pointed to by \a __p into a 256-bit vector of
3436	/// [8 x float], depending on the mask bits associated with each data
3437	/// element.
3438	///
3439	/// \headerfile <x86intrin.h>
3440	///
3441	/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3442	///
3443	/// \param __p
3444	/// A pointer to a memory location that contains the single-precision
3445	/// floating point values.
3446	/// \param __m
3447	/// A 256-bit integer vector of [8 x dword] containing the mask. The most
3448	/// significant bit of each dword element represents the mask bits. If a mask
3449	/// bit is zero, the corresponding value in the memory location is not loaded
3450	/// and the corresponding field in the return value is set to zero.
3451	/// \returns A 256-bit vector of [8 x float] containing the loaded values.
3452	static __inline __m256 __DEFAULT_FN_ATTRS
3453	_mm256_maskload_ps(float const *__p, __m256i __m)
3454	{
3455	return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)__p, (__v8si)__m);
3456	}
3457
3458	/* Conditional store ops */
3459	/// Moves single-precision floating point values from a 256-bit vector
3460	/// of [8 x float] to a memory location pointed to by \a __p, according to
3461	/// the specified mask.
3462	///
3463	/// \headerfile <x86intrin.h>
3464	///
3465	/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3466	///
3467	/// \param __p
3468	/// A pointer to a memory location that will receive the float values.
3469	/// \param __m
3470	/// A 256-bit integer vector of [8 x dword] containing the mask. The most
3471	/// significant bit of each dword element in the mask vector represents the
3472	/// mask bits. If a mask bit is zero, the corresponding value from vector
3473	/// \a __a is not stored and the corresponding field in the memory location
3474	/// pointed to by \a __p is not changed.
3475	/// \param __a
3476	/// A 256-bit vector of [8 x float] containing the values to be stored.
3477	static __inline void __DEFAULT_FN_ATTRS
3478	_mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a)
3479	{
3480	__builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8si)__m, (__v8sf)__a);
3481	}
3482
3483	/// Moves double-precision values from a 128-bit vector of [2 x double]
3484	/// to a memory location pointed to by \a __p, according to the specified
3485	/// mask.
3486	///
3487	/// \headerfile <x86intrin.h>
3488	///
3489	/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3490	///
3491	/// \param __p
3492	/// A pointer to a memory location that will receive the float values.
3493	/// \param __m
3494	/// A 128-bit integer vector containing the mask. The most significant bit of
3495	/// each field in the mask vector represents the mask bits. If a mask bit is
3496	/// zero, the corresponding value from vector \a __a is not stored and the
3497	/// corresponding field in the memory location pointed to by \a __p is not
3498	/// changed.
3499	/// \param __a
3500	/// A 128-bit vector of [2 x double] containing the values to be stored.
3501	static __inline void __DEFAULT_FN_ATTRS128
3502	_mm_maskstore_pd(double *__p, __m128i __m, __m128d __a)
3503	{
3504	__builtin_ia32_maskstorepd((__v2df *)__p, (__v2di)__m, (__v2df)__a);
3505	}
3506
3507	/// Moves double-precision values from a 256-bit vector of [4 x double]
3508	/// to a memory location pointed to by \a __p, according to the specified
3509	/// mask.
3510	///
3511	/// \headerfile <x86intrin.h>
3512	///
3513	/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3514	///
3515	/// \param __p
3516	/// A pointer to a memory location that will receive the float values.
3517	/// \param __m
3518	/// A 256-bit integer vector of [4 x quadword] containing the mask. The most
3519	/// significant bit of each quadword element in the mask vector represents
3520	/// the mask bits. If a mask bit is zero, the corresponding value from vector
3521	/// __a is not stored and the corresponding field in the memory location
3522	/// pointed to by \a __p is not changed.
3523	/// \param __a
3524	/// A 256-bit vector of [4 x double] containing the values to be stored.
3525	static __inline void __DEFAULT_FN_ATTRS
3526	_mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a)
3527	{
3528	__builtin_ia32_maskstorepd256((__v4df *)__p, (__v4di)__m, (__v4df)__a);
3529	}
3530
3531	/// Moves single-precision floating point values from a 128-bit vector
3532	/// of [4 x float] to a memory location pointed to by \a __p, according to
3533	/// the specified mask.
3534	///
3535	/// \headerfile <x86intrin.h>
3536	///
3537	/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3538	///
3539	/// \param __p
3540	/// A pointer to a memory location that will receive the float values.
3541	/// \param __m
3542	/// A 128-bit integer vector containing the mask. The most significant bit of
3543	/// each field in the mask vector represents the mask bits. If a mask bit is
3544	/// zero, the corresponding value from vector __a is not stored and the
3545	/// corresponding field in the memory location pointed to by \a __p is not
3546	/// changed.
3547	/// \param __a
3548	/// A 128-bit vector of [4 x float] containing the values to be stored.
3549	static __inline void __DEFAULT_FN_ATTRS128
3550	_mm_maskstore_ps(float *__p, __m128i __m, __m128 __a)
3551	{
3552	__builtin_ia32_maskstoreps((__v4sf *)__p, (__v4si)__m, (__v4sf)__a);
3553	}
3554
3555	/* Cacheability support ops */
3556	/// Moves integer data from a 256-bit integer vector to a 32-byte
3557	/// aligned memory location. To minimize caching, the data is flagged as
3558	/// non-temporal (unlikely to be used again soon).
3559	///
3560	/// \headerfile <x86intrin.h>
3561	///
3562	/// This intrinsic corresponds to the <c> VMOVNTDQ </c> instruction.
3563	///
3564	/// \param __a
3565	/// A pointer to a 32-byte aligned memory location that will receive the
3566	/// integer values.
3567	/// \param __b
3568	/// A 256-bit integer vector containing the values to be moved.
3569	static __inline void __DEFAULT_FN_ATTRS
3570	_mm256_stream_si256(void *__a, __m256i __b)
3571	{
3572	typedef __v4di __v4di_aligned __attribute__((aligned(32)));
3573	__builtin_nontemporal_store((__v4di_aligned)__b, (__v4di_aligned*)__a);
3574	}
3575
3576	/// Moves double-precision values from a 256-bit vector of [4 x double]
3577	/// to a 32-byte aligned memory location. To minimize caching, the data is
3578	/// flagged as non-temporal (unlikely to be used again soon).
3579	///
3580	/// \headerfile <x86intrin.h>
3581	///
3582	/// This intrinsic corresponds to the <c> VMOVNTPD </c> instruction.
3583	///
3584	/// \param __a
3585	/// A pointer to a 32-byte aligned memory location that will receive the
3586	/// double-precision floating-point values.
3587	/// \param __b
3588	/// A 256-bit vector of [4 x double] containing the values to be moved.
3589	static __inline void __DEFAULT_FN_ATTRS
3590	_mm256_stream_pd(void *__a, __m256d __b)
3591	{
3592	typedef __v4df __v4df_aligned __attribute__((aligned(32)));
3593	__builtin_nontemporal_store((__v4df_aligned)__b, (__v4df_aligned*)__a);
3594	}
3595
3596	/// Moves single-precision floating point values from a 256-bit vector
3597	/// of [8 x float] to a 32-byte aligned memory location. To minimize
3598	/// caching, the data is flagged as non-temporal (unlikely to be used again
3599	/// soon).
3600	///
3601	/// \headerfile <x86intrin.h>
3602	///
3603	/// This intrinsic corresponds to the <c> VMOVNTPS </c> instruction.
3604	///
3605	/// \param __p
3606	/// A pointer to a 32-byte aligned memory location that will receive the
3607	/// single-precision floating point values.
3608	/// \param __a
3609	/// A 256-bit vector of [8 x float] containing the values to be moved.
3610	static __inline void __DEFAULT_FN_ATTRS
3611	_mm256_stream_ps(void *__p, __m256 __a)
3612	{
3613	typedef __v8sf __v8sf_aligned __attribute__((aligned(32)));
3614	__builtin_nontemporal_store((__v8sf_aligned)__a, (__v8sf_aligned*)__p);
3615	}
3616
3617	/* Create vectors */
3618	/// Create a 256-bit vector of [4 x double] with undefined values.
3619	///
3620	/// \headerfile <x86intrin.h>
3621	///
3622	/// This intrinsic has no corresponding instruction.
3623	///
3624	/// \returns A 256-bit vector of [4 x double] containing undefined values.
3625	static __inline__ __m256d __DEFAULT_FN_ATTRS
3626	_mm256_undefined_pd(void)
3627	{
3628	return (__m256d)__builtin_ia32_undef256();
3629	}
3630
3631	/// Create a 256-bit vector of [8 x float] with undefined values.
3632	///
3633	/// \headerfile <x86intrin.h>
3634	///
3635	/// This intrinsic has no corresponding instruction.
3636	///
3637	/// \returns A 256-bit vector of [8 x float] containing undefined values.
3638	static __inline__ __m256 __DEFAULT_FN_ATTRS
3639	_mm256_undefined_ps(void)
3640	{
3641	return (__m256)__builtin_ia32_undef256();
3642	}
3643
3644	/// Create a 256-bit integer vector with undefined values.
3645	///
3646	/// \headerfile <x86intrin.h>
3647	///
3648	/// This intrinsic has no corresponding instruction.
3649	///
3650	/// \returns A 256-bit integer vector containing undefined values.
3651	static __inline__ __m256i __DEFAULT_FN_ATTRS
3652	_mm256_undefined_si256(void)
3653	{
3654	return (__m256i)__builtin_ia32_undef256();
3655	}
3656
3657	/// Constructs a 256-bit floating-point vector of [4 x double]
3658	/// initialized with the specified double-precision floating-point values.
3659	///
3660	/// \headerfile <x86intrin.h>
3661	///
3662	/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
3663	/// instruction.
3664	///
3665	/// \param __a
3666	/// A double-precision floating-point value used to initialize bits [255:192]
3667	/// of the result.
3668	/// \param __b
3669	/// A double-precision floating-point value used to initialize bits [191:128]
3670	/// of the result.
3671	/// \param __c
3672	/// A double-precision floating-point value used to initialize bits [127:64]
3673	/// of the result.
3674	/// \param __d
3675	/// A double-precision floating-point value used to initialize bits [63:0]
3676	/// of the result.
3677	/// \returns An initialized 256-bit floating-point vector of [4 x double].
3678	static __inline __m256d __DEFAULT_FN_ATTRS
3679	_mm256_set_pd(double __a, double __b, double __c, double __d)
3680	{
3681	return __extension__ (__m256d){ __d, __c, __b, __a };
3682	}
3683
3684	/// Constructs a 256-bit floating-point vector of [8 x float] initialized
3685	/// with the specified single-precision floating-point values.
3686	///
3687	/// \headerfile <x86intrin.h>
3688	///
3689	/// This intrinsic is a utility function and does not correspond to a specific
3690	/// instruction.
3691	///
3692	/// \param __a
3693	/// A single-precision floating-point value used to initialize bits [255:224]
3694	/// of the result.
3695	/// \param __b
3696	/// A single-precision floating-point value used to initialize bits [223:192]
3697	/// of the result.
3698	/// \param __c
3699	/// A single-precision floating-point value used to initialize bits [191:160]
3700	/// of the result.
3701	/// \param __d
3702	/// A single-precision floating-point value used to initialize bits [159:128]
3703	/// of the result.
3704	/// \param __e
3705	/// A single-precision floating-point value used to initialize bits [127:96]
3706	/// of the result.
3707	/// \param __f
3708	/// A single-precision floating-point value used to initialize bits [95:64]
3709	/// of the result.
3710	/// \param __g
3711	/// A single-precision floating-point value used to initialize bits [63:32]
3712	/// of the result.
3713	/// \param __h
3714	/// A single-precision floating-point value used to initialize bits [31:0]
3715	/// of the result.
3716	/// \returns An initialized 256-bit floating-point vector of [8 x float].
3717	static __inline __m256 __DEFAULT_FN_ATTRS
3718	_mm256_set_ps(float __a, float __b, float __c, float __d,
3719	float __e, float __f, float __g, float __h)
3720	{
3721	return __extension__ (__m256){ __h, __g, __f, __e, __d, __c, __b, __a };
3722	}
3723
3724	/// Constructs a 256-bit integer vector initialized with the specified
3725	/// 32-bit integral values.
3726	///
3727	/// \headerfile <x86intrin.h>
3728	///
3729	/// This intrinsic is a utility function and does not correspond to a specific
3730	/// instruction.
3731	///
3732	/// \param __i0
3733	/// A 32-bit integral value used to initialize bits [255:224] of the result.
3734	/// \param __i1
3735	/// A 32-bit integral value used to initialize bits [223:192] of the result.
3736	/// \param __i2
3737	/// A 32-bit integral value used to initialize bits [191:160] of the result.
3738	/// \param __i3
3739	/// A 32-bit integral value used to initialize bits [159:128] of the result.
3740	/// \param __i4
3741	/// A 32-bit integral value used to initialize bits [127:96] of the result.
3742	/// \param __i5
3743	/// A 32-bit integral value used to initialize bits [95:64] of the result.
3744	/// \param __i6
3745	/// A 32-bit integral value used to initialize bits [63:32] of the result.
3746	/// \param __i7
3747	/// A 32-bit integral value used to initialize bits [31:0] of the result.
3748	/// \returns An initialized 256-bit integer vector.
3749	static __inline __m256i __DEFAULT_FN_ATTRS
3750	_mm256_set_epi32(int __i0, int __i1, int __i2, int __i3,
3751	int __i4, int __i5, int __i6, int __i7)
3752	{
3753	return __extension__ (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 };
3754	}
3755
3756	/// Constructs a 256-bit integer vector initialized with the specified
3757	/// 16-bit integral values.
3758	///
3759	/// \headerfile <x86intrin.h>
3760	///
3761	/// This intrinsic is a utility function and does not correspond to a specific
3762	/// instruction.
3763	///
3764	/// \param __w15
3765	/// A 16-bit integral value used to initialize bits [255:240] of the result.
3766	/// \param __w14
3767	/// A 16-bit integral value used to initialize bits [239:224] of the result.
3768	/// \param __w13
3769	/// A 16-bit integral value used to initialize bits [223:208] of the result.
3770	/// \param __w12
3771	/// A 16-bit integral value used to initialize bits [207:192] of the result.
3772	/// \param __w11
3773	/// A 16-bit integral value used to initialize bits [191:176] of the result.
3774	/// \param __w10
3775	/// A 16-bit integral value used to initialize bits [175:160] of the result.
3776	/// \param __w09
3777	/// A 16-bit integral value used to initialize bits [159:144] of the result.
3778	/// \param __w08
3779	/// A 16-bit integral value used to initialize bits [143:128] of the result.
3780	/// \param __w07
3781	/// A 16-bit integral value used to initialize bits [127:112] of the result.
3782	/// \param __w06
3783	/// A 16-bit integral value used to initialize bits [111:96] of the result.
3784	/// \param __w05
3785	/// A 16-bit integral value used to initialize bits [95:80] of the result.
3786	/// \param __w04
3787	/// A 16-bit integral value used to initialize bits [79:64] of the result.
3788	/// \param __w03
3789	/// A 16-bit integral value used to initialize bits [63:48] of the result.
3790	/// \param __w02
3791	/// A 16-bit integral value used to initialize bits [47:32] of the result.
3792	/// \param __w01
3793	/// A 16-bit integral value used to initialize bits [31:16] of the result.
3794	/// \param __w00
3795	/// A 16-bit integral value used to initialize bits [15:0] of the result.
3796	/// \returns An initialized 256-bit integer vector.
3797	static __inline __m256i __DEFAULT_FN_ATTRS
3798	_mm256_set_epi16(short __w15, short __w14, short __w13, short __w12,
3799	short __w11, short __w10, short __w09, short __w08,
3800	short __w07, short __w06, short __w05, short __w04,
3801	short __w03, short __w02, short __w01, short __w00)
3802	{
3803	return __extension__ (__m256i)(__v16hi){ __w00, __w01, __w02, __w03, __w04, __w05, __w06,
3804	__w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 };
3805	}
3806
3807	/// Constructs a 256-bit integer vector initialized with the specified
3808	/// 8-bit integral values.
3809	///
3810	/// \headerfile <x86intrin.h>
3811	///
3812	/// This intrinsic is a utility function and does not correspond to a specific
3813	/// instruction.
3814	///
3815	/// \param __b31
3816	/// An 8-bit integral value used to initialize bits [255:248] of the result.
3817	/// \param __b30
3818	/// An 8-bit integral value used to initialize bits [247:240] of the result.
3819	/// \param __b29
3820	/// An 8-bit integral value used to initialize bits [239:232] of the result.
3821	/// \param __b28
3822	/// An 8-bit integral value used to initialize bits [231:224] of the result.
3823	/// \param __b27
3824	/// An 8-bit integral value used to initialize bits [223:216] of the result.
3825	/// \param __b26
3826	/// An 8-bit integral value used to initialize bits [215:208] of the result.
3827	/// \param __b25
3828	/// An 8-bit integral value used to initialize bits [207:200] of the result.
3829	/// \param __b24
3830	/// An 8-bit integral value used to initialize bits [199:192] of the result.
3831	/// \param __b23
3832	/// An 8-bit integral value used to initialize bits [191:184] of the result.
3833	/// \param __b22
3834	/// An 8-bit integral value used to initialize bits [183:176] of the result.
3835	/// \param __b21
3836	/// An 8-bit integral value used to initialize bits [175:168] of the result.
3837	/// \param __b20
3838	/// An 8-bit integral value used to initialize bits [167:160] of the result.
3839	/// \param __b19
3840	/// An 8-bit integral value used to initialize bits [159:152] of the result.
3841	/// \param __b18
3842	/// An 8-bit integral value used to initialize bits [151:144] of the result.
3843	/// \param __b17
3844	/// An 8-bit integral value used to initialize bits [143:136] of the result.
3845	/// \param __b16
3846	/// An 8-bit integral value used to initialize bits [135:128] of the result.
3847	/// \param __b15
3848	/// An 8-bit integral value used to initialize bits [127:120] of the result.
3849	/// \param __b14
3850	/// An 8-bit integral value used to initialize bits [119:112] of the result.
3851	/// \param __b13
3852	/// An 8-bit integral value used to initialize bits [111:104] of the result.
3853	/// \param __b12
3854	/// An 8-bit integral value used to initialize bits [103:96] of the result.
3855	/// \param __b11
3856	/// An 8-bit integral value used to initialize bits [95:88] of the result.
3857	/// \param __b10
3858	/// An 8-bit integral value used to initialize bits [87:80] of the result.
3859	/// \param __b09
3860	/// An 8-bit integral value used to initialize bits [79:72] of the result.
3861	/// \param __b08
3862	/// An 8-bit integral value used to initialize bits [71:64] of the result.
3863	/// \param __b07
3864	/// An 8-bit integral value used to initialize bits [63:56] of the result.
3865	/// \param __b06
3866	/// An 8-bit integral value used to initialize bits [55:48] of the result.
3867	/// \param __b05
3868	/// An 8-bit integral value used to initialize bits [47:40] of the result.
3869	/// \param __b04
3870	/// An 8-bit integral value used to initialize bits [39:32] of the result.
3871	/// \param __b03
3872	/// An 8-bit integral value used to initialize bits [31:24] of the result.
3873	/// \param __b02
3874	/// An 8-bit integral value used to initialize bits [23:16] of the result.
3875	/// \param __b01
3876	/// An 8-bit integral value used to initialize bits [15:8] of the result.
3877	/// \param __b00
3878	/// An 8-bit integral value used to initialize bits [7:0] of the result.
3879	/// \returns An initialized 256-bit integer vector.
3880	static __inline __m256i __DEFAULT_FN_ATTRS
3881	_mm256_set_epi8(char __b31, char __b30, char __b29, char __b28,
3882	char __b27, char __b26, char __b25, char __b24,
3883	char __b23, char __b22, char __b21, char __b20,
3884	char __b19, char __b18, char __b17, char __b16,
3885	char __b15, char __b14, char __b13, char __b12,
3886	char __b11, char __b10, char __b09, char __b08,
3887	char __b07, char __b06, char __b05, char __b04,
3888	char __b03, char __b02, char __b01, char __b00)
3889	{
3890	return __extension__ (__m256i)(__v32qi){
3891	__b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
3892	__b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
3893	__b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
3894	__b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31
3895	};
3896	}
3897
3898	/// Constructs a 256-bit integer vector initialized with the specified
3899	/// 64-bit integral values.
3900	///
3901	/// \headerfile <x86intrin.h>
3902	///
3903	/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
3904	/// instruction.
3905	///
3906	/// \param __a
3907	/// A 64-bit integral value used to initialize bits [255:192] of the result.
3908	/// \param __b
3909	/// A 64-bit integral value used to initialize bits [191:128] of the result.
3910	/// \param __c
3911	/// A 64-bit integral value used to initialize bits [127:64] of the result.
3912	/// \param __d
3913	/// A 64-bit integral value used to initialize bits [63:0] of the result.
3914	/// \returns An initialized 256-bit integer vector.
3915	static __inline __m256i __DEFAULT_FN_ATTRS
3916	_mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d)
3917	{
3918	return __extension__ (__m256i)(__v4di){ __d, __c, __b, __a };
3919	}
3920
3921	/* Create vectors with elements in reverse order */
3922	/// Constructs a 256-bit floating-point vector of [4 x double],
3923	/// initialized in reverse order with the specified double-precision
3924	/// floating-point values.
3925	///
3926	/// \headerfile <x86intrin.h>
3927	///
3928	/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
3929	/// instruction.
3930	///
3931	/// \param __a
3932	/// A double-precision floating-point value used to initialize bits [63:0]
3933	/// of the result.
3934	/// \param __b
3935	/// A double-precision floating-point value used to initialize bits [127:64]
3936	/// of the result.
3937	/// \param __c
3938	/// A double-precision floating-point value used to initialize bits [191:128]
3939	/// of the result.
3940	/// \param __d
3941	/// A double-precision floating-point value used to initialize bits [255:192]
3942	/// of the result.
3943	/// \returns An initialized 256-bit floating-point vector of [4 x double].
3944	static __inline __m256d __DEFAULT_FN_ATTRS
3945	_mm256_setr_pd(double __a, double __b, double __c, double __d)
3946	{
3947	return _mm256_set_pd(__d, __c, __b, __a);
3948	}
3949
3950	/// Constructs a 256-bit floating-point vector of [8 x float],
3951	/// initialized in reverse order with the specified single-precision
3952	/// float-point values.
3953	///
3954	/// \headerfile <x86intrin.h>
3955	///
3956	/// This intrinsic is a utility function and does not correspond to a specific
3957	/// instruction.
3958	///
3959	/// \param __a
3960	/// A single-precision floating-point value used to initialize bits [31:0]
3961	/// of the result.
3962	/// \param __b
3963	/// A single-precision floating-point value used to initialize bits [63:32]
3964	/// of the result.
3965	/// \param __c
3966	/// A single-precision floating-point value used to initialize bits [95:64]
3967	/// of the result.
3968	/// \param __d
3969	/// A single-precision floating-point value used to initialize bits [127:96]
3970	/// of the result.
3971	/// \param __e
3972	/// A single-precision floating-point value used to initialize bits [159:128]
3973	/// of the result.
3974	/// \param __f
3975	/// A single-precision floating-point value used to initialize bits [191:160]
3976	/// of the result.
3977	/// \param __g
3978	/// A single-precision floating-point value used to initialize bits [223:192]
3979	/// of the result.
3980	/// \param __h
3981	/// A single-precision floating-point value used to initialize bits [255:224]
3982	/// of the result.
3983	/// \returns An initialized 256-bit floating-point vector of [8 x float].
3984	static __inline __m256 __DEFAULT_FN_ATTRS
3985	_mm256_setr_ps(float __a, float __b, float __c, float __d,
3986	float __e, float __f, float __g, float __h)
3987	{
3988	return _mm256_set_ps(__h, __g, __f, __e, __d, __c, __b, __a);
3989	}
3990
3991	/// Constructs a 256-bit integer vector, initialized in reverse order
3992	/// with the specified 32-bit integral values.
3993	///
3994	/// \headerfile <x86intrin.h>
3995	///
3996	/// This intrinsic is a utility function and does not correspond to a specific
3997	/// instruction.
3998	///
3999	/// \param __i0
4000	/// A 32-bit integral value used to initialize bits [31:0] of the result.
4001	/// \param __i1
4002	/// A 32-bit integral value used to initialize bits [63:32] of the result.
4003	/// \param __i2
4004	/// A 32-bit integral value used to initialize bits [95:64] of the result.
4005	/// \param __i3
4006	/// A 32-bit integral value used to initialize bits [127:96] of the result.
4007	/// \param __i4
4008	/// A 32-bit integral value used to initialize bits [159:128] of the result.
4009	/// \param __i5
4010	/// A 32-bit integral value used to initialize bits [191:160] of the result.
4011	/// \param __i6
4012	/// A 32-bit integral value used to initialize bits [223:192] of the result.
4013	/// \param __i7
4014	/// A 32-bit integral value used to initialize bits [255:224] of the result.
4015	/// \returns An initialized 256-bit integer vector.
4016	static __inline __m256i __DEFAULT_FN_ATTRS
4017	_mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3,
4018	int __i4, int __i5, int __i6, int __i7)
4019	{
4020	return _mm256_set_epi32(__i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0);
4021	}
4022
4023	/// Constructs a 256-bit integer vector, initialized in reverse order
4024	/// with the specified 16-bit integral values.
4025	///
4026	/// \headerfile <x86intrin.h>
4027	///
4028	/// This intrinsic is a utility function and does not correspond to a specific
4029	/// instruction.
4030	///
4031	/// \param __w15
4032	/// A 16-bit integral value used to initialize bits [15:0] of the result.
4033	/// \param __w14
4034	/// A 16-bit integral value used to initialize bits [31:16] of the result.
4035	/// \param __w13
4036	/// A 16-bit integral value used to initialize bits [47:32] of the result.
4037	/// \param __w12
4038	/// A 16-bit integral value used to initialize bits [63:48] of the result.
4039	/// \param __w11
4040	/// A 16-bit integral value used to initialize bits [79:64] of the result.
4041	/// \param __w10
4042	/// A 16-bit integral value used to initialize bits [95:80] of the result.
4043	/// \param __w09
4044	/// A 16-bit integral value used to initialize bits [111:96] of the result.
4045	/// \param __w08
4046	/// A 16-bit integral value used to initialize bits [127:112] of the result.
4047	/// \param __w07
4048	/// A 16-bit integral value used to initialize bits [143:128] of the result.
4049	/// \param __w06
4050	/// A 16-bit integral value used to initialize bits [159:144] of the result.
4051	/// \param __w05
4052	/// A 16-bit integral value used to initialize bits [175:160] of the result.
4053	/// \param __w04
4054	/// A 16-bit integral value used to initialize bits [191:176] of the result.
4055	/// \param __w03
4056	/// A 16-bit integral value used to initialize bits [207:192] of the result.
4057	/// \param __w02
4058	/// A 16-bit integral value used to initialize bits [223:208] of the result.
4059	/// \param __w01
4060	/// A 16-bit integral value used to initialize bits [239:224] of the result.
4061	/// \param __w00
4062	/// A 16-bit integral value used to initialize bits [255:240] of the result.
4063	/// \returns An initialized 256-bit integer vector.
4064	static __inline __m256i __DEFAULT_FN_ATTRS
4065	_mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12,
4066	short __w11, short __w10, short __w09, short __w08,
4067	short __w07, short __w06, short __w05, short __w04,
4068	short __w03, short __w02, short __w01, short __w00)
4069	{
4070	return _mm256_set_epi16(__w00, __w01, __w02, __w03,
4071	__w04, __w05, __w06, __w07,
4072	__w08, __w09, __w10, __w11,
4073	__w12, __w13, __w14, __w15);
4074	}
4075
4076	/// Constructs a 256-bit integer vector, initialized in reverse order
4077	/// with the specified 8-bit integral values.
4078	///
4079	/// \headerfile <x86intrin.h>
4080	///
4081	/// This intrinsic is a utility function and does not correspond to a specific
4082	/// instruction.
4083	///
4084	/// \param __b31
4085	/// An 8-bit integral value used to initialize bits [7:0] of the result.
4086	/// \param __b30
4087	/// An 8-bit integral value used to initialize bits [15:8] of the result.
4088	/// \param __b29
4089	/// An 8-bit integral value used to initialize bits [23:16] of the result.
4090	/// \param __b28
4091	/// An 8-bit integral value used to initialize bits [31:24] of the result.
4092	/// \param __b27
4093	/// An 8-bit integral value used to initialize bits [39:32] of the result.
4094	/// \param __b26
4095	/// An 8-bit integral value used to initialize bits [47:40] of the result.
4096	/// \param __b25
4097	/// An 8-bit integral value used to initialize bits [55:48] of the result.
4098	/// \param __b24
4099	/// An 8-bit integral value used to initialize bits [63:56] of the result.
4100	/// \param __b23
4101	/// An 8-bit integral value used to initialize bits [71:64] of the result.
4102	/// \param __b22
4103	/// An 8-bit integral value used to initialize bits [79:72] of the result.
4104	/// \param __b21
4105	/// An 8-bit integral value used to initialize bits [87:80] of the result.
4106	/// \param __b20
4107	/// An 8-bit integral value used to initialize bits [95:88] of the result.
4108	/// \param __b19
4109	/// An 8-bit integral value used to initialize bits [103:96] of the result.
4110	/// \param __b18
4111	/// An 8-bit integral value used to initialize bits [111:104] of the result.
4112	/// \param __b17
4113	/// An 8-bit integral value used to initialize bits [119:112] of the result.
4114	/// \param __b16
4115	/// An 8-bit integral value used to initialize bits [127:120] of the result.
4116	/// \param __b15
4117	/// An 8-bit integral value used to initialize bits [135:128] of the result.
4118	/// \param __b14
4119	/// An 8-bit integral value used to initialize bits [143:136] of the result.
4120	/// \param __b13
4121	/// An 8-bit integral value used to initialize bits [151:144] of the result.
4122	/// \param __b12
4123	/// An 8-bit integral value used to initialize bits [159:152] of the result.
4124	/// \param __b11
4125	/// An 8-bit integral value used to initialize bits [167:160] of the result.
4126	/// \param __b10
4127	/// An 8-bit integral value used to initialize bits [175:168] of the result.
4128	/// \param __b09
4129	/// An 8-bit integral value used to initialize bits [183:176] of the result.
4130	/// \param __b08
4131	/// An 8-bit integral value used to initialize bits [191:184] of the result.
4132	/// \param __b07
4133	/// An 8-bit integral value used to initialize bits [199:192] of the result.
4134	/// \param __b06
4135	/// An 8-bit integral value used to initialize bits [207:200] of the result.
4136	/// \param __b05
4137	/// An 8-bit integral value used to initialize bits [215:208] of the result.
4138	/// \param __b04
4139	/// An 8-bit integral value used to initialize bits [223:216] of the result.
4140	/// \param __b03
4141	/// An 8-bit integral value used to initialize bits [231:224] of the result.
4142	/// \param __b02
4143	/// An 8-bit integral value used to initialize bits [239:232] of the result.
4144	/// \param __b01
4145	/// An 8-bit integral value used to initialize bits [247:240] of the result.
4146	/// \param __b00
4147	/// An 8-bit integral value used to initialize bits [255:248] of the result.
4148	/// \returns An initialized 256-bit integer vector.
4149	static __inline __m256i __DEFAULT_FN_ATTRS
4150	_mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28,
4151	char __b27, char __b26, char __b25, char __b24,
4152	char __b23, char __b22, char __b21, char __b20,
4153	char __b19, char __b18, char __b17, char __b16,
4154	char __b15, char __b14, char __b13, char __b12,
4155	char __b11, char __b10, char __b09, char __b08,
4156	char __b07, char __b06, char __b05, char __b04,
4157	char __b03, char __b02, char __b01, char __b00)
4158	{
4159	return _mm256_set_epi8(__b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
4160	__b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
4161	__b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
4162	__b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31);
4163	}
4164
4165	/// Constructs a 256-bit integer vector, initialized in reverse order
4166	/// with the specified 64-bit integral values.
4167	///
4168	/// \headerfile <x86intrin.h>
4169	///
4170	/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
4171	/// instruction.
4172	///
4173	/// \param __a
4174	/// A 64-bit integral value used to initialize bits [63:0] of the result.
4175	/// \param __b
4176	/// A 64-bit integral value used to initialize bits [127:64] of the result.
4177	/// \param __c
4178	/// A 64-bit integral value used to initialize bits [191:128] of the result.
4179	/// \param __d
4180	/// A 64-bit integral value used to initialize bits [255:192] of the result.
4181	/// \returns An initialized 256-bit integer vector.
4182	static __inline __m256i __DEFAULT_FN_ATTRS
4183	_mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d)
4184	{
4185	return _mm256_set_epi64x(__d, __c, __b, __a);
4186	}
4187
4188	/* Create vectors with repeated elements */
4189	/// Constructs a 256-bit floating-point vector of [4 x double], with each
4190	/// of the four double-precision floating-point vector elements set to the
4191	/// specified double-precision floating-point value.
4192	///
4193	/// \headerfile <x86intrin.h>
4194	///
4195	/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
4196	///
4197	/// \param __w
4198	/// A double-precision floating-point value used to initialize each vector
4199	/// element of the result.
4200	/// \returns An initialized 256-bit floating-point vector of [4 x double].
4201	static __inline __m256d __DEFAULT_FN_ATTRS
4202	_mm256_set1_pd(double __w)
4203	{
4204	return _mm256_set_pd(__w, __w, __w, __w);
4205	}
4206
4207	/// Constructs a 256-bit floating-point vector of [8 x float], with each
4208	/// of the eight single-precision floating-point vector elements set to the
4209	/// specified single-precision floating-point value.
4210	///
4211	/// \headerfile <x86intrin.h>
4212	///
4213	/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
4214	/// instruction.
4215	///
4216	/// \param __w
4217	/// A single-precision floating-point value used to initialize each vector
4218	/// element of the result.
4219	/// \returns An initialized 256-bit floating-point vector of [8 x float].
4220	static __inline __m256 __DEFAULT_FN_ATTRS
4221	_mm256_set1_ps(float __w)
4222	{
4223	return _mm256_set_ps(__w, __w, __w, __w, __w, __w, __w, __w);
4224	}
4225
4226	/// Constructs a 256-bit integer vector of [8 x i32], with each of the
4227	/// 32-bit integral vector elements set to the specified 32-bit integral
4228	/// value.
4229	///
4230	/// \headerfile <x86intrin.h>
4231	///
4232	/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
4233	/// instruction.
4234	///
4235	/// \param __i
4236	/// A 32-bit integral value used to initialize each vector element of the
4237	/// result.
4238	/// \returns An initialized 256-bit integer vector of [8 x i32].
4239	static __inline __m256i __DEFAULT_FN_ATTRS
4240	_mm256_set1_epi32(int __i)
4241	{
4242	return _mm256_set_epi32(__i, __i, __i, __i, __i, __i, __i, __i);
4243	}
4244
4245	/// Constructs a 256-bit integer vector of [16 x i16], with each of the
4246	/// 16-bit integral vector elements set to the specified 16-bit integral
4247	/// value.
4248	///
4249	/// \headerfile <x86intrin.h>
4250	///
4251	/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
4252	///
4253	/// \param __w
4254	/// A 16-bit integral value used to initialize each vector element of the
4255	/// result.
4256	/// \returns An initialized 256-bit integer vector of [16 x i16].
4257	static __inline __m256i __DEFAULT_FN_ATTRS
4258	_mm256_set1_epi16(short __w)
4259	{
4260	return _mm256_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w,
4261	__w, __w, __w, __w, __w, __w, __w, __w);
4262	}
4263
4264	/// Constructs a 256-bit integer vector of [32 x i8], with each of the
4265	/// 8-bit integral vector elements set to the specified 8-bit integral value.
4266	///
4267	/// \headerfile <x86intrin.h>
4268	///
4269	/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
4270	///
4271	/// \param __b
4272	/// An 8-bit integral value used to initialize each vector element of the
4273	/// result.
4274	/// \returns An initialized 256-bit integer vector of [32 x i8].
4275	static __inline __m256i __DEFAULT_FN_ATTRS
4276	_mm256_set1_epi8(char __b)
4277	{
4278	return _mm256_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b,
4279	__b, __b, __b, __b, __b, __b, __b, __b,
4280	__b, __b, __b, __b, __b, __b, __b, __b,
4281	__b, __b, __b, __b, __b, __b, __b, __b);
4282	}
4283
4284	/// Constructs a 256-bit integer vector of [4 x i64], with each of the
4285	/// 64-bit integral vector elements set to the specified 64-bit integral
4286	/// value.
4287	///
4288	/// \headerfile <x86intrin.h>
4289	///
4290	/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
4291	///
4292	/// \param __q
4293	/// A 64-bit integral value used to initialize each vector element of the
4294	/// result.
4295	/// \returns An initialized 256-bit integer vector of [4 x i64].
4296	static __inline __m256i __DEFAULT_FN_ATTRS
4297	_mm256_set1_epi64x(long long __q)
4298	{
4299	return _mm256_set_epi64x(__q, __q, __q, __q);
4300	}
4301
4302	/* Create __zeroed vectors */
4303	/// Constructs a 256-bit floating-point vector of [4 x double] with all
4304	/// vector elements initialized to zero.
4305	///
4306	/// \headerfile <x86intrin.h>
4307	///
4308	/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
4309	///
4310	/// \returns A 256-bit vector of [4 x double] with all elements set to zero.
4311	static __inline __m256d __DEFAULT_FN_ATTRS
4312	_mm256_setzero_pd(void)
4313	{
4314	return __extension__ (__m256d){ 0.0, 0.0, 0.0, 0.0 };
4315	}
4316
4317	/// Constructs a 256-bit floating-point vector of [8 x float] with all
4318	/// vector elements initialized to zero.
4319	///
4320	/// \headerfile <x86intrin.h>
4321	///
4322	/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
4323	///
4324	/// \returns A 256-bit vector of [8 x float] with all elements set to zero.
4325	static __inline __m256 __DEFAULT_FN_ATTRS
4326	_mm256_setzero_ps(void)
4327	{
4328	return __extension__ (__m256){ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
4329	}
4330
4331	/// Constructs a 256-bit integer vector initialized to zero.
4332	///
4333	/// \headerfile <x86intrin.h>
4334	///
4335	/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
4336	///
4337	/// \returns A 256-bit integer vector initialized to zero.
4338	static __inline __m256i __DEFAULT_FN_ATTRS
4339	_mm256_setzero_si256(void)
4340	{
4341	return __extension__ (__m256i)(__v4di){ 0, 0, 0, 0 };
4342	}
4343
4344	/* Cast between vector types */
4345	/// Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
4346	/// floating-point vector of [8 x float].
4347	///
4348	/// \headerfile <x86intrin.h>
4349	///
4350	/// This intrinsic has no corresponding instruction.
4351	///
4352	/// \param __a
4353	/// A 256-bit floating-point vector of [4 x double].
4354	/// \returns A 256-bit floating-point vector of [8 x float] containing the same
4355	/// bitwise pattern as the parameter.
4356	static __inline __m256 __DEFAULT_FN_ATTRS
4357	_mm256_castpd_ps(__m256d __a)
4358	{
4359	return (__m256)__a;
4360	}
4361
4362	/// Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
4363	/// integer vector.
4364	///
4365	/// \headerfile <x86intrin.h>
4366	///
4367	/// This intrinsic has no corresponding instruction.
4368	///
4369	/// \param __a
4370	/// A 256-bit floating-point vector of [4 x double].
4371	/// \returns A 256-bit integer vector containing the same bitwise pattern as the
4372	/// parameter.
4373	static __inline __m256i __DEFAULT_FN_ATTRS
4374	_mm256_castpd_si256(__m256d __a)
4375	{
4376	return (__m256i)__a;
4377	}
4378
4379	/// Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
4380	/// floating-point vector of [4 x double].
4381	///
4382	/// \headerfile <x86intrin.h>
4383	///
4384	/// This intrinsic has no corresponding instruction.
4385	///
4386	/// \param __a
4387	/// A 256-bit floating-point vector of [8 x float].
4388	/// \returns A 256-bit floating-point vector of [4 x double] containing the same
4389	/// bitwise pattern as the parameter.
4390	static __inline __m256d __DEFAULT_FN_ATTRS
4391	_mm256_castps_pd(__m256 __a)
4392	{
4393	return (__m256d)__a;
4394	}
4395
4396	/// Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
4397	/// integer vector.
4398	///
4399	/// \headerfile <x86intrin.h>
4400	///
4401	/// This intrinsic has no corresponding instruction.
4402	///
4403	/// \param __a
4404	/// A 256-bit floating-point vector of [8 x float].
4405	/// \returns A 256-bit integer vector containing the same bitwise pattern as the
4406	/// parameter.
4407	static __inline __m256i __DEFAULT_FN_ATTRS
4408	_mm256_castps_si256(__m256 __a)
4409	{
4410	return (__m256i)__a;
4411	}
4412
4413	/// Casts a 256-bit integer vector into a 256-bit floating-point vector
4414	/// of [8 x float].
4415	///
4416	/// \headerfile <x86intrin.h>
4417	///
4418	/// This intrinsic has no corresponding instruction.
4419	///
4420	/// \param __a
4421	/// A 256-bit integer vector.
4422	/// \returns A 256-bit floating-point vector of [8 x float] containing the same
4423	/// bitwise pattern as the parameter.
4424	static __inline __m256 __DEFAULT_FN_ATTRS
4425	_mm256_castsi256_ps(__m256i __a)
4426	{
4427	return (__m256)__a;
4428	}
4429
4430	/// Casts a 256-bit integer vector into a 256-bit floating-point vector
4431	/// of [4 x double].
4432	///
4433	/// \headerfile <x86intrin.h>
4434	///
4435	/// This intrinsic has no corresponding instruction.
4436	///
4437	/// \param __a
4438	/// A 256-bit integer vector.
4439	/// \returns A 256-bit floating-point vector of [4 x double] containing the same
4440	/// bitwise pattern as the parameter.
4441	static __inline __m256d __DEFAULT_FN_ATTRS
4442	_mm256_castsi256_pd(__m256i __a)
4443	{
4444	return (__m256d)__a;
4445	}
4446
4447	/// Returns the lower 128 bits of a 256-bit floating-point vector of
4448	/// [4 x double] as a 128-bit floating-point vector of [2 x double].
4449	///
4450	/// \headerfile <x86intrin.h>
4451	///
4452	/// This intrinsic has no corresponding instruction.
4453	///
4454	/// \param __a
4455	/// A 256-bit floating-point vector of [4 x double].
4456	/// \returns A 128-bit floating-point vector of [2 x double] containing the
4457	/// lower 128 bits of the parameter.
4458	static __inline __m128d __DEFAULT_FN_ATTRS
4459	_mm256_castpd256_pd128(__m256d __a)
4460	{
4461	return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 1);
4462	}
4463
4464	/// Returns the lower 128 bits of a 256-bit floating-point vector of
4465	/// [8 x float] as a 128-bit floating-point vector of [4 x float].
4466	///
4467	/// \headerfile <x86intrin.h>
4468	///
4469	/// This intrinsic has no corresponding instruction.
4470	///
4471	/// \param __a
4472	/// A 256-bit floating-point vector of [8 x float].
4473	/// \returns A 128-bit floating-point vector of [4 x float] containing the
4474	/// lower 128 bits of the parameter.
4475	static __inline __m128 __DEFAULT_FN_ATTRS
4476	_mm256_castps256_ps128(__m256 __a)
4477	{
4478	return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 1, 2, 3);
4479	}
4480
4481	/// Truncates a 256-bit integer vector into a 128-bit integer vector.
4482	///
4483	/// \headerfile <x86intrin.h>
4484	///
4485	/// This intrinsic has no corresponding instruction.
4486	///
4487	/// \param __a
4488	/// A 256-bit integer vector.
4489	/// \returns A 128-bit integer vector containing the lower 128 bits of the
4490	/// parameter.
4491	static __inline __m128i __DEFAULT_FN_ATTRS
4492	_mm256_castsi256_si128(__m256i __a)
4493	{
4494	return __builtin_shufflevector((__v4di)__a, (__v4di)__a, 0, 1);
4495	}
4496
4497	/// Constructs a 256-bit floating-point vector of [4 x double] from a
4498	/// 128-bit floating-point vector of [2 x double].
4499	///
4500	/// The lower 128 bits contain the value of the source vector. The contents
4501	/// of the upper 128 bits are undefined.
4502	///
4503	/// \headerfile <x86intrin.h>
4504	///
4505	/// This intrinsic has no corresponding instruction.
4506	///
4507	/// \param __a
4508	/// A 128-bit vector of [2 x double].
4509	/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
4510	/// contain the value of the parameter. The contents of the upper 128 bits
4511	/// are undefined.
4512	static __inline __m256d __DEFAULT_FN_ATTRS
4513	_mm256_castpd128_pd256(__m128d __a)
4514	{
4515	return __builtin_shufflevector(
4516	(__v2df)__a, (__v2df)__builtin_nondeterministic_value(__a), 0, 1, 2, 3);
4517	}
4518
4519	/// Constructs a 256-bit floating-point vector of [8 x float] from a
4520	/// 128-bit floating-point vector of [4 x float].
4521	///
4522	/// The lower 128 bits contain the value of the source vector. The contents
4523	/// of the upper 128 bits are undefined.
4524	///
4525	/// \headerfile <x86intrin.h>
4526	///
4527	/// This intrinsic has no corresponding instruction.
4528	///
4529	/// \param __a
4530	/// A 128-bit vector of [4 x float].
4531	/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
4532	/// contain the value of the parameter. The contents of the upper 128 bits
4533	/// are undefined.
4534	static __inline __m256 __DEFAULT_FN_ATTRS
4535	_mm256_castps128_ps256(__m128 __a)
4536	{
4537	return __builtin_shufflevector((__v4sf)__a,
4538	(__v4sf)__builtin_nondeterministic_value(__a),
4539	0, 1, 2, 3, 4, 5, 6, 7);
4540	}
4541
4542	/// Constructs a 256-bit integer vector from a 128-bit integer vector.
4543	///
4544	/// The lower 128 bits contain the value of the source vector. The contents
4545	/// of the upper 128 bits are undefined.
4546	///
4547	/// \headerfile <x86intrin.h>
4548	///
4549	/// This intrinsic has no corresponding instruction.
4550	///
4551	/// \param __a
4552	/// A 128-bit integer vector.
4553	/// \returns A 256-bit integer vector. The lower 128 bits contain the value of
4554	/// the parameter. The contents of the upper 128 bits are undefined.
4555	static __inline __m256i __DEFAULT_FN_ATTRS
4556	_mm256_castsi128_si256(__m128i __a)
4557	{
4558	return __builtin_shufflevector(
4559	(__v2di)__a, (__v2di)__builtin_nondeterministic_value(__a), 0, 1, 2, 3);
4560	}
4561
4562	/// Constructs a 256-bit floating-point vector of [4 x double] from a
4563	/// 128-bit floating-point vector of [2 x double]. The lower 128 bits
4564	/// contain the value of the source vector. The upper 128 bits are set
4565	/// to zero.
4566	///
4567	/// \headerfile <x86intrin.h>
4568	///
4569	/// This intrinsic has no corresponding instruction.
4570	///
4571	/// \param __a
4572	/// A 128-bit vector of [2 x double].
4573	/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
4574	/// contain the value of the parameter. The upper 128 bits are set to zero.
4575	static __inline __m256d __DEFAULT_FN_ATTRS
4576	_mm256_zextpd128_pd256(__m128d __a)
4577	{
4578	return __builtin_shufflevector((__v2df)__a, (__v2df)_mm_setzero_pd(), 0, 1, 2, 3);
4579	}
4580
4581	/// Constructs a 256-bit floating-point vector of [8 x float] from a
4582	/// 128-bit floating-point vector of [4 x float]. The lower 128 bits contain
4583	/// the value of the source vector. The upper 128 bits are set to zero.
4584	///
4585	/// \headerfile <x86intrin.h>
4586	///
4587	/// This intrinsic has no corresponding instruction.
4588	///
4589	/// \param __a
4590	/// A 128-bit vector of [4 x float].
4591	/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
4592	/// contain the value of the parameter. The upper 128 bits are set to zero.
4593	static __inline __m256 __DEFAULT_FN_ATTRS
4594	_mm256_zextps128_ps256(__m128 __a)
4595	{
4596	return __builtin_shufflevector((__v4sf)__a, (__v4sf)_mm_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7);
4597	}
4598
4599	/// Constructs a 256-bit integer vector from a 128-bit integer vector.
4600	/// The lower 128 bits contain the value of the source vector. The upper
4601	/// 128 bits are set to zero.
4602	///
4603	/// \headerfile <x86intrin.h>
4604	///
4605	/// This intrinsic has no corresponding instruction.
4606	///
4607	/// \param __a
4608	/// A 128-bit integer vector.
4609	/// \returns A 256-bit integer vector. The lower 128 bits contain the value of
4610	/// the parameter. The upper 128 bits are set to zero.
4611	static __inline __m256i __DEFAULT_FN_ATTRS
4612	_mm256_zextsi128_si256(__m128i __a)
4613	{
4614	return __builtin_shufflevector((__v2di)__a, (__v2di)_mm_setzero_si128(), 0, 1, 2, 3);
4615	}
4616
4617	/*
4618	Vector insert.
4619	We use macros rather than inlines because we only want to accept
4620	invocations where the immediate M is a constant expression.
4621	*/
4622	/// Constructs a new 256-bit vector of [8 x float] by first duplicating
4623	/// a 256-bit vector of [8 x float] given in the first parameter, and then
4624	/// replacing either the upper or the lower 128 bits with the contents of a
4625	/// 128-bit vector of [4 x float] in the second parameter.
4626	///
4627	/// The immediate integer parameter determines between the upper or the lower
4628	/// 128 bits.
4629	///
4630	/// \headerfile <x86intrin.h>
4631	///
4632	/// \code
4633	/// __m256 _mm256_insertf128_ps(__m256 V1, __m128 V2, const int M);
4634	/// \endcode
4635	///
4636	/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4637	///
4638	/// \param V1
4639	/// A 256-bit vector of [8 x float]. This vector is copied to the result
4640	/// first, and then either the upper or the lower 128 bits of the result will
4641	/// be replaced by the contents of \a V2.
4642	/// \param V2
4643	/// A 128-bit vector of [4 x float]. The contents of this parameter are
4644	/// written to either the upper or the lower 128 bits of the result depending
4645	/// on the value of parameter \a M.
4646	/// \param M
4647	/// An immediate integer. The least significant bit determines how the values
4648	/// from the two parameters are interleaved: \n
4649	/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
4650	/// and bits [255:128] of \a V1 are copied to bits [255:128] of the
4651	/// result. \n
4652	/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
4653	/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
4654	/// result.
4655	/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
4656	#define _mm256_insertf128_ps(V1, V2, M) \
4657	((__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)(__m256)(V1), \
4658	(__v4sf)(__m128)(V2), (int)(M)))
4659
4660	/// Constructs a new 256-bit vector of [4 x double] by first duplicating
4661	/// a 256-bit vector of [4 x double] given in the first parameter, and then
4662	/// replacing either the upper or the lower 128 bits with the contents of a
4663	/// 128-bit vector of [2 x double] in the second parameter.
4664	///
4665	/// The immediate integer parameter determines between the upper or the lower
4666	/// 128 bits.
4667	///
4668	/// \headerfile <x86intrin.h>
4669	///
4670	/// \code
4671	/// __m256d _mm256_insertf128_pd(__m256d V1, __m128d V2, const int M);
4672	/// \endcode
4673	///
4674	/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4675	///
4676	/// \param V1
4677	/// A 256-bit vector of [4 x double]. This vector is copied to the result
4678	/// first, and then either the upper or the lower 128 bits of the result will
4679	/// be replaced by the contents of \a V2.
4680	/// \param V2
4681	/// A 128-bit vector of [2 x double]. The contents of this parameter are
4682	/// written to either the upper or the lower 128 bits of the result depending
4683	/// on the value of parameter \a M.
4684	/// \param M
4685	/// An immediate integer. The least significant bit determines how the values
4686	/// from the two parameters are interleaved: \n
4687	/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
4688	/// and bits [255:128] of \a V1 are copied to bits [255:128] of the
4689	/// result. \n
4690	/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
4691	/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
4692	/// result.
4693	/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
4694	#define _mm256_insertf128_pd(V1, V2, M) \
4695	((__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)(__m256d)(V1), \
4696	(__v2df)(__m128d)(V2), (int)(M)))
4697
4698	/// Constructs a new 256-bit integer vector by first duplicating a
4699	/// 256-bit integer vector given in the first parameter, and then replacing
4700	/// either the upper or the lower 128 bits with the contents of a 128-bit
4701	/// integer vector in the second parameter.
4702	///
4703	/// The immediate integer parameter determines between the upper or the lower
4704	/// 128 bits.
4705	///
4706	/// \headerfile <x86intrin.h>
4707	///
4708	/// \code
4709	/// __m256i _mm256_insertf128_si256(__m256i V1, __m128i V2, const int M);
4710	/// \endcode
4711	///
4712	/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4713	///
4714	/// \param V1
4715	/// A 256-bit integer vector. This vector is copied to the result first, and
4716	/// then either the upper or the lower 128 bits of the result will be
4717	/// replaced by the contents of \a V2.
4718	/// \param V2
4719	/// A 128-bit integer vector. The contents of this parameter are written to
4720	/// either the upper or the lower 128 bits of the result depending on the
4721	/// value of parameter \a M.
4722	/// \param M
4723	/// An immediate integer. The least significant bit determines how the values
4724	/// from the two parameters are interleaved: \n
4725	/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
4726	/// and bits [255:128] of \a V1 are copied to bits [255:128] of the
4727	/// result. \n
4728	/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
4729	/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
4730	/// result.
4731	/// \returns A 256-bit integer vector containing the interleaved values.
4732	#define _mm256_insertf128_si256(V1, V2, M) \
4733	((__m256i)__builtin_ia32_vinsertf128_si256((__v8si)(__m256i)(V1), \
4734	(__v4si)(__m128i)(V2), (int)(M)))
4735
4736	/*
4737	Vector extract.
4738	We use macros rather than inlines because we only want to accept
4739	invocations where the immediate M is a constant expression.
4740	*/
4741	/// Extracts either the upper or the lower 128 bits from a 256-bit vector
4742	/// of [8 x float], as determined by the immediate integer parameter, and
4743	/// returns the extracted bits as a 128-bit vector of [4 x float].
4744	///
4745	/// \headerfile <x86intrin.h>
4746	///
4747	/// \code
4748	/// __m128 _mm256_extractf128_ps(__m256 V, const int M);
4749	/// \endcode
4750	///
4751	/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
4752	///
4753	/// \param V
4754	/// A 256-bit vector of [8 x float].
4755	/// \param M
4756	/// An immediate integer. The least significant bit determines which bits are
4757	/// extracted from the first parameter: \n
4758	/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
4759	/// result. \n
4760	/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
4761	/// \returns A 128-bit vector of [4 x float] containing the extracted bits.
4762	#define _mm256_extractf128_ps(V, M) \
4763	((__m128)__builtin_ia32_vextractf128_ps256((__v8sf)(__m256)(V), (int)(M)))
4764
4765	/// Extracts either the upper or the lower 128 bits from a 256-bit vector
4766	/// of [4 x double], as determined by the immediate integer parameter, and
4767	/// returns the extracted bits as a 128-bit vector of [2 x double].
4768	///
4769	/// \headerfile <x86intrin.h>
4770	///
4771	/// \code
4772	/// __m128d _mm256_extractf128_pd(__m256d V, const int M);
4773	/// \endcode
4774	///
4775	/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
4776	///
4777	/// \param V
4778	/// A 256-bit vector of [4 x double].
4779	/// \param M
4780	/// An immediate integer. The least significant bit determines which bits are
4781	/// extracted from the first parameter: \n
4782	/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
4783	/// result. \n
4784	/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
4785	/// \returns A 128-bit vector of [2 x double] containing the extracted bits.
4786	#define _mm256_extractf128_pd(V, M) \
4787	((__m128d)__builtin_ia32_vextractf128_pd256((__v4df)(__m256d)(V), (int)(M)))
4788
4789	/// Extracts either the upper or the lower 128 bits from a 256-bit
4790	/// integer vector, as determined by the immediate integer parameter, and
4791	/// returns the extracted bits as a 128-bit integer vector.
4792	///
4793	/// \headerfile <x86intrin.h>
4794	///
4795	/// \code
4796	/// __m128i _mm256_extractf128_si256(__m256i V, const int M);
4797	/// \endcode
4798	///
4799	/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
4800	///
4801	/// \param V
4802	/// A 256-bit integer vector.
4803	/// \param M
4804	/// An immediate integer. The least significant bit determines which bits are
4805	/// extracted from the first parameter: \n
4806	/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
4807	/// result. \n
4808	/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
4809	/// \returns A 128-bit integer vector containing the extracted bits.
4810	#define _mm256_extractf128_si256(V, M) \
4811	((__m128i)__builtin_ia32_vextractf128_si256((__v8si)(__m256i)(V), (int)(M)))
4812
4813	/// Constructs a 256-bit floating-point vector of [8 x float] by
4814	/// concatenating two 128-bit floating-point vectors of [4 x float].
4815	///
4816	/// \headerfile <x86intrin.h>
4817	///
4818	/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4819	///
4820	/// \param __hi
4821	/// A 128-bit floating-point vector of [4 x float] to be copied to the upper
4822	/// 128 bits of the result.
4823	/// \param __lo
4824	/// A 128-bit floating-point vector of [4 x float] to be copied to the lower
4825	/// 128 bits of the result.
4826	/// \returns A 256-bit floating-point vector of [8 x float] containing the
4827	/// concatenated result.
4828	static __inline __m256 __DEFAULT_FN_ATTRS
4829	_mm256_set_m128 (__m128 __hi, __m128 __lo)
4830	{
4831	return (__m256) __builtin_shufflevector((__v4sf)__lo, (__v4sf)__hi, 0, 1, 2, 3, 4, 5, 6, 7);
4832	}
4833
4834	/// Constructs a 256-bit floating-point vector of [4 x double] by
4835	/// concatenating two 128-bit floating-point vectors of [2 x double].
4836	///
4837	/// \headerfile <x86intrin.h>
4838	///
4839	/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4840	///
4841	/// \param __hi
4842	/// A 128-bit floating-point vector of [2 x double] to be copied to the upper
4843	/// 128 bits of the result.
4844	/// \param __lo
4845	/// A 128-bit floating-point vector of [2 x double] to be copied to the lower
4846	/// 128 bits of the result.
4847	/// \returns A 256-bit floating-point vector of [4 x double] containing the
4848	/// concatenated result.
4849	static __inline __m256d __DEFAULT_FN_ATTRS
4850	_mm256_set_m128d (__m128d __hi, __m128d __lo)
4851	{
4852	return (__m256d) __builtin_shufflevector((__v2df)__lo, (__v2df)__hi, 0, 1, 2, 3);
4853	}
4854
4855	/// Constructs a 256-bit integer vector by concatenating two 128-bit
4856	/// integer vectors.
4857	///
4858	/// \headerfile <x86intrin.h>
4859	///
4860	/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4861	///
4862	/// \param __hi
4863	/// A 128-bit integer vector to be copied to the upper 128 bits of the
4864	/// result.
4865	/// \param __lo
4866	/// A 128-bit integer vector to be copied to the lower 128 bits of the
4867	/// result.
4868	/// \returns A 256-bit integer vector containing the concatenated result.
4869	static __inline __m256i __DEFAULT_FN_ATTRS
4870	_mm256_set_m128i (__m128i __hi, __m128i __lo)
4871	{
4872	return (__m256i) __builtin_shufflevector((__v2di)__lo, (__v2di)__hi, 0, 1, 2, 3);
4873	}
4874
4875	/// Constructs a 256-bit floating-point vector of [8 x float] by
4876	/// concatenating two 128-bit floating-point vectors of [4 x float]. This is
4877	/// similar to _mm256_set_m128, but the order of the input parameters is
4878	/// swapped.
4879	///
4880	/// \headerfile <x86intrin.h>
4881	///
4882	/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4883	///
4884	/// \param __lo
4885	/// A 128-bit floating-point vector of [4 x float] to be copied to the lower
4886	/// 128 bits of the result.
4887	/// \param __hi
4888	/// A 128-bit floating-point vector of [4 x float] to be copied to the upper
4889	/// 128 bits of the result.
4890	/// \returns A 256-bit floating-point vector of [8 x float] containing the
4891	/// concatenated result.
4892	static __inline __m256 __DEFAULT_FN_ATTRS
4893	_mm256_setr_m128 (__m128 __lo, __m128 __hi)
4894	{
4895	return _mm256_set_m128(__hi, __lo);
4896	}
4897
4898	/// Constructs a 256-bit floating-point vector of [4 x double] by
4899	/// concatenating two 128-bit floating-point vectors of [2 x double]. This is
4900	/// similar to _mm256_set_m128d, but the order of the input parameters is
4901	/// swapped.
4902	///
4903	/// \headerfile <x86intrin.h>
4904	///
4905	/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4906	///
4907	/// \param __lo
4908	/// A 128-bit floating-point vector of [2 x double] to be copied to the lower
4909	/// 128 bits of the result.
4910	/// \param __hi
4911	/// A 128-bit floating-point vector of [2 x double] to be copied to the upper
4912	/// 128 bits of the result.
4913	/// \returns A 256-bit floating-point vector of [4 x double] containing the
4914	/// concatenated result.
4915	static __inline __m256d __DEFAULT_FN_ATTRS
4916	_mm256_setr_m128d (__m128d __lo, __m128d __hi)
4917	{
4918	return (__m256d)_mm256_set_m128d(__hi, __lo);
4919	}
4920
4921	/// Constructs a 256-bit integer vector by concatenating two 128-bit
4922	/// integer vectors. This is similar to _mm256_set_m128i, but the order of
4923	/// the input parameters is swapped.
4924	///
4925	/// \headerfile <x86intrin.h>
4926	///
4927	/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4928	///
4929	/// \param __lo
4930	/// A 128-bit integer vector to be copied to the lower 128 bits of the
4931	/// result.
4932	/// \param __hi
4933	/// A 128-bit integer vector to be copied to the upper 128 bits of the
4934	/// result.
4935	/// \returns A 256-bit integer vector containing the concatenated result.
4936	static __inline __m256i __DEFAULT_FN_ATTRS
4937	_mm256_setr_m128i (__m128i __lo, __m128i __hi)
4938	{
4939	return (__m256i)_mm256_set_m128i(__hi, __lo);
4940	}
4941
4942	/* SIMD load ops (unaligned) */
4943	/// Loads two 128-bit floating-point vectors of [4 x float] from
4944	/// unaligned memory locations and constructs a 256-bit floating-point vector
4945	/// of [8 x float] by concatenating the two 128-bit vectors.
4946	///
4947	/// \headerfile <x86intrin.h>
4948	///
4949	/// This intrinsic corresponds to load instructions followed by the
4950	/// <c> VINSERTF128 </c> instruction.
4951	///
4952	/// \param __addr_hi
4953	/// A pointer to a 128-bit memory location containing 4 consecutive
4954	/// single-precision floating-point values. These values are to be copied to
4955	/// bits[255:128] of the result. The address of the memory location does not
4956	/// have to be aligned.
4957	/// \param __addr_lo
4958	/// A pointer to a 128-bit memory location containing 4 consecutive
4959	/// single-precision floating-point values. These values are to be copied to
4960	/// bits[127:0] of the result. The address of the memory location does not
4961	/// have to be aligned.
4962	/// \returns A 256-bit floating-point vector of [8 x float] containing the
4963	/// concatenated result.
4964	static __inline __m256 __DEFAULT_FN_ATTRS
4965	_mm256_loadu2_m128(float const __addr_hi, float const __addr_lo)
4966	{
4967	return _mm256_set_m128(_mm_loadu_ps(__addr_hi), _mm_loadu_ps(__addr_lo));
4968	}
4969
4970	/// Loads two 128-bit floating-point vectors of [2 x double] from
4971	/// unaligned memory locations and constructs a 256-bit floating-point vector
4972	/// of [4 x double] by concatenating the two 128-bit vectors.
4973	///
4974	/// \headerfile <x86intrin.h>
4975	///
4976	/// This intrinsic corresponds to load instructions followed by the
4977	/// <c> VINSERTF128 </c> instruction.
4978	///
4979	/// \param __addr_hi
4980	/// A pointer to a 128-bit memory location containing two consecutive
4981	/// double-precision floating-point values. These values are to be copied to
4982	/// bits[255:128] of the result. The address of the memory location does not
4983	/// have to be aligned.
4984	/// \param __addr_lo
4985	/// A pointer to a 128-bit memory location containing two consecutive
4986	/// double-precision floating-point values. These values are to be copied to
4987	/// bits[127:0] of the result. The address of the memory location does not
4988	/// have to be aligned.
4989	/// \returns A 256-bit floating-point vector of [4 x double] containing the
4990	/// concatenated result.
4991	static __inline __m256d __DEFAULT_FN_ATTRS
4992	_mm256_loadu2_m128d(double const __addr_hi, double const __addr_lo)
4993	{
4994	return _mm256_set_m128d(_mm_loadu_pd(__addr_hi), _mm_loadu_pd(__addr_lo));
4995	}
4996
4997	/// Loads two 128-bit integer vectors from unaligned memory locations and
4998	/// constructs a 256-bit integer vector by concatenating the two 128-bit
4999	/// vectors.
5000	///
5001	/// \headerfile <x86intrin.h>
5002	///
5003	/// This intrinsic corresponds to load instructions followed by the
5004	/// <c> VINSERTF128 </c> instruction.
5005	///
5006	/// \param __addr_hi
5007	/// A pointer to a 128-bit memory location containing a 128-bit integer
5008	/// vector. This vector is to be copied to bits[255:128] of the result. The
5009	/// address of the memory location does not have to be aligned.
5010	/// \param __addr_lo
5011	/// A pointer to a 128-bit memory location containing a 128-bit integer
5012	/// vector. This vector is to be copied to bits[127:0] of the result. The
5013	/// address of the memory location does not have to be aligned.
5014	/// \returns A 256-bit integer vector containing the concatenated result.
5015	static __inline __m256i __DEFAULT_FN_ATTRS
5016	_mm256_loadu2_m128i(__m128i_u const __addr_hi, __m128i_u const __addr_lo)
5017	{
5018	return _mm256_set_m128i(_mm_loadu_si128(__addr_hi), _mm_loadu_si128(__addr_lo));
5019	}
5020
5021	/* SIMD store ops (unaligned) */
5022	/// Stores the upper and lower 128 bits of a 256-bit floating-point
5023	/// vector of [8 x float] into two different unaligned memory locations.
5024	///
5025	/// \headerfile <x86intrin.h>
5026	///
5027	/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
5028	/// store instructions.
5029	///
5030	/// \param __addr_hi
5031	/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
5032	/// copied to this memory location. The address of this memory location does
5033	/// not have to be aligned.
5034	/// \param __addr_lo
5035	/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
5036	/// copied to this memory location. The address of this memory location does
5037	/// not have to be aligned.
5038	/// \param __a
5039	/// A 256-bit floating-point vector of [8 x float].
5040	static __inline void __DEFAULT_FN_ATTRS
5041	_mm256_storeu2_m128(float __addr_hi, float __addr_lo, __m256 __a)
5042	{
5043	__m128 __v128;
5044
5045	__v128 = _mm256_castps256_ps128(__a);
5046	_mm_storeu_ps(__addr_lo, __v128);
5047	__v128 = _mm256_extractf128_ps(__a, 1);
5048	_mm_storeu_ps(__addr_hi, __v128);
5049	}
5050
5051	/// Stores the upper and lower 128 bits of a 256-bit floating-point
5052	/// vector of [4 x double] into two different unaligned memory locations.
5053	///
5054	/// \headerfile <x86intrin.h>
5055	///
5056	/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
5057	/// store instructions.
5058	///
5059	/// \param __addr_hi
5060	/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
5061	/// copied to this memory location. The address of this memory location does
5062	/// not have to be aligned.
5063	/// \param __addr_lo
5064	/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
5065	/// copied to this memory location. The address of this memory location does
5066	/// not have to be aligned.
5067	/// \param __a
5068	/// A 256-bit floating-point vector of [4 x double].
5069	static __inline void __DEFAULT_FN_ATTRS
5070	_mm256_storeu2_m128d(double __addr_hi, double __addr_lo, __m256d __a)
5071	{
5072	__m128d __v128;
5073
5074	__v128 = _mm256_castpd256_pd128(__a);
5075	_mm_storeu_pd(__addr_lo, __v128);
5076	__v128 = _mm256_extractf128_pd(__a, 1);
5077	_mm_storeu_pd(__addr_hi, __v128);
5078	}
5079
5080	/// Stores the upper and lower 128 bits of a 256-bit integer vector into
5081	/// two different unaligned memory locations.
5082	///
5083	/// \headerfile <x86intrin.h>
5084	///
5085	/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
5086	/// store instructions.
5087	///
5088	/// \param __addr_hi
5089	/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
5090	/// copied to this memory location. The address of this memory location does
5091	/// not have to be aligned.
5092	/// \param __addr_lo
5093	/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
5094	/// copied to this memory location. The address of this memory location does
5095	/// not have to be aligned.
5096	/// \param __a
5097	/// A 256-bit integer vector.
5098	static __inline void __DEFAULT_FN_ATTRS
5099	_mm256_storeu2_m128i(__m128i_u __addr_hi, __m128i_u __addr_lo, __m256i __a)
5100	{
5101	__m128i __v128;
5102
5103	__v128 = _mm256_castsi256_si128(__a);
5104	_mm_storeu_si128(__addr_lo, __v128);
5105	__v128 = _mm256_extractf128_si256(__a, 1);
5106	_mm_storeu_si128(__addr_hi, __v128);
5107	}
5108
5109	#undef __DEFAULT_FN_ATTRS
5110	#undef __DEFAULT_FN_ATTRS128
5111
5112	#endif /* __AVXINTRIN_H */
5113

Warning: This file is not a C or C++ file. It does not have highlighting.

source code of clang/lib/Headers/avxintrin.h