avx2intrin.h source code [clang/lib/Headers/avx2intrin.h]

Warning: This file is not a C or C++ file. It does not have highlighting.

1	/*===---- avx2intrin.h - AVX2 intrinsics -----------------------------------===
2	*
3	* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	* See https://llvm.org/LICENSE.txt for license information.
5	* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	*
7	*===-----------------------------------------------------------------------===
8	*/
9
10	#ifndef __IMMINTRIN_H
11	#error "Never use <avx2intrin.h> directly; include <immintrin.h> instead."
12	#endif
13
14	#ifndef __AVX2INTRIN_H
15	#define __AVX2INTRIN_H
16
17	/* Define the default attributes for the functions in this file. */
18	#define __DEFAULT_FN_ATTRS256 \
19	__attribute__((__always_inline__, __nodebug__, \
20	__target__("avx2,no-evex512"), __min_vector_width__(256)))
21	#define __DEFAULT_FN_ATTRS128 \
22	__attribute__((__always_inline__, __nodebug__, \
23	__target__("avx2,no-evex512"), __min_vector_width__(128)))
24
25	/* SSE4 Multiple Packed Sums of Absolute Difference. */
26	/// Computes sixteen sum of absolute difference (SAD) operations on sets of
27	/// four unsigned 8-bit integers from the 256-bit integer vectors \a X and
28	/// \a Y.
29	///
30	/// Eight SAD results are computed using the lower half of the input
31	/// vectors, and another eight using the upper half. These 16-bit values
32	/// are returned in the lower and upper halves of the 256-bit result,
33	/// respectively.
34	///
35	/// A single SAD operation selects four bytes from \a X and four bytes from
36	/// \a Y as input. It computes the differences between each \a X byte and
37	/// the corresponding \a Y byte, takes the absolute value of each
38	/// difference, and sums these four values to form one 16-bit result. The
39	/// intrinsic computes 16 of these results with different sets of input
40	/// bytes.
41	///
42	/// For each set of eight results, the SAD operations use the same four
43	/// bytes from \a Y; the starting bit position for these four bytes is
44	/// specified by \a M[1:0] times 32. The eight operations use successive
45	/// sets of four bytes from \a X; the starting bit position for the first
46	/// set of four bytes is specified by \a M[2] times 32. These bit positions
47	/// are all relative to the 128-bit lane for each set of eight operations.
48	///
49	/// \code{.operation}
50	/// r := 0
51	/// FOR i := 0 TO 1
52	/// j := i*3
53	/// Ybase := M[j+1:j]32 + i128
54	/// Xbase := M[j+2]32 + i128
55	/// FOR k := 0 TO 3
56	/// temp0 := ABS(X[Xbase+7:Xbase] - Y[Ybase+7:Ybase])
57	/// temp1 := ABS(X[Xbase+15:Xbase+8] - Y[Ybase+15:Ybase+8])
58	/// temp2 := ABS(X[Xbase+23:Xbase+16] - Y[Ybase+23:Ybase+16])
59	/// temp3 := ABS(X[Xbase+31:Xbase+24] - Y[Ybase+31:Ybase+24])
60	/// result[r+15:r] := temp0 + temp1 + temp2 + temp3
61	/// Xbase := Xbase + 8
62	/// r := r + 16
63	/// ENDFOR
64	/// ENDFOR
65	/// \endcode
66	///
67	/// \headerfile <immintrin.h>
68	///
69	/// \code
70	/// __m256i _mm256_mpsadbw_epu8(__m256i X, __m256i Y, const int M);
71	/// \endcode
72	///
73	/// This intrinsic corresponds to the \c VMPSADBW instruction.
74	///
75	/// \param X
76	/// A 256-bit integer vector containing one of the inputs.
77	/// \param Y
78	/// A 256-bit integer vector containing one of the inputs.
79	/// \param M
80	/// An unsigned immediate value specifying the starting positions of the
81	/// bytes to operate on.
82	/// \returns A 256-bit vector of [16 x i16] containing the result.
83	#define _mm256_mpsadbw_epu8(X, Y, M) \
84	((__m256i)__builtin_ia32_mpsadbw256((__v32qi)(__m256i)(X), \
85	(__v32qi)(__m256i)(Y), (int)(M)))
86
87	/// Computes the absolute value of each signed byte in the 256-bit integer
88	/// vector \a __a and returns each value in the corresponding byte of
89	/// the result.
90	///
91	/// \headerfile <immintrin.h>
92	///
93	/// This intrinsic corresponds to the \c VPABSB instruction.
94	///
95	/// \param __a
96	/// A 256-bit integer vector.
97	/// \returns A 256-bit integer vector containing the result.
98	static __inline__ __m256i __DEFAULT_FN_ATTRS256
99	_mm256_abs_epi8(__m256i __a)
100	{
101	return (__m256i)__builtin_elementwise_abs((__v32qs)__a);
102	}
103
104	/// Computes the absolute value of each signed 16-bit element in the 256-bit
105	/// vector of [16 x i16] in \a __a and returns each value in the
106	/// corresponding element of the result.
107	///
108	/// \headerfile <immintrin.h>
109	///
110	/// This intrinsic corresponds to the \c VPABSW instruction.
111	///
112	/// \param __a
113	/// A 256-bit vector of [16 x i16].
114	/// \returns A 256-bit vector of [16 x i16] containing the result.
115	static __inline__ __m256i __DEFAULT_FN_ATTRS256
116	_mm256_abs_epi16(__m256i __a)
117	{
118	return (__m256i)__builtin_elementwise_abs((__v16hi)__a);
119	}
120
121	/// Computes the absolute value of each signed 32-bit element in the 256-bit
122	/// vector of [8 x i32] in \a __a and returns each value in the
123	/// corresponding element of the result.
124	///
125	/// \headerfile <immintrin.h>
126	///
127	/// This intrinsic corresponds to the \c VPABSD instruction.
128	///
129	/// \param __a
130	/// A 256-bit vector of [8 x i32].
131	/// \returns A 256-bit vector of [8 x i32] containing the result.
132	static __inline__ __m256i __DEFAULT_FN_ATTRS256
133	_mm256_abs_epi32(__m256i __a)
134	{
135	return (__m256i)__builtin_elementwise_abs((__v8si)__a);
136	}
137
138	/// Converts the elements of two 256-bit vectors of [16 x i16] to 8-bit
139	/// integers using signed saturation, and returns the 256-bit result.
140	///
141	/// \code{.operation}
142	/// FOR i := 0 TO 7
143	/// j := i*16
144	/// k := i*8
145	/// result[7+k:k] := SATURATE8(__a[15+j:j])
146	/// result[71+k:64+k] := SATURATE8(__b[15+j:j])
147	/// result[135+k:128+k] := SATURATE8(__a[143+j:128+j])
148	/// result[199+k:192+k] := SATURATE8(__b[143+j:128+j])
149	/// ENDFOR
150	/// \endcode
151	///
152	/// \headerfile <immintrin.h>
153	///
154	/// This intrinsic corresponds to the \c VPACKSSWB instruction.
155	///
156	/// \param __a
157	/// A 256-bit vector of [16 x i16] used to generate result[63:0] and
158	/// result[191:128].
159	/// \param __b
160	/// A 256-bit vector of [16 x i16] used to generate result[127:64] and
161	/// result[255:192].
162	/// \returns A 256-bit integer vector containing the result.
163	static __inline__ __m256i __DEFAULT_FN_ATTRS256
164	_mm256_packs_epi16(__m256i __a, __m256i __b)
165	{
166	return (__m256i)__builtin_ia32_packsswb256((__v16hi)__a, (__v16hi)__b);
167	}
168
169	/// Converts the elements of two 256-bit vectors of [8 x i32] to 16-bit
170	/// integers using signed saturation, and returns the resulting 256-bit
171	/// vector of [16 x i16].
172	///
173	/// \code{.operation}
174	/// FOR i := 0 TO 3
175	/// j := i*32
176	/// k := i*16
177	/// result[15+k:k] := SATURATE16(__a[31+j:j])
178	/// result[79+k:64+k] := SATURATE16(__b[31+j:j])
179	/// result[143+k:128+k] := SATURATE16(__a[159+j:128+j])
180	/// result[207+k:192+k] := SATURATE16(__b[159+j:128+j])
181	/// ENDFOR
182	/// \endcode
183	///
184	/// \headerfile <immintrin.h>
185	///
186	/// This intrinsic corresponds to the \c VPACKSSDW instruction.
187	///
188	/// \param __a
189	/// A 256-bit vector of [8 x i32] used to generate result[63:0] and
190	/// result[191:128].
191	/// \param __b
192	/// A 256-bit vector of [8 x i32] used to generate result[127:64] and
193	/// result[255:192].
194	/// \returns A 256-bit vector of [16 x i16] containing the result.
195	static __inline__ __m256i __DEFAULT_FN_ATTRS256
196	_mm256_packs_epi32(__m256i __a, __m256i __b)
197	{
198	return (__m256i)__builtin_ia32_packssdw256((__v8si)__a, (__v8si)__b);
199	}
200
201	/// Converts elements from two 256-bit vectors of [16 x i16] to 8-bit integers
202	/// using unsigned saturation, and returns the 256-bit result.
203	///
204	/// \code{.operation}
205	/// FOR i := 0 TO 7
206	/// j := i*16
207	/// k := i*8
208	/// result[7+k:k] := SATURATE8U(__a[15+j:j])
209	/// result[71+k:64+k] := SATURATE8U(__b[15+j:j])
210	/// result[135+k:128+k] := SATURATE8U(__a[143+j:128+j])
211	/// result[199+k:192+k] := SATURATE8U(__b[143+j:128+j])
212	/// ENDFOR
213	/// \endcode
214	///
215	/// \headerfile <immintrin.h>
216	///
217	/// This intrinsic corresponds to the \c VPACKUSWB instruction.
218	///
219	/// \param __a
220	/// A 256-bit vector of [16 x i16] used to generate result[63:0] and
221	/// result[191:128].
222	/// \param __b
223	/// A 256-bit vector of [16 x i16] used to generate result[127:64] and
224	/// result[255:192].
225	/// \returns A 256-bit integer vector containing the result.
226	static __inline__ __m256i __DEFAULT_FN_ATTRS256
227	_mm256_packus_epi16(__m256i __a, __m256i __b)
228	{
229	return (__m256i)__builtin_ia32_packuswb256((__v16hi)__a, (__v16hi)__b);
230	}
231
232	/// Converts elements from two 256-bit vectors of [8 x i32] to 16-bit integers
233	/// using unsigned saturation, and returns the resulting 256-bit vector of
234	/// [16 x i16].
235	///
236	/// \code{.operation}
237	/// FOR i := 0 TO 3
238	/// j := i*32
239	/// k := i*16
240	/// result[15+k:k] := SATURATE16U(__V1[31+j:j])
241	/// result[79+k:64+k] := SATURATE16U(__V2[31+j:j])
242	/// result[143+k:128+k] := SATURATE16U(__V1[159+j:128+j])
243	/// result[207+k:192+k] := SATURATE16U(__V2[159+j:128+j])
244	/// ENDFOR
245	/// \endcode
246	///
247	/// \headerfile <immintrin.h>
248	///
249	/// This intrinsic corresponds to the \c VPACKUSDW instruction.
250	///
251	/// \param __V1
252	/// A 256-bit vector of [8 x i32] used to generate result[63:0] and
253	/// result[191:128].
254	/// \param __V2
255	/// A 256-bit vector of [8 x i32] used to generate result[127:64] and
256	/// result[255:192].
257	/// \returns A 256-bit vector of [16 x i16] containing the result.
258	static __inline__ __m256i __DEFAULT_FN_ATTRS256
259	_mm256_packus_epi32(__m256i __V1, __m256i __V2)
260	{
261	return (__m256i) __builtin_ia32_packusdw256((__v8si)__V1, (__v8si)__V2);
262	}
263
264	/// Adds 8-bit integers from corresponding bytes of two 256-bit integer
265	/// vectors and returns the lower 8 bits of each sum in the corresponding
266	/// byte of the 256-bit integer vector result (overflow is ignored).
267	///
268	/// \headerfile <immintrin.h>
269	///
270	/// This intrinsic corresponds to the \c VPADDB instruction.
271	///
272	/// \param __a
273	/// A 256-bit integer vector containing one of the source operands.
274	/// \param __b
275	/// A 256-bit integer vector containing one of the source operands.
276	/// \returns A 256-bit integer vector containing the sums.
277	static __inline__ __m256i __DEFAULT_FN_ATTRS256
278	_mm256_add_epi8(__m256i __a, __m256i __b)
279	{
280	return (__m256i)((__v32qu)__a + (__v32qu)__b);
281	}
282
283	/// Adds 16-bit integers from corresponding elements of two 256-bit vectors of
284	/// [16 x i16] and returns the lower 16 bits of each sum in the
285	/// corresponding element of the [16 x i16] result (overflow is ignored).
286	///
287	/// \headerfile <immintrin.h>
288	///
289	/// This intrinsic corresponds to the \c VPADDW instruction.
290	///
291	/// \param __a
292	/// A 256-bit vector of [16 x i16] containing one of the source operands.
293	/// \param __b
294	/// A 256-bit vector of [16 x i16] containing one of the source operands.
295	/// \returns A 256-bit vector of [16 x i16] containing the sums.
296	static __inline__ __m256i __DEFAULT_FN_ATTRS256
297	_mm256_add_epi16(__m256i __a, __m256i __b)
298	{
299	return (__m256i)((__v16hu)__a + (__v16hu)__b);
300	}
301
302	/// Adds 32-bit integers from corresponding elements of two 256-bit vectors of
303	/// [8 x i32] and returns the lower 32 bits of each sum in the corresponding
304	/// element of the [8 x i32] result (overflow is ignored).
305	///
306	/// \headerfile <immintrin.h>
307	///
308	/// This intrinsic corresponds to the \c VPADDD instruction.
309	///
310	/// \param __a
311	/// A 256-bit vector of [8 x i32] containing one of the source operands.
312	/// \param __b
313	/// A 256-bit vector of [8 x i32] containing one of the source operands.
314	/// \returns A 256-bit vector of [8 x i32] containing the sums.
315	static __inline__ __m256i __DEFAULT_FN_ATTRS256
316	_mm256_add_epi32(__m256i __a, __m256i __b)
317	{
318	return (__m256i)((__v8su)__a + (__v8su)__b);
319	}
320
321	/// Adds 64-bit integers from corresponding elements of two 256-bit vectors of
322	/// [4 x i64] and returns the lower 64 bits of each sum in the corresponding
323	/// element of the [4 x i64] result (overflow is ignored).
324	///
325	/// \headerfile <immintrin.h>
326	///
327	/// This intrinsic corresponds to the \c VPADDQ instruction.
328	///
329	/// \param __a
330	/// A 256-bit vector of [4 x i64] containing one of the source operands.
331	/// \param __b
332	/// A 256-bit vector of [4 x i64] containing one of the source operands.
333	/// \returns A 256-bit vector of [4 x i64] containing the sums.
334	static __inline__ __m256i __DEFAULT_FN_ATTRS256
335	_mm256_add_epi64(__m256i __a, __m256i __b)
336	{
337	return (__m256i)((__v4du)__a + (__v4du)__b);
338	}
339
340	/// Adds 8-bit integers from corresponding bytes of two 256-bit integer
341	/// vectors using signed saturation, and returns each sum in the
342	/// corresponding byte of the 256-bit integer vector result.
343	///
344	/// \headerfile <immintrin.h>
345	///
346	/// This intrinsic corresponds to the \c VPADDSB instruction.
347	///
348	/// \param __a
349	/// A 256-bit integer vector containing one of the source operands.
350	/// \param __b
351	/// A 256-bit integer vector containing one of the source operands.
352	/// \returns A 256-bit integer vector containing the sums.
353	static __inline__ __m256i __DEFAULT_FN_ATTRS256
354	_mm256_adds_epi8(__m256i __a, __m256i __b)
355	{
356	return (__m256i)__builtin_elementwise_add_sat((__v32qs)__a, (__v32qs)__b);
357	}
358
359	/// Adds 16-bit integers from corresponding elements of two 256-bit vectors of
360	/// [16 x i16] using signed saturation, and returns the [16 x i16] result.
361	///
362	/// \headerfile <immintrin.h>
363	///
364	/// This intrinsic corresponds to the \c VPADDSW instruction.
365	///
366	/// \param __a
367	/// A 256-bit vector of [16 x i16] containing one of the source operands.
368	/// \param __b
369	/// A 256-bit vector of [16 x i16] containing one of the source operands.
370	/// \returns A 256-bit vector of [16 x i16] containing the sums.
371	static __inline__ __m256i __DEFAULT_FN_ATTRS256
372	_mm256_adds_epi16(__m256i __a, __m256i __b)
373	{
374	return (__m256i)__builtin_elementwise_add_sat((__v16hi)__a, (__v16hi)__b);
375	}
376
377	/// Adds 8-bit integers from corresponding bytes of two 256-bit integer
378	/// vectors using unsigned saturation, and returns each sum in the
379	/// corresponding byte of the 256-bit integer vector result.
380	///
381	/// \headerfile <immintrin.h>
382	///
383	/// This intrinsic corresponds to the \c VPADDUSB instruction.
384	///
385	/// \param __a
386	/// A 256-bit integer vector containing one of the source operands.
387	/// \param __b
388	/// A 256-bit integer vector containing one of the source operands.
389	/// \returns A 256-bit integer vector containing the sums.
390	static __inline__ __m256i __DEFAULT_FN_ATTRS256
391	_mm256_adds_epu8(__m256i __a, __m256i __b)
392	{
393	return (__m256i)__builtin_elementwise_add_sat((__v32qu)__a, (__v32qu)__b);
394	}
395
396	/// Adds 16-bit integers from corresponding elements of two 256-bit vectors of
397	/// [16 x i16] using unsigned saturation, and returns the [16 x i16] result.
398	///
399	/// \headerfile <immintrin.h>
400	///
401	/// This intrinsic corresponds to the \c VPADDUSW instruction.
402	///
403	/// \param __a
404	/// A 256-bit vector of [16 x i16] containing one of the source operands.
405	/// \param __b
406	/// A 256-bit vector of [16 x i16] containing one of the source operands.
407	/// \returns A 256-bit vector of [16 x i16] containing the sums.
408	static __inline__ __m256i __DEFAULT_FN_ATTRS256
409	_mm256_adds_epu16(__m256i __a, __m256i __b)
410	{
411	return (__m256i)__builtin_elementwise_add_sat((__v16hu)__a, (__v16hu)__b);
412	}
413
414	/// Uses the lower half of the 256-bit vector \a a as the upper half of a
415	/// temporary 256-bit value, and the lower half of the 256-bit vector \a b
416	/// as the lower half of the temporary value. Right-shifts the temporary
417	/// value by \a n bytes, and uses the lower 16 bytes of the shifted value
418	/// as the lower 16 bytes of the result. Uses the upper halves of \a a and
419	/// \a b to make another temporary value, right shifts by \a n, and uses
420	/// the lower 16 bytes of the shifted value as the upper 16 bytes of the
421	/// result.
422	///
423	/// \headerfile <immintrin.h>
424	///
425	/// \code
426	/// __m256i _mm256_alignr_epi8(__m256i a, __m256i b, const int n);
427	/// \endcode
428	///
429	/// This intrinsic corresponds to the \c VPALIGNR instruction.
430	///
431	/// \param a
432	/// A 256-bit integer vector containing source values.
433	/// \param b
434	/// A 256-bit integer vector containing source values.
435	/// \param n
436	/// An immediate value specifying the number of bytes to shift.
437	/// \returns A 256-bit integer vector containing the result.
438	#define _mm256_alignr_epi8(a, b, n) \
439	((__m256i)__builtin_ia32_palignr256((__v32qi)(__m256i)(a), \
440	(__v32qi)(__m256i)(b), (n)))
441
442	/// Computes the bitwise AND of the 256-bit integer vectors in \a __a and
443	/// \a __b.
444	///
445	/// \headerfile <immintrin.h>
446	///
447	/// This intrinsic corresponds to the \c VPAND instruction.
448	///
449	/// \param __a
450	/// A 256-bit integer vector.
451	/// \param __b
452	/// A 256-bit integer vector.
453	/// \returns A 256-bit integer vector containing the result.
454	static __inline__ __m256i __DEFAULT_FN_ATTRS256
455	_mm256_and_si256(__m256i __a, __m256i __b)
456	{
457	return (__m256i)((__v4du)__a & (__v4du)__b);
458	}
459
460	/// Computes the bitwise AND of the 256-bit integer vector in \a __b with
461	/// the bitwise NOT of the 256-bit integer vector in \a __a.
462	///
463	/// \headerfile <immintrin.h>
464	///
465	/// This intrinsic corresponds to the \c VPANDN instruction.
466	///
467	/// \param __a
468	/// A 256-bit integer vector.
469	/// \param __b
470	/// A 256-bit integer vector.
471	/// \returns A 256-bit integer vector containing the result.
472	static __inline__ __m256i __DEFAULT_FN_ATTRS256
473	_mm256_andnot_si256(__m256i __a, __m256i __b)
474	{
475	return (__m256i)(~(__v4du)__a & (__v4du)__b);
476	}
477
478	/// Computes the averages of the corresponding unsigned bytes in the two
479	/// 256-bit integer vectors in \a __a and \a __b and returns each
480	/// average in the corresponding byte of the 256-bit result.
481	///
482	/// \code{.operation}
483	/// FOR i := 0 TO 31
484	/// j := i*8
485	/// result[j+7:j] := (__a[j+7:j] + __b[j+7:j] + 1) >> 1
486	/// ENDFOR
487	/// \endcode
488	///
489	/// \headerfile <immintrin.h>
490	///
491	/// This intrinsic corresponds to the \c VPAVGB instruction.
492	///
493	/// \param __a
494	/// A 256-bit integer vector.
495	/// \param __b
496	/// A 256-bit integer vector.
497	/// \returns A 256-bit integer vector containing the result.
498	static __inline__ __m256i __DEFAULT_FN_ATTRS256
499	_mm256_avg_epu8(__m256i __a, __m256i __b)
500	{
501	return (__m256i)__builtin_ia32_pavgb256((__v32qi)__a, (__v32qi)__b);
502	}
503
504	/// Computes the averages of the corresponding unsigned 16-bit integers in
505	/// the two 256-bit vectors of [16 x i16] in \a __a and \a __b and returns
506	/// each average in the corresponding element of the 256-bit result.
507	///
508	/// \code{.operation}
509	/// FOR i := 0 TO 15
510	/// j := i*16
511	/// result[j+15:j] := (__a[j+15:j] + __b[j+15:j] + 1) >> 1
512	/// ENDFOR
513	/// \endcode
514	///
515	/// \headerfile <immintrin.h>
516	///
517	/// This intrinsic corresponds to the \c VPAVGW instruction.
518	///
519	/// \param __a
520	/// A 256-bit vector of [16 x i16].
521	/// \param __b
522	/// A 256-bit vector of [16 x i16].
523	/// \returns A 256-bit vector of [16 x i16] containing the result.
524	static __inline__ __m256i __DEFAULT_FN_ATTRS256
525	_mm256_avg_epu16(__m256i __a, __m256i __b)
526	{
527	return (__m256i)__builtin_ia32_pavgw256((__v16hi)__a, (__v16hi)__b);
528	}
529
530	/// Merges 8-bit integer values from either of the two 256-bit vectors
531	/// \a __V1 or \a __V2, as specified by the 256-bit mask \a __M and returns
532	/// the resulting 256-bit integer vector.
533	///
534	/// \code{.operation}
535	/// FOR i := 0 TO 31
536	/// j := i*8
537	/// IF __M[7+i] == 0
538	/// result[7+j:j] := __V1[7+j:j]
539	/// ELSE
540	/// result[7+j:j] := __V2[7+j:j]
541	/// FI
542	/// ENDFOR
543	/// \endcode
544	///
545	/// \headerfile <immintrin.h>
546	///
547	/// This intrinsic corresponds to the \c VPBLENDVB instruction.
548	///
549	/// \param __V1
550	/// A 256-bit integer vector containing source values.
551	/// \param __V2
552	/// A 256-bit integer vector containing source values.
553	/// \param __M
554	/// A 256-bit integer vector, with bit [7] of each byte specifying the
555	/// source for each corresponding byte of the result. When the mask bit
556	/// is 0, the byte is copied from \a __V1; otherwise, it is copied from
557	/// \a __V2.
558	/// \returns A 256-bit integer vector containing the result.
559	static __inline__ __m256i __DEFAULT_FN_ATTRS256
560	_mm256_blendv_epi8(__m256i __V1, __m256i __V2, __m256i __M)
561	{
562	return (__m256i)__builtin_ia32_pblendvb256((__v32qi)__V1, (__v32qi)__V2,
563	(__v32qi)__M);
564	}
565
566	/// Merges 16-bit integer values from either of the two 256-bit vectors
567	/// \a V1 or \a V2, as specified by the immediate integer operand \a M,
568	/// and returns the resulting 256-bit vector of [16 x i16].
569	///
570	/// \code{.operation}
571	/// FOR i := 0 TO 7
572	/// j := i*16
573	/// IF M[i] == 0
574	/// result[7+j:j] := V1[7+j:j]
575	/// result[135+j:128+j] := V1[135+j:128+j]
576	/// ELSE
577	/// result[7+j:j] := V2[7+j:j]
578	/// result[135+j:128+j] := V2[135+j:128+j]
579	/// FI
580	/// ENDFOR
581	/// \endcode
582	///
583	/// \headerfile <immintrin.h>
584	///
585	/// \code
586	/// __m256i _mm256_blend_epi16(__m256i V1, __m256i V2, const int M);
587	/// \endcode
588	///
589	/// This intrinsic corresponds to the \c VPBLENDW instruction.
590	///
591	/// \param V1
592	/// A 256-bit vector of [16 x i16] containing source values.
593	/// \param V2
594	/// A 256-bit vector of [16 x i16] containing source values.
595	/// \param M
596	/// An immediate 8-bit integer operand, with bits [7:0] specifying the
597	/// source for each element of the result. The position of the mask bit
598	/// corresponds to the index of a copied value. When a mask bit is 0, the
599	/// element is copied from \a V1; otherwise, it is copied from \a V2.
600	/// \a M[0] determines the source for elements 0 and 8, \a M[1] for
601	/// elements 1 and 9, and so forth.
602	/// \returns A 256-bit vector of [16 x i16] containing the result.
603	#define _mm256_blend_epi16(V1, V2, M) \
604	((__m256i)__builtin_ia32_pblendw256((__v16hi)(__m256i)(V1), \
605	(__v16hi)(__m256i)(V2), (int)(M)))
606
607	/// Compares corresponding bytes in the 256-bit integer vectors in \a __a and
608	/// \a __b for equality and returns the outcomes in the corresponding
609	/// bytes of the 256-bit result.
610	///
611	/// \code{.operation}
612	/// FOR i := 0 TO 31
613	/// j := i*8
614	/// result[j+7:j] := (__a[j+7:j] == __b[j+7:j]) ? 0xFF : 0
615	/// ENDFOR
616	/// \endcode
617	///
618	/// \headerfile <immintrin.h>
619	///
620	/// This intrinsic corresponds to the \c VPCMPEQB instruction.
621	///
622	/// \param __a
623	/// A 256-bit integer vector containing one of the inputs.
624	/// \param __b
625	/// A 256-bit integer vector containing one of the inputs.
626	/// \returns A 256-bit integer vector containing the result.
627	static __inline__ __m256i __DEFAULT_FN_ATTRS256
628	_mm256_cmpeq_epi8(__m256i __a, __m256i __b)
629	{
630	return (__m256i)((__v32qi)__a == (__v32qi)__b);
631	}
632
633	/// Compares corresponding elements in the 256-bit vectors of [16 x i16] in
634	/// \a __a and \a __b for equality and returns the outcomes in the
635	/// corresponding elements of the 256-bit result.
636	///
637	/// \code{.operation}
638	/// FOR i := 0 TO 15
639	/// j := i*16
640	/// result[j+15:j] := (__a[j+15:j] == __b[j+15:j]) ? 0xFFFF : 0
641	/// ENDFOR
642	/// \endcode
643	///
644	/// \headerfile <immintrin.h>
645	///
646	/// This intrinsic corresponds to the \c VPCMPEQW instruction.
647	///
648	/// \param __a
649	/// A 256-bit vector of [16 x i16] containing one of the inputs.
650	/// \param __b
651	/// A 256-bit vector of [16 x i16] containing one of the inputs.
652	/// \returns A 256-bit vector of [16 x i16] containing the result.
653	static __inline__ __m256i __DEFAULT_FN_ATTRS256
654	_mm256_cmpeq_epi16(__m256i __a, __m256i __b)
655	{
656	return (__m256i)((__v16hi)__a == (__v16hi)__b);
657	}
658
659	/// Compares corresponding elements in the 256-bit vectors of [8 x i32] in
660	/// \a __a and \a __b for equality and returns the outcomes in the
661	/// corresponding elements of the 256-bit result.
662	///
663	/// \code{.operation}
664	/// FOR i := 0 TO 7
665	/// j := i*32
666	/// result[j+31:j] := (__a[j+31:j] == __b[j+31:j]) ? 0xFFFFFFFF : 0
667	/// ENDFOR
668	/// \endcode
669	///
670	/// \headerfile <immintrin.h>
671	///
672	/// This intrinsic corresponds to the \c VPCMPEQD instruction.
673	///
674	/// \param __a
675	/// A 256-bit vector of [8 x i32] containing one of the inputs.
676	/// \param __b
677	/// A 256-bit vector of [8 x i32] containing one of the inputs.
678	/// \returns A 256-bit vector of [8 x i32] containing the result.
679	static __inline__ __m256i __DEFAULT_FN_ATTRS256
680	_mm256_cmpeq_epi32(__m256i __a, __m256i __b)
681	{
682	return (__m256i)((__v8si)__a == (__v8si)__b);
683	}
684
685	/// Compares corresponding elements in the 256-bit vectors of [4 x i64] in
686	/// \a __a and \a __b for equality and returns the outcomes in the
687	/// corresponding elements of the 256-bit result.
688	///
689	/// \code{.operation}
690	/// FOR i := 0 TO 3
691	/// j := i*64
692	/// result[j+63:j] := (__a[j+63:j] == __b[j+63:j]) ? 0xFFFFFFFFFFFFFFFF : 0
693	/// ENDFOR
694	/// \endcode
695	///
696	/// \headerfile <immintrin.h>
697	///
698	/// This intrinsic corresponds to the \c VPCMPEQQ instruction.
699	///
700	/// \param __a
701	/// A 256-bit vector of [4 x i64] containing one of the inputs.
702	/// \param __b
703	/// A 256-bit vector of [4 x i64] containing one of the inputs.
704	/// \returns A 256-bit vector of [4 x i64] containing the result.
705	static __inline__ __m256i __DEFAULT_FN_ATTRS256
706	_mm256_cmpeq_epi64(__m256i __a, __m256i __b)
707	{
708	return (__m256i)((__v4di)__a == (__v4di)__b);
709	}
710
711	/// Compares corresponding signed bytes in the 256-bit integer vectors in
712	/// \a __a and \a __b for greater-than and returns the outcomes in the
713	/// corresponding bytes of the 256-bit result.
714	///
715	/// \code{.operation}
716	/// FOR i := 0 TO 31
717	/// j := i*8
718	/// result[j+7:j] := (__a[j+7:j] > __b[j+7:j]) ? 0xFF : 0
719	/// ENDFOR
720	/// \endcode
721	///
722	/// \headerfile <immintrin.h>
723	///
724	/// This intrinsic corresponds to the \c VPCMPGTB instruction.
725	///
726	/// \param __a
727	/// A 256-bit integer vector containing one of the inputs.
728	/// \param __b
729	/// A 256-bit integer vector containing one of the inputs.
730	/// \returns A 256-bit integer vector containing the result.
731	static __inline__ __m256i __DEFAULT_FN_ATTRS256
732	_mm256_cmpgt_epi8(__m256i __a, __m256i __b)
733	{
734	/* This function always performs a signed comparison, but __v32qi is a char
735	which may be signed or unsigned, so use __v32qs. */
736	return (__m256i)((__v32qs)__a > (__v32qs)__b);
737	}
738
739	/// Compares corresponding signed elements in the 256-bit vectors of
740	/// [16 x i16] in \a __a and \a __b for greater-than and returns the
741	/// outcomes in the corresponding elements of the 256-bit result.
742	///
743	/// \code{.operation}
744	/// FOR i := 0 TO 15
745	/// j := i*16
746	/// result[j+15:j] := (__a[j+15:j] > __b[j+15:j]) ? 0xFFFF : 0
747	/// ENDFOR
748	/// \endcode
749	///
750	/// \headerfile <immintrin.h>
751	///
752	/// This intrinsic corresponds to the \c VPCMPGTW instruction.
753	///
754	/// \param __a
755	/// A 256-bit vector of [16 x i16] containing one of the inputs.
756	/// \param __b
757	/// A 256-bit vector of [16 x i16] containing one of the inputs.
758	/// \returns A 256-bit vector of [16 x i16] containing the result.
759	static __inline__ __m256i __DEFAULT_FN_ATTRS256
760	_mm256_cmpgt_epi16(__m256i __a, __m256i __b)
761	{
762	return (__m256i)((__v16hi)__a > (__v16hi)__b);
763	}
764
765	/// Compares corresponding signed elements in the 256-bit vectors of
766	/// [8 x i32] in \a __a and \a __b for greater-than and returns the
767	/// outcomes in the corresponding elements of the 256-bit result.
768	///
769	/// \code{.operation}
770	/// FOR i := 0 TO 7
771	/// j := i*32
772	/// result[j+31:j] := (__a[j+31:j] > __b[j+31:j]) ? 0xFFFFFFFF : 0
773	/// ENDFOR
774	/// \endcode
775	///
776	/// \headerfile <immintrin.h>
777	///
778	/// This intrinsic corresponds to the \c VPCMPGTD instruction.
779	///
780	/// \param __a
781	/// A 256-bit vector of [8 x i32] containing one of the inputs.
782	/// \param __b
783	/// A 256-bit vector of [8 x i32] containing one of the inputs.
784	/// \returns A 256-bit vector of [8 x i32] containing the result.
785	static __inline__ __m256i __DEFAULT_FN_ATTRS256
786	_mm256_cmpgt_epi32(__m256i __a, __m256i __b)
787	{
788	return (__m256i)((__v8si)__a > (__v8si)__b);
789	}
790
791	/// Compares corresponding signed elements in the 256-bit vectors of
792	/// [4 x i64] in \a __a and \a __b for greater-than and returns the
793	/// outcomes in the corresponding elements of the 256-bit result.
794	///
795	/// \code{.operation}
796	/// FOR i := 0 TO 3
797	/// j := i*64
798	/// result[j+63:j] := (__a[j+63:j] > __b[j+63:j]) ? 0xFFFFFFFFFFFFFFFF : 0
799	/// ENDFOR
800	/// \endcode
801	///
802	/// \headerfile <immintrin.h>
803	///
804	/// This intrinsic corresponds to the \c VPCMPGTQ instruction.
805	///
806	/// \param __a
807	/// A 256-bit vector of [4 x i64] containing one of the inputs.
808	/// \param __b
809	/// A 256-bit vector of [4 x i64] containing one of the inputs.
810	/// \returns A 256-bit vector of [4 x i64] containing the result.
811	static __inline__ __m256i __DEFAULT_FN_ATTRS256
812	_mm256_cmpgt_epi64(__m256i __a, __m256i __b)
813	{
814	return (__m256i)((__v4di)__a > (__v4di)__b);
815	}
816
817	/// Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit
818	/// vectors of [16 x i16] and returns the lower 16 bits of each sum in an
819	/// element of the [16 x i16] result (overflow is ignored). Sums from
820	/// \a __a are returned in the lower 64 bits of each 128-bit half of the
821	/// result; sums from \a __b are returned in the upper 64 bits of each
822	/// 128-bit half of the result.
823	///
824	/// \code{.operation}
825	/// FOR i := 0 TO 1
826	/// j := i*128
827	/// result[j+15:j] := __a[j+15:j] + __a[j+31:j+16]
828	/// result[j+31:j+16] := __a[j+47:j+32] + __a[j+63:j+48]
829	/// result[j+47:j+32] := __a[j+79:j+64] + __a[j+95:j+80]
830	/// result[j+63:j+48] := __a[j+111:j+96] + __a[j+127:j+112]
831	/// result[j+79:j+64] := __b[j+15:j] + __b[j+31:j+16]
832	/// result[j+95:j+80] := __b[j+47:j+32] + __b[j+63:j+48]
833	/// result[j+111:j+96] := __b[j+79:j+64] + __b[j+95:j+80]
834	/// result[j+127:j+112] := __b[j+111:j+96] + __b[j+127:j+112]
835	/// ENDFOR
836	/// \endcode
837	///
838	/// \headerfile <immintrin.h>
839	///
840	/// This intrinsic corresponds to the \c VPHADDW instruction.
841	///
842	/// \param __a
843	/// A 256-bit vector of [16 x i16] containing one of the source operands.
844	/// \param __b
845	/// A 256-bit vector of [16 x i16] containing one of the source operands.
846	/// \returns A 256-bit vector of [16 x i16] containing the sums.
847	static __inline__ __m256i __DEFAULT_FN_ATTRS256
848	_mm256_hadd_epi16(__m256i __a, __m256i __b)
849	{
850	return (__m256i)__builtin_ia32_phaddw256((__v16hi)__a, (__v16hi)__b);
851	}
852
853	/// Horizontally adds the adjacent pairs of 32-bit integers from two 256-bit
854	/// vectors of [8 x i32] and returns the lower 32 bits of each sum in an
855	/// element of the [8 x i32] result (overflow is ignored). Sums from \a __a
856	/// are returned in the lower 64 bits of each 128-bit half of the result;
857	/// sums from \a __b are returned in the upper 64 bits of each 128-bit half
858	/// of the result.
859	///
860	/// \code{.operation}
861	/// FOR i := 0 TO 1
862	/// j := i*128
863	/// result[j+31:j] := __a[j+31:j] + __a[j+63:j+32]
864	/// result[j+63:j+32] := __a[j+95:j+64] + __a[j+127:j+96]
865	/// result[j+95:j+64] := __b[j+31:j] + __b[j+63:j+32]
866	/// result[j+127:j+96] := __b[j+95:j+64] + __b[j+127:j+96]
867	/// ENDFOR
868	/// \endcode
869	///
870	/// \headerfile <immintrin.h>
871	///
872	/// This intrinsic corresponds to the \c VPHADDD instruction.
873	///
874	/// \param __a
875	/// A 256-bit vector of [8 x i32] containing one of the source operands.
876	/// \param __b
877	/// A 256-bit vector of [8 x i32] containing one of the source operands.
878	/// \returns A 256-bit vector of [8 x i32] containing the sums.
879	static __inline__ __m256i __DEFAULT_FN_ATTRS256
880	_mm256_hadd_epi32(__m256i __a, __m256i __b)
881	{
882	return (__m256i)__builtin_ia32_phaddd256((__v8si)__a, (__v8si)__b);
883	}
884
885	/// Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit
886	/// vectors of [16 x i16] using signed saturation and returns each sum in
887	/// an element of the [16 x i16] result. Sums from \a __a are returned in
888	/// the lower 64 bits of each 128-bit half of the result; sums from \a __b
889	/// are returned in the upper 64 bits of each 128-bit half of the result.
890	///
891	/// \code{.operation}
892	/// FOR i := 0 TO 1
893	/// j := i*128
894	/// result[j+15:j] := SATURATE16(__a[j+15:j] + __a[j+31:j+16])
895	/// result[j+31:j+16] := SATURATE16(__a[j+47:j+32] + __a[j+63:j+48])
896	/// result[j+47:j+32] := SATURATE16(__a[j+79:j+64] + __a[j+95:j+80])
897	/// result[j+63:j+48] := SATURATE16(__a[j+111:j+96] + __a[j+127:j+112])
898	/// result[j+79:j+64] := SATURATE16(__b[j+15:j] + __b[j+31:j+16])
899	/// result[j+95:j+80] := SATURATE16(__b[j+47:j+32] + __b[j+63:j+48])
900	/// result[j+111:j+96] := SATURATE16(__b[j+79:j+64] + __b[j+95:j+80])
901	/// result[j+127:j+112] := SATURATE16(__b[j+111:j+96] + __b[j+127:j+112])
902	/// ENDFOR
903	/// \endcode
904	///
905	/// \headerfile <immintrin.h>
906	///
907	/// This intrinsic corresponds to the \c VPHADDSW instruction.
908	///
909	/// \param __a
910	/// A 256-bit vector of [16 x i16] containing one of the source operands.
911	/// \param __b
912	/// A 256-bit vector of [16 x i16] containing one of the source operands.
913	/// \returns A 256-bit vector of [16 x i16] containing the sums.
914	static __inline__ __m256i __DEFAULT_FN_ATTRS256
915	_mm256_hadds_epi16(__m256i __a, __m256i __b)
916	{
917	return (__m256i)__builtin_ia32_phaddsw256((__v16hi)__a, (__v16hi)__b);
918	}
919
920	/// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit
921	/// vectors of [16 x i16] and returns the lower 16 bits of each difference
922	/// in an element of the [16 x i16] result (overflow is ignored).
923	/// Differences from \a __a are returned in the lower 64 bits of each
924	/// 128-bit half of the result; differences from \a __b are returned in the
925	/// upper 64 bits of each 128-bit half of the result.
926	///
927	/// \code{.operation}
928	/// FOR i := 0 TO 1
929	/// j := i*128
930	/// result[j+15:j] := __a[j+15:j] - __a[j+31:j+16]
931	/// result[j+31:j+16] := __a[j+47:j+32] - __a[j+63:j+48]
932	/// result[j+47:j+32] := __a[j+79:j+64] - __a[j+95:j+80]
933	/// result[j+63:j+48] := __a[j+111:j+96] - __a[j+127:j+112]
934	/// result[j+79:j+64] := __b[j+15:j] - __b[j+31:j+16]
935	/// result[j+95:j+80] := __b[j+47:j+32] - __b[j+63:j+48]
936	/// result[j+111:j+96] := __b[j+79:j+64] - __b[j+95:j+80]
937	/// result[j+127:j+112] := __b[j+111:j+96] - __b[j+127:j+112]
938	/// ENDFOR
939	/// \endcode
940	///
941	/// \headerfile <immintrin.h>
942	///
943	/// This intrinsic corresponds to the \c VPHSUBW instruction.
944	///
945	/// \param __a
946	/// A 256-bit vector of [16 x i16] containing one of the source operands.
947	/// \param __b
948	/// A 256-bit vector of [16 x i16] containing one of the source operands.
949	/// \returns A 256-bit vector of [16 x i16] containing the differences.
950	static __inline__ __m256i __DEFAULT_FN_ATTRS256
951	_mm256_hsub_epi16(__m256i __a, __m256i __b)
952	{
953	return (__m256i)__builtin_ia32_phsubw256((__v16hi)__a, (__v16hi)__b);
954	}
955
956	/// Horizontally subtracts adjacent pairs of 32-bit integers from two 256-bit
957	/// vectors of [8 x i32] and returns the lower 32 bits of each difference in
958	/// an element of the [8 x i32] result (overflow is ignored). Differences
959	/// from \a __a are returned in the lower 64 bits of each 128-bit half of
960	/// the result; differences from \a __b are returned in the upper 64 bits
961	/// of each 128-bit half of the result.
962	///
963	/// \code{.operation}
964	/// FOR i := 0 TO 1
965	/// j := i*128
966	/// result[j+31:j] := __a[j+31:j] - __a[j+63:j+32]
967	/// result[j+63:j+32] := __a[j+95:j+64] - __a[j+127:j+96]
968	/// result[j+95:j+64] := __b[j+31:j] - __b[j+63:j+32]
969	/// result[j+127:j+96] := __b[j+95:j+64] - __b[j+127:j+96]
970	/// ENDFOR
971	/// \endcode
972	///
973	/// \headerfile <immintrin.h>
974	///
975	/// This intrinsic corresponds to the \c VPHSUBD instruction.
976	///
977	/// \param __a
978	/// A 256-bit vector of [8 x i32] containing one of the source operands.
979	/// \param __b
980	/// A 256-bit vector of [8 x i32] containing one of the source operands.
981	/// \returns A 256-bit vector of [8 x i32] containing the differences.
982	static __inline__ __m256i __DEFAULT_FN_ATTRS256
983	_mm256_hsub_epi32(__m256i __a, __m256i __b)
984	{
985	return (__m256i)__builtin_ia32_phsubd256((__v8si)__a, (__v8si)__b);
986	}
987
988	/// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit
989	/// vectors of [16 x i16] using signed saturation and returns each sum in
990	/// an element of the [16 x i16] result. Differences from \a __a are
991	/// returned in the lower 64 bits of each 128-bit half of the result;
992	/// differences from \a __b are returned in the upper 64 bits of each
993	/// 128-bit half of the result.
994	///
995	/// \code{.operation}
996	/// FOR i := 0 TO 1
997	/// j := i*128
998	/// result[j+15:j] := SATURATE16(__a[j+15:j] - __a[j+31:j+16])
999	/// result[j+31:j+16] := SATURATE16(__a[j+47:j+32] - __a[j+63:j+48])
1000	/// result[j+47:j+32] := SATURATE16(__a[j+79:j+64] - __a[j+95:j+80])
1001	/// result[j+63:j+48] := SATURATE16(__a[j+111:j+96] - __a[j+127:j+112])
1002	/// result[j+79:j+64] := SATURATE16(__b[j+15:j] - __b[j+31:j+16])
1003	/// result[j+95:j+80] := SATURATE16(__b[j+47:j+32] - __b[j+63:j+48])
1004	/// result[j+111:j+96] := SATURATE16(__b[j+79:j+64] - __b[j+95:j+80])
1005	/// result[j+127:j+112] := SATURATE16(__b[j+111:j+96] - __b[j+127:j+112])
1006	/// ENDFOR
1007	/// \endcode
1008	///
1009	/// \headerfile <immintrin.h>
1010	///
1011	/// This intrinsic corresponds to the \c VPHSUBSW instruction.
1012	///
1013	/// \param __a
1014	/// A 256-bit vector of [16 x i16] containing one of the source operands.
1015	/// \param __b
1016	/// A 256-bit vector of [16 x i16] containing one of the source operands.
1017	/// \returns A 256-bit vector of [16 x i16] containing the differences.
1018	static __inline__ __m256i __DEFAULT_FN_ATTRS256
1019	_mm256_hsubs_epi16(__m256i __a, __m256i __b)
1020	{
1021	return (__m256i)__builtin_ia32_phsubsw256((__v16hi)__a, (__v16hi)__b);
1022	}
1023
1024	/// Multiplies each unsigned byte from the 256-bit integer vector in \a __a
1025	/// with the corresponding signed byte from the 256-bit integer vector in
1026	/// \a __b, forming signed 16-bit intermediate products. Adds adjacent
1027	/// pairs of those products using signed saturation to form 16-bit sums
1028	/// returned as elements of the [16 x i16] result.
1029	///
1030	/// \code{.operation}
1031	/// FOR i := 0 TO 15
1032	/// j := i*16
1033	/// temp1 := __a[j+7:j] * __b[j+7:j]
1034	/// temp2 := __a[j+15:j+8] * __b[j+15:j+8]
1035	/// result[j+15:j] := SATURATE16(temp1 + temp2)
1036	/// ENDFOR
1037	/// \endcode
1038	///
1039	/// \headerfile <immintrin.h>
1040	///
1041	/// This intrinsic corresponds to the \c VPMADDUBSW instruction.
1042	///
1043	/// \param __a
1044	/// A 256-bit vector containing one of the source operands.
1045	/// \param __b
1046	/// A 256-bit vector containing one of the source operands.
1047	/// \returns A 256-bit vector of [16 x i16] containing the result.
1048	static __inline__ __m256i __DEFAULT_FN_ATTRS256
1049	_mm256_maddubs_epi16(__m256i __a, __m256i __b)
1050	{
1051	return (__m256i)__builtin_ia32_pmaddubsw256((__v32qi)__a, (__v32qi)__b);
1052	}
1053
1054	/// Multiplies corresponding 16-bit elements of two 256-bit vectors of
1055	/// [16 x i16], forming 32-bit intermediate products, and adds pairs of
1056	/// those products to form 32-bit sums returned as elements of the
1057	/// [8 x i32] result.
1058	///
1059	/// There is only one wraparound case: when all four of the 16-bit sources
1060	/// are \c 0x8000, the result will be \c 0x80000000.
1061	///
1062	/// \code{.operation}
1063	/// FOR i := 0 TO 7
1064	/// j := i*32
1065	/// temp1 := __a[j+15:j] * __b[j+15:j]
1066	/// temp2 := __a[j+31:j+16] * __b[j+31:j+16]
1067	/// result[j+31:j] := temp1 + temp2
1068	/// ENDFOR
1069	/// \endcode
1070	///
1071	/// \headerfile <immintrin.h>
1072	///
1073	/// This intrinsic corresponds to the \c VPMADDWD instruction.
1074	///
1075	/// \param __a
1076	/// A 256-bit vector of [16 x i16] containing one of the source operands.
1077	/// \param __b
1078	/// A 256-bit vector of [16 x i16] containing one of the source operands.
1079	/// \returns A 256-bit vector of [8 x i32] containing the result.
1080	static __inline__ __m256i __DEFAULT_FN_ATTRS256
1081	_mm256_madd_epi16(__m256i __a, __m256i __b)
1082	{
1083	return (__m256i)__builtin_ia32_pmaddwd256((__v16hi)__a, (__v16hi)__b);
1084	}
1085
1086	/// Compares the corresponding signed bytes in the two 256-bit integer vectors
1087	/// in \a __a and \a __b and returns the larger of each pair in the
1088	/// corresponding byte of the 256-bit result.
1089	///
1090	/// \headerfile <immintrin.h>
1091	///
1092	/// This intrinsic corresponds to the \c VPMAXSB instruction.
1093	///
1094	/// \param __a
1095	/// A 256-bit integer vector.
1096	/// \param __b
1097	/// A 256-bit integer vector.
1098	/// \returns A 256-bit integer vector containing the result.
1099	static __inline__ __m256i __DEFAULT_FN_ATTRS256
1100	_mm256_max_epi8(__m256i __a, __m256i __b)
1101	{
1102	return (__m256i)__builtin_elementwise_max((__v32qs)__a, (__v32qs)__b);
1103	}
1104
1105	/// Compares the corresponding signed 16-bit integers in the two 256-bit
1106	/// vectors of [16 x i16] in \a __a and \a __b and returns the larger of
1107	/// each pair in the corresponding element of the 256-bit result.
1108	///
1109	/// \headerfile <immintrin.h>
1110	///
1111	/// This intrinsic corresponds to the \c VPMAXSW instruction.
1112	///
1113	/// \param __a
1114	/// A 256-bit vector of [16 x i16].
1115	/// \param __b
1116	/// A 256-bit vector of [16 x i16].
1117	/// \returns A 256-bit vector of [16 x i16] containing the result.
1118	static __inline__ __m256i __DEFAULT_FN_ATTRS256
1119	_mm256_max_epi16(__m256i __a, __m256i __b)
1120	{
1121	return (__m256i)__builtin_elementwise_max((__v16hi)__a, (__v16hi)__b);
1122	}
1123
1124	/// Compares the corresponding signed 32-bit integers in the two 256-bit
1125	/// vectors of [8 x i32] in \a __a and \a __b and returns the larger of
1126	/// each pair in the corresponding element of the 256-bit result.
1127	///
1128	/// \headerfile <immintrin.h>
1129	///
1130	/// This intrinsic corresponds to the \c VPMAXSD instruction.
1131	///
1132	/// \param __a
1133	/// A 256-bit vector of [8 x i32].
1134	/// \param __b
1135	/// A 256-bit vector of [8 x i32].
1136	/// \returns A 256-bit vector of [8 x i32] containing the result.
1137	static __inline__ __m256i __DEFAULT_FN_ATTRS256
1138	_mm256_max_epi32(__m256i __a, __m256i __b)
1139	{
1140	return (__m256i)__builtin_elementwise_max((__v8si)__a, (__v8si)__b);
1141	}
1142
1143	/// Compares the corresponding unsigned bytes in the two 256-bit integer
1144	/// vectors in \a __a and \a __b and returns the larger of each pair in
1145	/// the corresponding byte of the 256-bit result.
1146	///
1147	/// \headerfile <immintrin.h>
1148	///
1149	/// This intrinsic corresponds to the \c VPMAXUB instruction.
1150	///
1151	/// \param __a
1152	/// A 256-bit integer vector.
1153	/// \param __b
1154	/// A 256-bit integer vector.
1155	/// \returns A 256-bit integer vector containing the result.
1156	static __inline__ __m256i __DEFAULT_FN_ATTRS256
1157	_mm256_max_epu8(__m256i __a, __m256i __b)
1158	{
1159	return (__m256i)__builtin_elementwise_max((__v32qu)__a, (__v32qu)__b);
1160	}
1161
1162	/// Compares the corresponding unsigned 16-bit integers in the two 256-bit
1163	/// vectors of [16 x i16] in \a __a and \a __b and returns the larger of
1164	/// each pair in the corresponding element of the 256-bit result.
1165	///
1166	/// \headerfile <immintrin.h>
1167	///
1168	/// This intrinsic corresponds to the \c VPMAXUW instruction.
1169	///
1170	/// \param __a
1171	/// A 256-bit vector of [16 x i16].
1172	/// \param __b
1173	/// A 256-bit vector of [16 x i16].
1174	/// \returns A 256-bit vector of [16 x i16] containing the result.
1175	static __inline__ __m256i __DEFAULT_FN_ATTRS256
1176	_mm256_max_epu16(__m256i __a, __m256i __b)
1177	{
1178	return (__m256i)__builtin_elementwise_max((__v16hu)__a, (__v16hu)__b);
1179	}
1180
1181	/// Compares the corresponding unsigned 32-bit integers in the two 256-bit
1182	/// vectors of [8 x i32] in \a __a and \a __b and returns the larger of
1183	/// each pair in the corresponding element of the 256-bit result.
1184	///
1185	/// \headerfile <immintrin.h>
1186	///
1187	/// This intrinsic corresponds to the \c VPMAXUD instruction.
1188	///
1189	/// \param __a
1190	/// A 256-bit vector of [8 x i32].
1191	/// \param __b
1192	/// A 256-bit vector of [8 x i32].
1193	/// \returns A 256-bit vector of [8 x i32] containing the result.
1194	static __inline__ __m256i __DEFAULT_FN_ATTRS256
1195	_mm256_max_epu32(__m256i __a, __m256i __b)
1196	{
1197	return (__m256i)__builtin_elementwise_max((__v8su)__a, (__v8su)__b);
1198	}
1199
1200	/// Compares the corresponding signed bytes in the two 256-bit integer vectors
1201	/// in \a __a and \a __b and returns the smaller of each pair in the
1202	/// corresponding byte of the 256-bit result.
1203	///
1204	/// \headerfile <immintrin.h>
1205	///
1206	/// This intrinsic corresponds to the \c VPMINSB instruction.
1207	///
1208	/// \param __a
1209	/// A 256-bit integer vector.
1210	/// \param __b
1211	/// A 256-bit integer vector.
1212	/// \returns A 256-bit integer vector containing the result.
1213	static __inline__ __m256i __DEFAULT_FN_ATTRS256
1214	_mm256_min_epi8(__m256i __a, __m256i __b)
1215	{
1216	return (__m256i)__builtin_elementwise_min((__v32qs)__a, (__v32qs)__b);
1217	}
1218
1219	/// Compares the corresponding signed 16-bit integers in the two 256-bit
1220	/// vectors of [16 x i16] in \a __a and \a __b and returns the smaller of
1221	/// each pair in the corresponding element of the 256-bit result.
1222	///
1223	/// \headerfile <immintrin.h>
1224	///
1225	/// This intrinsic corresponds to the \c VPMINSW instruction.
1226	///
1227	/// \param __a
1228	/// A 256-bit vector of [16 x i16].
1229	/// \param __b
1230	/// A 256-bit vector of [16 x i16].
1231	/// \returns A 256-bit vector of [16 x i16] containing the result.
1232	static __inline__ __m256i __DEFAULT_FN_ATTRS256
1233	_mm256_min_epi16(__m256i __a, __m256i __b)
1234	{
1235	return (__m256i)__builtin_elementwise_min((__v16hi)__a, (__v16hi)__b);
1236	}
1237
1238	/// Compares the corresponding signed 32-bit integers in the two 256-bit
1239	/// vectors of [8 x i32] in \a __a and \a __b and returns the smaller of
1240	/// each pair in the corresponding element of the 256-bit result.
1241	///
1242	/// \headerfile <immintrin.h>
1243	///
1244	/// This intrinsic corresponds to the \c VPMINSD instruction.
1245	///
1246	/// \param __a
1247	/// A 256-bit vector of [8 x i32].
1248	/// \param __b
1249	/// A 256-bit vector of [8 x i32].
1250	/// \returns A 256-bit vector of [8 x i32] containing the result.
1251	static __inline__ __m256i __DEFAULT_FN_ATTRS256
1252	_mm256_min_epi32(__m256i __a, __m256i __b)
1253	{
1254	return (__m256i)__builtin_elementwise_min((__v8si)__a, (__v8si)__b);
1255	}
1256
1257	/// Compares the corresponding unsigned bytes in the two 256-bit integer
1258	/// vectors in \a __a and \a __b and returns the smaller of each pair in
1259	/// the corresponding byte of the 256-bit result.
1260	///
1261	/// \headerfile <immintrin.h>
1262	///
1263	/// This intrinsic corresponds to the \c VPMINUB instruction.
1264	///
1265	/// \param __a
1266	/// A 256-bit integer vector.
1267	/// \param __b
1268	/// A 256-bit integer vector.
1269	/// \returns A 256-bit integer vector containing the result.
1270	static __inline__ __m256i __DEFAULT_FN_ATTRS256
1271	_mm256_min_epu8(__m256i __a, __m256i __b)
1272	{
1273	return (__m256i)__builtin_elementwise_min((__v32qu)__a, (__v32qu)__b);
1274	}
1275
1276	/// Compares the corresponding unsigned 16-bit integers in the two 256-bit
1277	/// vectors of [16 x i16] in \a __a and \a __b and returns the smaller of
1278	/// each pair in the corresponding element of the 256-bit result.
1279	///
1280	/// \headerfile <immintrin.h>
1281	///
1282	/// This intrinsic corresponds to the \c VPMINUW instruction.
1283	///
1284	/// \param __a
1285	/// A 256-bit vector of [16 x i16].
1286	/// \param __b
1287	/// A 256-bit vector of [16 x i16].
1288	/// \returns A 256-bit vector of [16 x i16] containing the result.
1289	static __inline__ __m256i __DEFAULT_FN_ATTRS256
1290	_mm256_min_epu16(__m256i __a, __m256i __b)
1291	{
1292	return (__m256i)__builtin_elementwise_min((__v16hu)__a, (__v16hu)__b);
1293	}
1294
1295	/// Compares the corresponding unsigned 32-bit integers in the two 256-bit
1296	/// vectors of [8 x i32] in \a __a and \a __b and returns the smaller of
1297	/// each pair in the corresponding element of the 256-bit result.
1298	///
1299	/// \headerfile <immintrin.h>
1300	///
1301	/// This intrinsic corresponds to the \c VPMINUD instruction.
1302	///
1303	/// \param __a
1304	/// A 256-bit vector of [8 x i32].
1305	/// \param __b
1306	/// A 256-bit vector of [8 x i32].
1307	/// \returns A 256-bit vector of [8 x i32] containing the result.
1308	static __inline__ __m256i __DEFAULT_FN_ATTRS256
1309	_mm256_min_epu32(__m256i __a, __m256i __b)
1310	{
1311	return (__m256i)__builtin_elementwise_min((__v8su)__a, (__v8su)__b);
1312	}
1313
1314	/// Creates a 32-bit integer mask from the most significant bit of each byte
1315	/// in the 256-bit integer vector in \a __a and returns the result.
1316	///
1317	/// \code{.operation}
1318	/// FOR i := 0 TO 31
1319	/// j := i*8
1320	/// result[i] := __a[j+7]
1321	/// ENDFOR
1322	/// \endcode
1323	///
1324	/// \headerfile <immintrin.h>
1325	///
1326	/// This intrinsic corresponds to the \c VPMOVMSKB instruction.
1327	///
1328	/// \param __a
1329	/// A 256-bit integer vector containing the source bytes.
1330	/// \returns The 32-bit integer mask.
1331	static __inline__ int __DEFAULT_FN_ATTRS256
1332	_mm256_movemask_epi8(__m256i __a)
1333	{
1334	return __builtin_ia32_pmovmskb256((__v32qi)__a);
1335	}
1336
1337	/// Sign-extends bytes from the 128-bit integer vector in \a __V and returns
1338	/// the 16-bit values in the corresponding elements of a 256-bit vector
1339	/// of [16 x i16].
1340	///
1341	/// \code{.operation}
1342	/// FOR i := 0 TO 15
1343	/// j := i*8
1344	/// k := i*16
1345	/// result[k+15:k] := SignExtend(__V[j+7:j])
1346	/// ENDFOR
1347	/// \endcode
1348	///
1349	/// \headerfile <immintrin.h>
1350	///
1351	/// This intrinsic corresponds to the \c VPMOVSXBW instruction.
1352	///
1353	/// \param __V
1354	/// A 128-bit integer vector containing the source bytes.
1355	/// \returns A 256-bit vector of [16 x i16] containing the sign-extended
1356	/// values.
1357	static __inline__ __m256i __DEFAULT_FN_ATTRS256
1358	_mm256_cvtepi8_epi16(__m128i __V)
1359	{
1360	/* This function always performs a signed extension, but __v16qi is a char
1361	which may be signed or unsigned, so use __v16qs. */
1362	return (__m256i)__builtin_convertvector((__v16qs)__V, __v16hi);
1363	}
1364
1365	/// Sign-extends bytes from the lower half of the 128-bit integer vector in
1366	/// \a __V and returns the 32-bit values in the corresponding elements of a
1367	/// 256-bit vector of [8 x i32].
1368	///
1369	/// \code{.operation}
1370	/// FOR i := 0 TO 7
1371	/// j := i*8
1372	/// k := i*32
1373	/// result[k+31:k] := SignExtend(__V[j+7:j])
1374	/// ENDFOR
1375	/// \endcode
1376	///
1377	/// \headerfile <immintrin.h>
1378	///
1379	/// This intrinsic corresponds to the \c VPMOVSXBD instruction.
1380	///
1381	/// \param __V
1382	/// A 128-bit integer vector containing the source bytes.
1383	/// \returns A 256-bit vector of [8 x i32] containing the sign-extended
1384	/// values.
1385	static __inline__ __m256i __DEFAULT_FN_ATTRS256
1386	_mm256_cvtepi8_epi32(__m128i __V)
1387	{
1388	/* This function always performs a signed extension, but __v16qi is a char
1389	which may be signed or unsigned, so use __v16qs. */
1390	return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si);
1391	}
1392
1393	/// Sign-extends the first four bytes from the 128-bit integer vector in
1394	/// \a __V and returns the 64-bit values in the corresponding elements of a
1395	/// 256-bit vector of [4 x i64].
1396	///
1397	/// \code{.operation}
1398	/// result[63:0] := SignExtend(__V[7:0])
1399	/// result[127:64] := SignExtend(__V[15:8])
1400	/// result[191:128] := SignExtend(__V[23:16])
1401	/// result[255:192] := SignExtend(__V[31:24])
1402	/// \endcode
1403	///
1404	/// \headerfile <immintrin.h>
1405	///
1406	/// This intrinsic corresponds to the \c VPMOVSXBQ instruction.
1407	///
1408	/// \param __V
1409	/// A 128-bit integer vector containing the source bytes.
1410	/// \returns A 256-bit vector of [4 x i64] containing the sign-extended
1411	/// values.
1412	static __inline__ __m256i __DEFAULT_FN_ATTRS256
1413	_mm256_cvtepi8_epi64(__m128i __V)
1414	{
1415	/* This function always performs a signed extension, but __v16qi is a char
1416	which may be signed or unsigned, so use __v16qs. */
1417	return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4di);
1418	}
1419
1420	/// Sign-extends 16-bit elements from the 128-bit vector of [8 x i16] in
1421	/// \a __V and returns the 32-bit values in the corresponding elements of a
1422	/// 256-bit vector of [8 x i32].
1423	///
1424	/// \code{.operation}
1425	/// FOR i := 0 TO 7
1426	/// j := i*16
1427	/// k := i*32
1428	/// result[k+31:k] := SignExtend(__V[j+15:j])
1429	/// ENDFOR
1430	/// \endcode
1431	///
1432	/// \headerfile <immintrin.h>
1433	///
1434	/// This intrinsic corresponds to the \c VPMOVSXWD instruction.
1435	///
1436	/// \param __V
1437	/// A 128-bit vector of [8 x i16] containing the source values.
1438	/// \returns A 256-bit vector of [8 x i32] containing the sign-extended
1439	/// values.
1440	static __inline__ __m256i __DEFAULT_FN_ATTRS256
1441	_mm256_cvtepi16_epi32(__m128i __V)
1442	{
1443	return (__m256i)__builtin_convertvector((__v8hi)__V, __v8si);
1444	}
1445
1446	/// Sign-extends 16-bit elements from the lower half of the 128-bit vector of
1447	/// [8 x i16] in \a __V and returns the 64-bit values in the corresponding
1448	/// elements of a 256-bit vector of [4 x i64].
1449	///
1450	/// \code{.operation}
1451	/// result[63:0] := SignExtend(__V[15:0])
1452	/// result[127:64] := SignExtend(__V[31:16])
1453	/// result[191:128] := SignExtend(__V[47:32])
1454	/// result[255:192] := SignExtend(__V[64:48])
1455	/// \endcode
1456	///
1457	/// \headerfile <immintrin.h>
1458	///
1459	/// This intrinsic corresponds to the \c VPMOVSXWQ instruction.
1460	///
1461	/// \param __V
1462	/// A 128-bit vector of [8 x i16] containing the source values.
1463	/// \returns A 256-bit vector of [4 x i64] containing the sign-extended
1464	/// values.
1465	static __inline__ __m256i __DEFAULT_FN_ATTRS256
1466	_mm256_cvtepi16_epi64(__m128i __V)
1467	{
1468	return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4di);
1469	}
1470
1471	/// Sign-extends 32-bit elements from the 128-bit vector of [4 x i32] in
1472	/// \a __V and returns the 64-bit values in the corresponding elements of a
1473	/// 256-bit vector of [4 x i64].
1474	///
1475	/// \code{.operation}
1476	/// result[63:0] := SignExtend(__V[31:0])
1477	/// result[127:64] := SignExtend(__V[63:32])
1478	/// result[191:128] := SignExtend(__V[95:64])
1479	/// result[255:192] := SignExtend(__V[127:96])
1480	/// \endcode
1481	///
1482	/// \headerfile <immintrin.h>
1483	///
1484	/// This intrinsic corresponds to the \c VPMOVSXDQ instruction.
1485	///
1486	/// \param __V
1487	/// A 128-bit vector of [4 x i32] containing the source values.
1488	/// \returns A 256-bit vector of [4 x i64] containing the sign-extended
1489	/// values.
1490	static __inline__ __m256i __DEFAULT_FN_ATTRS256
1491	_mm256_cvtepi32_epi64(__m128i __V)
1492	{
1493	return (__m256i)__builtin_convertvector((__v4si)__V, __v4di);
1494	}
1495
1496	/// Zero-extends bytes from the 128-bit integer vector in \a __V and returns
1497	/// the 16-bit values in the corresponding elements of a 256-bit vector
1498	/// of [16 x i16].
1499	///
1500	/// \code{.operation}
1501	/// FOR i := 0 TO 15
1502	/// j := i*8
1503	/// k := i*16
1504	/// result[k+15:k] := ZeroExtend(__V[j+7:j])
1505	/// ENDFOR
1506	/// \endcode
1507	///
1508	/// \headerfile <immintrin.h>
1509	///
1510	/// This intrinsic corresponds to the \c VPMOVZXBW instruction.
1511	///
1512	/// \param __V
1513	/// A 128-bit integer vector containing the source bytes.
1514	/// \returns A 256-bit vector of [16 x i16] containing the zero-extended
1515	/// values.
1516	static __inline__ __m256i __DEFAULT_FN_ATTRS256
1517	_mm256_cvtepu8_epi16(__m128i __V)
1518	{
1519	return (__m256i)__builtin_convertvector((__v16qu)__V, __v16hi);
1520	}
1521
1522	/// Zero-extends bytes from the lower half of the 128-bit integer vector in
1523	/// \a __V and returns the 32-bit values in the corresponding elements of a
1524	/// 256-bit vector of [8 x i32].
1525	///
1526	/// \code{.operation}
1527	/// FOR i := 0 TO 7
1528	/// j := i*8
1529	/// k := i*32
1530	/// result[k+31:k] := ZeroExtend(__V[j+7:j])
1531	/// ENDFOR
1532	/// \endcode
1533	///
1534	/// \headerfile <immintrin.h>
1535	///
1536	/// This intrinsic corresponds to the \c VPMOVZXBD instruction.
1537	///
1538	/// \param __V
1539	/// A 128-bit integer vector containing the source bytes.
1540	/// \returns A 256-bit vector of [8 x i32] containing the zero-extended
1541	/// values.
1542	static __inline__ __m256i __DEFAULT_FN_ATTRS256
1543	_mm256_cvtepu8_epi32(__m128i __V)
1544	{
1545	return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si);
1546	}
1547
1548	/// Zero-extends the first four bytes from the 128-bit integer vector in
1549	/// \a __V and returns the 64-bit values in the corresponding elements of a
1550	/// 256-bit vector of [4 x i64].
1551	///
1552	/// \code{.operation}
1553	/// result[63:0] := ZeroExtend(__V[7:0])
1554	/// result[127:64] := ZeroExtend(__V[15:8])
1555	/// result[191:128] := ZeroExtend(__V[23:16])
1556	/// result[255:192] := ZeroExtend(__V[31:24])
1557	/// \endcode
1558	///
1559	/// \headerfile <immintrin.h>
1560	///
1561	/// This intrinsic corresponds to the \c VPMOVZXBQ instruction.
1562	///
1563	/// \param __V
1564	/// A 128-bit integer vector containing the source bytes.
1565	/// \returns A 256-bit vector of [4 x i64] containing the zero-extended
1566	/// values.
1567	static __inline__ __m256i __DEFAULT_FN_ATTRS256
1568	_mm256_cvtepu8_epi64(__m128i __V)
1569	{
1570	return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4di);
1571	}
1572
1573	/// Zero-extends 16-bit elements from the 128-bit vector of [8 x i16] in
1574	/// \a __V and returns the 32-bit values in the corresponding elements of a
1575	/// 256-bit vector of [8 x i32].
1576	///
1577	/// \code{.operation}
1578	/// FOR i := 0 TO 7
1579	/// j := i*16
1580	/// k := i*32
1581	/// result[k+31:k] := ZeroExtend(__V[j+15:j])
1582	/// ENDFOR
1583	/// \endcode
1584	///
1585	/// \headerfile <immintrin.h>
1586	///
1587	/// This intrinsic corresponds to the \c VPMOVZXWD instruction.
1588	///
1589	/// \param __V
1590	/// A 128-bit vector of [8 x i16] containing the source values.
1591	/// \returns A 256-bit vector of [8 x i32] containing the zero-extended
1592	/// values.
1593	static __inline__ __m256i __DEFAULT_FN_ATTRS256
1594	_mm256_cvtepu16_epi32(__m128i __V)
1595	{
1596	return (__m256i)__builtin_convertvector((__v8hu)__V, __v8si);
1597	}
1598
1599	/// Zero-extends 16-bit elements from the lower half of the 128-bit vector of
1600	/// [8 x i16] in \a __V and returns the 64-bit values in the corresponding
1601	/// elements of a 256-bit vector of [4 x i64].
1602	///
1603	/// \code{.operation}
1604	/// result[63:0] := ZeroExtend(__V[15:0])
1605	/// result[127:64] := ZeroExtend(__V[31:16])
1606	/// result[191:128] := ZeroExtend(__V[47:32])
1607	/// result[255:192] := ZeroExtend(__V[64:48])
1608	/// \endcode
1609	///
1610	/// \headerfile <immintrin.h>
1611	///
1612	/// This intrinsic corresponds to the \c VPMOVSXWQ instruction.
1613	///
1614	/// \param __V
1615	/// A 128-bit vector of [8 x i16] containing the source values.
1616	/// \returns A 256-bit vector of [4 x i64] containing the zero-extended
1617	/// values.
1618	static __inline__ __m256i __DEFAULT_FN_ATTRS256
1619	_mm256_cvtepu16_epi64(__m128i __V)
1620	{
1621	return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4di);
1622	}
1623
1624	/// Zero-extends 32-bit elements from the 128-bit vector of [4 x i32] in
1625	/// \a __V and returns the 64-bit values in the corresponding elements of a
1626	/// 256-bit vector of [4 x i64].
1627	///
1628	/// \code{.operation}
1629	/// result[63:0] := ZeroExtend(__V[31:0])
1630	/// result[127:64] := ZeroExtend(__V[63:32])
1631	/// result[191:128] := ZeroExtend(__V[95:64])
1632	/// result[255:192] := ZeroExtend(__V[127:96])
1633	/// \endcode
1634	///
1635	/// \headerfile <immintrin.h>
1636	///
1637	/// This intrinsic corresponds to the \c VPMOVZXDQ instruction.
1638	///
1639	/// \param __V
1640	/// A 128-bit vector of [4 x i32] containing the source values.
1641	/// \returns A 256-bit vector of [4 x i64] containing the zero-extended
1642	/// values.
1643	static __inline__ __m256i __DEFAULT_FN_ATTRS256
1644	_mm256_cvtepu32_epi64(__m128i __V)
1645	{
1646	return (__m256i)__builtin_convertvector((__v4su)__V, __v4di);
1647	}
1648
1649	/// Multiplies signed 32-bit integers from even-numbered elements of two
1650	/// 256-bit vectors of [8 x i32] and returns the 64-bit products in the
1651	/// [4 x i64] result.
1652	///
1653	/// \code{.operation}
1654	/// result[63:0] := __a[31:0] * __b[31:0]
1655	/// result[127:64] := __a[95:64] * __b[95:64]
1656	/// result[191:128] := __a[159:128] * __b[159:128]
1657	/// result[255:192] := __a[223:192] * __b[223:192]
1658	/// \endcode
1659	///
1660	/// \headerfile <immintrin.h>
1661	///
1662	/// This intrinsic corresponds to the \c VPMULDQ instruction.
1663	///
1664	/// \param __a
1665	/// A 256-bit vector of [8 x i32] containing one of the source operands.
1666	/// \param __b
1667	/// A 256-bit vector of [8 x i32] containing one of the source operands.
1668	/// \returns A 256-bit vector of [4 x i64] containing the products.
1669	static __inline__ __m256i __DEFAULT_FN_ATTRS256
1670	_mm256_mul_epi32(__m256i __a, __m256i __b)
1671	{
1672	return (__m256i)__builtin_ia32_pmuldq256((__v8si)__a, (__v8si)__b);
1673	}
1674
1675	/// Multiplies signed 16-bit integer elements of two 256-bit vectors of
1676	/// [16 x i16], truncates the 32-bit results to the most significant 18
1677	/// bits, rounds by adding 1, and returns bits [16:1] of each rounded
1678	/// product in the [16 x i16] result.
1679	///
1680	/// \code{.operation}
1681	/// FOR i := 0 TO 15
1682	/// j := i*16
1683	/// temp := ((__a[j+15:j] * __b[j+15:j]) >> 14) + 1
1684	/// result[j+15:j] := temp[16:1]
1685	/// \endcode
1686	///
1687	/// \headerfile <immintrin.h>
1688	///
1689	/// This intrinsic corresponds to the \c VPMULHRSW instruction.
1690	///
1691	/// \param __a
1692	/// A 256-bit vector of [16 x i16] containing one of the source operands.
1693	/// \param __b
1694	/// A 256-bit vector of [16 x i16] containing one of the source operands.
1695	/// \returns A 256-bit vector of [16 x i16] containing the rounded products.
1696	static __inline__ __m256i __DEFAULT_FN_ATTRS256
1697	_mm256_mulhrs_epi16(__m256i __a, __m256i __b)
1698	{
1699	return (__m256i)__builtin_ia32_pmulhrsw256((__v16hi)__a, (__v16hi)__b);
1700	}
1701
1702	/// Multiplies unsigned 16-bit integer elements of two 256-bit vectors of
1703	/// [16 x i16], and returns the upper 16 bits of each 32-bit product in the
1704	/// [16 x i16] result.
1705	///
1706	/// \headerfile <immintrin.h>
1707	///
1708	/// This intrinsic corresponds to the \c VPMULHUW instruction.
1709	///
1710	/// \param __a
1711	/// A 256-bit vector of [16 x i16] containing one of the source operands.
1712	/// \param __b
1713	/// A 256-bit vector of [16 x i16] containing one of the source operands.
1714	/// \returns A 256-bit vector of [16 x i16] containing the products.
1715	static __inline__ __m256i __DEFAULT_FN_ATTRS256
1716	_mm256_mulhi_epu16(__m256i __a, __m256i __b)
1717	{
1718	return (__m256i)__builtin_ia32_pmulhuw256((__v16hi)__a, (__v16hi)__b);
1719	}
1720
1721	/// Multiplies signed 16-bit integer elements of two 256-bit vectors of
1722	/// [16 x i16], and returns the upper 16 bits of each 32-bit product in the
1723	/// [16 x i16] result.
1724	///
1725	/// \headerfile <immintrin.h>
1726	///
1727	/// This intrinsic corresponds to the \c VPMULHW instruction.
1728	///
1729	/// \param __a
1730	/// A 256-bit vector of [16 x i16] containing one of the source operands.
1731	/// \param __b
1732	/// A 256-bit vector of [16 x i16] containing one of the source operands.
1733	/// \returns A 256-bit vector of [16 x i16] containing the products.
1734	static __inline__ __m256i __DEFAULT_FN_ATTRS256
1735	_mm256_mulhi_epi16(__m256i __a, __m256i __b)
1736	{
1737	return (__m256i)__builtin_ia32_pmulhw256((__v16hi)__a, (__v16hi)__b);
1738	}
1739
1740	/// Multiplies signed 16-bit integer elements of two 256-bit vectors of
1741	/// [16 x i16], and returns the lower 16 bits of each 32-bit product in the
1742	/// [16 x i16] result.
1743	///
1744	/// \headerfile <immintrin.h>
1745	///
1746	/// This intrinsic corresponds to the \c VPMULLW instruction.
1747	///
1748	/// \param __a
1749	/// A 256-bit vector of [16 x i16] containing one of the source operands.
1750	/// \param __b
1751	/// A 256-bit vector of [16 x i16] containing one of the source operands.
1752	/// \returns A 256-bit vector of [16 x i16] containing the products.
1753	static __inline__ __m256i __DEFAULT_FN_ATTRS256
1754	_mm256_mullo_epi16(__m256i __a, __m256i __b)
1755	{
1756	return (__m256i)((__v16hu)__a * (__v16hu)__b);
1757	}
1758
1759	/// Multiplies signed 32-bit integer elements of two 256-bit vectors of
1760	/// [8 x i32], and returns the lower 32 bits of each 64-bit product in the
1761	/// [8 x i32] result.
1762	///
1763	/// \headerfile <immintrin.h>
1764	///
1765	/// This intrinsic corresponds to the \c VPMULLD instruction.
1766	///
1767	/// \param __a
1768	/// A 256-bit vector of [8 x i32] containing one of the source operands.
1769	/// \param __b
1770	/// A 256-bit vector of [8 x i32] containing one of the source operands.
1771	/// \returns A 256-bit vector of [8 x i32] containing the products.
1772	static __inline__ __m256i __DEFAULT_FN_ATTRS256
1773	_mm256_mullo_epi32 (__m256i __a, __m256i __b)
1774	{
1775	return (__m256i)((__v8su)__a * (__v8su)__b);
1776	}
1777
1778	/// Multiplies unsigned 32-bit integers from even-numered elements of two
1779	/// 256-bit vectors of [8 x i32] and returns the 64-bit products in the
1780	/// [4 x i64] result.
1781	///
1782	/// \code{.operation}
1783	/// result[63:0] := __a[31:0] * __b[31:0]
1784	/// result[127:64] := __a[95:64] * __b[95:64]
1785	/// result[191:128] := __a[159:128] * __b[159:128]
1786	/// result[255:192] := __a[223:192] * __b[223:192]
1787	/// \endcode
1788	///
1789	/// \headerfile <immintrin.h>
1790	///
1791	/// This intrinsic corresponds to the \c VPMULUDQ instruction.
1792	///
1793	/// \param __a
1794	/// A 256-bit vector of [8 x i32] containing one of the source operands.
1795	/// \param __b
1796	/// A 256-bit vector of [8 x i32] containing one of the source operands.
1797	/// \returns A 256-bit vector of [4 x i64] containing the products.
1798	static __inline__ __m256i __DEFAULT_FN_ATTRS256
1799	_mm256_mul_epu32(__m256i __a, __m256i __b)
1800	{
1801	return __builtin_ia32_pmuludq256((__v8si)__a, (__v8si)__b);
1802	}
1803
1804	/// Computes the bitwise OR of the 256-bit integer vectors in \a __a and
1805	/// \a __b.
1806	///
1807	/// \headerfile <immintrin.h>
1808	///
1809	/// This intrinsic corresponds to the \c VPOR instruction.
1810	///
1811	/// \param __a
1812	/// A 256-bit integer vector.
1813	/// \param __b
1814	/// A 256-bit integer vector.
1815	/// \returns A 256-bit integer vector containing the result.
1816	static __inline__ __m256i __DEFAULT_FN_ATTRS256
1817	_mm256_or_si256(__m256i __a, __m256i __b)
1818	{
1819	return (__m256i)((__v4du)__a \| (__v4du)__b);
1820	}
1821
1822	/// Computes four sum of absolute difference (SAD) operations on sets of eight
1823	/// unsigned 8-bit integers from the 256-bit integer vectors \a __a and
1824	/// \a __b.
1825	///
1826	/// One SAD result is computed for each set of eight bytes from \a __a and
1827	/// eight bytes from \a __b. The zero-extended SAD value is returned in the
1828	/// corresponding 64-bit element of the result.
1829	///
1830	/// A single SAD operation takes the differences between the corresponding
1831	/// bytes of \a __a and \a __b, takes the absolute value of each difference,
1832	/// and sums these eight values to form one 16-bit result. This operation
1833	/// is repeated four times with successive sets of eight bytes.
1834	///
1835	/// \code{.operation}
1836	/// FOR i := 0 TO 3
1837	/// j := i*64
1838	/// temp0 := ABS(__a[j+7:j] - __b[j+7:j])
1839	/// temp1 := ABS(__a[j+15:j+8] - __b[j+15:j+8])
1840	/// temp2 := ABS(__a[j+23:j+16] - __b[j+23:j+16])
1841	/// temp3 := ABS(__a[j+31:j+24] - __b[j+31:j+24])
1842	/// temp4 := ABS(__a[j+39:j+32] - __b[j+39:j+32])
1843	/// temp5 := ABS(__a[j+47:j+40] - __b[j+47:j+40])
1844	/// temp6 := ABS(__a[j+55:j+48] - __b[j+55:j+48])
1845	/// temp7 := ABS(__a[j+63:j+56] - __b[j+63:j+56])
1846	/// result[j+15:j] := temp0 + temp1 + temp2 + temp3 +
1847	/// temp4 + temp5 + temp6 + temp7
1848	/// result[j+63:j+16] := 0
1849	/// ENDFOR
1850	/// \endcode
1851	///
1852	/// \headerfile <immintrin.h>
1853	///
1854	/// This intrinsic corresponds to the \c VPSADBW instruction.
1855	///
1856	/// \param __a
1857	/// A 256-bit integer vector.
1858	/// \param __b
1859	/// A 256-bit integer vector.
1860	/// \returns A 256-bit integer vector containing the result.
1861	static __inline__ __m256i __DEFAULT_FN_ATTRS256
1862	_mm256_sad_epu8(__m256i __a, __m256i __b)
1863	{
1864	return __builtin_ia32_psadbw256((__v32qi)__a, (__v32qi)__b);
1865	}
1866
1867	/// Shuffles 8-bit integers in the 256-bit integer vector \a __a according
1868	/// to control information in the 256-bit integer vector \a __b, and
1869	/// returns the 256-bit result. In effect there are two separate 128-bit
1870	/// shuffles in the lower and upper halves.
1871	///
1872	/// \code{.operation}
1873	/// FOR i := 0 TO 31
1874	/// j := i*8
1875	/// IF __b[j+7] == 1
1876	/// result[j+7:j] := 0
1877	/// ELSE
1878	/// k := __b[j+3:j] * 8
1879	/// IF i > 15
1880	/// k := k + 128
1881	/// FI
1882	/// result[j+7:j] := __a[k+7:k]
1883	/// FI
1884	/// ENDFOR
1885	/// \endcode
1886	///
1887	/// \headerfile <immintrin.h>
1888	///
1889	/// This intrinsic corresponds to the \c VPSHUFB instruction.
1890	///
1891	/// \param __a
1892	/// A 256-bit integer vector containing source values.
1893	/// \param __b
1894	/// A 256-bit integer vector containing control information to determine
1895	/// what goes into the corresponding byte of the result. If bit 7 of the
1896	/// control byte is 1, the result byte is 0; otherwise, bits 3:0 of the
1897	/// control byte specify the index (within the same 128-bit half) of \a __a
1898	/// to copy to the result byte.
1899	/// \returns A 256-bit integer vector containing the result.
1900	static __inline__ __m256i __DEFAULT_FN_ATTRS256
1901	_mm256_shuffle_epi8(__m256i __a, __m256i __b)
1902	{
1903	return (__m256i)__builtin_ia32_pshufb256((__v32qi)__a, (__v32qi)__b);
1904	}
1905
1906	/// Shuffles 32-bit integers from the 256-bit vector of [8 x i32] in \a a
1907	/// according to control information in the integer literal \a imm, and
1908	/// returns the 256-bit result. In effect there are two parallel 128-bit
1909	/// shuffles in the lower and upper halves.
1910	///
1911	/// \code{.operation}
1912	/// FOR i := 0 to 3
1913	/// j := i*32
1914	/// k := (imm >> i2)[1:0] 32
1915	/// result[j+31:j] := a[k+31:k]
1916	/// result[128+j+31:128+j] := a[128+k+31:128+k]
1917	/// ENDFOR
1918	/// \endcode
1919	///
1920	/// \headerfile <immintrin.h>
1921	///
1922	/// \code
1923	/// __m256i _mm256_shuffle_epi32(__m256i a, const int imm);
1924	/// \endcode
1925	///
1926	/// This intrinsic corresponds to the \c VPSHUFB instruction.
1927	///
1928	/// \param a
1929	/// A 256-bit vector of [8 x i32] containing source values.
1930	/// \param imm
1931	/// An immediate 8-bit value specifying which elements to copy from \a a.
1932	/// \a imm[1:0] specifies the index in \a a for elements 0 and 4 of the
1933	/// result, \a imm[3:2] specifies the index for elements 1 and 5, and so
1934	/// forth.
1935	/// \returns A 256-bit vector of [8 x i32] containing the result.
1936	#define _mm256_shuffle_epi32(a, imm) \
1937	((__m256i)__builtin_ia32_pshufd256((__v8si)(__m256i)(a), (int)(imm)))
1938
1939	/// Shuffles 16-bit integers from the 256-bit vector of [16 x i16] in \a a
1940	/// according to control information in the integer literal \a imm, and
1941	/// returns the 256-bit result. The upper 64 bits of each 128-bit half
1942	/// are shuffled in parallel; the lower 64 bits of each 128-bit half are
1943	/// copied from \a a unchanged.
1944	///
1945	/// \code{.operation}
1946	/// result[63:0] := a[63:0]
1947	/// result[191:128] := a[191:128]
1948	/// FOR i := 0 TO 3
1949	/// j := i * 16 + 64
1950	/// k := (imm >> i2)[1:0] 16 + 64
1951	/// result[j+15:j] := a[k+15:k]
1952	/// result[128+j+15:128+j] := a[128+k+15:128+k]
1953	/// ENDFOR
1954	/// \endcode
1955	///
1956	/// \headerfile <immintrin.h>
1957	///
1958	/// \code
1959	/// __m256i _mm256_shufflehi_epi16(__m256i a, const int imm);
1960	/// \endcode
1961	///
1962	/// This intrinsic corresponds to the \c VPSHUFHW instruction.
1963	///
1964	/// \param a
1965	/// A 256-bit vector of [16 x i16] containing source values.
1966	/// \param imm
1967	/// An immediate 8-bit value specifying which elements to copy from \a a.
1968	/// \a imm[1:0] specifies the index in \a a for elements 4 and 8 of the
1969	/// result, \a imm[3:2] specifies the index for elements 5 and 9, and so
1970	/// forth. Indexes are offset by 4 (so 0 means index 4, and so forth).
1971	/// \returns A 256-bit vector of [16 x i16] containing the result.
1972	#define _mm256_shufflehi_epi16(a, imm) \
1973	((__m256i)__builtin_ia32_pshufhw256((__v16hi)(__m256i)(a), (int)(imm)))
1974
1975	/// Shuffles 16-bit integers from the 256-bit vector of [16 x i16] \a a
1976	/// according to control information in the integer literal \a imm, and
1977	/// returns the 256-bit [16 x i16] result. The lower 64 bits of each
1978	/// 128-bit half are shuffled; the upper 64 bits of each 128-bit half are
1979	/// copied from \a a unchanged.
1980	///
1981	/// \code{.operation}
1982	/// result[127:64] := a[127:64]
1983	/// result[255:192] := a[255:192]
1984	/// FOR i := 0 TO 3
1985	/// j := i * 16
1986	/// k := (imm >> i2)[1:0] 16
1987	/// result[j+15:j] := a[k+15:k]
1988	/// result[128+j+15:128+j] := a[128+k+15:128+k]
1989	/// ENDFOR
1990	/// \endcode
1991	///
1992	/// \headerfile <immintrin.h>
1993	///
1994	/// \code
1995	/// __m256i _mm256_shufflelo_epi16(__m256i a, const int imm);
1996	/// \endcode
1997	///
1998	/// This intrinsic corresponds to the \c VPSHUFLW instruction.
1999	///
2000	/// \param a
2001	/// A 256-bit vector of [16 x i16] to use as a source of data for the
2002	/// result.
2003	/// \param imm
2004	/// An immediate 8-bit value specifying which elements to copy from \a a.
2005	/// \a imm[1:0] specifies the index in \a a for elements 0 and 8 of the
2006	/// result, \a imm[3:2] specifies the index for elements 1 and 9, and so
2007	/// forth.
2008	/// \returns A 256-bit vector of [16 x i16] containing the result.
2009	#define _mm256_shufflelo_epi16(a, imm) \
2010	((__m256i)__builtin_ia32_pshuflw256((__v16hi)(__m256i)(a), (int)(imm)))
2011
2012	/// Sets each byte of the result to the corresponding byte of the 256-bit
2013	/// integer vector in \a __a, the negative of that byte, or zero, depending
2014	/// on whether the corresponding byte of the 256-bit integer vector in
2015	/// \a __b is greater than zero, less than zero, or equal to zero,
2016	/// respectively.
2017	///
2018	/// \headerfile <immintrin.h>
2019	///
2020	/// This intrinsic corresponds to the \c VPSIGNB instruction.
2021	///
2022	/// \param __a
2023	/// A 256-bit integer vector.
2024	/// \param __b
2025	/// A 256-bit integer vector].
2026	/// \returns A 256-bit integer vector containing the result.
2027	static __inline__ __m256i __DEFAULT_FN_ATTRS256
2028	_mm256_sign_epi8(__m256i __a, __m256i __b)
2029	{
2030	return (__m256i)__builtin_ia32_psignb256((__v32qi)__a, (__v32qi)__b);
2031	}
2032
2033	/// Sets each element of the result to the corresponding element of the
2034	/// 256-bit vector of [16 x i16] in \a __a, the negative of that element,
2035	/// or zero, depending on whether the corresponding element of the 256-bit
2036	/// vector of [16 x i16] in \a __b is greater than zero, less than zero, or
2037	/// equal to zero, respectively.
2038	///
2039	/// \headerfile <immintrin.h>
2040	///
2041	/// This intrinsic corresponds to the \c VPSIGNW instruction.
2042	///
2043	/// \param __a
2044	/// A 256-bit vector of [16 x i16].
2045	/// \param __b
2046	/// A 256-bit vector of [16 x i16].
2047	/// \returns A 256-bit vector of [16 x i16] containing the result.
2048	static __inline__ __m256i __DEFAULT_FN_ATTRS256
2049	_mm256_sign_epi16(__m256i __a, __m256i __b)
2050	{
2051	return (__m256i)__builtin_ia32_psignw256((__v16hi)__a, (__v16hi)__b);
2052	}
2053
2054	/// Sets each element of the result to the corresponding element of the
2055	/// 256-bit vector of [8 x i32] in \a __a, the negative of that element, or
2056	/// zero, depending on whether the corresponding element of the 256-bit
2057	/// vector of [8 x i32] in \a __b is greater than zero, less than zero, or
2058	/// equal to zero, respectively.
2059	///
2060	/// \headerfile <immintrin.h>
2061	///
2062	/// This intrinsic corresponds to the \c VPSIGND instruction.
2063	///
2064	/// \param __a
2065	/// A 256-bit vector of [8 x i32].
2066	/// \param __b
2067	/// A 256-bit vector of [8 x i32].
2068	/// \returns A 256-bit vector of [8 x i32] containing the result.
2069	static __inline__ __m256i __DEFAULT_FN_ATTRS256
2070	_mm256_sign_epi32(__m256i __a, __m256i __b)
2071	{
2072	return (__m256i)__builtin_ia32_psignd256((__v8si)__a, (__v8si)__b);
2073	}
2074
2075	/// Shifts each 128-bit half of the 256-bit integer vector \a a left by
2076	/// \a imm bytes, shifting in zero bytes, and returns the result. If \a imm
2077	/// is greater than 15, the returned result is all zeroes.
2078	///
2079	/// \headerfile <immintrin.h>
2080	///
2081	/// \code
2082	/// __m256i _mm256_slli_si256(__m256i a, const int imm);
2083	/// \endcode
2084	///
2085	/// This intrinsic corresponds to the \c VPSLLDQ instruction.
2086	///
2087	/// \param a
2088	/// A 256-bit integer vector to be shifted.
2089	/// \param imm
2090	/// An unsigned immediate value specifying the shift count (in bytes).
2091	/// \returns A 256-bit integer vector containing the result.
2092	#define _mm256_slli_si256(a, imm) \
2093	((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm)))
2094
2095	/// Shifts each 128-bit half of the 256-bit integer vector \a a left by
2096	/// \a imm bytes, shifting in zero bytes, and returns the result. If \a imm
2097	/// is greater than 15, the returned result is all zeroes.
2098	///
2099	/// \headerfile <immintrin.h>
2100	///
2101	/// \code
2102	/// __m256i _mm256_bslli_epi128(__m256i a, const int imm);
2103	/// \endcode
2104	///
2105	/// This intrinsic corresponds to the \c VPSLLDQ instruction.
2106	///
2107	/// \param a
2108	/// A 256-bit integer vector to be shifted.
2109	/// \param imm
2110	/// An unsigned immediate value specifying the shift count (in bytes).
2111	/// \returns A 256-bit integer vector containing the result.
2112	#define _mm256_bslli_epi128(a, imm) \
2113	((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm)))
2114
2115	/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2116	/// left by \a __count bits, shifting in zero bits, and returns the result.
2117	/// If \a __count is greater than 15, the returned result is all zeroes.
2118	///
2119	/// \headerfile <immintrin.h>
2120	///
2121	/// This intrinsic corresponds to the \c VPSLLW instruction.
2122	///
2123	/// \param __a
2124	/// A 256-bit vector of [16 x i16] to be shifted.
2125	/// \param __count
2126	/// An unsigned integer value specifying the shift count (in bits).
2127	/// \returns A 256-bit vector of [16 x i16] containing the result.
2128	static __inline__ __m256i __DEFAULT_FN_ATTRS256
2129	_mm256_slli_epi16(__m256i __a, int __count)
2130	{
2131	return (__m256i)__builtin_ia32_psllwi256((__v16hi)__a, __count);
2132	}
2133
2134	/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2135	/// left by the number of bits specified by the lower 64 bits of \a __count,
2136	/// shifting in zero bits, and returns the result. If \a __count is greater
2137	/// than 15, the returned result is all zeroes.
2138	///
2139	/// \headerfile <immintrin.h>
2140	///
2141	/// This intrinsic corresponds to the \c VPSLLW instruction.
2142	///
2143	/// \param __a
2144	/// A 256-bit vector of [16 x i16] to be shifted.
2145	/// \param __count
2146	/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2147	/// shift count (in bits). The upper element is ignored.
2148	/// \returns A 256-bit vector of [16 x i16] containing the result.
2149	static __inline__ __m256i __DEFAULT_FN_ATTRS256
2150	_mm256_sll_epi16(__m256i __a, __m128i __count)
2151	{
2152	return (__m256i)__builtin_ia32_psllw256((__v16hi)__a, (__v8hi)__count);
2153	}
2154
2155	/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2156	/// left by \a __count bits, shifting in zero bits, and returns the result.
2157	/// If \a __count is greater than 31, the returned result is all zeroes.
2158	///
2159	/// \headerfile <immintrin.h>
2160	///
2161	/// This intrinsic corresponds to the \c VPSLLD instruction.
2162	///
2163	/// \param __a
2164	/// A 256-bit vector of [8 x i32] to be shifted.
2165	/// \param __count
2166	/// An unsigned integer value specifying the shift count (in bits).
2167	/// \returns A 256-bit vector of [8 x i32] containing the result.
2168	static __inline__ __m256i __DEFAULT_FN_ATTRS256
2169	_mm256_slli_epi32(__m256i __a, int __count)
2170	{
2171	return (__m256i)__builtin_ia32_pslldi256((__v8si)__a, __count);
2172	}
2173
2174	/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2175	/// left by the number of bits given in the lower 64 bits of \a __count,
2176	/// shifting in zero bits, and returns the result. If \a __count is greater
2177	/// than 31, the returned result is all zeroes.
2178	///
2179	/// \headerfile <immintrin.h>
2180	///
2181	/// This intrinsic corresponds to the \c VPSLLD instruction.
2182	///
2183	/// \param __a
2184	/// A 256-bit vector of [8 x i32] to be shifted.
2185	/// \param __count
2186	/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2187	/// shift count (in bits). The upper element is ignored.
2188	/// \returns A 256-bit vector of [8 x i32] containing the result.
2189	static __inline__ __m256i __DEFAULT_FN_ATTRS256
2190	_mm256_sll_epi32(__m256i __a, __m128i __count)
2191	{
2192	return (__m256i)__builtin_ia32_pslld256((__v8si)__a, (__v4si)__count);
2193	}
2194
2195	/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2196	/// left by \a __count bits, shifting in zero bits, and returns the result.
2197	/// If \a __count is greater than 63, the returned result is all zeroes.
2198	///
2199	/// \headerfile <immintrin.h>
2200	///
2201	/// This intrinsic corresponds to the \c VPSLLQ instruction.
2202	///
2203	/// \param __a
2204	/// A 256-bit vector of [4 x i64] to be shifted.
2205	/// \param __count
2206	/// An unsigned integer value specifying the shift count (in bits).
2207	/// \returns A 256-bit vector of [4 x i64] containing the result.
2208	static __inline__ __m256i __DEFAULT_FN_ATTRS256
2209	_mm256_slli_epi64(__m256i __a, int __count)
2210	{
2211	return __builtin_ia32_psllqi256((__v4di)__a, __count);
2212	}
2213
2214	/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2215	/// left by the number of bits given in the lower 64 bits of \a __count,
2216	/// shifting in zero bits, and returns the result. If \a __count is greater
2217	/// than 63, the returned result is all zeroes.
2218	///
2219	/// \headerfile <immintrin.h>
2220	///
2221	/// This intrinsic corresponds to the \c VPSLLQ instruction.
2222	///
2223	/// \param __a
2224	/// A 256-bit vector of [4 x i64] to be shifted.
2225	/// \param __count
2226	/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2227	/// shift count (in bits). The upper element is ignored.
2228	/// \returns A 256-bit vector of [4 x i64] containing the result.
2229	static __inline__ __m256i __DEFAULT_FN_ATTRS256
2230	_mm256_sll_epi64(__m256i __a, __m128i __count)
2231	{
2232	return __builtin_ia32_psllq256((__v4di)__a, __count);
2233	}
2234
2235	/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2236	/// right by \a __count bits, shifting in sign bits, and returns the result.
2237	/// If \a __count is greater than 15, each element of the result is either
2238	/// 0 or -1 according to the corresponding input sign bit.
2239	///
2240	/// \headerfile <immintrin.h>
2241	///
2242	/// This intrinsic corresponds to the \c VPSRAW instruction.
2243	///
2244	/// \param __a
2245	/// A 256-bit vector of [16 x i16] to be shifted.
2246	/// \param __count
2247	/// An unsigned integer value specifying the shift count (in bits).
2248	/// \returns A 256-bit vector of [16 x i16] containing the result.
2249	static __inline__ __m256i __DEFAULT_FN_ATTRS256
2250	_mm256_srai_epi16(__m256i __a, int __count)
2251	{
2252	return (__m256i)__builtin_ia32_psrawi256((__v16hi)__a, __count);
2253	}
2254
2255	/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2256	/// right by the number of bits given in the lower 64 bits of \a __count,
2257	/// shifting in sign bits, and returns the result. If \a __count is greater
2258	/// than 15, each element of the result is either 0 or -1 according to the
2259	/// corresponding input sign bit.
2260	///
2261	/// \headerfile <immintrin.h>
2262	///
2263	/// This intrinsic corresponds to the \c VPSRAW instruction.
2264	///
2265	/// \param __a
2266	/// A 256-bit vector of [16 x i16] to be shifted.
2267	/// \param __count
2268	/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2269	/// shift count (in bits). The upper element is ignored.
2270	/// \returns A 256-bit vector of [16 x i16] containing the result.
2271	static __inline__ __m256i __DEFAULT_FN_ATTRS256
2272	_mm256_sra_epi16(__m256i __a, __m128i __count)
2273	{
2274	return (__m256i)__builtin_ia32_psraw256((__v16hi)__a, (__v8hi)__count);
2275	}
2276
2277	/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2278	/// right by \a __count bits, shifting in sign bits, and returns the result.
2279	/// If \a __count is greater than 31, each element of the result is either
2280	/// 0 or -1 according to the corresponding input sign bit.
2281	///
2282	/// \headerfile <immintrin.h>
2283	///
2284	/// This intrinsic corresponds to the \c VPSRAD instruction.
2285	///
2286	/// \param __a
2287	/// A 256-bit vector of [8 x i32] to be shifted.
2288	/// \param __count
2289	/// An unsigned integer value specifying the shift count (in bits).
2290	/// \returns A 256-bit vector of [8 x i32] containing the result.
2291	static __inline__ __m256i __DEFAULT_FN_ATTRS256
2292	_mm256_srai_epi32(__m256i __a, int __count)
2293	{
2294	return (__m256i)__builtin_ia32_psradi256((__v8si)__a, __count);
2295	}
2296
2297	/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2298	/// right by the number of bits given in the lower 64 bits of \a __count,
2299	/// shifting in sign bits, and returns the result. If \a __count is greater
2300	/// than 31, each element of the result is either 0 or -1 according to the
2301	/// corresponding input sign bit.
2302	///
2303	/// \headerfile <immintrin.h>
2304	///
2305	/// This intrinsic corresponds to the \c VPSRAD instruction.
2306	///
2307	/// \param __a
2308	/// A 256-bit vector of [8 x i32] to be shifted.
2309	/// \param __count
2310	/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2311	/// shift count (in bits). The upper element is ignored.
2312	/// \returns A 256-bit vector of [8 x i32] containing the result.
2313	static __inline__ __m256i __DEFAULT_FN_ATTRS256
2314	_mm256_sra_epi32(__m256i __a, __m128i __count)
2315	{
2316	return (__m256i)__builtin_ia32_psrad256((__v8si)__a, (__v4si)__count);
2317	}
2318
2319	/// Shifts each 128-bit half of the 256-bit integer vector in \a a right by
2320	/// \a imm bytes, shifting in zero bytes, and returns the result. If
2321	/// \a imm is greater than 15, the returned result is all zeroes.
2322	///
2323	/// \headerfile <immintrin.h>
2324	///
2325	/// \code
2326	/// __m256i _mm256_srli_si256(__m256i a, const int imm);
2327	/// \endcode
2328	///
2329	/// This intrinsic corresponds to the \c VPSRLDQ instruction.
2330	///
2331	/// \param a
2332	/// A 256-bit integer vector to be shifted.
2333	/// \param imm
2334	/// An unsigned immediate value specifying the shift count (in bytes).
2335	/// \returns A 256-bit integer vector containing the result.
2336	#define _mm256_srli_si256(a, imm) \
2337	((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm)))
2338
2339	/// Shifts each 128-bit half of the 256-bit integer vector in \a a right by
2340	/// \a imm bytes, shifting in zero bytes, and returns the result. If
2341	/// \a imm is greater than 15, the returned result is all zeroes.
2342	///
2343	/// \headerfile <immintrin.h>
2344	///
2345	/// \code
2346	/// __m256i _mm256_bsrli_epi128(__m256i a, const int imm);
2347	/// \endcode
2348	///
2349	/// This intrinsic corresponds to the \c VPSRLDQ instruction.
2350	///
2351	/// \param a
2352	/// A 256-bit integer vector to be shifted.
2353	/// \param imm
2354	/// An unsigned immediate value specifying the shift count (in bytes).
2355	/// \returns A 256-bit integer vector containing the result.
2356	#define _mm256_bsrli_epi128(a, imm) \
2357	((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm)))
2358
2359	/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2360	/// right by \a __count bits, shifting in zero bits, and returns the result.
2361	/// If \a __count is greater than 15, the returned result is all zeroes.
2362	///
2363	/// \headerfile <immintrin.h>
2364	///
2365	/// This intrinsic corresponds to the \c VPSRLW instruction.
2366	///
2367	/// \param __a
2368	/// A 256-bit vector of [16 x i16] to be shifted.
2369	/// \param __count
2370	/// An unsigned integer value specifying the shift count (in bits).
2371	/// \returns A 256-bit vector of [16 x i16] containing the result.
2372	static __inline__ __m256i __DEFAULT_FN_ATTRS256
2373	_mm256_srli_epi16(__m256i __a, int __count)
2374	{
2375	return (__m256i)__builtin_ia32_psrlwi256((__v16hi)__a, __count);
2376	}
2377
2378	/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2379	/// right by the number of bits given in the lower 64 bits of \a __count,
2380	/// shifting in zero bits, and returns the result. If \a __count is greater
2381	/// than 15, the returned result is all zeroes.
2382	///
2383	/// \headerfile <immintrin.h>
2384	///
2385	/// This intrinsic corresponds to the \c VPSRLW instruction.
2386	///
2387	/// \param __a
2388	/// A 256-bit vector of [16 x i16] to be shifted.
2389	/// \param __count
2390	/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2391	/// shift count (in bits). The upper element is ignored.
2392	/// \returns A 256-bit vector of [16 x i16] containing the result.
2393	static __inline__ __m256i __DEFAULT_FN_ATTRS256
2394	_mm256_srl_epi16(__m256i __a, __m128i __count)
2395	{
2396	return (__m256i)__builtin_ia32_psrlw256((__v16hi)__a, (__v8hi)__count);
2397	}
2398
2399	/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2400	/// right by \a __count bits, shifting in zero bits, and returns the result.
2401	/// If \a __count is greater than 31, the returned result is all zeroes.
2402	///
2403	/// \headerfile <immintrin.h>
2404	///
2405	/// This intrinsic corresponds to the \c VPSRLD instruction.
2406	///
2407	/// \param __a
2408	/// A 256-bit vector of [8 x i32] to be shifted.
2409	/// \param __count
2410	/// An unsigned integer value specifying the shift count (in bits).
2411	/// \returns A 256-bit vector of [8 x i32] containing the result.
2412	static __inline__ __m256i __DEFAULT_FN_ATTRS256
2413	_mm256_srli_epi32(__m256i __a, int __count)
2414	{
2415	return (__m256i)__builtin_ia32_psrldi256((__v8si)__a, __count);
2416	}
2417
2418	/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2419	/// right by the number of bits given in the lower 64 bits of \a __count,
2420	/// shifting in zero bits, and returns the result. If \a __count is greater
2421	/// than 31, the returned result is all zeroes.
2422	///
2423	/// \headerfile <immintrin.h>
2424	///
2425	/// This intrinsic corresponds to the \c VPSRLD instruction.
2426	///
2427	/// \param __a
2428	/// A 256-bit vector of [8 x i32] to be shifted.
2429	/// \param __count
2430	/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2431	/// shift count (in bits). The upper element is ignored.
2432	/// \returns A 256-bit vector of [8 x i32] containing the result.
2433	static __inline__ __m256i __DEFAULT_FN_ATTRS256
2434	_mm256_srl_epi32(__m256i __a, __m128i __count)
2435	{
2436	return (__m256i)__builtin_ia32_psrld256((__v8si)__a, (__v4si)__count);
2437	}
2438
2439	/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2440	/// right by \a __count bits, shifting in zero bits, and returns the result.
2441	/// If \a __count is greater than 63, the returned result is all zeroes.
2442	///
2443	/// \headerfile <immintrin.h>
2444	///
2445	/// This intrinsic corresponds to the \c VPSRLQ instruction.
2446	///
2447	/// \param __a
2448	/// A 256-bit vector of [4 x i64] to be shifted.
2449	/// \param __count
2450	/// An unsigned integer value specifying the shift count (in bits).
2451	/// \returns A 256-bit vector of [4 x i64] containing the result.
2452	static __inline__ __m256i __DEFAULT_FN_ATTRS256
2453	_mm256_srli_epi64(__m256i __a, int __count)
2454	{
2455	return __builtin_ia32_psrlqi256((__v4di)__a, __count);
2456	}
2457
2458	/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2459	/// right by the number of bits given in the lower 64 bits of \a __count,
2460	/// shifting in zero bits, and returns the result. If \a __count is greater
2461	/// than 63, the returned result is all zeroes.
2462	///
2463	/// \headerfile <immintrin.h>
2464	///
2465	/// This intrinsic corresponds to the \c VPSRLQ instruction.
2466	///
2467	/// \param __a
2468	/// A 256-bit vector of [4 x i64] to be shifted.
2469	/// \param __count
2470	/// A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2471	/// shift count (in bits). The upper element is ignored.
2472	/// \returns A 256-bit vector of [4 x i64] containing the result.
2473	static __inline__ __m256i __DEFAULT_FN_ATTRS256
2474	_mm256_srl_epi64(__m256i __a, __m128i __count)
2475	{
2476	return __builtin_ia32_psrlq256((__v4di)__a, __count);
2477	}
2478
2479	/// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer
2480	/// vectors. Returns the lower 8 bits of each difference in the
2481	/// corresponding byte of the 256-bit integer vector result (overflow is
2482	/// ignored).
2483	///
2484	/// \code{.operation}
2485	/// FOR i := 0 TO 31
2486	/// j := i*8
2487	/// result[j+7:j] := __a[j+7:j] - __b[j+7:j]
2488	/// ENDFOR
2489	/// \endcode
2490	///
2491	/// \headerfile <immintrin.h>
2492	///
2493	/// This intrinsic corresponds to the \c VPSUBB instruction.
2494	///
2495	/// \param __a
2496	/// A 256-bit integer vector containing the minuends.
2497	/// \param __b
2498	/// A 256-bit integer vector containing the subtrahends.
2499	/// \returns A 256-bit integer vector containing the differences.
2500	static __inline__ __m256i __DEFAULT_FN_ATTRS256
2501	_mm256_sub_epi8(__m256i __a, __m256i __b)
2502	{
2503	return (__m256i)((__v32qu)__a - (__v32qu)__b);
2504	}
2505
2506	/// Subtracts 16-bit integers from corresponding elements of two 256-bit
2507	/// vectors of [16 x i16]. Returns the lower 16 bits of each difference in
2508	/// the corresponding element of the [16 x i16] result (overflow is
2509	/// ignored).
2510	///
2511	/// \code{.operation}
2512	/// FOR i := 0 TO 15
2513	/// j := i*16
2514	/// result[j+15:j] := __a[j+15:j] - __b[j+15:j]
2515	/// ENDFOR
2516	/// \endcode
2517	///
2518	/// \headerfile <immintrin.h>
2519	///
2520	/// This intrinsic corresponds to the \c VPSUBW instruction.
2521	///
2522	/// \param __a
2523	/// A 256-bit vector of [16 x i16] containing the minuends.
2524	/// \param __b
2525	/// A 256-bit vector of [16 x i16] containing the subtrahends.
2526	/// \returns A 256-bit vector of [16 x i16] containing the differences.
2527	static __inline__ __m256i __DEFAULT_FN_ATTRS256
2528	_mm256_sub_epi16(__m256i __a, __m256i __b)
2529	{
2530	return (__m256i)((__v16hu)__a - (__v16hu)__b);
2531	}
2532
2533	/// Subtracts 32-bit integers from corresponding elements of two 256-bit
2534	/// vectors of [8 x i32]. Returns the lower 32 bits of each difference in
2535	/// the corresponding element of the [8 x i32] result (overflow is ignored).
2536	///
2537	/// \code{.operation}
2538	/// FOR i := 0 TO 7
2539	/// j := i*32
2540	/// result[j+31:j] := __a[j+31:j] - __b[j+31:j]
2541	/// ENDFOR
2542	/// \endcode
2543	///
2544	/// \headerfile <immintrin.h>
2545	///
2546	/// This intrinsic corresponds to the \c VPSUBD instruction.
2547	///
2548	/// \param __a
2549	/// A 256-bit vector of [8 x i32] containing the minuends.
2550	/// \param __b
2551	/// A 256-bit vector of [8 x i32] containing the subtrahends.
2552	/// \returns A 256-bit vector of [8 x i32] containing the differences.
2553	static __inline__ __m256i __DEFAULT_FN_ATTRS256
2554	_mm256_sub_epi32(__m256i __a, __m256i __b)
2555	{
2556	return (__m256i)((__v8su)__a - (__v8su)__b);
2557	}
2558
2559	/// Subtracts 64-bit integers from corresponding elements of two 256-bit
2560	/// vectors of [4 x i64]. Returns the lower 64 bits of each difference in
2561	/// the corresponding element of the [4 x i64] result (overflow is ignored).
2562	///
2563	/// \code{.operation}
2564	/// FOR i := 0 TO 3
2565	/// j := i*64
2566	/// result[j+63:j] := __a[j+63:j] - __b[j+63:j]
2567	/// ENDFOR
2568	/// \endcode
2569	///
2570	/// \headerfile <immintrin.h>
2571	///
2572	/// This intrinsic corresponds to the \c VPSUBQ instruction.
2573	///
2574	/// \param __a
2575	/// A 256-bit vector of [4 x i64] containing the minuends.
2576	/// \param __b
2577	/// A 256-bit vector of [4 x i64] containing the subtrahends.
2578	/// \returns A 256-bit vector of [4 x i64] containing the differences.
2579	static __inline__ __m256i __DEFAULT_FN_ATTRS256
2580	_mm256_sub_epi64(__m256i __a, __m256i __b)
2581	{
2582	return (__m256i)((__v4du)__a - (__v4du)__b);
2583	}
2584
2585	/// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer
2586	/// vectors using signed saturation, and returns each differences in the
2587	/// corresponding byte of the 256-bit integer vector result.
2588	///
2589	/// \code{.operation}
2590	/// FOR i := 0 TO 31
2591	/// j := i*8
2592	/// result[j+7:j] := SATURATE8(__a[j+7:j] - __b[j+7:j])
2593	/// ENDFOR
2594	/// \endcode
2595	///
2596	/// \headerfile <immintrin.h>
2597	///
2598	/// This intrinsic corresponds to the \c VPSUBSB instruction.
2599	///
2600	/// \param __a
2601	/// A 256-bit integer vector containing the minuends.
2602	/// \param __b
2603	/// A 256-bit integer vector containing the subtrahends.
2604	/// \returns A 256-bit integer vector containing the differences.
2605	static __inline__ __m256i __DEFAULT_FN_ATTRS256
2606	_mm256_subs_epi8(__m256i __a, __m256i __b)
2607	{
2608	return (__m256i)__builtin_elementwise_sub_sat((__v32qs)__a, (__v32qs)__b);
2609	}
2610
2611	/// Subtracts 16-bit integers from corresponding elements of two 256-bit
2612	/// vectors of [16 x i16] using signed saturation, and returns each
2613	/// difference in the corresponding element of the [16 x i16] result.
2614	///
2615	/// \code{.operation}
2616	/// FOR i := 0 TO 15
2617	/// j := i*16
2618	/// result[j+7:j] := SATURATE16(__a[j+7:j] - __b[j+7:j])
2619	/// ENDFOR
2620	/// \endcode
2621	///
2622	/// \headerfile <immintrin.h>
2623	///
2624	/// This intrinsic corresponds to the \c VPSUBSW instruction.
2625	///
2626	/// \param __a
2627	/// A 256-bit vector of [16 x i16] containing the minuends.
2628	/// \param __b
2629	/// A 256-bit vector of [16 x i16] containing the subtrahends.
2630	/// \returns A 256-bit vector of [16 x i16] containing the differences.
2631	static __inline__ __m256i __DEFAULT_FN_ATTRS256
2632	_mm256_subs_epi16(__m256i __a, __m256i __b)
2633	{
2634	return (__m256i)__builtin_elementwise_sub_sat((__v16hi)__a, (__v16hi)__b);
2635	}
2636
2637	/// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer
2638	/// vectors using unsigned saturation, and returns each difference in the
2639	/// corresponding byte of the 256-bit integer vector result. For each byte,
2640	/// computes <c> result = __a - __b </c>.
2641	///
2642	/// \code{.operation}
2643	/// FOR i := 0 TO 31
2644	/// j := i*8
2645	/// result[j+7:j] := SATURATE8U(__a[j+7:j] - __b[j+7:j])
2646	/// ENDFOR
2647	/// \endcode
2648	///
2649	/// \headerfile <immintrin.h>
2650	///
2651	/// This intrinsic corresponds to the \c VPSUBUSB instruction.
2652	///
2653	/// \param __a
2654	/// A 256-bit integer vector containing the minuends.
2655	/// \param __b
2656	/// A 256-bit integer vector containing the subtrahends.
2657	/// \returns A 256-bit integer vector containing the differences.
2658	static __inline__ __m256i __DEFAULT_FN_ATTRS256
2659	_mm256_subs_epu8(__m256i __a, __m256i __b)
2660	{
2661	return (__m256i)__builtin_elementwise_sub_sat((__v32qu)__a, (__v32qu)__b);
2662	}
2663
2664	/// Subtracts 16-bit integers from corresponding elements of two 256-bit
2665	/// vectors of [16 x i16] using unsigned saturation, and returns each
2666	/// difference in the corresponding element of the [16 x i16] result.
2667	///
2668	/// \code{.operation}
2669	/// FOR i := 0 TO 15
2670	/// j := i*16
2671	/// result[j+15:j] := SATURATE16U(__a[j+15:j] - __b[j+15:j])
2672	/// ENDFOR
2673	/// \endcode
2674	///
2675	/// \headerfile <immintrin.h>
2676	///
2677	/// This intrinsic corresponds to the \c VPSUBUSW instruction.
2678	///
2679	/// \param __a
2680	/// A 256-bit vector of [16 x i16] containing the minuends.
2681	/// \param __b
2682	/// A 256-bit vector of [16 x i16] containing the subtrahends.
2683	/// \returns A 256-bit vector of [16 x i16] containing the differences.
2684	static __inline__ __m256i __DEFAULT_FN_ATTRS256
2685	_mm256_subs_epu16(__m256i __a, __m256i __b)
2686	{
2687	return (__m256i)__builtin_elementwise_sub_sat((__v16hu)__a, (__v16hu)__b);
2688	}
2689
2690	/// Unpacks and interleaves 8-bit integers from parts of the 256-bit integer
2691	/// vectors in \a __a and \a __b to form the 256-bit result. Specifically,
2692	/// uses the upper 64 bits of each 128-bit half of \a __a and \a __b as
2693	/// input; other bits in these parameters are ignored.
2694	///
2695	/// \code{.operation}
2696	/// result[7:0] := __a[71:64]
2697	/// result[15:8] := __b[71:64]
2698	/// result[23:16] := __a[79:72]
2699	/// result[31:24] := __b[79:72]
2700	/// . . .
2701	/// result[127:120] := __b[127:120]
2702	/// result[135:128] := __a[199:192]
2703	/// . . .
2704	/// result[255:248] := __b[255:248]
2705	/// \endcode
2706	///
2707	/// \headerfile <immintrin.h>
2708	///
2709	/// This intrinsic corresponds to the \c VPUNPCKHBW instruction.
2710	///
2711	/// \param __a
2712	/// A 256-bit integer vector used as the source for the even-numbered bytes
2713	/// of the result.
2714	/// \param __b
2715	/// A 256-bit integer vector used as the source for the odd-numbered bytes
2716	/// of the result.
2717	/// \returns A 256-bit integer vector containing the result.
2718	static __inline__ __m256i __DEFAULT_FN_ATTRS256
2719	_mm256_unpackhi_epi8(__m256i __a, __m256i __b)
2720	{
2721	return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 8, 32+8, 9, 32+9, 10, 32+10, 11, 32+11, 12, 32+12, 13, 32+13, 14, 32+14, 15, 32+15, 24, 32+24, 25, 32+25, 26, 32+26, 27, 32+27, 28, 32+28, 29, 32+29, 30, 32+30, 31, 32+31);
2722	}
2723
2724	/// Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors
2725	/// of [16 x i16] in \a __a and \a __b to return the resulting 256-bit
2726	/// vector of [16 x i16]. Specifically, uses the upper 64 bits of each
2727	/// 128-bit half of \a __a and \a __b as input; other bits in these
2728	/// parameters are ignored.
2729	///
2730	/// \code{.operation}
2731	/// result[15:0] := __a[79:64]
2732	/// result[31:16] := __b[79:64]
2733	/// result[47:32] := __a[95:80]
2734	/// result[63:48] := __b[95:80]
2735	/// . . .
2736	/// result[127:112] := __b[127:112]
2737	/// result[143:128] := __a[211:196]
2738	/// . . .
2739	/// result[255:240] := __b[255:240]
2740	/// \endcode
2741	///
2742	/// \headerfile <immintrin.h>
2743	///
2744	/// This intrinsic corresponds to the \c VPUNPCKHWD instruction.
2745	///
2746	/// \param __a
2747	/// A 256-bit vector of [16 x i16] used as the source for the even-numbered
2748	/// elements of the result.
2749	/// \param __b
2750	/// A 256-bit vector of [16 x i16] used as the source for the odd-numbered
2751	/// elements of the result.
2752	/// \returns A 256-bit vector of [16 x i16] containing the result.
2753	static __inline__ __m256i __DEFAULT_FN_ATTRS256
2754	_mm256_unpackhi_epi16(__m256i __a, __m256i __b)
2755	{
2756	return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
2757	}
2758
2759	/// Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors
2760	/// of [8 x i32] in \a __a and \a __b to return the resulting 256-bit vector
2761	/// of [8 x i32]. Specifically, uses the upper 64 bits of each 128-bit half
2762	/// of \a __a and \a __b as input; other bits in these parameters are
2763	/// ignored.
2764	///
2765	/// \code{.operation}
2766	/// result[31:0] := __a[95:64]
2767	/// result[63:32] := __b[95:64]
2768	/// result[95:64] := __a[127:96]
2769	/// result[127:96] := __b[127:96]
2770	/// result[159:128] := __a[223:192]
2771	/// result[191:160] := __b[223:192]
2772	/// result[223:192] := __a[255:224]
2773	/// result[255:224] := __b[255:224]
2774	/// \endcode
2775	///
2776	/// \headerfile <immintrin.h>
2777	///
2778	/// This intrinsic corresponds to the \c VPUNPCKHDQ instruction.
2779	///
2780	/// \param __a
2781	/// A 256-bit vector of [8 x i32] used as the source for the even-numbered
2782	/// elements of the result.
2783	/// \param __b
2784	/// A 256-bit vector of [8 x i32] used as the source for the odd-numbered
2785	/// elements of the result.
2786	/// \returns A 256-bit vector of [8 x i32] containing the result.
2787	static __inline__ __m256i __DEFAULT_FN_ATTRS256
2788	_mm256_unpackhi_epi32(__m256i __a, __m256i __b)
2789	{
2790	return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 2, 8+2, 3, 8+3, 6, 8+6, 7, 8+7);
2791	}
2792
2793	/// Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors
2794	/// of [4 x i64] in \a __a and \a __b to return the resulting 256-bit vector
2795	/// of [4 x i64]. Specifically, uses the upper 64 bits of each 128-bit half
2796	/// of \a __a and \a __b as input; other bits in these parameters are
2797	/// ignored.
2798	///
2799	/// \code{.operation}
2800	/// result[63:0] := __a[127:64]
2801	/// result[127:64] := __b[127:64]
2802	/// result[191:128] := __a[255:192]
2803	/// result[255:192] := __b[255:192]
2804	/// \endcode
2805	///
2806	/// \headerfile <immintrin.h>
2807	///
2808	/// This intrinsic corresponds to the \c VPUNPCKHQDQ instruction.
2809	///
2810	/// \param __a
2811	/// A 256-bit vector of [4 x i64] used as the source for the even-numbered
2812	/// elements of the result.
2813	/// \param __b
2814	/// A 256-bit vector of [4 x i64] used as the source for the odd-numbered
2815	/// elements of the result.
2816	/// \returns A 256-bit vector of [4 x i64] containing the result.
2817	static __inline__ __m256i __DEFAULT_FN_ATTRS256
2818	_mm256_unpackhi_epi64(__m256i __a, __m256i __b)
2819	{
2820	return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 1, 4+1, 3, 4+3);
2821	}
2822
2823	/// Unpacks and interleaves 8-bit integers from parts of the 256-bit integer
2824	/// vectors in \a __a and \a __b to form the 256-bit result. Specifically,
2825	/// uses the lower 64 bits of each 128-bit half of \a __a and \a __b as
2826	/// input; other bits in these parameters are ignored.
2827	///
2828	/// \code{.operation}
2829	/// result[7:0] := __a[7:0]
2830	/// result[15:8] := __b[7:0]
2831	/// result[23:16] := __a[15:8]
2832	/// result[31:24] := __b[15:8]
2833	/// . . .
2834	/// result[127:120] := __b[63:56]
2835	/// result[135:128] := __a[135:128]
2836	/// . . .
2837	/// result[255:248] := __b[191:184]
2838	/// \endcode
2839	///
2840	/// \headerfile <immintrin.h>
2841	///
2842	/// This intrinsic corresponds to the \c VPUNPCKLBW instruction.
2843	///
2844	/// \param __a
2845	/// A 256-bit integer vector used as the source for the even-numbered bytes
2846	/// of the result.
2847	/// \param __b
2848	/// A 256-bit integer vector used as the source for the odd-numbered bytes
2849	/// of the result.
2850	/// \returns A 256-bit integer vector containing the result.
2851	static __inline__ __m256i __DEFAULT_FN_ATTRS256
2852	_mm256_unpacklo_epi8(__m256i __a, __m256i __b)
2853	{
2854	return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 0, 32+0, 1, 32+1, 2, 32+2, 3, 32+3, 4, 32+4, 5, 32+5, 6, 32+6, 7, 32+7, 16, 32+16, 17, 32+17, 18, 32+18, 19, 32+19, 20, 32+20, 21, 32+21, 22, 32+22, 23, 32+23);
2855	}
2856
2857	/// Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors
2858	/// of [16 x i16] in \a __a and \a __b to return the resulting 256-bit
2859	/// vector of [16 x i16]. Specifically, uses the lower 64 bits of each
2860	/// 128-bit half of \a __a and \a __b as input; other bits in these
2861	/// parameters are ignored.
2862	///
2863	/// \code{.operation}
2864	/// result[15:0] := __a[15:0]
2865	/// result[31:16] := __b[15:0]
2866	/// result[47:32] := __a[31:16]
2867	/// result[63:48] := __b[31:16]
2868	/// . . .
2869	/// result[127:112] := __b[63:48]
2870	/// result[143:128] := __a[143:128]
2871	/// . . .
2872	/// result[255:239] := __b[191:176]
2873	/// \endcode
2874	///
2875	/// \headerfile <immintrin.h>
2876	///
2877	/// This intrinsic corresponds to the \c VPUNPCKLWD instruction.
2878	///
2879	/// \param __a
2880	/// A 256-bit vector of [16 x i16] used as the source for the even-numbered
2881	/// elements of the result.
2882	/// \param __b
2883	/// A 256-bit vector of [16 x i16] used as the source for the odd-numbered
2884	/// elements of the result.
2885	/// \returns A 256-bit vector of [16 x i16] containing the result.
2886	static __inline__ __m256i __DEFAULT_FN_ATTRS256
2887	_mm256_unpacklo_epi16(__m256i __a, __m256i __b)
2888	{
2889	return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11);
2890	}
2891
2892	/// Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors
2893	/// of [8 x i32] in \a __a and \a __b to return the resulting 256-bit vector
2894	/// of [8 x i32]. Specifically, uses the lower 64 bits of each 128-bit half
2895	/// of \a __a and \a __b as input; other bits in these parameters are
2896	/// ignored.
2897	///
2898	/// \code{.operation}
2899	/// result[31:0] := __a[31:0]
2900	/// result[63:32] := __b[31:0]
2901	/// result[95:64] := __a[63:32]
2902	/// result[127:96] := __b[63:32]
2903	/// result[159:128] := __a[159:128]
2904	/// result[191:160] := __b[159:128]
2905	/// result[223:192] := __a[191:160]
2906	/// result[255:224] := __b[191:190]
2907	/// \endcode
2908	///
2909	/// \headerfile <immintrin.h>
2910	///
2911	/// This intrinsic corresponds to the \c VPUNPCKLDQ instruction.
2912	///
2913	/// \param __a
2914	/// A 256-bit vector of [8 x i32] used as the source for the even-numbered
2915	/// elements of the result.
2916	/// \param __b
2917	/// A 256-bit vector of [8 x i32] used as the source for the odd-numbered
2918	/// elements of the result.
2919	/// \returns A 256-bit vector of [8 x i32] containing the result.
2920	static __inline__ __m256i __DEFAULT_FN_ATTRS256
2921	_mm256_unpacklo_epi32(__m256i __a, __m256i __b)
2922	{
2923	return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 0, 8+0, 1, 8+1, 4, 8+4, 5, 8+5);
2924	}
2925
2926	/// Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors
2927	/// of [4 x i64] in \a __a and \a __b to return the resulting 256-bit vector
2928	/// of [4 x i64]. Specifically, uses the lower 64 bits of each 128-bit half
2929	/// of \a __a and \a __b as input; other bits in these parameters are
2930	/// ignored.
2931	///
2932	/// \code{.operation}
2933	/// result[63:0] := __a[63:0]
2934	/// result[127:64] := __b[63:0]
2935	/// result[191:128] := __a[191:128]
2936	/// result[255:192] := __b[191:128]
2937	/// \endcode
2938	///
2939	/// \headerfile <immintrin.h>
2940	///
2941	/// This intrinsic corresponds to the \c VPUNPCKLQDQ instruction.
2942	///
2943	/// \param __a
2944	/// A 256-bit vector of [4 x i64] used as the source for the even-numbered
2945	/// elements of the result.
2946	/// \param __b
2947	/// A 256-bit vector of [4 x i64] used as the source for the odd-numbered
2948	/// elements of the result.
2949	/// \returns A 256-bit vector of [4 x i64] containing the result.
2950	static __inline__ __m256i __DEFAULT_FN_ATTRS256
2951	_mm256_unpacklo_epi64(__m256i __a, __m256i __b)
2952	{
2953	return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 0, 4+0, 2, 4+2);
2954	}
2955
2956	/// Computes the bitwise XOR of the 256-bit integer vectors in \a __a and
2957	/// \a __b.
2958	///
2959	/// \headerfile <immintrin.h>
2960	///
2961	/// This intrinsic corresponds to the \c VPXOR instruction.
2962	///
2963	/// \param __a
2964	/// A 256-bit integer vector.
2965	/// \param __b
2966	/// A 256-bit integer vector.
2967	/// \returns A 256-bit integer vector containing the result.
2968	static __inline__ __m256i __DEFAULT_FN_ATTRS256
2969	_mm256_xor_si256(__m256i __a, __m256i __b)
2970	{
2971	return (__m256i)((__v4du)__a ^ (__v4du)__b);
2972	}
2973
2974	/// Loads the 256-bit integer vector from memory \a __V using a non-temporal
2975	/// memory hint and returns the vector. \a __V must be aligned on a 32-byte
2976	/// boundary.
2977	///
2978	/// \headerfile <immintrin.h>
2979	///
2980	/// This intrinsic corresponds to the \c VMOVNTDQA instruction.
2981	///
2982	/// \param __V
2983	/// A pointer to the 32-byte aligned memory containing the vector to load.
2984	/// \returns A 256-bit integer vector loaded from memory.
2985	static __inline__ __m256i __DEFAULT_FN_ATTRS256
2986	_mm256_stream_load_si256(const void *__V)
2987	{
2988	typedef __v4di __v4di_aligned __attribute__((aligned(32)));
2989	return (__m256i)__builtin_nontemporal_load((const __v4di_aligned *)__V);
2990	}
2991
2992	/// Broadcasts the 32-bit floating-point value from the low element of the
2993	/// 128-bit vector of [4 x float] in \a __X to all elements of the result's
2994	/// 128-bit vector of [4 x float].
2995	///
2996	/// \headerfile <immintrin.h>
2997	///
2998	/// This intrinsic corresponds to the \c VBROADCASTSS instruction.
2999	///
3000	/// \param __X
3001	/// A 128-bit vector of [4 x float] whose low element will be broadcast.
3002	/// \returns A 128-bit vector of [4 x float] containing the result.
3003	static __inline__ __m128 __DEFAULT_FN_ATTRS128
3004	_mm_broadcastss_ps(__m128 __X)
3005	{
3006	return (__m128)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0);
3007	}
3008
3009	/// Broadcasts the 64-bit floating-point value from the low element of the
3010	/// 128-bit vector of [2 x double] in \a __a to both elements of the
3011	/// result's 128-bit vector of [2 x double].
3012	///
3013	/// \headerfile <immintrin.h>
3014	///
3015	/// This intrinsic corresponds to the \c MOVDDUP instruction.
3016	///
3017	/// \param __a
3018	/// A 128-bit vector of [2 x double] whose low element will be broadcast.
3019	/// \returns A 128-bit vector of [2 x double] containing the result.
3020	static __inline__ __m128d __DEFAULT_FN_ATTRS128
3021	_mm_broadcastsd_pd(__m128d __a)
3022	{
3023	return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
3024	}
3025
3026	/// Broadcasts the 32-bit floating-point value from the low element of the
3027	/// 128-bit vector of [4 x float] in \a __X to all elements of the
3028	/// result's 256-bit vector of [8 x float].
3029	///
3030	/// \headerfile <immintrin.h>
3031	///
3032	/// This intrinsic corresponds to the \c VBROADCASTSS instruction.
3033	///
3034	/// \param __X
3035	/// A 128-bit vector of [4 x float] whose low element will be broadcast.
3036	/// \returns A 256-bit vector of [8 x float] containing the result.
3037	static __inline__ __m256 __DEFAULT_FN_ATTRS256
3038	_mm256_broadcastss_ps(__m128 __X)
3039	{
3040	return (__m256)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0, 0, 0, 0, 0);
3041	}
3042
3043	/// Broadcasts the 64-bit floating-point value from the low element of the
3044	/// 128-bit vector of [2 x double] in \a __X to all elements of the
3045	/// result's 256-bit vector of [4 x double].
3046	///
3047	/// \headerfile <immintrin.h>
3048	///
3049	/// This intrinsic corresponds to the \c VBROADCASTSD instruction.
3050	///
3051	/// \param __X
3052	/// A 128-bit vector of [2 x double] whose low element will be broadcast.
3053	/// \returns A 256-bit vector of [4 x double] containing the result.
3054	static __inline__ __m256d __DEFAULT_FN_ATTRS256
3055	_mm256_broadcastsd_pd(__m128d __X)
3056	{
3057	return (__m256d)__builtin_shufflevector((__v2df)__X, (__v2df)__X, 0, 0, 0, 0);
3058	}
3059
3060	/// Broadcasts the 128-bit integer data from \a __X to both the lower and
3061	/// upper halves of the 256-bit result.
3062	///
3063	/// \headerfile <immintrin.h>
3064	///
3065	/// This intrinsic corresponds to the \c VBROADCASTI128 instruction.
3066	///
3067	/// \param __X
3068	/// A 128-bit integer vector to be broadcast.
3069	/// \returns A 256-bit integer vector containing the result.
3070	static __inline__ __m256i __DEFAULT_FN_ATTRS256
3071	_mm256_broadcastsi128_si256(__m128i __X)
3072	{
3073	return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 1, 0, 1);
3074	}
3075
3076	#define _mm_broadcastsi128_si256(X) _mm256_broadcastsi128_si256(X)
3077
3078	/// Merges 32-bit integer elements from either of the two 128-bit vectors of
3079	/// [4 x i32] in \a V1 or \a V2 to the result's 128-bit vector of [4 x i32],
3080	/// as specified by the immediate integer operand \a M.
3081	///
3082	/// \code{.operation}
3083	/// FOR i := 0 TO 3
3084	/// j := i*32
3085	/// IF M[i] == 0
3086	/// result[31+j:j] := V1[31+j:j]
3087	/// ELSE
3088	/// result[31+j:j] := V2[32+j:j]
3089	/// FI
3090	/// ENDFOR
3091	/// \endcode
3092	///
3093	/// \headerfile <immintrin.h>
3094	///
3095	/// \code
3096	/// __m128i _mm_blend_epi32(__m128i V1, __m128i V2, const int M);
3097	/// \endcode
3098	///
3099	/// This intrinsic corresponds to the \c VPBLENDDD instruction.
3100	///
3101	/// \param V1
3102	/// A 128-bit vector of [4 x i32] containing source values.
3103	/// \param V2
3104	/// A 128-bit vector of [4 x i32] containing source values.
3105	/// \param M
3106	/// An immediate 8-bit integer operand, with bits [3:0] specifying the
3107	/// source for each element of the result. The position of the mask bit
3108	/// corresponds to the index of a copied value. When a mask bit is 0, the
3109	/// element is copied from \a V1; otherwise, it is copied from \a V2.
3110	/// \returns A 128-bit vector of [4 x i32] containing the result.
3111	#define _mm_blend_epi32(V1, V2, M) \
3112	((__m128i)__builtin_ia32_pblendd128((__v4si)(__m128i)(V1), \
3113	(__v4si)(__m128i)(V2), (int)(M)))
3114
3115	/// Merges 32-bit integer elements from either of the two 256-bit vectors of
3116	/// [8 x i32] in \a V1 or \a V2 to return a 256-bit vector of [8 x i32],
3117	/// as specified by the immediate integer operand \a M.
3118	///
3119	/// \code{.operation}
3120	/// FOR i := 0 TO 7
3121	/// j := i*32
3122	/// IF M[i] == 0
3123	/// result[31+j:j] := V1[31+j:j]
3124	/// ELSE
3125	/// result[31+j:j] := V2[32+j:j]
3126	/// FI
3127	/// ENDFOR
3128	/// \endcode
3129	///
3130	/// \headerfile <immintrin.h>
3131	///
3132	/// \code
3133	/// __m256i _mm256_blend_epi32(__m256i V1, __m256i V2, const int M);
3134	/// \endcode
3135	///
3136	/// This intrinsic corresponds to the \c VPBLENDDD instruction.
3137	///
3138	/// \param V1
3139	/// A 256-bit vector of [8 x i32] containing source values.
3140	/// \param V2
3141	/// A 256-bit vector of [8 x i32] containing source values.
3142	/// \param M
3143	/// An immediate 8-bit integer operand, with bits [7:0] specifying the
3144	/// source for each element of the result. The position of the mask bit
3145	/// corresponds to the index of a copied value. When a mask bit is 0, the
3146	/// element is copied from \a V1; otherwise, it is is copied from \a V2.
3147	/// \returns A 256-bit vector of [8 x i32] containing the result.
3148	#define _mm256_blend_epi32(V1, V2, M) \
3149	((__m256i)__builtin_ia32_pblendd256((__v8si)(__m256i)(V1), \
3150	(__v8si)(__m256i)(V2), (int)(M)))
3151
3152	/// Broadcasts the low byte from the 128-bit integer vector in \a __X to all
3153	/// bytes of the 256-bit result.
3154	///
3155	/// \headerfile <immintrin.h>
3156	///
3157	/// This intrinsic corresponds to the \c VPBROADCASTB instruction.
3158	///
3159	/// \param __X
3160	/// A 128-bit integer vector whose low byte will be broadcast.
3161	/// \returns A 256-bit integer vector containing the result.
3162	static __inline__ __m256i __DEFAULT_FN_ATTRS256
3163	_mm256_broadcastb_epi8(__m128i __X)
3164	{
3165	return (__m256i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3166	}
3167
3168	/// Broadcasts the low element from the 128-bit vector of [8 x i16] in \a __X
3169	/// to all elements of the result's 256-bit vector of [16 x i16].
3170	///
3171	/// \headerfile <immintrin.h>
3172	///
3173	/// This intrinsic corresponds to the \c VPBROADCASTW instruction.
3174	///
3175	/// \param __X
3176	/// A 128-bit vector of [8 x i16] whose low element will be broadcast.
3177	/// \returns A 256-bit vector of [16 x i16] containing the result.
3178	static __inline__ __m256i __DEFAULT_FN_ATTRS256
3179	_mm256_broadcastw_epi16(__m128i __X)
3180	{
3181	return (__m256i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3182	}
3183
3184	/// Broadcasts the low element from the 128-bit vector of [4 x i32] in \a __X
3185	/// to all elements of the result's 256-bit vector of [8 x i32].
3186	///
3187	/// \headerfile <immintrin.h>
3188	///
3189	/// This intrinsic corresponds to the \c VPBROADCASTD instruction.
3190	///
3191	/// \param __X
3192	/// A 128-bit vector of [4 x i32] whose low element will be broadcast.
3193	/// \returns A 256-bit vector of [8 x i32] containing the result.
3194	static __inline__ __m256i __DEFAULT_FN_ATTRS256
3195	_mm256_broadcastd_epi32(__m128i __X)
3196	{
3197	return (__m256i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0, 0, 0, 0, 0);
3198	}
3199
3200	/// Broadcasts the low element from the 128-bit vector of [2 x i64] in \a __X
3201	/// to all elements of the result's 256-bit vector of [4 x i64].
3202	///
3203	/// \headerfile <immintrin.h>
3204	///
3205	/// This intrinsic corresponds to the \c VPBROADCASTQ instruction.
3206	///
3207	/// \param __X
3208	/// A 128-bit vector of [2 x i64] whose low element will be broadcast.
3209	/// \returns A 256-bit vector of [4 x i64] containing the result.
3210	static __inline__ __m256i __DEFAULT_FN_ATTRS256
3211	_mm256_broadcastq_epi64(__m128i __X)
3212	{
3213	return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0, 0, 0);
3214	}
3215
3216	/// Broadcasts the low byte from the 128-bit integer vector in \a __X to all
3217	/// bytes of the 128-bit result.
3218	///
3219	/// \headerfile <immintrin.h>
3220	///
3221	/// This intrinsic corresponds to the \c VPBROADCASTB instruction.
3222	///
3223	/// \param __X
3224	/// A 128-bit integer vector whose low byte will be broadcast.
3225	/// \returns A 128-bit integer vector containing the result.
3226	static __inline__ __m128i __DEFAULT_FN_ATTRS128
3227	_mm_broadcastb_epi8(__m128i __X)
3228	{
3229	return (__m128i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3230	}
3231
3232	/// Broadcasts the low element from the 128-bit vector of [8 x i16] in
3233	/// \a __X to all elements of the result's 128-bit vector of [8 x i16].
3234	///
3235	/// \headerfile <immintrin.h>
3236	///
3237	/// This intrinsic corresponds to the \c VPBROADCASTW instruction.
3238	///
3239	/// \param __X
3240	/// A 128-bit vector of [8 x i16] whose low element will be broadcast.
3241	/// \returns A 128-bit vector of [8 x i16] containing the result.
3242	static __inline__ __m128i __DEFAULT_FN_ATTRS128
3243	_mm_broadcastw_epi16(__m128i __X)
3244	{
3245	return (__m128i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0);
3246	}
3247
3248	/// Broadcasts the low element from the 128-bit vector of [4 x i32] in \a __X
3249	/// to all elements of the result's vector of [4 x i32].
3250	///
3251	/// \headerfile <immintrin.h>
3252	///
3253	/// This intrinsic corresponds to the \c VPBROADCASTD instruction.
3254	///
3255	/// \param __X
3256	/// A 128-bit vector of [4 x i32] whose low element will be broadcast.
3257	/// \returns A 128-bit vector of [4 x i32] containing the result.
3258	static __inline__ __m128i __DEFAULT_FN_ATTRS128
3259	_mm_broadcastd_epi32(__m128i __X)
3260	{
3261	return (__m128i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0);
3262	}
3263
3264	/// Broadcasts the low element from the 128-bit vector of [2 x i64] in \a __X
3265	/// to both elements of the result's 128-bit vector of [2 x i64].
3266	///
3267	/// \headerfile <immintrin.h>
3268	///
3269	/// This intrinsic corresponds to the \c VPBROADCASTQ instruction.
3270	///
3271	/// \param __X
3272	/// A 128-bit vector of [2 x i64] whose low element will be broadcast.
3273	/// \returns A 128-bit vector of [2 x i64] containing the result.
3274	static __inline__ __m128i __DEFAULT_FN_ATTRS128
3275	_mm_broadcastq_epi64(__m128i __X)
3276	{
3277	return (__m128i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0);
3278	}
3279
3280	/// Sets the result's 256-bit vector of [8 x i32] to copies of elements of the
3281	/// 256-bit vector of [8 x i32] in \a __a as specified by indexes in the
3282	/// elements of the 256-bit vector of [8 x i32] in \a __b.
3283	///
3284	/// \code{.operation}
3285	/// FOR i := 0 TO 7
3286	/// j := i*32
3287	/// k := __b[j+2:j] * 32
3288	/// result[j+31:j] := __a[k+31:k]
3289	/// ENDFOR
3290	/// \endcode
3291	///
3292	/// \headerfile <immintrin.h>
3293	///
3294	/// This intrinsic corresponds to the \c VPERMD instruction.
3295	///
3296	/// \param __a
3297	/// A 256-bit vector of [8 x i32] containing the source values.
3298	/// \param __b
3299	/// A 256-bit vector of [8 x i32] containing indexes of values to use from
3300	/// \a __a.
3301	/// \returns A 256-bit vector of [8 x i32] containing the result.
3302	static __inline__ __m256i __DEFAULT_FN_ATTRS256
3303	_mm256_permutevar8x32_epi32(__m256i __a, __m256i __b)
3304	{
3305	return (__m256i)__builtin_ia32_permvarsi256((__v8si)__a, (__v8si)__b);
3306	}
3307
3308	/// Sets the result's 256-bit vector of [4 x double] to copies of elements of
3309	/// the 256-bit vector of [4 x double] in \a V as specified by the
3310	/// immediate value \a M.
3311	///
3312	/// \code{.operation}
3313	/// FOR i := 0 TO 3
3314	/// j := i*64
3315	/// k := (M >> i2)[1:0] 64
3316	/// result[j+63:j] := V[k+63:k]
3317	/// ENDFOR
3318	/// \endcode
3319	///
3320	/// \headerfile <immintrin.h>
3321	///
3322	/// \code
3323	/// __m256d _mm256_permute4x64_pd(__m256d V, const int M);
3324	/// \endcode
3325	///
3326	/// This intrinsic corresponds to the \c VPERMPD instruction.
3327	///
3328	/// \param V
3329	/// A 256-bit vector of [4 x double] containing the source values.
3330	/// \param M
3331	/// An immediate 8-bit value specifying which elements to copy from \a V.
3332	/// \a M[1:0] specifies the index in \a a for element 0 of the result,
3333	/// \a M[3:2] specifies the index for element 1, and so forth.
3334	/// \returns A 256-bit vector of [4 x double] containing the result.
3335	#define _mm256_permute4x64_pd(V, M) \
3336	((__m256d)__builtin_ia32_permdf256((__v4df)(__m256d)(V), (int)(M)))
3337
3338	/// Sets the result's 256-bit vector of [8 x float] to copies of elements of
3339	/// the 256-bit vector of [8 x float] in \a __a as specified by indexes in
3340	/// the elements of the 256-bit vector of [8 x i32] in \a __b.
3341	///
3342	/// \code{.operation}
3343	/// FOR i := 0 TO 7
3344	/// j := i*32
3345	/// k := __b[j+2:j] * 32
3346	/// result[j+31:j] := __a[k+31:k]
3347	/// ENDFOR
3348	/// \endcode
3349	///
3350	/// \headerfile <immintrin.h>
3351	///
3352	/// This intrinsic corresponds to the \c VPERMPS instruction.
3353	///
3354	/// \param __a
3355	/// A 256-bit vector of [8 x float] containing the source values.
3356	/// \param __b
3357	/// A 256-bit vector of [8 x i32] containing indexes of values to use from
3358	/// \a __a.
3359	/// \returns A 256-bit vector of [8 x float] containing the result.
3360	static __inline__ __m256 __DEFAULT_FN_ATTRS256
3361	_mm256_permutevar8x32_ps(__m256 __a, __m256i __b)
3362	{
3363	return (__m256)__builtin_ia32_permvarsf256((__v8sf)__a, (__v8si)__b);
3364	}
3365
3366	/// Sets the result's 256-bit vector of [4 x i64] result to copies of elements
3367	/// of the 256-bit vector of [4 x i64] in \a V as specified by the
3368	/// immediate value \a M.
3369	///
3370	/// \code{.operation}
3371	/// FOR i := 0 TO 3
3372	/// j := i*64
3373	/// k := (M >> i2)[1:0] 64
3374	/// result[j+63:j] := V[k+63:k]
3375	/// ENDFOR
3376	/// \endcode
3377	///
3378	/// \headerfile <immintrin.h>
3379	///
3380	/// \code
3381	/// __m256i _mm256_permute4x64_epi64(__m256i V, const int M);
3382	/// \endcode
3383	///
3384	/// This intrinsic corresponds to the \c VPERMQ instruction.
3385	///
3386	/// \param V
3387	/// A 256-bit vector of [4 x i64] containing the source values.
3388	/// \param M
3389	/// An immediate 8-bit value specifying which elements to copy from \a V.
3390	/// \a M[1:0] specifies the index in \a a for element 0 of the result,
3391	/// \a M[3:2] specifies the index for element 1, and so forth.
3392	/// \returns A 256-bit vector of [4 x i64] containing the result.
3393	#define _mm256_permute4x64_epi64(V, M) \
3394	((__m256i)__builtin_ia32_permdi256((__v4di)(__m256i)(V), (int)(M)))
3395
3396	/// Sets each half of the 256-bit result either to zero or to one of the
3397	/// four possible 128-bit halves of the 256-bit vectors \a V1 and \a V2,
3398	/// as specified by the immediate value \a M.
3399	///
3400	/// \code{.operation}
3401	/// FOR i := 0 TO 1
3402	/// j := i*128
3403	/// k := M >> (i*4)
3404	/// IF k[3] == 0
3405	/// CASE (k[1:0]) OF
3406	/// 0: result[127+j:j] := V1[127:0]
3407	/// 1: result[127+j:j] := V1[255:128]
3408	/// 2: result[127+j:j] := V2[127:0]
3409	/// 3: result[127+j:j] := V2[255:128]
3410	/// ESAC
3411	/// ELSE
3412	/// result[127+j:j] := 0
3413	/// FI
3414	/// ENDFOR
3415	/// \endcode
3416	///
3417	/// \headerfile <immintrin.h>
3418	///
3419	/// \code
3420	/// __m256i _mm256_permute2x128_si256(__m256i V1, __m256i V2, const int M);
3421	/// \endcode
3422	///
3423	/// This intrinsic corresponds to the \c VPERM2I128 instruction.
3424	///
3425	/// \param V1
3426	/// A 256-bit integer vector containing source values.
3427	/// \param V2
3428	/// A 256-bit integer vector containing source values.
3429	/// \param M
3430	/// An immediate value specifying how to form the result. Bits [3:0]
3431	/// control the lower half of the result, bits [7:4] control the upper half.
3432	/// Within each 4-bit control value, if bit 3 is 1, the result is zero,
3433	/// otherwise bits [1:0] determine the source as follows. \n
3434	/// 0: the lower half of \a V1 \n
3435	/// 1: the upper half of \a V1 \n
3436	/// 2: the lower half of \a V2 \n
3437	/// 3: the upper half of \a V2
3438	/// \returns A 256-bit integer vector containing the result.
3439	#define _mm256_permute2x128_si256(V1, V2, M) \
3440	((__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (int)(M)))
3441
3442	/// Extracts half of the 256-bit vector \a V to the 128-bit result. If bit 0
3443	/// of the immediate \a M is zero, extracts the lower half of the result;
3444	/// otherwise, extracts the upper half.
3445	///
3446	/// \headerfile <immintrin.h>
3447	///
3448	/// \code
3449	/// __m128i _mm256_extracti128_si256(__m256i V, const int M);
3450	/// \endcode
3451	///
3452	/// This intrinsic corresponds to the \c VEXTRACTI128 instruction.
3453	///
3454	/// \param V
3455	/// A 256-bit integer vector containing the source values.
3456	/// \param M
3457	/// An immediate value specifying which half of \a V to extract.
3458	/// \returns A 128-bit integer vector containing the result.
3459	#define _mm256_extracti128_si256(V, M) \
3460	((__m128i)__builtin_ia32_extract128i256((__v4di)(__m256i)(V), (int)(M)))
3461
3462	/// Copies the 256-bit vector \a V1 to the result, then overwrites half of the
3463	/// result with the 128-bit vector \a V2. If bit 0 of the immediate \a M
3464	/// is zero, overwrites the lower half of the result; otherwise,
3465	/// overwrites the upper half.
3466	///
3467	/// \headerfile <immintrin.h>
3468	///
3469	/// \code
3470	/// __m256i _mm256_inserti128_si256(__m256i V1, __m128i V2, const int M);
3471	/// \endcode
3472	///
3473	/// This intrinsic corresponds to the \c VINSERTI128 instruction.
3474	///
3475	/// \param V1
3476	/// A 256-bit integer vector containing a source value.
3477	/// \param V2
3478	/// A 128-bit integer vector containing a source value.
3479	/// \param M
3480	/// An immediate value specifying where to put \a V2 in the result.
3481	/// \returns A 256-bit integer vector containing the result.
3482	#define _mm256_inserti128_si256(V1, V2, M) \
3483	((__m256i)__builtin_ia32_insert128i256((__v4di)(__m256i)(V1), \
3484	(__v2di)(__m128i)(V2), (int)(M)))
3485
3486	/// Conditionally loads eight 32-bit integer elements from memory \a __X, if
3487	/// the most significant bit of the corresponding element in the mask
3488	/// \a __M is set; otherwise, sets that element of the result to zero.
3489	/// Returns the 256-bit [8 x i32] result.
3490	///
3491	/// \code{.operation}
3492	/// FOR i := 0 TO 7
3493	/// j := i*32
3494	/// IF __M[j+31] == 1
3495	/// result[j+31:j] := Load32(__X+(i*4))
3496	/// ELSE
3497	/// result[j+31:j] := 0
3498	/// FI
3499	/// ENDFOR
3500	/// \endcode
3501	///
3502	/// \headerfile <immintrin.h>
3503	///
3504	/// This intrinsic corresponds to the \c VPMASKMOVD instruction.
3505	///
3506	/// \param __X
3507	/// A pointer to the memory used for loading values.
3508	/// \param __M
3509	/// A 256-bit vector of [8 x i32] containing the mask bits.
3510	/// \returns A 256-bit vector of [8 x i32] containing the loaded or zeroed
3511	/// elements.
3512	static __inline__ __m256i __DEFAULT_FN_ATTRS256
3513	_mm256_maskload_epi32(int const *__X, __m256i __M)
3514	{
3515	return (__m256i)__builtin_ia32_maskloadd256((const __v8si *)__X, (__v8si)__M);
3516	}
3517
3518	/// Conditionally loads four 64-bit integer elements from memory \a __X, if
3519	/// the most significant bit of the corresponding element in the mask
3520	/// \a __M is set; otherwise, sets that element of the result to zero.
3521	/// Returns the 256-bit [4 x i64] result.
3522	///
3523	/// \code{.operation}
3524	/// FOR i := 0 TO 3
3525	/// j := i*64
3526	/// IF __M[j+63] == 1
3527	/// result[j+63:j] := Load64(__X+(i*8))
3528	/// ELSE
3529	/// result[j+63:j] := 0
3530	/// FI
3531	/// ENDFOR
3532	/// \endcode
3533	///
3534	/// \headerfile <immintrin.h>
3535	///
3536	/// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
3537	///
3538	/// \param __X
3539	/// A pointer to the memory used for loading values.
3540	/// \param __M
3541	/// A 256-bit vector of [4 x i64] containing the mask bits.
3542	/// \returns A 256-bit vector of [4 x i64] containing the loaded or zeroed
3543	/// elements.
3544	static __inline__ __m256i __DEFAULT_FN_ATTRS256
3545	_mm256_maskload_epi64(long long const *__X, __m256i __M)
3546	{
3547	return (__m256i)__builtin_ia32_maskloadq256((const __v4di *)__X, (__v4di)__M);
3548	}
3549
3550	/// Conditionally loads four 32-bit integer elements from memory \a __X, if
3551	/// the most significant bit of the corresponding element in the mask
3552	/// \a __M is set; otherwise, sets that element of the result to zero.
3553	/// Returns the 128-bit [4 x i32] result.
3554	///
3555	/// \code{.operation}
3556	/// FOR i := 0 TO 3
3557	/// j := i*32
3558	/// IF __M[j+31] == 1
3559	/// result[j+31:j] := Load32(__X+(i*4))
3560	/// ELSE
3561	/// result[j+31:j] := 0
3562	/// FI
3563	/// ENDFOR
3564	/// \endcode
3565	///
3566	/// \headerfile <immintrin.h>
3567	///
3568	/// This intrinsic corresponds to the \c VPMASKMOVD instruction.
3569	///
3570	/// \param __X
3571	/// A pointer to the memory used for loading values.
3572	/// \param __M
3573	/// A 128-bit vector of [4 x i32] containing the mask bits.
3574	/// \returns A 128-bit vector of [4 x i32] containing the loaded or zeroed
3575	/// elements.
3576	static __inline__ __m128i __DEFAULT_FN_ATTRS128
3577	_mm_maskload_epi32(int const *__X, __m128i __M)
3578	{
3579	return (__m128i)__builtin_ia32_maskloadd((const __v4si *)__X, (__v4si)__M);
3580	}
3581
3582	/// Conditionally loads two 64-bit integer elements from memory \a __X, if
3583	/// the most significant bit of the corresponding element in the mask
3584	/// \a __M is set; otherwise, sets that element of the result to zero.
3585	/// Returns the 128-bit [2 x i64] result.
3586	///
3587	/// \code{.operation}
3588	/// FOR i := 0 TO 1
3589	/// j := i*64
3590	/// IF __M[j+63] == 1
3591	/// result[j+63:j] := Load64(__X+(i*8))
3592	/// ELSE
3593	/// result[j+63:j] := 0
3594	/// FI
3595	/// ENDFOR
3596	/// \endcode
3597	///
3598	/// \headerfile <immintrin.h>
3599	///
3600	/// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
3601	///
3602	/// \param __X
3603	/// A pointer to the memory used for loading values.
3604	/// \param __M
3605	/// A 128-bit vector of [2 x i64] containing the mask bits.
3606	/// \returns A 128-bit vector of [2 x i64] containing the loaded or zeroed
3607	/// elements.
3608	static __inline__ __m128i __DEFAULT_FN_ATTRS128
3609	_mm_maskload_epi64(long long const *__X, __m128i __M)
3610	{
3611	return (__m128i)__builtin_ia32_maskloadq((const __v2di *)__X, (__v2di)__M);
3612	}
3613
3614	/// Conditionally stores eight 32-bit integer elements from the 256-bit vector
3615	/// of [8 x i32] in \a __Y to memory \a __X, if the most significant bit of
3616	/// the corresponding element in the mask \a __M is set; otherwise, the
3617	/// memory element is unchanged.
3618	///
3619	/// \code{.operation}
3620	/// FOR i := 0 TO 7
3621	/// j := i*32
3622	/// IF __M[j+31] == 1
3623	/// Store32(__X+(i*4), __Y[j+31:j])
3624	/// FI
3625	/// ENDFOR
3626	/// \endcode
3627	///
3628	/// \headerfile <immintrin.h>
3629	///
3630	/// This intrinsic corresponds to the \c VPMASKMOVD instruction.
3631	///
3632	/// \param __X
3633	/// A pointer to the memory used for storing values.
3634	/// \param __M
3635	/// A 256-bit vector of [8 x i32] containing the mask bits.
3636	/// \param __Y
3637	/// A 256-bit vector of [8 x i32] containing the values to store.
3638	static __inline__ void __DEFAULT_FN_ATTRS256
3639	_mm256_maskstore_epi32(int *__X, __m256i __M, __m256i __Y)
3640	{
3641	__builtin_ia32_maskstored256((__v8si *)__X, (__v8si)__M, (__v8si)__Y);
3642	}
3643
3644	/// Conditionally stores four 64-bit integer elements from the 256-bit vector
3645	/// of [4 x i64] in \a __Y to memory \a __X, if the most significant bit of
3646	/// the corresponding element in the mask \a __M is set; otherwise, the
3647	/// memory element is unchanged.
3648	///
3649	/// \code{.operation}
3650	/// FOR i := 0 TO 3
3651	/// j := i*64
3652	/// IF __M[j+63] == 1
3653	/// Store64(__X+(i*8), __Y[j+63:j])
3654	/// FI
3655	/// ENDFOR
3656	/// \endcode
3657	///
3658	/// \headerfile <immintrin.h>
3659	///
3660	/// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
3661	///
3662	/// \param __X
3663	/// A pointer to the memory used for storing values.
3664	/// \param __M
3665	/// A 256-bit vector of [4 x i64] containing the mask bits.
3666	/// \param __Y
3667	/// A 256-bit vector of [4 x i64] containing the values to store.
3668	static __inline__ void __DEFAULT_FN_ATTRS256
3669	_mm256_maskstore_epi64(long long *__X, __m256i __M, __m256i __Y)
3670	{
3671	__builtin_ia32_maskstoreq256((__v4di *)__X, (__v4di)__M, (__v4di)__Y);
3672	}
3673
3674	/// Conditionally stores four 32-bit integer elements from the 128-bit vector
3675	/// of [4 x i32] in \a __Y to memory \a __X, if the most significant bit of
3676	/// the corresponding element in the mask \a __M is set; otherwise, the
3677	/// memory element is unchanged.
3678	///
3679	/// \code{.operation}
3680	/// FOR i := 0 TO 3
3681	/// j := i*32
3682	/// IF __M[j+31] == 1
3683	/// Store32(__X+(i*4), __Y[j+31:j])
3684	/// FI
3685	/// ENDFOR
3686	/// \endcode
3687	///
3688	/// \headerfile <immintrin.h>
3689	///
3690	/// This intrinsic corresponds to the \c VPMASKMOVD instruction.
3691	///
3692	/// \param __X
3693	/// A pointer to the memory used for storing values.
3694	/// \param __M
3695	/// A 128-bit vector of [4 x i32] containing the mask bits.
3696	/// \param __Y
3697	/// A 128-bit vector of [4 x i32] containing the values to store.
3698	static __inline__ void __DEFAULT_FN_ATTRS128
3699	_mm_maskstore_epi32(int *__X, __m128i __M, __m128i __Y)
3700	{
3701	__builtin_ia32_maskstored((__v4si *)__X, (__v4si)__M, (__v4si)__Y);
3702	}
3703
3704	/// Conditionally stores two 64-bit integer elements from the 128-bit vector
3705	/// of [2 x i64] in \a __Y to memory \a __X, if the most significant bit of
3706	/// the corresponding element in the mask \a __M is set; otherwise, the
3707	/// memory element is unchanged.
3708	///
3709	/// \code{.operation}
3710	/// FOR i := 0 TO 1
3711	/// j := i*64
3712	/// IF __M[j+63] == 1
3713	/// Store64(__X+(i*8), __Y[j+63:j])
3714	/// FI
3715	/// ENDFOR
3716	/// \endcode
3717	///
3718	/// \headerfile <immintrin.h>
3719	///
3720	/// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
3721	///
3722	/// \param __X
3723	/// A pointer to the memory used for storing values.
3724	/// \param __M
3725	/// A 128-bit vector of [2 x i64] containing the mask bits.
3726	/// \param __Y
3727	/// A 128-bit vector of [2 x i64] containing the values to store.
3728	static __inline__ void __DEFAULT_FN_ATTRS128
3729	_mm_maskstore_epi64(long long *__X, __m128i __M, __m128i __Y)
3730	{
3731	__builtin_ia32_maskstoreq(( __v2di *)__X, (__v2di)__M, (__v2di)__Y);
3732	}
3733
3734	/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
3735	/// left by the number of bits given in the corresponding element of the
3736	/// 256-bit vector of [8 x i32] in \a __Y, shifting in zero bits, and
3737	/// returns the result. If the shift count for any element is greater than
3738	/// 31, the result for that element is zero.
3739	///
3740	/// \headerfile <immintrin.h>
3741	///
3742	/// This intrinsic corresponds to the \c VPSLLVD instruction.
3743	///
3744	/// \param __X
3745	/// A 256-bit vector of [8 x i32] to be shifted.
3746	/// \param __Y
3747	/// A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
3748	/// bits).
3749	/// \returns A 256-bit vector of [8 x i32] containing the result.
3750	static __inline__ __m256i __DEFAULT_FN_ATTRS256
3751	_mm256_sllv_epi32(__m256i __X, __m256i __Y)
3752	{
3753	return (__m256i)__builtin_ia32_psllv8si((__v8si)__X, (__v8si)__Y);
3754	}
3755
3756	/// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X
3757	/// left by the number of bits given in the corresponding element of the
3758	/// 128-bit vector of [4 x i32] in \a __Y, shifting in zero bits, and
3759	/// returns the result. If the shift count for any element is greater than
3760	/// 31, the result for that element is zero.
3761	///
3762	/// \headerfile <immintrin.h>
3763	///
3764	/// This intrinsic corresponds to the \c VPSLLVD instruction.
3765	///
3766	/// \param __X
3767	/// A 128-bit vector of [4 x i32] to be shifted.
3768	/// \param __Y
3769	/// A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
3770	/// bits).
3771	/// \returns A 128-bit vector of [4 x i32] containing the result.
3772	static __inline__ __m128i __DEFAULT_FN_ATTRS128
3773	_mm_sllv_epi32(__m128i __X, __m128i __Y)
3774	{
3775	return (__m128i)__builtin_ia32_psllv4si((__v4si)__X, (__v4si)__Y);
3776	}
3777
3778	/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __X
3779	/// left by the number of bits given in the corresponding element of the
3780	/// 128-bit vector of [4 x i64] in \a __Y, shifting in zero bits, and
3781	/// returns the result. If the shift count for any element is greater than
3782	/// 63, the result for that element is zero.
3783	///
3784	/// \headerfile <immintrin.h>
3785	///
3786	/// This intrinsic corresponds to the \c VPSLLVQ instruction.
3787	///
3788	/// \param __X
3789	/// A 256-bit vector of [4 x i64] to be shifted.
3790	/// \param __Y
3791	/// A 256-bit vector of [4 x i64] containing the unsigned shift counts (in
3792	/// bits).
3793	/// \returns A 256-bit vector of [4 x i64] containing the result.
3794	static __inline__ __m256i __DEFAULT_FN_ATTRS256
3795	_mm256_sllv_epi64(__m256i __X, __m256i __Y)
3796	{
3797	return (__m256i)__builtin_ia32_psllv4di((__v4di)__X, (__v4di)__Y);
3798	}
3799
3800	/// Shifts each 64-bit element of the 128-bit vector of [2 x i64] in \a __X
3801	/// left by the number of bits given in the corresponding element of the
3802	/// 128-bit vector of [2 x i64] in \a __Y, shifting in zero bits, and
3803	/// returns the result. If the shift count for any element is greater than
3804	/// 63, the result for that element is zero.
3805	///
3806	/// \headerfile <immintrin.h>
3807	///
3808	/// This intrinsic corresponds to the \c VPSLLVQ instruction.
3809	///
3810	/// \param __X
3811	/// A 128-bit vector of [2 x i64] to be shifted.
3812	/// \param __Y
3813	/// A 128-bit vector of [2 x i64] containing the unsigned shift counts (in
3814	/// bits).
3815	/// \returns A 128-bit vector of [2 x i64] containing the result.
3816	static __inline__ __m128i __DEFAULT_FN_ATTRS128
3817	_mm_sllv_epi64(__m128i __X, __m128i __Y)
3818	{
3819	return (__m128i)__builtin_ia32_psllv2di((__v2di)__X, (__v2di)__Y);
3820	}
3821
3822	/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
3823	/// right by the number of bits given in the corresponding element of the
3824	/// 256-bit vector of [8 x i32] in \a __Y, shifting in sign bits, and
3825	/// returns the result. If the shift count for any element is greater than
3826	/// 31, the result for that element is 0 or -1 according to the sign bit
3827	/// for that element.
3828	///
3829	/// \headerfile <immintrin.h>
3830	///
3831	/// This intrinsic corresponds to the \c VPSRAVD instruction.
3832	///
3833	/// \param __X
3834	/// A 256-bit vector of [8 x i32] to be shifted.
3835	/// \param __Y
3836	/// A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
3837	/// bits).
3838	/// \returns A 256-bit vector of [8 x i32] containing the result.
3839	static __inline__ __m256i __DEFAULT_FN_ATTRS256
3840	_mm256_srav_epi32(__m256i __X, __m256i __Y)
3841	{
3842	return (__m256i)__builtin_ia32_psrav8si((__v8si)__X, (__v8si)__Y);
3843	}
3844
3845	/// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X
3846	/// right by the number of bits given in the corresponding element of the
3847	/// 128-bit vector of [4 x i32] in \a __Y, shifting in sign bits, and
3848	/// returns the result. If the shift count for any element is greater than
3849	/// 31, the result for that element is 0 or -1 according to the sign bit
3850	/// for that element.
3851	///
3852	/// \headerfile <immintrin.h>
3853	///
3854	/// This intrinsic corresponds to the \c VPSRAVD instruction.
3855	///
3856	/// \param __X
3857	/// A 128-bit vector of [4 x i32] to be shifted.
3858	/// \param __Y
3859	/// A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
3860	/// bits).
3861	/// \returns A 128-bit vector of [4 x i32] containing the result.
3862	static __inline__ __m128i __DEFAULT_FN_ATTRS128
3863	_mm_srav_epi32(__m128i __X, __m128i __Y)
3864	{
3865	return (__m128i)__builtin_ia32_psrav4si((__v4si)__X, (__v4si)__Y);
3866	}
3867
3868	/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
3869	/// right by the number of bits given in the corresponding element of the
3870	/// 256-bit vector of [8 x i32] in \a __Y, shifting in zero bits, and
3871	/// returns the result. If the shift count for any element is greater than
3872	/// 31, the result for that element is zero.
3873	///
3874	/// \headerfile <immintrin.h>
3875	///
3876	/// This intrinsic corresponds to the \c VPSRLVD instruction.
3877	///
3878	/// \param __X
3879	/// A 256-bit vector of [8 x i32] to be shifted.
3880	/// \param __Y
3881	/// A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
3882	/// bits).
3883	/// \returns A 256-bit vector of [8 x i32] containing the result.
3884	static __inline__ __m256i __DEFAULT_FN_ATTRS256
3885	_mm256_srlv_epi32(__m256i __X, __m256i __Y)
3886	{
3887	return (__m256i)__builtin_ia32_psrlv8si((__v8si)__X, (__v8si)__Y);
3888	}
3889
3890	/// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X
3891	/// right by the number of bits given in the corresponding element of the
3892	/// 128-bit vector of [4 x i32] in \a __Y, shifting in zero bits, and
3893	/// returns the result. If the shift count for any element is greater than
3894	/// 31, the result for that element is zero.
3895	///
3896	/// \headerfile <immintrin.h>
3897	///
3898	/// This intrinsic corresponds to the \c VPSRLVD instruction.
3899	///
3900	/// \param __X
3901	/// A 128-bit vector of [4 x i32] to be shifted.
3902	/// \param __Y
3903	/// A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
3904	/// bits).
3905	/// \returns A 128-bit vector of [4 x i32] containing the result.
3906	static __inline__ __m128i __DEFAULT_FN_ATTRS128
3907	_mm_srlv_epi32(__m128i __X, __m128i __Y)
3908	{
3909	return (__m128i)__builtin_ia32_psrlv4si((__v4si)__X, (__v4si)__Y);
3910	}
3911
3912	/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __X
3913	/// right by the number of bits given in the corresponding element of the
3914	/// 128-bit vector of [4 x i64] in \a __Y, shifting in zero bits, and
3915	/// returns the result. If the shift count for any element is greater than
3916	/// 63, the result for that element is zero.
3917	///
3918	/// \headerfile <immintrin.h>
3919	///
3920	/// This intrinsic corresponds to the \c VPSRLVQ instruction.
3921	///
3922	/// \param __X
3923	/// A 256-bit vector of [4 x i64] to be shifted.
3924	/// \param __Y
3925	/// A 256-bit vector of [4 x i64] containing the unsigned shift counts (in
3926	/// bits).
3927	/// \returns A 256-bit vector of [4 x i64] containing the result.
3928	static __inline__ __m256i __DEFAULT_FN_ATTRS256
3929	_mm256_srlv_epi64(__m256i __X, __m256i __Y)
3930	{
3931	return (__m256i)__builtin_ia32_psrlv4di((__v4di)__X, (__v4di)__Y);
3932	}
3933
3934	/// Shifts each 64-bit element of the 128-bit vector of [2 x i64] in \a __X
3935	/// right by the number of bits given in the corresponding element of the
3936	/// 128-bit vector of [2 x i64] in \a __Y, shifting in zero bits, and
3937	/// returns the result. If the shift count for any element is greater than
3938	/// 63, the result for that element is zero.
3939	///
3940	/// \headerfile <immintrin.h>
3941	///
3942	/// This intrinsic corresponds to the \c VPSRLVQ instruction.
3943	///
3944	/// \param __X
3945	/// A 128-bit vector of [2 x i64] to be shifted.
3946	/// \param __Y
3947	/// A 128-bit vector of [2 x i64] containing the unsigned shift counts (in
3948	/// bits).
3949	/// \returns A 128-bit vector of [2 x i64] containing the result.
3950	static __inline__ __m128i __DEFAULT_FN_ATTRS128
3951	_mm_srlv_epi64(__m128i __X, __m128i __Y)
3952	{
3953	return (__m128i)__builtin_ia32_psrlv2di((__v2di)__X, (__v2di)__Y);
3954	}
3955
3956	/// Conditionally gathers two 64-bit floating-point values, either from the
3957	/// 128-bit vector of [2 x double] in \a a, or from memory \a m using scaled
3958	/// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
3959	/// of [2 x double] in \a mask determines the source for each element.
3960	///
3961	/// \code{.operation}
3962	/// FOR element := 0 to 1
3963	/// j := element*64
3964	/// k := element*32
3965	/// IF mask[j+63] == 0
3966	/// result[j+63:j] := a[j+63:j]
3967	/// ELSE
3968	/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
3969	/// FI
3970	/// ENDFOR
3971	/// \endcode
3972	///
3973	/// \headerfile <immintrin.h>
3974	///
3975	/// \code
3976	/// __m128d _mm_mask_i32gather_pd(__m128d a, const double *m, __m128i i,
3977	/// __m128d mask, const int s);
3978	/// \endcode
3979	///
3980	/// This intrinsic corresponds to the \c VGATHERDPD instruction.
3981	///
3982	/// \param a
3983	/// A 128-bit vector of [2 x double] used as the source when a mask bit is
3984	/// zero.
3985	/// \param m
3986	/// A pointer to the memory used for loading values.
3987	/// \param i
3988	/// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
3989	/// the first two elements are used.
3990	/// \param mask
3991	/// A 128-bit vector of [2 x double] containing the mask. The most
3992	/// significant bit of each element in the mask vector represents the mask
3993	/// bits. If a mask bit is zero, the corresponding value from vector \a a
3994	/// is gathered; otherwise the value is loaded from memory.
3995	/// \param s
3996	/// A literal constant scale factor for the indexes in \a i. Must be
3997	/// 1, 2, 4, or 8.
3998	/// \returns A 128-bit vector of [2 x double] containing the gathered values.
3999	#define _mm_mask_i32gather_pd(a, m, i, mask, s) \
4000	((__m128d)__builtin_ia32_gatherd_pd((__v2df)(__m128i)(a), \
4001	(double const *)(m), \
4002	(__v4si)(__m128i)(i), \
4003	(__v2df)(__m128d)(mask), (s)))
4004
4005	/// Conditionally gathers four 64-bit floating-point values, either from the
4006	/// 256-bit vector of [4 x double] in \a a, or from memory \a m using scaled
4007	/// indexes from the 128-bit vector of [4 x i32] in \a i. The 256-bit vector
4008	/// of [4 x double] in \a mask determines the source for each element.
4009	///
4010	/// \code{.operation}
4011	/// FOR element := 0 to 3
4012	/// j := element*64
4013	/// k := element*32
4014	/// IF mask[j+63] == 0
4015	/// result[j+63:j] := a[j+63:j]
4016	/// ELSE
4017	/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4018	/// FI
4019	/// ENDFOR
4020	/// \endcode
4021	///
4022	/// \headerfile <immintrin.h>
4023	///
4024	/// \code
4025	/// __m256d _mm256_mask_i32gather_pd(__m256d a, const double *m, __m128i i,
4026	/// __m256d mask, const int s);
4027	/// \endcode
4028	///
4029	/// This intrinsic corresponds to the \c VGATHERDPD instruction.
4030	///
4031	/// \param a
4032	/// A 256-bit vector of [4 x double] used as the source when a mask bit is
4033	/// zero.
4034	/// \param m
4035	/// A pointer to the memory used for loading values.
4036	/// \param i
4037	/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4038	/// \param mask
4039	/// A 256-bit vector of [4 x double] containing the mask. The most
4040	/// significant bit of each element in the mask vector represents the mask
4041	/// bits. If a mask bit is zero, the corresponding value from vector \a a
4042	/// is gathered; otherwise the value is loaded from memory.
4043	/// \param s
4044	/// A literal constant scale factor for the indexes in \a i. Must be
4045	/// 1, 2, 4, or 8.
4046	/// \returns A 256-bit vector of [4 x double] containing the gathered values.
4047	#define _mm256_mask_i32gather_pd(a, m, i, mask, s) \
4048	((__m256d)__builtin_ia32_gatherd_pd256((__v4df)(__m256d)(a), \
4049	(double const *)(m), \
4050	(__v4si)(__m128i)(i), \
4051	(__v4df)(__m256d)(mask), (s)))
4052
4053	/// Conditionally gathers two 64-bit floating-point values, either from the
4054	/// 128-bit vector of [2 x double] in \a a, or from memory \a m using scaled
4055	/// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
4056	/// of [2 x double] in \a mask determines the source for each element.
4057	///
4058	/// \code{.operation}
4059	/// FOR element := 0 to 1
4060	/// j := element*64
4061	/// k := element*64
4062	/// IF mask[j+63] == 0
4063	/// result[j+63:j] := a[j+63:j]
4064	/// ELSE
4065	/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4066	/// FI
4067	/// ENDFOR
4068	/// \endcode
4069	///
4070	/// \headerfile <immintrin.h>
4071	///
4072	/// \code
4073	/// __m128d _mm_mask_i64gather_pd(__m128d a, const double *m, __m128i i,
4074	/// __m128d mask, const int s);
4075	/// \endcode
4076	///
4077	/// This intrinsic corresponds to the \c VGATHERQPD instruction.
4078	///
4079	/// \param a
4080	/// A 128-bit vector of [2 x double] used as the source when a mask bit is
4081	/// zero.
4082	/// \param m
4083	/// A pointer to the memory used for loading values.
4084	/// \param i
4085	/// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4086	/// \param mask
4087	/// A 128-bit vector of [2 x double] containing the mask. The most
4088	/// significant bit of each element in the mask vector represents the mask
4089	/// bits. If a mask bit is zero, the corresponding value from vector \a a
4090	/// is gathered; otherwise the value is loaded from memory.
4091	/// \param s
4092	/// A literal constant scale factor for the indexes in \a i. Must be
4093	/// 1, 2, 4, or 8.
4094	/// \returns A 128-bit vector of [2 x double] containing the gathered values.
4095	#define _mm_mask_i64gather_pd(a, m, i, mask, s) \
4096	((__m128d)__builtin_ia32_gatherq_pd((__v2df)(__m128d)(a), \
4097	(double const *)(m), \
4098	(__v2di)(__m128i)(i), \
4099	(__v2df)(__m128d)(mask), (s)))
4100
4101	/// Conditionally gathers four 64-bit floating-point values, either from the
4102	/// 256-bit vector of [4 x double] in \a a, or from memory \a m using scaled
4103	/// indexes from the 256-bit vector of [4 x i64] in \a i. The 256-bit vector
4104	/// of [4 x double] in \a mask determines the source for each element.
4105	///
4106	/// \code{.operation}
4107	/// FOR element := 0 to 3
4108	/// j := element*64
4109	/// k := element*64
4110	/// IF mask[j+63] == 0
4111	/// result[j+63:j] := a[j+63:j]
4112	/// ELSE
4113	/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4114	/// FI
4115	/// ENDFOR
4116	/// \endcode
4117	///
4118	/// \headerfile <immintrin.h>
4119	///
4120	/// \code
4121	/// __m256d _mm256_mask_i64gather_pd(__m256d a, const double *m, __m256i i,
4122	/// __m256d mask, const int s);
4123	/// \endcode
4124	///
4125	/// This intrinsic corresponds to the \c VGATHERQPD instruction.
4126	///
4127	/// \param a
4128	/// A 256-bit vector of [4 x double] used as the source when a mask bit is
4129	/// zero.
4130	/// \param m
4131	/// A pointer to the memory used for loading values.
4132	/// \param i
4133	/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4134	/// \param mask
4135	/// A 256-bit vector of [4 x double] containing the mask. The most
4136	/// significant bit of each element in the mask vector represents the mask
4137	/// bits. If a mask bit is zero, the corresponding value from vector \a a
4138	/// is gathered; otherwise the value is loaded from memory.
4139	/// \param s
4140	/// A literal constant scale factor for the indexes in \a i. Must be
4141	/// 1, 2, 4, or 8.
4142	/// \returns A 256-bit vector of [4 x double] containing the gathered values.
4143	#define _mm256_mask_i64gather_pd(a, m, i, mask, s) \
4144	((__m256d)__builtin_ia32_gatherq_pd256((__v4df)(__m256d)(a), \
4145	(double const *)(m), \
4146	(__v4di)(__m256i)(i), \
4147	(__v4df)(__m256d)(mask), (s)))
4148
4149	/// Conditionally gathers four 32-bit floating-point values, either from the
4150	/// 128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
4151	/// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
4152	/// of [4 x float] in \a mask determines the source for each element.
4153	///
4154	/// \code{.operation}
4155	/// FOR element := 0 to 3
4156	/// j := element*32
4157	/// k := element*32
4158	/// IF mask[j+31] == 0
4159	/// result[j+31:j] := a[j+31:j]
4160	/// ELSE
4161	/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4162	/// FI
4163	/// ENDFOR
4164	/// \endcode
4165	///
4166	/// \headerfile <immintrin.h>
4167	///
4168	/// \code
4169	/// __m128 _mm_mask_i32gather_ps(__m128 a, const float *m, __m128i i,
4170	/// __m128 mask, const int s);
4171	/// \endcode
4172	///
4173	/// This intrinsic corresponds to the \c VGATHERDPS instruction.
4174	///
4175	/// \param a
4176	/// A 128-bit vector of [4 x float] used as the source when a mask bit is
4177	/// zero.
4178	/// \param m
4179	/// A pointer to the memory used for loading values.
4180	/// \param i
4181	/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4182	/// \param mask
4183	/// A 128-bit vector of [4 x float] containing the mask. The most
4184	/// significant bit of each element in the mask vector represents the mask
4185	/// bits. If a mask bit is zero, the corresponding value from vector \a a
4186	/// is gathered; otherwise the value is loaded from memory.
4187	/// \param s
4188	/// A literal constant scale factor for the indexes in \a i. Must be
4189	/// 1, 2, 4, or 8.
4190	/// \returns A 128-bit vector of [4 x float] containing the gathered values.
4191	#define _mm_mask_i32gather_ps(a, m, i, mask, s) \
4192	((__m128)__builtin_ia32_gatherd_ps((__v4sf)(__m128)(a), \
4193	(float const *)(m), \
4194	(__v4si)(__m128i)(i), \
4195	(__v4sf)(__m128)(mask), (s)))
4196
4197	/// Conditionally gathers eight 32-bit floating-point values, either from the
4198	/// 256-bit vector of [8 x float] in \a a, or from memory \a m using scaled
4199	/// indexes from the 256-bit vector of [8 x i32] in \a i. The 256-bit vector
4200	/// of [8 x float] in \a mask determines the source for each element.
4201	///
4202	/// \code{.operation}
4203	/// FOR element := 0 to 7
4204	/// j := element*32
4205	/// k := element*32
4206	/// IF mask[j+31] == 0
4207	/// result[j+31:j] := a[j+31:j]
4208	/// ELSE
4209	/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4210	/// FI
4211	/// ENDFOR
4212	/// \endcode
4213	///
4214	/// \headerfile <immintrin.h>
4215	///
4216	/// \code
4217	/// __m256 _mm256_mask_i32gather_ps(__m256 a, const float *m, __m256i i,
4218	/// __m256 mask, const int s);
4219	/// \endcode
4220	///
4221	/// This intrinsic corresponds to the \c VGATHERDPS instruction.
4222	///
4223	/// \param a
4224	/// A 256-bit vector of [8 x float] used as the source when a mask bit is
4225	/// zero.
4226	/// \param m
4227	/// A pointer to the memory used for loading values.
4228	/// \param i
4229	/// A 256-bit vector of [8 x i32] containing signed indexes into \a m.
4230	/// \param mask
4231	/// A 256-bit vector of [8 x float] containing the mask. The most
4232	/// significant bit of each element in the mask vector represents the mask
4233	/// bits. If a mask bit is zero, the corresponding value from vector \a a
4234	/// is gathered; otherwise the value is loaded from memory.
4235	/// \param s
4236	/// A literal constant scale factor for the indexes in \a i. Must be
4237	/// 1, 2, 4, or 8.
4238	/// \returns A 256-bit vector of [8 x float] containing the gathered values.
4239	#define _mm256_mask_i32gather_ps(a, m, i, mask, s) \
4240	((__m256)__builtin_ia32_gatherd_ps256((__v8sf)(__m256)(a), \
4241	(float const *)(m), \
4242	(__v8si)(__m256i)(i), \
4243	(__v8sf)(__m256)(mask), (s)))
4244
4245	/// Conditionally gathers two 32-bit floating-point values, either from the
4246	/// 128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
4247	/// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
4248	/// of [4 x float] in \a mask determines the source for the lower two
4249	/// elements. The upper two elements of the result are zeroed.
4250	///
4251	/// \code{.operation}
4252	/// FOR element := 0 to 1
4253	/// j := element*32
4254	/// k := element*64
4255	/// IF mask[j+31] == 0
4256	/// result[j+31:j] := a[j+31:j]
4257	/// ELSE
4258	/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4259	/// FI
4260	/// ENDFOR
4261	/// result[127:64] := 0
4262	/// \endcode
4263	///
4264	/// \headerfile <immintrin.h>
4265	///
4266	/// \code
4267	/// __m128 _mm_mask_i64gather_ps(__m128 a, const float *m, __m128i i,
4268	/// __m128 mask, const int s);
4269	/// \endcode
4270	///
4271	/// This intrinsic corresponds to the \c VGATHERQPS instruction.
4272	///
4273	/// \param a
4274	/// A 128-bit vector of [4 x float] used as the source when a mask bit is
4275	/// zero. Only the first two elements are used.
4276	/// \param m
4277	/// A pointer to the memory used for loading values.
4278	/// \param i
4279	/// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4280	/// \param mask
4281	/// A 128-bit vector of [4 x float] containing the mask. The most
4282	/// significant bit of each element in the mask vector represents the mask
4283	/// bits. If a mask bit is zero, the corresponding value from vector \a a
4284	/// is gathered; otherwise the value is loaded from memory. Only the first
4285	/// two elements are used.
4286	/// \param s
4287	/// A literal constant scale factor for the indexes in \a i. Must be
4288	/// 1, 2, 4, or 8.
4289	/// \returns A 128-bit vector of [4 x float] containing the gathered values.
4290	#define _mm_mask_i64gather_ps(a, m, i, mask, s) \
4291	((__m128)__builtin_ia32_gatherq_ps((__v4sf)(__m128)(a), \
4292	(float const *)(m), \
4293	(__v2di)(__m128i)(i), \
4294	(__v4sf)(__m128)(mask), (s)))
4295
4296	/// Conditionally gathers four 32-bit floating-point values, either from the
4297	/// 128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
4298	/// indexes from the 256-bit vector of [4 x i64] in \a i. The 128-bit vector
4299	/// of [4 x float] in \a mask determines the source for each element.
4300	///
4301	/// \code{.operation}
4302	/// FOR element := 0 to 3
4303	/// j := element*32
4304	/// k := element*64
4305	/// IF mask[j+31] == 0
4306	/// result[j+31:j] := a[j+31:j]
4307	/// ELSE
4308	/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4309	/// FI
4310	/// ENDFOR
4311	/// \endcode
4312	///
4313	/// \headerfile <immintrin.h>
4314	///
4315	/// \code
4316	/// __m128 _mm256_mask_i64gather_ps(__m128 a, const float *m, __m256i i,
4317	/// __m128 mask, const int s);
4318	/// \endcode
4319	///
4320	/// This intrinsic corresponds to the \c VGATHERQPS instruction.
4321	///
4322	/// \param a
4323	/// A 128-bit vector of [4 x float] used as the source when a mask bit is
4324	/// zero.
4325	/// \param m
4326	/// A pointer to the memory used for loading values.
4327	/// \param i
4328	/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4329	/// \param mask
4330	/// A 128-bit vector of [4 x float] containing the mask. The most
4331	/// significant bit of each element in the mask vector represents the mask
4332	/// bits. If a mask bit is zero, the corresponding value from vector \a a
4333	/// is gathered; otherwise the value is loaded from memory.
4334	/// \param s
4335	/// A literal constant scale factor for the indexes in \a i. Must be
4336	/// 1, 2, 4, or 8.
4337	/// \returns A 128-bit vector of [4 x float] containing the gathered values.
4338	#define _mm256_mask_i64gather_ps(a, m, i, mask, s) \
4339	((__m128)__builtin_ia32_gatherq_ps256((__v4sf)(__m128)(a), \
4340	(float const *)(m), \
4341	(__v4di)(__m256i)(i), \
4342	(__v4sf)(__m128)(mask), (s)))
4343
4344	/// Conditionally gathers four 32-bit integer values, either from the
4345	/// 128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
4346	/// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
4347	/// of [4 x i32] in \a mask determines the source for each element.
4348	///
4349	/// \code{.operation}
4350	/// FOR element := 0 to 3
4351	/// j := element*32
4352	/// k := element*32
4353	/// IF mask[j+31] == 0
4354	/// result[j+31:j] := a[j+31:j]
4355	/// ELSE
4356	/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4357	/// FI
4358	/// ENDFOR
4359	/// \endcode
4360	///
4361	/// \headerfile <immintrin.h>
4362	///
4363	/// \code
4364	/// __m128i _mm_mask_i32gather_epi32(__m128i a, const int *m, __m128i i,
4365	/// __m128i mask, const int s);
4366	/// \endcode
4367	///
4368	/// This intrinsic corresponds to the \c VPGATHERDD instruction.
4369	///
4370	/// \param a
4371	/// A 128-bit vector of [4 x i32] used as the source when a mask bit is
4372	/// zero.
4373	/// \param m
4374	/// A pointer to the memory used for loading values.
4375	/// \param i
4376	/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4377	/// \param mask
4378	/// A 128-bit vector of [4 x i32] containing the mask. The most significant
4379	/// bit of each element in the mask vector represents the mask bits. If a
4380	/// mask bit is zero, the corresponding value from vector \a a is gathered;
4381	/// otherwise the value is loaded from memory.
4382	/// \param s
4383	/// A literal constant scale factor for the indexes in \a i. Must be
4384	/// 1, 2, 4, or 8.
4385	/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
4386	#define _mm_mask_i32gather_epi32(a, m, i, mask, s) \
4387	((__m128i)__builtin_ia32_gatherd_d((__v4si)(__m128i)(a), \
4388	(int const *)(m), \
4389	(__v4si)(__m128i)(i), \
4390	(__v4si)(__m128i)(mask), (s)))
4391
4392	/// Conditionally gathers eight 32-bit integer values, either from the
4393	/// 256-bit vector of [8 x i32] in \a a, or from memory \a m using scaled
4394	/// indexes from the 256-bit vector of [8 x i32] in \a i. The 256-bit vector
4395	/// of [8 x i32] in \a mask determines the source for each element.
4396	///
4397	/// \code{.operation}
4398	/// FOR element := 0 to 7
4399	/// j := element*32
4400	/// k := element*32
4401	/// IF mask[j+31] == 0
4402	/// result[j+31:j] := a[j+31:j]
4403	/// ELSE
4404	/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4405	/// FI
4406	/// ENDFOR
4407	/// \endcode
4408	///
4409	/// \headerfile <immintrin.h>
4410	///
4411	/// \code
4412	/// __m256i _mm256_mask_i32gather_epi32(__m256i a, const int *m, __m256i i,
4413	/// __m256i mask, const int s);
4414	/// \endcode
4415	///
4416	/// This intrinsic corresponds to the \c VPGATHERDD instruction.
4417	///
4418	/// \param a
4419	/// A 256-bit vector of [8 x i32] used as the source when a mask bit is
4420	/// zero.
4421	/// \param m
4422	/// A pointer to the memory used for loading values.
4423	/// \param i
4424	/// A 256-bit vector of [8 x i32] containing signed indexes into \a m.
4425	/// \param mask
4426	/// A 256-bit vector of [8 x i32] containing the mask. The most significant
4427	/// bit of each element in the mask vector represents the mask bits. If a
4428	/// mask bit is zero, the corresponding value from vector \a a is gathered;
4429	/// otherwise the value is loaded from memory.
4430	/// \param s
4431	/// A literal constant scale factor for the indexes in \a i. Must be
4432	/// 1, 2, 4, or 8.
4433	/// \returns A 256-bit vector of [8 x i32] containing the gathered values.
4434	#define _mm256_mask_i32gather_epi32(a, m, i, mask, s) \
4435	((__m256i)__builtin_ia32_gatherd_d256((__v8si)(__m256i)(a), \
4436	(int const *)(m), \
4437	(__v8si)(__m256i)(i), \
4438	(__v8si)(__m256i)(mask), (s)))
4439
4440	/// Conditionally gathers two 32-bit integer values, either from the
4441	/// 128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
4442	/// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
4443	/// of [4 x i32] in \a mask determines the source for the lower two
4444	/// elements. The upper two elements of the result are zeroed.
4445	///
4446	/// \code{.operation}
4447	/// FOR element := 0 to 1
4448	/// j := element*32
4449	/// k := element*64
4450	/// IF mask[j+31] == 0
4451	/// result[j+31:j] := a[j+31:j]
4452	/// ELSE
4453	/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4454	/// FI
4455	/// ENDFOR
4456	/// result[127:64] := 0
4457	/// \endcode
4458	///
4459	/// \headerfile <immintrin.h>
4460	///
4461	/// \code
4462	/// __m128i _mm_mask_i64gather_epi32(__m128i a, const int *m, __m128i i,
4463	/// __m128i mask, const int s);
4464	/// \endcode
4465	///
4466	/// This intrinsic corresponds to the \c VPGATHERQD instruction.
4467	///
4468	/// \param a
4469	/// A 128-bit vector of [4 x i32] used as the source when a mask bit is
4470	/// zero. Only the first two elements are used.
4471	/// \param m
4472	/// A pointer to the memory used for loading values.
4473	/// \param i
4474	/// A 128-bit vector of [2 x i64] containing indexes into \a m.
4475	/// \param mask
4476	/// A 128-bit vector of [4 x i32] containing the mask. The most significant
4477	/// bit of each element in the mask vector represents the mask bits. If a
4478	/// mask bit is zero, the corresponding value from vector \a a is gathered;
4479	/// otherwise the value is loaded from memory. Only the first two elements
4480	/// are used.
4481	/// \param s
4482	/// A literal constant scale factor for the indexes in \a i. Must be
4483	/// 1, 2, 4, or 8.
4484	/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
4485	#define _mm_mask_i64gather_epi32(a, m, i, mask, s) \
4486	((__m128i)__builtin_ia32_gatherq_d((__v4si)(__m128i)(a), \
4487	(int const *)(m), \
4488	(__v2di)(__m128i)(i), \
4489	(__v4si)(__m128i)(mask), (s)))
4490
4491	/// Conditionally gathers four 32-bit integer values, either from the
4492	/// 128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
4493	/// indexes from the 256-bit vector of [4 x i64] in \a i. The 128-bit vector
4494	/// of [4 x i32] in \a mask determines the source for each element.
4495	///
4496	/// \code{.operation}
4497	/// FOR element := 0 to 3
4498	/// j := element*32
4499	/// k := element*64
4500	/// IF mask[j+31] == 0
4501	/// result[j+31:j] := a[j+31:j]
4502	/// ELSE
4503	/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4504	/// FI
4505	/// ENDFOR
4506	/// \endcode
4507	///
4508	/// \headerfile <immintrin.h>
4509	///
4510	/// \code
4511	/// __m128i _mm256_mask_i64gather_epi32(__m128i a, const int *m, __m256i i,
4512	/// __m128i mask, const int s);
4513	/// \endcode
4514	///
4515	/// This intrinsic corresponds to the \c VPGATHERQD instruction.
4516	///
4517	/// \param a
4518	/// A 128-bit vector of [4 x i32] used as the source when a mask bit is
4519	/// zero.
4520	/// \param m
4521	/// A pointer to the memory used for loading values.
4522	/// \param i
4523	/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4524	/// \param mask
4525	/// A 128-bit vector of [4 x i32] containing the mask. The most significant
4526	/// bit of each element in the mask vector represents the mask bits. If a
4527	/// mask bit is zero, the corresponding value from vector \a a is gathered;
4528	/// otherwise the value is loaded from memory.
4529	/// \param s
4530	/// A literal constant scale factor for the indexes in \a i. Must be
4531	/// 1, 2, 4, or 8.
4532	/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
4533	#define _mm256_mask_i64gather_epi32(a, m, i, mask, s) \
4534	((__m128i)__builtin_ia32_gatherq_d256((__v4si)(__m128i)(a), \
4535	(int const *)(m), \
4536	(__v4di)(__m256i)(i), \
4537	(__v4si)(__m128i)(mask), (s)))
4538
4539	/// Conditionally gathers two 64-bit integer values, either from the
4540	/// 128-bit vector of [2 x i64] in \a a, or from memory \a m using scaled
4541	/// indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
4542	/// of [2 x i64] in \a mask determines the source for each element.
4543	///
4544	/// \code{.operation}
4545	/// FOR element := 0 to 1
4546	/// j := element*64
4547	/// k := element*32
4548	/// IF mask[j+63] == 0
4549	/// result[j+63:j] := a[j+63:j]
4550	/// ELSE
4551	/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4552	/// FI
4553	/// ENDFOR
4554	/// \endcode
4555	///
4556	/// \headerfile <immintrin.h>
4557	///
4558	/// \code
4559	/// __m128i _mm_mask_i32gather_epi64(__m128i a, const long long *m, __m128i i,
4560	/// __m128i mask, const int s);
4561	/// \endcode
4562	///
4563	/// This intrinsic corresponds to the \c VPGATHERDQ instruction.
4564	///
4565	/// \param a
4566	/// A 128-bit vector of [2 x i64] used as the source when a mask bit is
4567	/// zero.
4568	/// \param m
4569	/// A pointer to the memory used for loading values.
4570	/// \param i
4571	/// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
4572	/// the first two elements are used.
4573	/// \param mask
4574	/// A 128-bit vector of [2 x i64] containing the mask. The most significant
4575	/// bit of each element in the mask vector represents the mask bits. If a
4576	/// mask bit is zero, the corresponding value from vector \a a is gathered;
4577	/// otherwise the value is loaded from memory.
4578	/// \param s
4579	/// A literal constant scale factor for the indexes in \a i. Must be
4580	/// 1, 2, 4, or 8.
4581	/// \returns A 128-bit vector of [2 x i64] containing the gathered values.
4582	#define _mm_mask_i32gather_epi64(a, m, i, mask, s) \
4583	((__m128i)__builtin_ia32_gatherd_q((__v2di)(__m128i)(a), \
4584	(long long const *)(m), \
4585	(__v4si)(__m128i)(i), \
4586	(__v2di)(__m128i)(mask), (s)))
4587
4588	/// Conditionally gathers four 64-bit integer values, either from the
4589	/// 256-bit vector of [4 x i64] in \a a, or from memory \a m using scaled
4590	/// indexes from the 128-bit vector of [4 x i32] in \a i. The 256-bit vector
4591	/// of [4 x i64] in \a mask determines the source for each element.
4592	///
4593	/// \code{.operation}
4594	/// FOR element := 0 to 3
4595	/// j := element*64
4596	/// k := element*32
4597	/// IF mask[j+63] == 0
4598	/// result[j+63:j] := a[j+63:j]
4599	/// ELSE
4600	/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4601	/// FI
4602	/// ENDFOR
4603	/// \endcode
4604	///
4605	/// \headerfile <immintrin.h>
4606	///
4607	/// \code
4608	/// __m256i _mm256_mask_i32gather_epi64(__m256i a, const long long *m,
4609	/// __m128i i, __m256i mask, const int s);
4610	/// \endcode
4611	///
4612	/// This intrinsic corresponds to the \c VPGATHERDQ instruction.
4613	///
4614	/// \param a
4615	/// A 256-bit vector of [4 x i64] used as the source when a mask bit is
4616	/// zero.
4617	/// \param m
4618	/// A pointer to the memory used for loading values.
4619	/// \param i
4620	/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4621	/// \param mask
4622	/// A 256-bit vector of [4 x i64] containing the mask. The most significant
4623	/// bit of each element in the mask vector represents the mask bits. If a
4624	/// mask bit is zero, the corresponding value from vector \a a is gathered;
4625	/// otherwise the value is loaded from memory.
4626	/// \param s
4627	/// A literal constant scale factor for the indexes in \a i. Must be
4628	/// 1, 2, 4, or 8.
4629	/// \returns A 256-bit vector of [4 x i64] containing the gathered values.
4630	#define _mm256_mask_i32gather_epi64(a, m, i, mask, s) \
4631	((__m256i)__builtin_ia32_gatherd_q256((__v4di)(__m256i)(a), \
4632	(long long const *)(m), \
4633	(__v4si)(__m128i)(i), \
4634	(__v4di)(__m256i)(mask), (s)))
4635
4636	/// Conditionally gathers two 64-bit integer values, either from the
4637	/// 128-bit vector of [2 x i64] in \a a, or from memory \a m using scaled
4638	/// indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
4639	/// of [2 x i64] in \a mask determines the source for each element.
4640	///
4641	/// \code{.operation}
4642	/// FOR element := 0 to 1
4643	/// j := element*64
4644	/// k := element*64
4645	/// IF mask[j+63] == 0
4646	/// result[j+63:j] := a[j+63:j]
4647	/// ELSE
4648	/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4649	/// FI
4650	/// ENDFOR
4651	/// \endcode
4652	///
4653	/// \headerfile <immintrin.h>
4654	///
4655	/// \code
4656	/// __m128i _mm_mask_i64gather_epi64(__m128i a, const long long *m, __m128i i,
4657	/// __m128i mask, const int s);
4658	/// \endcode
4659	///
4660	/// This intrinsic corresponds to the \c VPGATHERQQ instruction.
4661	///
4662	/// \param a
4663	/// A 128-bit vector of [2 x i64] used as the source when a mask bit is
4664	/// zero.
4665	/// \param m
4666	/// A pointer to the memory used for loading values.
4667	/// \param i
4668	/// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4669	/// \param mask
4670	/// A 128-bit vector of [2 x i64] containing the mask. The most significant
4671	/// bit of each element in the mask vector represents the mask bits. If a
4672	/// mask bit is zero, the corresponding value from vector \a a is gathered;
4673	/// otherwise the value is loaded from memory.
4674	/// \param s
4675	/// A literal constant scale factor for the indexes in \a i. Must be
4676	/// 1, 2, 4, or 8.
4677	/// \returns A 128-bit vector of [2 x i64] containing the gathered values.
4678	#define _mm_mask_i64gather_epi64(a, m, i, mask, s) \
4679	((__m128i)__builtin_ia32_gatherq_q((__v2di)(__m128i)(a), \
4680	(long long const *)(m), \
4681	(__v2di)(__m128i)(i), \
4682	(__v2di)(__m128i)(mask), (s)))
4683
4684	/// Conditionally gathers four 64-bit integer values, either from the
4685	/// 256-bit vector of [4 x i64] in \a a, or from memory \a m using scaled
4686	/// indexes from the 256-bit vector of [4 x i64] in \a i. The 256-bit vector
4687	/// of [4 x i64] in \a mask determines the source for each element.
4688	///
4689	/// \code{.operation}
4690	/// FOR element := 0 to 3
4691	/// j := element*64
4692	/// k := element*64
4693	/// IF mask[j+63] == 0
4694	/// result[j+63:j] := a[j+63:j]
4695	/// ELSE
4696	/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4697	/// FI
4698	/// ENDFOR
4699	/// \endcode
4700	///
4701	/// \headerfile <immintrin.h>
4702	///
4703	/// \code
4704	/// __m256i _mm256_mask_i64gather_epi64(__m256i a, const long long *m,
4705	/// __m256i i, __m256i mask, const int s);
4706	/// \endcode
4707	///
4708	/// This intrinsic corresponds to the \c VPGATHERQQ instruction.
4709	///
4710	/// \param a
4711	/// A 256-bit vector of [4 x i64] used as the source when a mask bit is
4712	/// zero.
4713	/// \param m
4714	/// A pointer to the memory used for loading values.
4715	/// \param i
4716	/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4717	/// \param mask
4718	/// A 256-bit vector of [4 x i64] containing the mask. The most significant
4719	/// bit of each element in the mask vector represents the mask bits. If a
4720	/// mask bit is zero, the corresponding value from vector \a a is gathered;
4721	/// otherwise the value is loaded from memory.
4722	/// \param s
4723	/// A literal constant scale factor for the indexes in \a i. Must be
4724	/// 1, 2, 4, or 8.
4725	/// \returns A 256-bit vector of [4 x i64] containing the gathered values.
4726	#define _mm256_mask_i64gather_epi64(a, m, i, mask, s) \
4727	((__m256i)__builtin_ia32_gatherq_q256((__v4di)(__m256i)(a), \
4728	(long long const *)(m), \
4729	(__v4di)(__m256i)(i), \
4730	(__v4di)(__m256i)(mask), (s)))
4731
4732	/// Gathers two 64-bit floating-point values from memory \a m using scaled
4733	/// indexes from the 128-bit vector of [4 x i32] in \a i.
4734	///
4735	/// \code{.operation}
4736	/// FOR element := 0 to 1
4737	/// j := element*64
4738	/// k := element*32
4739	/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4740	/// ENDFOR
4741	/// \endcode
4742	///
4743	/// \headerfile <immintrin.h>
4744	///
4745	/// \code
4746	/// __m128d _mm_i32gather_pd(const double *m, __m128i i, const int s);
4747	/// \endcode
4748	///
4749	/// This intrinsic corresponds to the \c VGATHERDPD instruction.
4750	///
4751	/// \param m
4752	/// A pointer to the memory used for loading values.
4753	/// \param i
4754	/// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
4755	/// the first two elements are used.
4756	/// \param s
4757	/// A literal constant scale factor for the indexes in \a i. Must be
4758	/// 1, 2, 4, or 8.
4759	/// \returns A 128-bit vector of [2 x double] containing the gathered values.
4760	#define _mm_i32gather_pd(m, i, s) \
4761	((__m128d)__builtin_ia32_gatherd_pd((__v2df)_mm_undefined_pd(), \
4762	(double const *)(m), \
4763	(__v4si)(__m128i)(i), \
4764	(__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
4765	_mm_setzero_pd()), \
4766	(s)))
4767
4768	/// Gathers four 64-bit floating-point values from memory \a m using scaled
4769	/// indexes from the 128-bit vector of [4 x i32] in \a i.
4770	///
4771	/// \code{.operation}
4772	/// FOR element := 0 to 3
4773	/// j := element*64
4774	/// k := element*32
4775	/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4776	/// ENDFOR
4777	/// \endcode
4778	///
4779	/// \headerfile <immintrin.h>
4780	///
4781	/// \code
4782	/// __m256d _mm256_i32gather_pd(const double *m, __m128i i, const int s);
4783	/// \endcode
4784	///
4785	/// This intrinsic corresponds to the \c VGATHERDPD instruction.
4786	///
4787	/// \param m
4788	/// A pointer to the memory used for loading values.
4789	/// \param i
4790	/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4791	/// \param s
4792	/// A literal constant scale factor for the indexes in \a i. Must be
4793	/// 1, 2, 4, or 8.
4794	/// \returns A 256-bit vector of [4 x double] containing the gathered values.
4795	#define _mm256_i32gather_pd(m, i, s) \
4796	((__m256d)__builtin_ia32_gatherd_pd256((__v4df)_mm256_undefined_pd(), \
4797	(double const *)(m), \
4798	(__v4si)(__m128i)(i), \
4799	(__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
4800	_mm256_setzero_pd(), \
4801	_CMP_EQ_OQ), \
4802	(s)))
4803
4804	/// Gathers two 64-bit floating-point values from memory \a m using scaled
4805	/// indexes from the 128-bit vector of [2 x i64] in \a i.
4806	///
4807	/// \code{.operation}
4808	/// FOR element := 0 to 1
4809	/// j := element*64
4810	/// k := element*64
4811	/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4812	/// ENDFOR
4813	/// \endcode
4814	///
4815	/// \headerfile <immintrin.h>
4816	///
4817	/// \code
4818	/// __m128d _mm_i64gather_pd(const double *m, __m128i i, const int s);
4819	/// \endcode
4820	///
4821	/// This intrinsic corresponds to the \c VGATHERQPD instruction.
4822	///
4823	/// \param m
4824	/// A pointer to the memory used for loading values.
4825	/// \param i
4826	/// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4827	/// \param s
4828	/// A literal constant scale factor for the indexes in \a i. Must be
4829	/// 1, 2, 4, or 8.
4830	/// \returns A 128-bit vector of [2 x double] containing the gathered values.
4831	#define _mm_i64gather_pd(m, i, s) \
4832	((__m128d)__builtin_ia32_gatherq_pd((__v2df)_mm_undefined_pd(), \
4833	(double const *)(m), \
4834	(__v2di)(__m128i)(i), \
4835	(__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
4836	_mm_setzero_pd()), \
4837	(s)))
4838
4839	/// Gathers four 64-bit floating-point values from memory \a m using scaled
4840	/// indexes from the 256-bit vector of [4 x i64] in \a i.
4841	///
4842	/// \code{.operation}
4843	/// FOR element := 0 to 3
4844	/// j := element*64
4845	/// k := element*64
4846	/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4847	/// ENDFOR
4848	/// \endcode
4849	///
4850	/// \headerfile <immintrin.h>
4851	///
4852	/// \code
4853	/// __m256d _mm256_i64gather_pd(const double *m, __m256i i, const int s);
4854	/// \endcode
4855	///
4856	/// This intrinsic corresponds to the \c VGATHERQPD instruction.
4857	///
4858	/// \param m
4859	/// A pointer to the memory used for loading values.
4860	/// \param i
4861	/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4862	/// \param s
4863	/// A literal constant scale factor for the indexes in \a i. Must be
4864	/// 1, 2, 4, or 8.
4865	/// \returns A 256-bit vector of [4 x double] containing the gathered values.
4866	#define _mm256_i64gather_pd(m, i, s) \
4867	((__m256d)__builtin_ia32_gatherq_pd256((__v4df)_mm256_undefined_pd(), \
4868	(double const *)(m), \
4869	(__v4di)(__m256i)(i), \
4870	(__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
4871	_mm256_setzero_pd(), \
4872	_CMP_EQ_OQ), \
4873	(s)))
4874
4875	/// Gathers four 32-bit floating-point values from memory \a m using scaled
4876	/// indexes from the 128-bit vector of [4 x i32] in \a i.
4877	///
4878	/// \code{.operation}
4879	/// FOR element := 0 to 3
4880	/// j := element*32
4881	/// k := element*32
4882	/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4883	/// ENDFOR
4884	/// \endcode
4885	///
4886	/// \headerfile <immintrin.h>
4887	///
4888	/// \code
4889	/// __m128 _mm_i32gather_ps(const float *m, __m128i i, const int s);
4890	/// \endcode
4891	///
4892	/// This intrinsic corresponds to the \c VGATHERDPS instruction.
4893	///
4894	/// \param m
4895	/// A pointer to the memory used for loading values.
4896	/// \param i
4897	/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4898	/// \param s
4899	/// A literal constant scale factor for the indexes in \a i. Must be
4900	/// 1, 2, 4, or 8.
4901	/// \returns A 128-bit vector of [4 x float] containing the gathered values.
4902	#define _mm_i32gather_ps(m, i, s) \
4903	((__m128)__builtin_ia32_gatherd_ps((__v4sf)_mm_undefined_ps(), \
4904	(float const *)(m), \
4905	(__v4si)(__m128i)(i), \
4906	(__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
4907	_mm_setzero_ps()), \
4908	(s)))
4909
4910	/// Gathers eight 32-bit floating-point values from memory \a m using scaled
4911	/// indexes from the 256-bit vector of [8 x i32] in \a i.
4912	///
4913	/// \code{.operation}
4914	/// FOR element := 0 to 7
4915	/// j := element*32
4916	/// k := element*32
4917	/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4918	/// ENDFOR
4919	/// \endcode
4920	///
4921	/// \headerfile <immintrin.h>
4922	///
4923	/// \code
4924	/// __m256 _mm256_i32gather_ps(const float *m, __m256i i, const int s);
4925	/// \endcode
4926	///
4927	/// This intrinsic corresponds to the \c VGATHERDPS instruction.
4928	///
4929	/// \param m
4930	/// A pointer to the memory used for loading values.
4931	/// \param i
4932	/// A 256-bit vector of [8 x i32] containing signed indexes into \a m.
4933	/// \param s
4934	/// A literal constant scale factor for the indexes in \a i. Must be
4935	/// 1, 2, 4, or 8.
4936	/// \returns A 256-bit vector of [8 x float] containing the gathered values.
4937	#define _mm256_i32gather_ps(m, i, s) \
4938	((__m256)__builtin_ia32_gatherd_ps256((__v8sf)_mm256_undefined_ps(), \
4939	(float const *)(m), \
4940	(__v8si)(__m256i)(i), \
4941	(__v8sf)_mm256_cmp_ps(_mm256_setzero_ps(), \
4942	_mm256_setzero_ps(), \
4943	_CMP_EQ_OQ), \
4944	(s)))
4945
4946	/// Gathers two 32-bit floating-point values from memory \a m using scaled
4947	/// indexes from the 128-bit vector of [2 x i64] in \a i. The upper two
4948	/// elements of the result are zeroed.
4949	///
4950	/// \code{.operation}
4951	/// FOR element := 0 to 1
4952	/// j := element*32
4953	/// k := element*64
4954	/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4955	/// ENDFOR
4956	/// result[127:64] := 0
4957	/// \endcode
4958	///
4959	/// \headerfile <immintrin.h>
4960	///
4961	/// \code
4962	/// __m128 _mm_i64gather_ps(const float *m, __m128i i, const int s);
4963	/// \endcode
4964	///
4965	/// This intrinsic corresponds to the \c VGATHERQPS instruction.
4966	///
4967	/// \param m
4968	/// A pointer to the memory used for loading values.
4969	/// \param i
4970	/// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4971	/// \param s
4972	/// A literal constant scale factor for the indexes in \a i. Must be
4973	/// 1, 2, 4, or 8.
4974	/// \returns A 128-bit vector of [4 x float] containing the gathered values.
4975	#define _mm_i64gather_ps(m, i, s) \
4976	((__m128)__builtin_ia32_gatherq_ps((__v4sf)_mm_undefined_ps(), \
4977	(float const *)(m), \
4978	(__v2di)(__m128i)(i), \
4979	(__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
4980	_mm_setzero_ps()), \
4981	(s)))
4982
4983	/// Gathers four 32-bit floating-point values from memory \a m using scaled
4984	/// indexes from the 256-bit vector of [4 x i64] in \a i.
4985	///
4986	/// \code{.operation}
4987	/// FOR element := 0 to 3
4988	/// j := element*32
4989	/// k := element*64
4990	/// result[j+31:j] := Load32(m + SignExtend(i[k+64:k])*s)
4991	/// ENDFOR
4992	/// \endcode
4993	///
4994	/// \headerfile <immintrin.h>
4995	///
4996	/// \code
4997	/// __m128 _mm256_i64gather_ps(const float *m, __m256i i, const int s);
4998	/// \endcode
4999	///
5000	/// This intrinsic corresponds to the \c VGATHERQPS instruction.
5001	///
5002	/// \param m
5003	/// A pointer to the memory used for loading values.
5004	/// \param i
5005	/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
5006	/// \param s
5007	/// A literal constant scale factor for the indexes in \a i. Must be
5008	/// 1, 2, 4, or 8.
5009	/// \returns A 128-bit vector of [4 x float] containing the gathered values.
5010	#define _mm256_i64gather_ps(m, i, s) \
5011	((__m128)__builtin_ia32_gatherq_ps256((__v4sf)_mm_undefined_ps(), \
5012	(float const *)(m), \
5013	(__v4di)(__m256i)(i), \
5014	(__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
5015	_mm_setzero_ps()), \
5016	(s)))
5017
5018	/// Gathers four 32-bit floating-point values from memory \a m using scaled
5019	/// indexes from the 128-bit vector of [4 x i32] in \a i.
5020	///
5021	/// \code{.operation}
5022	/// FOR element := 0 to 3
5023	/// j := element*32
5024	/// k := element*32
5025	/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
5026	/// ENDFOR
5027	/// \endcode
5028	///
5029	/// \headerfile <immintrin.h>
5030	///
5031	/// \code
5032	/// __m128i _mm_i32gather_epi32(const int *m, __m128i i, const int s);
5033	/// \endcode
5034	///
5035	/// This intrinsic corresponds to the \c VPGATHERDD instruction.
5036	///
5037	/// \param m
5038	/// A pointer to the memory used for loading values.
5039	/// \param i
5040	/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
5041	/// \param s
5042	/// A literal constant scale factor for the indexes in \a i. Must be
5043	/// 1, 2, 4, or 8.
5044	/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
5045	#define _mm_i32gather_epi32(m, i, s) \
5046	((__m128i)__builtin_ia32_gatherd_d((__v4si)_mm_undefined_si128(), \
5047	(int const *)(m), (__v4si)(__m128i)(i), \
5048	(__v4si)_mm_set1_epi32(-1), (s)))
5049
5050	/// Gathers eight 32-bit floating-point values from memory \a m using scaled
5051	/// indexes from the 256-bit vector of [8 x i32] in \a i.
5052	///
5053	/// \code{.operation}
5054	/// FOR element := 0 to 7
5055	/// j := element*32
5056	/// k := element*32
5057	/// result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
5058	/// ENDFOR
5059	/// \endcode
5060	///
5061	/// \headerfile <immintrin.h>
5062	///
5063	/// \code
5064	/// __m256i _mm256_i32gather_epi32(const int *m, __m256i i, const int s);
5065	/// \endcode
5066	///
5067	/// This intrinsic corresponds to the \c VPGATHERDD instruction.
5068	///
5069	/// \param m
5070	/// A pointer to the memory used for loading values.
5071	/// \param i
5072	/// A 256-bit vector of [8 x i32] containing signed indexes into \a m.
5073	/// \param s
5074	/// A literal constant scale factor for the indexes in \a i. Must be
5075	/// 1, 2, 4, or 8.
5076	/// \returns A 256-bit vector of [8 x i32] containing the gathered values.
5077	#define _mm256_i32gather_epi32(m, i, s) \
5078	((__m256i)__builtin_ia32_gatherd_d256((__v8si)_mm256_undefined_si256(), \
5079	(int const *)(m), (__v8si)(__m256i)(i), \
5080	(__v8si)_mm256_set1_epi32(-1), (s)))
5081
5082	/// Gathers two 32-bit integer values from memory \a m using scaled indexes
5083	/// from the 128-bit vector of [2 x i64] in \a i. The upper two elements
5084	/// of the result are zeroed.
5085	///
5086	/// \code{.operation}
5087	/// FOR element := 0 to 1
5088	/// j := element*32
5089	/// k := element*64
5090	/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
5091	/// ENDFOR
5092	/// result[127:64] := 0
5093	/// \endcode
5094	///
5095	/// \headerfile <immintrin.h>
5096	///
5097	/// \code
5098	/// __m128i _mm_i64gather_epi32(const int *m, __m128i i, const int s);
5099	/// \endcode
5100	///
5101	/// This intrinsic corresponds to the \c VPGATHERQD instruction.
5102	///
5103	/// \param m
5104	/// A pointer to the memory used for loading values.
5105	/// \param i
5106	/// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
5107	/// \param s
5108	/// A literal constant scale factor for the indexes in \a i. Must be
5109	/// 1, 2, 4, or 8.
5110	/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
5111	#define _mm_i64gather_epi32(m, i, s) \
5112	((__m128i)__builtin_ia32_gatherq_d((__v4si)_mm_undefined_si128(), \
5113	(int const *)(m), (__v2di)(__m128i)(i), \
5114	(__v4si)_mm_set1_epi32(-1), (s)))
5115
5116	/// Gathers four 32-bit integer values from memory \a m using scaled indexes
5117	/// from the 256-bit vector of [4 x i64] in \a i.
5118	///
5119	/// \code{.operation}
5120	/// FOR element := 0 to 3
5121	/// j := element*32
5122	/// k := element*64
5123	/// result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
5124	/// ENDFOR
5125	/// \endcode
5126	///
5127	/// \headerfile <immintrin.h>
5128	///
5129	/// \code
5130	/// __m128i _mm256_i64gather_epi32(const int *m, __m256i i, const int s);
5131	/// \endcode
5132	///
5133	/// This intrinsic corresponds to the \c VPGATHERQD instruction.
5134	///
5135	/// \param m
5136	/// A pointer to the memory used for loading values.
5137	/// \param i
5138	/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
5139	/// \param s
5140	/// A literal constant scale factor for the indexes in \a i. Must be
5141	/// 1, 2, 4, or 8.
5142	/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
5143	#define _mm256_i64gather_epi32(m, i, s) \
5144	((__m128i)__builtin_ia32_gatherq_d256((__v4si)_mm_undefined_si128(), \
5145	(int const *)(m), (__v4di)(__m256i)(i), \
5146	(__v4si)_mm_set1_epi32(-1), (s)))
5147
5148	/// Gathers two 64-bit integer values from memory \a m using scaled indexes
5149	/// from the 128-bit vector of [4 x i32] in \a i.
5150	///
5151	/// \code{.operation}
5152	/// FOR element := 0 to 1
5153	/// j := element*64
5154	/// k := element*32
5155	/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
5156	/// ENDFOR
5157	/// \endcode
5158	///
5159	/// \headerfile <immintrin.h>
5160	///
5161	/// \code
5162	/// __m128i _mm_i32gather_epi64(const long long *m, __m128i i, const int s);
5163	/// \endcode
5164	///
5165	/// This intrinsic corresponds to the \c VPGATHERDQ instruction.
5166	///
5167	/// \param m
5168	/// A pointer to the memory used for loading values.
5169	/// \param i
5170	/// A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
5171	/// the first two elements are used.
5172	/// \param s
5173	/// A literal constant scale factor for the indexes in \a i. Must be
5174	/// 1, 2, 4, or 8.
5175	/// \returns A 128-bit vector of [2 x i64] containing the gathered values.
5176	#define _mm_i32gather_epi64(m, i, s) \
5177	((__m128i)__builtin_ia32_gatherd_q((__v2di)_mm_undefined_si128(), \
5178	(long long const *)(m), \
5179	(__v4si)(__m128i)(i), \
5180	(__v2di)_mm_set1_epi64x(-1), (s)))
5181
5182	/// Gathers four 64-bit integer values from memory \a m using scaled indexes
5183	/// from the 128-bit vector of [4 x i32] in \a i.
5184	///
5185	/// \code{.operation}
5186	/// FOR element := 0 to 3
5187	/// j := element*64
5188	/// k := element*32
5189	/// result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
5190	/// ENDFOR
5191	/// \endcode
5192	///
5193	/// \headerfile <immintrin.h>
5194	///
5195	/// \code
5196	/// __m256i _mm256_i32gather_epi64(const long long *m, __m128i i, const int s);
5197	/// \endcode
5198	///
5199	/// This intrinsic corresponds to the \c VPGATHERDQ instruction.
5200	///
5201	/// \param m
5202	/// A pointer to the memory used for loading values.
5203	/// \param i
5204	/// A 128-bit vector of [4 x i32] containing signed indexes into \a m.
5205	/// \param s
5206	/// A literal constant scale factor for the indexes in \a i. Must be
5207	/// 1, 2, 4, or 8.
5208	/// \returns A 256-bit vector of [4 x i64] containing the gathered values.
5209	#define _mm256_i32gather_epi64(m, i, s) \
5210	((__m256i)__builtin_ia32_gatherd_q256((__v4di)_mm256_undefined_si256(), \
5211	(long long const *)(m), \
5212	(__v4si)(__m128i)(i), \
5213	(__v4di)_mm256_set1_epi64x(-1), (s)))
5214
5215	/// Gathers two 64-bit integer values from memory \a m using scaled indexes
5216	/// from the 128-bit vector of [2 x i64] in \a i.
5217	///
5218	/// \code{.operation}
5219	/// FOR element := 0 to 1
5220	/// j := element*64
5221	/// k := element*64
5222	/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
5223	/// ENDFOR
5224	/// \endcode
5225	///
5226	/// \headerfile <immintrin.h>
5227	///
5228	/// \code
5229	/// __m128i _mm_i64gather_epi64(const long long *m, __m128i i, const int s);
5230	/// \endcode
5231	///
5232	/// This intrinsic corresponds to the \c VPGATHERQQ instruction.
5233	///
5234	/// \param m
5235	/// A pointer to the memory used for loading values.
5236	/// \param i
5237	/// A 128-bit vector of [2 x i64] containing signed indexes into \a m.
5238	/// \param s
5239	/// A literal constant scale factor for the indexes in \a i. Must be
5240	/// 1, 2, 4, or 8.
5241	/// \returns A 128-bit vector of [2 x i64] containing the gathered values.
5242	#define _mm_i64gather_epi64(m, i, s) \
5243	((__m128i)__builtin_ia32_gatherq_q((__v2di)_mm_undefined_si128(), \
5244	(long long const *)(m), \
5245	(__v2di)(__m128i)(i), \
5246	(__v2di)_mm_set1_epi64x(-1), (s)))
5247
5248	/// Gathers four 64-bit integer values from memory \a m using scaled indexes
5249	/// from the 256-bit vector of [4 x i64] in \a i.
5250	///
5251	/// \code{.operation}
5252	/// FOR element := 0 to 3
5253	/// j := element*64
5254	/// k := element*64
5255	/// result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
5256	/// ENDFOR
5257	/// \endcode
5258	///
5259	/// \headerfile <immintrin.h>
5260	///
5261	/// \code
5262	/// __m256i _mm256_i64gather_epi64(const long long *m, __m256i i, const int s);
5263	/// \endcode
5264	///
5265	/// This intrinsic corresponds to the \c VPGATHERQQ instruction.
5266	///
5267	/// \param m
5268	/// A pointer to the memory used for loading values.
5269	/// \param i
5270	/// A 256-bit vector of [4 x i64] containing signed indexes into \a m.
5271	/// \param s
5272	/// A literal constant scale factor for the indexes in \a i. Must be
5273	/// 1, 2, 4, or 8.
5274	/// \returns A 256-bit vector of [4 x i64] containing the gathered values.
5275	#define _mm256_i64gather_epi64(m, i, s) \
5276	((__m256i)__builtin_ia32_gatherq_q256((__v4di)_mm256_undefined_si256(), \
5277	(long long const *)(m), \
5278	(__v4di)(__m256i)(i), \
5279	(__v4di)_mm256_set1_epi64x(-1), (s)))
5280
5281	#undef __DEFAULT_FN_ATTRS256
5282	#undef __DEFAULT_FN_ATTRS128
5283
5284	#endif /* __AVX2INTRIN_H */
5285

Warning: This file is not a C or C++ file. It does not have highlighting.

source code of clang/lib/Headers/avx2intrin.h