Warning: This file is not a C or C++ file. It does not have highlighting.

1/*===---- avxintrin.h - AVX intrinsics -------------------------------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10#ifndef __IMMINTRIN_H
11#error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
12#endif
13
14#ifndef __AVXINTRIN_H
15#define __AVXINTRIN_H
16
17typedef double __v4df __attribute__ ((__vector_size__ (32)));
18typedef float __v8sf __attribute__ ((__vector_size__ (32)));
19typedef long long __v4di __attribute__ ((__vector_size__ (32)));
20typedef int __v8si __attribute__ ((__vector_size__ (32)));
21typedef short __v16hi __attribute__ ((__vector_size__ (32)));
22typedef char __v32qi __attribute__ ((__vector_size__ (32)));
23
24/* Unsigned types */
25typedef unsigned long long __v4du __attribute__ ((__vector_size__ (32)));
26typedef unsigned int __v8su __attribute__ ((__vector_size__ (32)));
27typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32)));
28typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32)));
29
30/* We need an explicitly signed variant for char. Note that this shouldn't
31 * appear in the interface though. */
32typedef signed char __v32qs __attribute__((__vector_size__(32)));
33
34typedef float __m256 __attribute__ ((__vector_size__ (32), __aligned__(32)));
35typedef double __m256d __attribute__((__vector_size__(32), __aligned__(32)));
36typedef long long __m256i __attribute__((__vector_size__(32), __aligned__(32)));
37
38typedef float __m256_u __attribute__ ((__vector_size__ (32), __aligned__(1)));
39typedef double __m256d_u __attribute__((__vector_size__(32), __aligned__(1)));
40typedef long long __m256i_u __attribute__((__vector_size__(32), __aligned__(1)));
41
42#ifdef __SSE2__
43/* Both _Float16 and __bf16 require SSE2 being enabled. */
44typedef _Float16 __v16hf __attribute__((__vector_size__(32), __aligned__(32)));
45typedef _Float16 __m256h __attribute__((__vector_size__(32), __aligned__(32)));
46typedef _Float16 __m256h_u __attribute__((__vector_size__(32), __aligned__(1)));
47
48typedef __bf16 __v16bf __attribute__((__vector_size__(32), __aligned__(32)));
49typedef __bf16 __m256bh __attribute__((__vector_size__(32), __aligned__(32)));
50#endif
51
52/* Define the default attributes for the functions in this file. */
53#define __DEFAULT_FN_ATTRS \
54 __attribute__((__always_inline__, __nodebug__, __target__("avx,no-evex512"), \
55 __min_vector_width__(256)))
56#define __DEFAULT_FN_ATTRS128 \
57 __attribute__((__always_inline__, __nodebug__, __target__("avx,no-evex512"), \
58 __min_vector_width__(128)))
59
60/* Arithmetic */
61/// Adds two 256-bit vectors of [4 x double].
62///
63/// \headerfile <x86intrin.h>
64///
65/// This intrinsic corresponds to the <c> VADDPD </c> instruction.
66///
67/// \param __a
68/// A 256-bit vector of [4 x double] containing one of the source operands.
69/// \param __b
70/// A 256-bit vector of [4 x double] containing one of the source operands.
71/// \returns A 256-bit vector of [4 x double] containing the sums of both
72/// operands.
73static __inline __m256d __DEFAULT_FN_ATTRS
74_mm256_add_pd(__m256d __a, __m256d __b)
75{
76 return (__m256d)((__v4df)__a+(__v4df)__b);
77}
78
79/// Adds two 256-bit vectors of [8 x float].
80///
81/// \headerfile <x86intrin.h>
82///
83/// This intrinsic corresponds to the <c> VADDPS </c> instruction.
84///
85/// \param __a
86/// A 256-bit vector of [8 x float] containing one of the source operands.
87/// \param __b
88/// A 256-bit vector of [8 x float] containing one of the source operands.
89/// \returns A 256-bit vector of [8 x float] containing the sums of both
90/// operands.
91static __inline __m256 __DEFAULT_FN_ATTRS
92_mm256_add_ps(__m256 __a, __m256 __b)
93{
94 return (__m256)((__v8sf)__a+(__v8sf)__b);
95}
96
97/// Subtracts two 256-bit vectors of [4 x double].
98///
99/// \headerfile <x86intrin.h>
100///
101/// This intrinsic corresponds to the <c> VSUBPD </c> instruction.
102///
103/// \param __a
104/// A 256-bit vector of [4 x double] containing the minuend.
105/// \param __b
106/// A 256-bit vector of [4 x double] containing the subtrahend.
107/// \returns A 256-bit vector of [4 x double] containing the differences between
108/// both operands.
109static __inline __m256d __DEFAULT_FN_ATTRS
110_mm256_sub_pd(__m256d __a, __m256d __b)
111{
112 return (__m256d)((__v4df)__a-(__v4df)__b);
113}
114
115/// Subtracts two 256-bit vectors of [8 x float].
116///
117/// \headerfile <x86intrin.h>
118///
119/// This intrinsic corresponds to the <c> VSUBPS </c> instruction.
120///
121/// \param __a
122/// A 256-bit vector of [8 x float] containing the minuend.
123/// \param __b
124/// A 256-bit vector of [8 x float] containing the subtrahend.
125/// \returns A 256-bit vector of [8 x float] containing the differences between
126/// both operands.
127static __inline __m256 __DEFAULT_FN_ATTRS
128_mm256_sub_ps(__m256 __a, __m256 __b)
129{
130 return (__m256)((__v8sf)__a-(__v8sf)__b);
131}
132
133/// Adds the even-indexed values and subtracts the odd-indexed values of
134/// two 256-bit vectors of [4 x double].
135///
136/// \headerfile <x86intrin.h>
137///
138/// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction.
139///
140/// \param __a
141/// A 256-bit vector of [4 x double] containing the left source operand.
142/// \param __b
143/// A 256-bit vector of [4 x double] containing the right source operand.
144/// \returns A 256-bit vector of [4 x double] containing the alternating sums
145/// and differences between both operands.
146static __inline __m256d __DEFAULT_FN_ATTRS
147_mm256_addsub_pd(__m256d __a, __m256d __b)
148{
149 return (__m256d)__builtin_ia32_addsubpd256((__v4df)__a, (__v4df)__b);
150}
151
152/// Adds the even-indexed values and subtracts the odd-indexed values of
153/// two 256-bit vectors of [8 x float].
154///
155/// \headerfile <x86intrin.h>
156///
157/// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction.
158///
159/// \param __a
160/// A 256-bit vector of [8 x float] containing the left source operand.
161/// \param __b
162/// A 256-bit vector of [8 x float] containing the right source operand.
163/// \returns A 256-bit vector of [8 x float] containing the alternating sums and
164/// differences between both operands.
165static __inline __m256 __DEFAULT_FN_ATTRS
166_mm256_addsub_ps(__m256 __a, __m256 __b)
167{
168 return (__m256)__builtin_ia32_addsubps256((__v8sf)__a, (__v8sf)__b);
169}
170
171/// Divides two 256-bit vectors of [4 x double].
172///
173/// \headerfile <x86intrin.h>
174///
175/// This intrinsic corresponds to the <c> VDIVPD </c> instruction.
176///
177/// \param __a
178/// A 256-bit vector of [4 x double] containing the dividend.
179/// \param __b
180/// A 256-bit vector of [4 x double] containing the divisor.
181/// \returns A 256-bit vector of [4 x double] containing the quotients of both
182/// operands.
183static __inline __m256d __DEFAULT_FN_ATTRS
184_mm256_div_pd(__m256d __a, __m256d __b)
185{
186 return (__m256d)((__v4df)__a/(__v4df)__b);
187}
188
189/// Divides two 256-bit vectors of [8 x float].
190///
191/// \headerfile <x86intrin.h>
192///
193/// This intrinsic corresponds to the <c> VDIVPS </c> instruction.
194///
195/// \param __a
196/// A 256-bit vector of [8 x float] containing the dividend.
197/// \param __b
198/// A 256-bit vector of [8 x float] containing the divisor.
199/// \returns A 256-bit vector of [8 x float] containing the quotients of both
200/// operands.
201static __inline __m256 __DEFAULT_FN_ATTRS
202_mm256_div_ps(__m256 __a, __m256 __b)
203{
204 return (__m256)((__v8sf)__a/(__v8sf)__b);
205}
206
207/// Compares two 256-bit vectors of [4 x double] and returns the greater
208/// of each pair of values.
209///
210/// \headerfile <x86intrin.h>
211///
212/// This intrinsic corresponds to the <c> VMAXPD </c> instruction.
213///
214/// \param __a
215/// A 256-bit vector of [4 x double] containing one of the operands.
216/// \param __b
217/// A 256-bit vector of [4 x double] containing one of the operands.
218/// \returns A 256-bit vector of [4 x double] containing the maximum values
219/// between both operands.
220static __inline __m256d __DEFAULT_FN_ATTRS
221_mm256_max_pd(__m256d __a, __m256d __b)
222{
223 return (__m256d)__builtin_ia32_maxpd256((__v4df)__a, (__v4df)__b);
224}
225
226/// Compares two 256-bit vectors of [8 x float] and returns the greater
227/// of each pair of values.
228///
229/// \headerfile <x86intrin.h>
230///
231/// This intrinsic corresponds to the <c> VMAXPS </c> instruction.
232///
233/// \param __a
234/// A 256-bit vector of [8 x float] containing one of the operands.
235/// \param __b
236/// A 256-bit vector of [8 x float] containing one of the operands.
237/// \returns A 256-bit vector of [8 x float] containing the maximum values
238/// between both operands.
239static __inline __m256 __DEFAULT_FN_ATTRS
240_mm256_max_ps(__m256 __a, __m256 __b)
241{
242 return (__m256)__builtin_ia32_maxps256((__v8sf)__a, (__v8sf)__b);
243}
244
245/// Compares two 256-bit vectors of [4 x double] and returns the lesser
246/// of each pair of values.
247///
248/// \headerfile <x86intrin.h>
249///
250/// This intrinsic corresponds to the <c> VMINPD </c> instruction.
251///
252/// \param __a
253/// A 256-bit vector of [4 x double] containing one of the operands.
254/// \param __b
255/// A 256-bit vector of [4 x double] containing one of the operands.
256/// \returns A 256-bit vector of [4 x double] containing the minimum values
257/// between both operands.
258static __inline __m256d __DEFAULT_FN_ATTRS
259_mm256_min_pd(__m256d __a, __m256d __b)
260{
261 return (__m256d)__builtin_ia32_minpd256((__v4df)__a, (__v4df)__b);
262}
263
264/// Compares two 256-bit vectors of [8 x float] and returns the lesser
265/// of each pair of values.
266///
267/// \headerfile <x86intrin.h>
268///
269/// This intrinsic corresponds to the <c> VMINPS </c> instruction.
270///
271/// \param __a
272/// A 256-bit vector of [8 x float] containing one of the operands.
273/// \param __b
274/// A 256-bit vector of [8 x float] containing one of the operands.
275/// \returns A 256-bit vector of [8 x float] containing the minimum values
276/// between both operands.
277static __inline __m256 __DEFAULT_FN_ATTRS
278_mm256_min_ps(__m256 __a, __m256 __b)
279{
280 return (__m256)__builtin_ia32_minps256((__v8sf)__a, (__v8sf)__b);
281}
282
283/// Multiplies two 256-bit vectors of [4 x double].
284///
285/// \headerfile <x86intrin.h>
286///
287/// This intrinsic corresponds to the <c> VMULPD </c> instruction.
288///
289/// \param __a
290/// A 256-bit vector of [4 x double] containing one of the operands.
291/// \param __b
292/// A 256-bit vector of [4 x double] containing one of the operands.
293/// \returns A 256-bit vector of [4 x double] containing the products of both
294/// operands.
295static __inline __m256d __DEFAULT_FN_ATTRS
296_mm256_mul_pd(__m256d __a, __m256d __b)
297{
298 return (__m256d)((__v4df)__a * (__v4df)__b);
299}
300
301/// Multiplies two 256-bit vectors of [8 x float].
302///
303/// \headerfile <x86intrin.h>
304///
305/// This intrinsic corresponds to the <c> VMULPS </c> instruction.
306///
307/// \param __a
308/// A 256-bit vector of [8 x float] containing one of the operands.
309/// \param __b
310/// A 256-bit vector of [8 x float] containing one of the operands.
311/// \returns A 256-bit vector of [8 x float] containing the products of both
312/// operands.
313static __inline __m256 __DEFAULT_FN_ATTRS
314_mm256_mul_ps(__m256 __a, __m256 __b)
315{
316 return (__m256)((__v8sf)__a * (__v8sf)__b);
317}
318
319/// Calculates the square roots of the values in a 256-bit vector of
320/// [4 x double].
321///
322/// \headerfile <x86intrin.h>
323///
324/// This intrinsic corresponds to the <c> VSQRTPD </c> instruction.
325///
326/// \param __a
327/// A 256-bit vector of [4 x double].
328/// \returns A 256-bit vector of [4 x double] containing the square roots of the
329/// values in the operand.
330static __inline __m256d __DEFAULT_FN_ATTRS
331_mm256_sqrt_pd(__m256d __a)
332{
333 return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a);
334}
335
336/// Calculates the square roots of the values in a 256-bit vector of
337/// [8 x float].
338///
339/// \headerfile <x86intrin.h>
340///
341/// This intrinsic corresponds to the <c> VSQRTPS </c> instruction.
342///
343/// \param __a
344/// A 256-bit vector of [8 x float].
345/// \returns A 256-bit vector of [8 x float] containing the square roots of the
346/// values in the operand.
347static __inline __m256 __DEFAULT_FN_ATTRS
348_mm256_sqrt_ps(__m256 __a)
349{
350 return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a);
351}
352
353/// Calculates the reciprocal square roots of the values in a 256-bit
354/// vector of [8 x float].
355///
356/// \headerfile <x86intrin.h>
357///
358/// This intrinsic corresponds to the <c> VRSQRTPS </c> instruction.
359///
360/// \param __a
361/// A 256-bit vector of [8 x float].
362/// \returns A 256-bit vector of [8 x float] containing the reciprocal square
363/// roots of the values in the operand.
364static __inline __m256 __DEFAULT_FN_ATTRS
365_mm256_rsqrt_ps(__m256 __a)
366{
367 return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__a);
368}
369
370/// Calculates the reciprocals of the values in a 256-bit vector of
371/// [8 x float].
372///
373/// \headerfile <x86intrin.h>
374///
375/// This intrinsic corresponds to the <c> VRCPPS </c> instruction.
376///
377/// \param __a
378/// A 256-bit vector of [8 x float].
379/// \returns A 256-bit vector of [8 x float] containing the reciprocals of the
380/// values in the operand.
381static __inline __m256 __DEFAULT_FN_ATTRS
382_mm256_rcp_ps(__m256 __a)
383{
384 return (__m256)__builtin_ia32_rcpps256((__v8sf)__a);
385}
386
387/// Rounds the values in a 256-bit vector of [4 x double] as specified
388/// by the byte operand. The source values are rounded to integer values and
389/// returned as 64-bit double-precision floating-point values.
390///
391/// \headerfile <x86intrin.h>
392///
393/// \code
394/// __m256d _mm256_round_pd(__m256d V, const int M);
395/// \endcode
396///
397/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
398///
399/// \param V
400/// A 256-bit vector of [4 x double].
401/// \param M
402/// An integer value that specifies the rounding operation. \n
403/// Bits [7:4] are reserved. \n
404/// Bit [3] is a precision exception value: \n
405/// 0: A normal PE exception is used. \n
406/// 1: The PE field is not updated. \n
407/// Bit [2] is the rounding control source: \n
408/// 0: Use bits [1:0] of \a M. \n
409/// 1: Use the current MXCSR setting. \n
410/// Bits [1:0] contain the rounding control definition: \n
411/// 00: Nearest. \n
412/// 01: Downward (toward negative infinity). \n
413/// 10: Upward (toward positive infinity). \n
414/// 11: Truncated.
415/// \returns A 256-bit vector of [4 x double] containing the rounded values.
416#define _mm256_round_pd(V, M) \
417 ((__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M)))
418
419/// Rounds the values stored in a 256-bit vector of [8 x float] as
420/// specified by the byte operand. The source values are rounded to integer
421/// values and returned as floating-point values.
422///
423/// \headerfile <x86intrin.h>
424///
425/// \code
426/// __m256 _mm256_round_ps(__m256 V, const int M);
427/// \endcode
428///
429/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
430///
431/// \param V
432/// A 256-bit vector of [8 x float].
433/// \param M
434/// An integer value that specifies the rounding operation. \n
435/// Bits [7:4] are reserved. \n
436/// Bit [3] is a precision exception value: \n
437/// 0: A normal PE exception is used. \n
438/// 1: The PE field is not updated. \n
439/// Bit [2] is the rounding control source: \n
440/// 0: Use bits [1:0] of \a M. \n
441/// 1: Use the current MXCSR setting. \n
442/// Bits [1:0] contain the rounding control definition: \n
443/// 00: Nearest. \n
444/// 01: Downward (toward negative infinity). \n
445/// 10: Upward (toward positive infinity). \n
446/// 11: Truncated.
447/// \returns A 256-bit vector of [8 x float] containing the rounded values.
448#define _mm256_round_ps(V, M) \
449 ((__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M)))
450
451/// Rounds up the values stored in a 256-bit vector of [4 x double]. The
452/// source values are rounded up to integer values and returned as 64-bit
453/// double-precision floating-point values.
454///
455/// \headerfile <x86intrin.h>
456///
457/// \code
458/// __m256d _mm256_ceil_pd(__m256d V);
459/// \endcode
460///
461/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
462///
463/// \param V
464/// A 256-bit vector of [4 x double].
465/// \returns A 256-bit vector of [4 x double] containing the rounded up values.
466#define _mm256_ceil_pd(V) _mm256_round_pd((V), _MM_FROUND_CEIL)
467
468/// Rounds down the values stored in a 256-bit vector of [4 x double].
469/// The source values are rounded down to integer values and returned as
470/// 64-bit double-precision floating-point values.
471///
472/// \headerfile <x86intrin.h>
473///
474/// \code
475/// __m256d _mm256_floor_pd(__m256d V);
476/// \endcode
477///
478/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
479///
480/// \param V
481/// A 256-bit vector of [4 x double].
482/// \returns A 256-bit vector of [4 x double] containing the rounded down
483/// values.
484#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
485
486/// Rounds up the values stored in a 256-bit vector of [8 x float]. The
487/// source values are rounded up to integer values and returned as
488/// floating-point values.
489///
490/// \headerfile <x86intrin.h>
491///
492/// \code
493/// __m256 _mm256_ceil_ps(__m256 V);
494/// \endcode
495///
496/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
497///
498/// \param V
499/// A 256-bit vector of [8 x float].
500/// \returns A 256-bit vector of [8 x float] containing the rounded up values.
501#define _mm256_ceil_ps(V) _mm256_round_ps((V), _MM_FROUND_CEIL)
502
503/// Rounds down the values stored in a 256-bit vector of [8 x float]. The
504/// source values are rounded down to integer values and returned as
505/// floating-point values.
506///
507/// \headerfile <x86intrin.h>
508///
509/// \code
510/// __m256 _mm256_floor_ps(__m256 V);
511/// \endcode
512///
513/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
514///
515/// \param V
516/// A 256-bit vector of [8 x float].
517/// \returns A 256-bit vector of [8 x float] containing the rounded down values.
518#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
519
520/* Logical */
521/// Performs a bitwise AND of two 256-bit vectors of [4 x double].
522///
523/// \headerfile <x86intrin.h>
524///
525/// This intrinsic corresponds to the <c> VANDPD </c> instruction.
526///
527/// \param __a
528/// A 256-bit vector of [4 x double] containing one of the source operands.
529/// \param __b
530/// A 256-bit vector of [4 x double] containing one of the source operands.
531/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
532/// values between both operands.
533static __inline __m256d __DEFAULT_FN_ATTRS
534_mm256_and_pd(__m256d __a, __m256d __b)
535{
536 return (__m256d)((__v4du)__a & (__v4du)__b);
537}
538
539/// Performs a bitwise AND of two 256-bit vectors of [8 x float].
540///
541/// \headerfile <x86intrin.h>
542///
543/// This intrinsic corresponds to the <c> VANDPS </c> instruction.
544///
545/// \param __a
546/// A 256-bit vector of [8 x float] containing one of the source operands.
547/// \param __b
548/// A 256-bit vector of [8 x float] containing one of the source operands.
549/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
550/// values between both operands.
551static __inline __m256 __DEFAULT_FN_ATTRS
552_mm256_and_ps(__m256 __a, __m256 __b)
553{
554 return (__m256)((__v8su)__a & (__v8su)__b);
555}
556
557/// Performs a bitwise AND of two 256-bit vectors of [4 x double], using
558/// the one's complement of the values contained in the first source operand.
559///
560/// \headerfile <x86intrin.h>
561///
562/// This intrinsic corresponds to the <c> VANDNPD </c> instruction.
563///
564/// \param __a
565/// A 256-bit vector of [4 x double] containing the left source operand. The
566/// one's complement of this value is used in the bitwise AND.
567/// \param __b
568/// A 256-bit vector of [4 x double] containing the right source operand.
569/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
570/// values of the second operand and the one's complement of the first
571/// operand.
572static __inline __m256d __DEFAULT_FN_ATTRS
573_mm256_andnot_pd(__m256d __a, __m256d __b)
574{
575 return (__m256d)(~(__v4du)__a & (__v4du)__b);
576}
577
578/// Performs a bitwise AND of two 256-bit vectors of [8 x float], using
579/// the one's complement of the values contained in the first source operand.
580///
581/// \headerfile <x86intrin.h>
582///
583/// This intrinsic corresponds to the <c> VANDNPS </c> instruction.
584///
585/// \param __a
586/// A 256-bit vector of [8 x float] containing the left source operand. The
587/// one's complement of this value is used in the bitwise AND.
588/// \param __b
589/// A 256-bit vector of [8 x float] containing the right source operand.
590/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
591/// values of the second operand and the one's complement of the first
592/// operand.
593static __inline __m256 __DEFAULT_FN_ATTRS
594_mm256_andnot_ps(__m256 __a, __m256 __b)
595{
596 return (__m256)(~(__v8su)__a & (__v8su)__b);
597}
598
599/// Performs a bitwise OR of two 256-bit vectors of [4 x double].
600///
601/// \headerfile <x86intrin.h>
602///
603/// This intrinsic corresponds to the <c> VORPD </c> instruction.
604///
605/// \param __a
606/// A 256-bit vector of [4 x double] containing one of the source operands.
607/// \param __b
608/// A 256-bit vector of [4 x double] containing one of the source operands.
609/// \returns A 256-bit vector of [4 x double] containing the bitwise OR of the
610/// values between both operands.
611static __inline __m256d __DEFAULT_FN_ATTRS
612_mm256_or_pd(__m256d __a, __m256d __b)
613{
614 return (__m256d)((__v4du)__a | (__v4du)__b);
615}
616
617/// Performs a bitwise OR of two 256-bit vectors of [8 x float].
618///
619/// \headerfile <x86intrin.h>
620///
621/// This intrinsic corresponds to the <c> VORPS </c> instruction.
622///
623/// \param __a
624/// A 256-bit vector of [8 x float] containing one of the source operands.
625/// \param __b
626/// A 256-bit vector of [8 x float] containing one of the source operands.
627/// \returns A 256-bit vector of [8 x float] containing the bitwise OR of the
628/// values between both operands.
629static __inline __m256 __DEFAULT_FN_ATTRS
630_mm256_or_ps(__m256 __a, __m256 __b)
631{
632 return (__m256)((__v8su)__a | (__v8su)__b);
633}
634
635/// Performs a bitwise XOR of two 256-bit vectors of [4 x double].
636///
637/// \headerfile <x86intrin.h>
638///
639/// This intrinsic corresponds to the <c> VXORPD </c> instruction.
640///
641/// \param __a
642/// A 256-bit vector of [4 x double] containing one of the source operands.
643/// \param __b
644/// A 256-bit vector of [4 x double] containing one of the source operands.
645/// \returns A 256-bit vector of [4 x double] containing the bitwise XOR of the
646/// values between both operands.
647static __inline __m256d __DEFAULT_FN_ATTRS
648_mm256_xor_pd(__m256d __a, __m256d __b)
649{
650 return (__m256d)((__v4du)__a ^ (__v4du)__b);
651}
652
653/// Performs a bitwise XOR of two 256-bit vectors of [8 x float].
654///
655/// \headerfile <x86intrin.h>
656///
657/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
658///
659/// \param __a
660/// A 256-bit vector of [8 x float] containing one of the source operands.
661/// \param __b
662/// A 256-bit vector of [8 x float] containing one of the source operands.
663/// \returns A 256-bit vector of [8 x float] containing the bitwise XOR of the
664/// values between both operands.
665static __inline __m256 __DEFAULT_FN_ATTRS
666_mm256_xor_ps(__m256 __a, __m256 __b)
667{
668 return (__m256)((__v8su)__a ^ (__v8su)__b);
669}
670
671/* Horizontal arithmetic */
672/// Horizontally adds the adjacent pairs of values contained in two
673/// 256-bit vectors of [4 x double].
674///
675/// \headerfile <x86intrin.h>
676///
677/// This intrinsic corresponds to the <c> VHADDPD </c> instruction.
678///
679/// \param __a
680/// A 256-bit vector of [4 x double] containing one of the source operands.
681/// The horizontal sums of the values are returned in the even-indexed
682/// elements of a vector of [4 x double].
683/// \param __b
684/// A 256-bit vector of [4 x double] containing one of the source operands.
685/// The horizontal sums of the values are returned in the odd-indexed
686/// elements of a vector of [4 x double].
687/// \returns A 256-bit vector of [4 x double] containing the horizontal sums of
688/// both operands.
689static __inline __m256d __DEFAULT_FN_ATTRS
690_mm256_hadd_pd(__m256d __a, __m256d __b)
691{
692 return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b);
693}
694
695/// Horizontally adds the adjacent pairs of values contained in two
696/// 256-bit vectors of [8 x float].
697///
698/// \headerfile <x86intrin.h>
699///
700/// This intrinsic corresponds to the <c> VHADDPS </c> instruction.
701///
702/// \param __a
703/// A 256-bit vector of [8 x float] containing one of the source operands.
704/// The horizontal sums of the values are returned in the elements with
705/// index 0, 1, 4, 5 of a vector of [8 x float].
706/// \param __b
707/// A 256-bit vector of [8 x float] containing one of the source operands.
708/// The horizontal sums of the values are returned in the elements with
709/// index 2, 3, 6, 7 of a vector of [8 x float].
710/// \returns A 256-bit vector of [8 x float] containing the horizontal sums of
711/// both operands.
712static __inline __m256 __DEFAULT_FN_ATTRS
713_mm256_hadd_ps(__m256 __a, __m256 __b)
714{
715 return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b);
716}
717
718/// Horizontally subtracts the adjacent pairs of values contained in two
719/// 256-bit vectors of [4 x double].
720///
721/// \headerfile <x86intrin.h>
722///
723/// This intrinsic corresponds to the <c> VHSUBPD </c> instruction.
724///
725/// \param __a
726/// A 256-bit vector of [4 x double] containing one of the source operands.
727/// The horizontal differences between the values are returned in the
728/// even-indexed elements of a vector of [4 x double].
729/// \param __b
730/// A 256-bit vector of [4 x double] containing one of the source operands.
731/// The horizontal differences between the values are returned in the
732/// odd-indexed elements of a vector of [4 x double].
733/// \returns A 256-bit vector of [4 x double] containing the horizontal
734/// differences of both operands.
735static __inline __m256d __DEFAULT_FN_ATTRS
736_mm256_hsub_pd(__m256d __a, __m256d __b)
737{
738 return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b);
739}
740
741/// Horizontally subtracts the adjacent pairs of values contained in two
742/// 256-bit vectors of [8 x float].
743///
744/// \headerfile <x86intrin.h>
745///
746/// This intrinsic corresponds to the <c> VHSUBPS </c> instruction.
747///
748/// \param __a
749/// A 256-bit vector of [8 x float] containing one of the source operands.
750/// The horizontal differences between the values are returned in the
751/// elements with index 0, 1, 4, 5 of a vector of [8 x float].
752/// \param __b
753/// A 256-bit vector of [8 x float] containing one of the source operands.
754/// The horizontal differences between the values are returned in the
755/// elements with index 2, 3, 6, 7 of a vector of [8 x float].
756/// \returns A 256-bit vector of [8 x float] containing the horizontal
757/// differences of both operands.
758static __inline __m256 __DEFAULT_FN_ATTRS
759_mm256_hsub_ps(__m256 __a, __m256 __b)
760{
761 return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b);
762}
763
764/* Vector permutations */
765/// Copies the values in a 128-bit vector of [2 x double] as specified
766/// by the 128-bit integer vector operand.
767///
768/// \headerfile <x86intrin.h>
769///
770/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
771///
772/// \param __a
773/// A 128-bit vector of [2 x double].
774/// \param __c
775/// A 128-bit integer vector operand specifying how the values are to be
776/// copied. \n
777/// Bit [1]: \n
778/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
779/// vector. \n
780/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
781/// returned vector. \n
782/// Bit [65]: \n
783/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
784/// returned vector. \n
785/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
786/// returned vector.
787/// \returns A 128-bit vector of [2 x double] containing the copied values.
788static __inline __m128d __DEFAULT_FN_ATTRS128
789_mm_permutevar_pd(__m128d __a, __m128i __c)
790{
791 return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c);
792}
793
794/// Copies the values in a 256-bit vector of [4 x double] as specified
795/// by the 256-bit integer vector operand.
796///
797/// \headerfile <x86intrin.h>
798///
799/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
800///
801/// \param __a
802/// A 256-bit vector of [4 x double].
803/// \param __c
804/// A 256-bit integer vector operand specifying how the values are to be
805/// copied. \n
806/// Bit [1]: \n
807/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
808/// vector. \n
809/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
810/// returned vector. \n
811/// Bit [65]: \n
812/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
813/// returned vector. \n
814/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
815/// returned vector. \n
816/// Bit [129]: \n
817/// 0: Bits [191:128] of the source are copied to bits [191:128] of the
818/// returned vector. \n
819/// 1: Bits [255:192] of the source are copied to bits [191:128] of the
820/// returned vector. \n
821/// Bit [193]: \n
822/// 0: Bits [191:128] of the source are copied to bits [255:192] of the
823/// returned vector. \n
824/// 1: Bits [255:192] of the source are copied to bits [255:192] of the
825/// returned vector.
826/// \returns A 256-bit vector of [4 x double] containing the copied values.
827static __inline __m256d __DEFAULT_FN_ATTRS
828_mm256_permutevar_pd(__m256d __a, __m256i __c)
829{
830 return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__a, (__v4di)__c);
831}
832
833/// Copies the values stored in a 128-bit vector of [4 x float] as
834/// specified by the 128-bit integer vector operand.
835/// \headerfile <x86intrin.h>
836///
837/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
838///
839/// \param __a
840/// A 128-bit vector of [4 x float].
841/// \param __c
842/// A 128-bit integer vector operand specifying how the values are to be
843/// copied. \n
844/// Bits [1:0]: \n
845/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
846/// returned vector. \n
847/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
848/// returned vector. \n
849/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
850/// returned vector. \n
851/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
852/// returned vector. \n
853/// Bits [33:32]: \n
854/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
855/// returned vector. \n
856/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
857/// returned vector. \n
858/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
859/// returned vector. \n
860/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
861/// returned vector. \n
862/// Bits [65:64]: \n
863/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
864/// returned vector. \n
865/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
866/// returned vector. \n
867/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
868/// returned vector. \n
869/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
870/// returned vector. \n
871/// Bits [97:96]: \n
872/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
873/// returned vector. \n
874/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
875/// returned vector. \n
876/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
877/// returned vector. \n
878/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
879/// returned vector.
880/// \returns A 128-bit vector of [4 x float] containing the copied values.
881static __inline __m128 __DEFAULT_FN_ATTRS128
882_mm_permutevar_ps(__m128 __a, __m128i __c)
883{
884 return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__a, (__v4si)__c);
885}
886
887/// Copies the values stored in a 256-bit vector of [8 x float] as
888/// specified by the 256-bit integer vector operand.
889///
890/// \headerfile <x86intrin.h>
891///
892/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
893///
894/// \param __a
895/// A 256-bit vector of [8 x float].
896/// \param __c
897/// A 256-bit integer vector operand specifying how the values are to be
898/// copied. \n
899/// Bits [1:0]: \n
900/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
901/// returned vector. \n
902/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
903/// returned vector. \n
904/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
905/// returned vector. \n
906/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
907/// returned vector. \n
908/// Bits [33:32]: \n
909/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
910/// returned vector. \n
911/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
912/// returned vector. \n
913/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
914/// returned vector. \n
915/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
916/// returned vector. \n
917/// Bits [65:64]: \n
918/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
919/// returned vector. \n
920/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
921/// returned vector. \n
922/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
923/// returned vector. \n
924/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
925/// returned vector. \n
926/// Bits [97:96]: \n
927/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
928/// returned vector. \n
929/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
930/// returned vector. \n
931/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
932/// returned vector. \n
933/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
934/// returned vector. \n
935/// Bits [129:128]: \n
936/// 00: Bits [159:128] of the source are copied to bits [159:128] of the
937/// returned vector. \n
938/// 01: Bits [191:160] of the source are copied to bits [159:128] of the
939/// returned vector. \n
940/// 10: Bits [223:192] of the source are copied to bits [159:128] of the
941/// returned vector. \n
942/// 11: Bits [255:224] of the source are copied to bits [159:128] of the
943/// returned vector. \n
944/// Bits [161:160]: \n
945/// 00: Bits [159:128] of the source are copied to bits [191:160] of the
946/// returned vector. \n
947/// 01: Bits [191:160] of the source are copied to bits [191:160] of the
948/// returned vector. \n
949/// 10: Bits [223:192] of the source are copied to bits [191:160] of the
950/// returned vector. \n
951/// 11: Bits [255:224] of the source are copied to bits [191:160] of the
952/// returned vector. \n
953/// Bits [193:192]: \n
954/// 00: Bits [159:128] of the source are copied to bits [223:192] of the
955/// returned vector. \n
956/// 01: Bits [191:160] of the source are copied to bits [223:192] of the
957/// returned vector. \n
958/// 10: Bits [223:192] of the source are copied to bits [223:192] of the
959/// returned vector. \n
960/// 11: Bits [255:224] of the source are copied to bits [223:192] of the
961/// returned vector. \n
962/// Bits [225:224]: \n
963/// 00: Bits [159:128] of the source are copied to bits [255:224] of the
964/// returned vector. \n
965/// 01: Bits [191:160] of the source are copied to bits [255:224] of the
966/// returned vector. \n
967/// 10: Bits [223:192] of the source are copied to bits [255:224] of the
968/// returned vector. \n
969/// 11: Bits [255:224] of the source are copied to bits [255:224] of the
970/// returned vector.
971/// \returns A 256-bit vector of [8 x float] containing the copied values.
972static __inline __m256 __DEFAULT_FN_ATTRS
973_mm256_permutevar_ps(__m256 __a, __m256i __c)
974{
975 return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a, (__v8si)__c);
976}
977
978/// Copies the values in a 128-bit vector of [2 x double] as specified
979/// by the immediate integer operand.
980///
981/// \headerfile <x86intrin.h>
982///
983/// \code
984/// __m128d _mm_permute_pd(__m128d A, const int C);
985/// \endcode
986///
987/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
988///
989/// \param A
990/// A 128-bit vector of [2 x double].
991/// \param C
992/// An immediate integer operand specifying how the values are to be
993/// copied. \n
994/// Bit [0]: \n
995/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
996/// vector. \n
997/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
998/// returned vector. \n
999/// Bit [1]: \n
1000/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
1001/// returned vector. \n
1002/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
1003/// returned vector.
1004/// \returns A 128-bit vector of [2 x double] containing the copied values.
1005#define _mm_permute_pd(A, C) \
1006 ((__m128d)__builtin_ia32_vpermilpd((__v2df)(__m128d)(A), (int)(C)))
1007
1008/// Copies the values in a 256-bit vector of [4 x double] as specified by
1009/// the immediate integer operand.
1010///
1011/// \headerfile <x86intrin.h>
1012///
1013/// \code
1014/// __m256d _mm256_permute_pd(__m256d A, const int C);
1015/// \endcode
1016///
1017/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
1018///
1019/// \param A
1020/// A 256-bit vector of [4 x double].
1021/// \param C
1022/// An immediate integer operand specifying how the values are to be
1023/// copied. \n
1024/// Bit [0]: \n
1025/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
1026/// vector. \n
1027/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
1028/// returned vector. \n
1029/// Bit [1]: \n
1030/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
1031/// returned vector. \n
1032/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
1033/// returned vector. \n
1034/// Bit [2]: \n
1035/// 0: Bits [191:128] of the source are copied to bits [191:128] of the
1036/// returned vector. \n
1037/// 1: Bits [255:192] of the source are copied to bits [191:128] of the
1038/// returned vector. \n
1039/// Bit [3]: \n
1040/// 0: Bits [191:128] of the source are copied to bits [255:192] of the
1041/// returned vector. \n
1042/// 1: Bits [255:192] of the source are copied to bits [255:192] of the
1043/// returned vector.
1044/// \returns A 256-bit vector of [4 x double] containing the copied values.
1045#define _mm256_permute_pd(A, C) \
1046 ((__m256d)__builtin_ia32_vpermilpd256((__v4df)(__m256d)(A), (int)(C)))
1047
1048/// Copies the values in a 128-bit vector of [4 x float] as specified by
1049/// the immediate integer operand.
1050///
1051/// \headerfile <x86intrin.h>
1052///
1053/// \code
1054/// __m128 _mm_permute_ps(__m128 A, const int C);
1055/// \endcode
1056///
1057/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
1058///
1059/// \param A
1060/// A 128-bit vector of [4 x float].
1061/// \param C
1062/// An immediate integer operand specifying how the values are to be
1063/// copied. \n
1064/// Bits [1:0]: \n
1065/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
1066/// returned vector. \n
1067/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
1068/// returned vector. \n
1069/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
1070/// returned vector. \n
1071/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
1072/// returned vector. \n
1073/// Bits [3:2]: \n
1074/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
1075/// returned vector. \n
1076/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
1077/// returned vector. \n
1078/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
1079/// returned vector. \n
1080/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
1081/// returned vector. \n
1082/// Bits [5:4]: \n
1083/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
1084/// returned vector. \n
1085/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
1086/// returned vector. \n
1087/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
1088/// returned vector. \n
1089/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
1090/// returned vector. \n
1091/// Bits [7:6]: \n
1092/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
1093/// returned vector. \n
1094/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
1095/// returned vector. \n
1096/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
1097/// returned vector. \n
1098/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
1099/// returned vector.
1100/// \returns A 128-bit vector of [4 x float] containing the copied values.
1101#define _mm_permute_ps(A, C) \
1102 ((__m128)__builtin_ia32_vpermilps((__v4sf)(__m128)(A), (int)(C)))
1103
1104/// Copies the values in a 256-bit vector of [8 x float] as specified by
1105/// the immediate integer operand.
1106///
1107/// \headerfile <x86intrin.h>
1108///
1109/// \code
1110/// __m256 _mm256_permute_ps(__m256 A, const int C);
1111/// \endcode
1112///
1113/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
1114///
1115/// \param A
1116/// A 256-bit vector of [8 x float].
1117/// \param C
1118/// An immediate integer operand specifying how the values are to be
1119/// copied. \n
1120/// Bits [1:0]: \n
1121/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
1122/// returned vector. \n
1123/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
1124/// returned vector. \n
1125/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
1126/// returned vector. \n
1127/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
1128/// returned vector. \n
1129/// Bits [3:2]: \n
1130/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
1131/// returned vector. \n
1132/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
1133/// returned vector. \n
1134/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
1135/// returned vector. \n
1136/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
1137/// returned vector. \n
1138/// Bits [5:4]: \n
1139/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
1140/// returned vector. \n
1141/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
1142/// returned vector. \n
1143/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
1144/// returned vector. \n
1145/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
1146/// returned vector. \n
1147/// Bits [7:6]: \n
1148/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
1149/// returned vector. \n
1150/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
1151/// returned vector. \n
1152/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
1153/// returned vector. \n
1154/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
1155/// returned vector. \n
1156/// Bits [1:0]: \n
1157/// 00: Bits [159:128] of the source are copied to bits [159:128] of the
1158/// returned vector. \n
1159/// 01: Bits [191:160] of the source are copied to bits [159:128] of the
1160/// returned vector. \n
1161/// 10: Bits [223:192] of the source are copied to bits [159:128] of the
1162/// returned vector. \n
1163/// 11: Bits [255:224] of the source are copied to bits [159:128] of the
1164/// returned vector. \n
1165/// Bits [3:2]: \n
1166/// 00: Bits [159:128] of the source are copied to bits [191:160] of the
1167/// returned vector. \n
1168/// 01: Bits [191:160] of the source are copied to bits [191:160] of the
1169/// returned vector. \n
1170/// 10: Bits [223:192] of the source are copied to bits [191:160] of the
1171/// returned vector. \n
1172/// 11: Bits [255:224] of the source are copied to bits [191:160] of the
1173/// returned vector. \n
1174/// Bits [5:4]: \n
1175/// 00: Bits [159:128] of the source are copied to bits [223:192] of the
1176/// returned vector. \n
1177/// 01: Bits [191:160] of the source are copied to bits [223:192] of the
1178/// returned vector. \n
1179/// 10: Bits [223:192] of the source are copied to bits [223:192] of the
1180/// returned vector. \n
1181/// 11: Bits [255:224] of the source are copied to bits [223:192] of the
1182/// returned vector. \n
1183/// Bits [7:6]: \n
1184/// 00: Bits [159:128] of the source are copied to bits [255:224] of the
1185/// returned vector. \n
1186/// 01: Bits [191:160] of the source are copied to bits [255:224] of the
1187/// returned vector. \n
1188/// 10: Bits [223:192] of the source are copied to bits [255:224] of the
1189/// returned vector. \n
1190/// 11: Bits [255:224] of the source are copied to bits [255:224] of the
1191/// returned vector.
1192/// \returns A 256-bit vector of [8 x float] containing the copied values.
1193#define _mm256_permute_ps(A, C) \
1194 ((__m256)__builtin_ia32_vpermilps256((__v8sf)(__m256)(A), (int)(C)))
1195
1196/// Permutes 128-bit data values stored in two 256-bit vectors of
1197/// [4 x double], as specified by the immediate integer operand.
1198///
1199/// \headerfile <x86intrin.h>
1200///
1201/// \code
1202/// __m256d _mm256_permute2f128_pd(__m256d V1, __m256d V2, const int M);
1203/// \endcode
1204///
1205/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
1206///
1207/// \param V1
1208/// A 256-bit vector of [4 x double].
1209/// \param V2
1210/// A 256-bit vector of [4 x double.
1211/// \param M
1212/// An immediate integer operand specifying how the values are to be
1213/// permuted. \n
1214/// Bits [1:0]: \n
1215/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
1216/// destination. \n
1217/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
1218/// destination. \n
1219/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
1220/// destination. \n
1221/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
1222/// destination. \n
1223/// Bits [5:4]: \n
1224/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
1225/// destination. \n
1226/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
1227/// destination. \n
1228/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
1229/// destination. \n
1230/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
1231/// destination.
1232/// \returns A 256-bit vector of [4 x double] containing the copied values.
1233#define _mm256_permute2f128_pd(V1, V2, M) \
1234 ((__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \
1235 (__v4df)(__m256d)(V2), (int)(M)))
1236
1237/// Permutes 128-bit data values stored in two 256-bit vectors of
1238/// [8 x float], as specified by the immediate integer operand.
1239///
1240/// \headerfile <x86intrin.h>
1241///
1242/// \code
1243/// __m256 _mm256_permute2f128_ps(__m256 V1, __m256 V2, const int M);
1244/// \endcode
1245///
1246/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
1247///
1248/// \param V1
1249/// A 256-bit vector of [8 x float].
1250/// \param V2
1251/// A 256-bit vector of [8 x float].
1252/// \param M
1253/// An immediate integer operand specifying how the values are to be
1254/// permuted. \n
1255/// Bits [1:0]: \n
1256/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
1257/// destination. \n
1258/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
1259/// destination. \n
1260/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
1261/// destination. \n
1262/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
1263/// destination. \n
1264/// Bits [5:4]: \n
1265/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
1266/// destination. \n
1267/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
1268/// destination. \n
1269/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
1270/// destination. \n
1271/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
1272/// destination.
1273/// \returns A 256-bit vector of [8 x float] containing the copied values.
1274#define _mm256_permute2f128_ps(V1, V2, M) \
1275 ((__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \
1276 (__v8sf)(__m256)(V2), (int)(M)))
1277
1278/// Permutes 128-bit data values stored in two 256-bit integer vectors,
1279/// as specified by the immediate integer operand.
1280///
1281/// \headerfile <x86intrin.h>
1282///
1283/// \code
1284/// __m256i _mm256_permute2f128_si256(__m256i V1, __m256i V2, const int M);
1285/// \endcode
1286///
1287/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
1288///
1289/// \param V1
1290/// A 256-bit integer vector.
1291/// \param V2
1292/// A 256-bit integer vector.
1293/// \param M
1294/// An immediate integer operand specifying how the values are to be copied.
1295/// Bits [1:0]: \n
1296/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
1297/// destination. \n
1298/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
1299/// destination. \n
1300/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
1301/// destination. \n
1302/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
1303/// destination. \n
1304/// Bits [5:4]: \n
1305/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
1306/// destination. \n
1307/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
1308/// destination. \n
1309/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
1310/// destination. \n
1311/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
1312/// destination.
1313/// \returns A 256-bit integer vector containing the copied values.
1314#define _mm256_permute2f128_si256(V1, V2, M) \
1315 ((__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \
1316 (__v8si)(__m256i)(V2), (int)(M)))
1317
1318/* Vector Blend */
1319/// Merges 64-bit double-precision data values stored in either of the
1320/// two 256-bit vectors of [4 x double], as specified by the immediate
1321/// integer operand.
1322///
1323/// \headerfile <x86intrin.h>
1324///
1325/// \code
1326/// __m256d _mm256_blend_pd(__m256d V1, __m256d V2, const int M);
1327/// \endcode
1328///
1329/// This intrinsic corresponds to the <c> VBLENDPD </c> instruction.
1330///
1331/// \param V1
1332/// A 256-bit vector of [4 x double].
1333/// \param V2
1334/// A 256-bit vector of [4 x double].
1335/// \param M
1336/// An immediate integer operand, with mask bits [3:0] specifying how the
1337/// values are to be copied. The position of the mask bit corresponds to the
1338/// index of a copied value. When a mask bit is 0, the corresponding 64-bit
1339/// element in operand \a V1 is copied to the same position in the
1340/// destination. When a mask bit is 1, the corresponding 64-bit element in
1341/// operand \a V2 is copied to the same position in the destination.
1342/// \returns A 256-bit vector of [4 x double] containing the copied values.
1343#define _mm256_blend_pd(V1, V2, M) \
1344 ((__m256d)__builtin_ia32_blendpd256((__v4df)(__m256d)(V1), \
1345 (__v4df)(__m256d)(V2), (int)(M)))
1346
1347/// Merges 32-bit single-precision data values stored in either of the
1348/// two 256-bit vectors of [8 x float], as specified by the immediate
1349/// integer operand.
1350///
1351/// \headerfile <x86intrin.h>
1352///
1353/// \code
1354/// __m256 _mm256_blend_ps(__m256 V1, __m256 V2, const int M);
1355/// \endcode
1356///
1357/// This intrinsic corresponds to the <c> VBLENDPS </c> instruction.
1358///
1359/// \param V1
1360/// A 256-bit vector of [8 x float].
1361/// \param V2
1362/// A 256-bit vector of [8 x float].
1363/// \param M
1364/// An immediate integer operand, with mask bits [7:0] specifying how the
1365/// values are to be copied. The position of the mask bit corresponds to the
1366/// index of a copied value. When a mask bit is 0, the corresponding 32-bit
1367/// element in operand \a V1 is copied to the same position in the
1368/// destination. When a mask bit is 1, the corresponding 32-bit element in
1369/// operand \a V2 is copied to the same position in the destination.
1370/// \returns A 256-bit vector of [8 x float] containing the copied values.
1371#define _mm256_blend_ps(V1, V2, M) \
1372 ((__m256)__builtin_ia32_blendps256((__v8sf)(__m256)(V1), \
1373 (__v8sf)(__m256)(V2), (int)(M)))
1374
1375/// Merges 64-bit double-precision data values stored in either of the
1376/// two 256-bit vectors of [4 x double], as specified by the 256-bit vector
1377/// operand.
1378///
1379/// \headerfile <x86intrin.h>
1380///
1381/// This intrinsic corresponds to the <c> VBLENDVPD </c> instruction.
1382///
1383/// \param __a
1384/// A 256-bit vector of [4 x double].
1385/// \param __b
1386/// A 256-bit vector of [4 x double].
1387/// \param __c
1388/// A 256-bit vector operand, with mask bits 255, 191, 127, and 63 specifying
1389/// how the values are to be copied. The position of the mask bit corresponds
1390/// to the most significant bit of a copied value. When a mask bit is 0, the
1391/// corresponding 64-bit element in operand \a __a is copied to the same
1392/// position in the destination. When a mask bit is 1, the corresponding
1393/// 64-bit element in operand \a __b is copied to the same position in the
1394/// destination.
1395/// \returns A 256-bit vector of [4 x double] containing the copied values.
1396static __inline __m256d __DEFAULT_FN_ATTRS
1397_mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c)
1398{
1399 return (__m256d)__builtin_ia32_blendvpd256(
1400 (__v4df)__a, (__v4df)__b, (__v4df)__c);
1401}
1402
1403/// Merges 32-bit single-precision data values stored in either of the
1404/// two 256-bit vectors of [8 x float], as specified by the 256-bit vector
1405/// operand.
1406///
1407/// \headerfile <x86intrin.h>
1408///
1409/// This intrinsic corresponds to the <c> VBLENDVPS </c> instruction.
1410///
1411/// \param __a
1412/// A 256-bit vector of [8 x float].
1413/// \param __b
1414/// A 256-bit vector of [8 x float].
1415/// \param __c
1416/// A 256-bit vector operand, with mask bits 255, 223, 191, 159, 127, 95, 63,
1417/// and 31 specifying how the values are to be copied. The position of the
1418/// mask bit corresponds to the most significant bit of a copied value. When
1419/// a mask bit is 0, the corresponding 32-bit element in operand \a __a is
1420/// copied to the same position in the destination. When a mask bit is 1, the
1421/// corresponding 32-bit element in operand \a __b is copied to the same
1422/// position in the destination.
1423/// \returns A 256-bit vector of [8 x float] containing the copied values.
1424static __inline __m256 __DEFAULT_FN_ATTRS
1425_mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
1426{
1427 return (__m256)__builtin_ia32_blendvps256(
1428 (__v8sf)__a, (__v8sf)__b, (__v8sf)__c);
1429}
1430
1431/* Vector Dot Product */
1432/// Computes two dot products in parallel, using the lower and upper
1433/// halves of two [8 x float] vectors as input to the two computations, and
1434/// returning the two dot products in the lower and upper halves of the
1435/// [8 x float] result.
1436///
1437/// The immediate integer operand controls which input elements will
1438/// contribute to the dot product, and where the final results are returned.
1439/// In general, for each dot product, the four corresponding elements of the
1440/// input vectors are multiplied; the first two and second two products are
1441/// summed, then the two sums are added to form the final result.
1442///
1443/// \headerfile <x86intrin.h>
1444///
1445/// \code
1446/// __m256 _mm256_dp_ps(__m256 V1, __m256 V2, const int M);
1447/// \endcode
1448///
1449/// This intrinsic corresponds to the <c> VDPPS </c> instruction.
1450///
1451/// \param V1
1452/// A vector of [8 x float] values, treated as two [4 x float] vectors.
1453/// \param V2
1454/// A vector of [8 x float] values, treated as two [4 x float] vectors.
1455/// \param M
1456/// An immediate integer argument. Bits [7:4] determine which elements of
1457/// the input vectors are used, with bit [4] corresponding to the lowest
1458/// element and bit [7] corresponding to the highest element of each [4 x
1459/// float] subvector. If a bit is set, the corresponding elements from the
1460/// two input vectors are used as an input for dot product; otherwise that
1461/// input is treated as zero. Bits [3:0] determine which elements of the
1462/// result will receive a copy of the final dot product, with bit [0]
1463/// corresponding to the lowest element and bit [3] corresponding to the
1464/// highest element of each [4 x float] subvector. If a bit is set, the dot
1465/// product is returned in the corresponding element; otherwise that element
1466/// is set to zero. The bitmask is applied in the same way to each of the
1467/// two parallel dot product computations.
1468/// \returns A 256-bit vector of [8 x float] containing the two dot products.
1469#define _mm256_dp_ps(V1, V2, M) \
1470 ((__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \
1471 (__v8sf)(__m256)(V2), (M)))
1472
1473/* Vector shuffle */
1474/// Selects 8 float values from the 256-bit operands of [8 x float], as
1475/// specified by the immediate value operand.
1476///
1477/// The four selected elements in each operand are copied to the destination
1478/// according to the bits specified in the immediate operand. The selected
1479/// elements from the first 256-bit operand are copied to bits [63:0] and
1480/// bits [191:128] of the destination, and the selected elements from the
1481/// second 256-bit operand are copied to bits [127:64] and bits [255:192] of
1482/// the destination. For example, if bits [7:0] of the immediate operand
1483/// contain a value of 0xFF, the 256-bit destination vector would contain the
1484/// following values: b[7], b[7], a[7], a[7], b[3], b[3], a[3], a[3].
1485///
1486/// \headerfile <x86intrin.h>
1487///
1488/// \code
1489/// __m256 _mm256_shuffle_ps(__m256 a, __m256 b, const int mask);
1490/// \endcode
1491///
1492/// This intrinsic corresponds to the <c> VSHUFPS </c> instruction.
1493///
1494/// \param a
1495/// A 256-bit vector of [8 x float]. The four selected elements in this
1496/// operand are copied to bits [63:0] and bits [191:128] in the destination,
1497/// according to the bits specified in the immediate operand.
1498/// \param b
1499/// A 256-bit vector of [8 x float]. The four selected elements in this
1500/// operand are copied to bits [127:64] and bits [255:192] in the
1501/// destination, according to the bits specified in the immediate operand.
1502/// \param mask
1503/// An immediate value containing an 8-bit value specifying which elements to
1504/// copy from \a a and \a b \n.
1505/// Bits [3:0] specify the values copied from operand \a a. \n
1506/// Bits [7:4] specify the values copied from operand \a b. \n
1507/// The destinations within the 256-bit destination are assigned values as
1508/// follows, according to the bit value assignments described below: \n
1509/// Bits [1:0] are used to assign values to bits [31:0] and [159:128] in the
1510/// destination. \n
1511/// Bits [3:2] are used to assign values to bits [63:32] and [191:160] in the
1512/// destination. \n
1513/// Bits [5:4] are used to assign values to bits [95:64] and [223:192] in the
1514/// destination. \n
1515/// Bits [7:6] are used to assign values to bits [127:96] and [255:224] in
1516/// the destination. \n
1517/// Bit value assignments: \n
1518/// 00: Bits [31:0] and [159:128] are copied from the selected operand. \n
1519/// 01: Bits [63:32] and [191:160] are copied from the selected operand. \n
1520/// 10: Bits [95:64] and [223:192] are copied from the selected operand. \n
1521/// 11: Bits [127:96] and [255:224] are copied from the selected operand. \n
1522/// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
1523/// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
1524/// <c>[b6, b4, b2, b0]</c>.
1525/// \returns A 256-bit vector of [8 x float] containing the shuffled values.
1526#define _mm256_shuffle_ps(a, b, mask) \
1527 ((__m256)__builtin_ia32_shufps256((__v8sf)(__m256)(a), \
1528 (__v8sf)(__m256)(b), (int)(mask)))
1529
1530/// Selects four double-precision values from the 256-bit operands of
1531/// [4 x double], as specified by the immediate value operand.
1532///
1533/// The selected elements from the first 256-bit operand are copied to bits
1534/// [63:0] and bits [191:128] in the destination, and the selected elements
1535/// from the second 256-bit operand are copied to bits [127:64] and bits
1536/// [255:192] in the destination. For example, if bits [3:0] of the immediate
1537/// operand contain a value of 0xF, the 256-bit destination vector would
1538/// contain the following values: b[3], a[3], b[1], a[1].
1539///
1540/// \headerfile <x86intrin.h>
1541///
1542/// \code
1543/// __m256d _mm256_shuffle_pd(__m256d a, __m256d b, const int mask);
1544/// \endcode
1545///
1546/// This intrinsic corresponds to the <c> VSHUFPD </c> instruction.
1547///
1548/// \param a
1549/// A 256-bit vector of [4 x double].
1550/// \param b
1551/// A 256-bit vector of [4 x double].
1552/// \param mask
1553/// An immediate value containing 8-bit values specifying which elements to
1554/// copy from \a a and \a b: \n
1555/// Bit [0]=0: Bits [63:0] are copied from \a a to bits [63:0] of the
1556/// destination. \n
1557/// Bit [0]=1: Bits [127:64] are copied from \a a to bits [63:0] of the
1558/// destination. \n
1559/// Bit [1]=0: Bits [63:0] are copied from \a b to bits [127:64] of the
1560/// destination. \n
1561/// Bit [1]=1: Bits [127:64] are copied from \a b to bits [127:64] of the
1562/// destination. \n
1563/// Bit [2]=0: Bits [191:128] are copied from \a a to bits [191:128] of the
1564/// destination. \n
1565/// Bit [2]=1: Bits [255:192] are copied from \a a to bits [191:128] of the
1566/// destination. \n
1567/// Bit [3]=0: Bits [191:128] are copied from \a b to bits [255:192] of the
1568/// destination. \n
1569/// Bit [3]=1: Bits [255:192] are copied from \a b to bits [255:192] of the
1570/// destination.
1571/// \returns A 256-bit vector of [4 x double] containing the shuffled values.
1572#define _mm256_shuffle_pd(a, b, mask) \
1573 ((__m256d)__builtin_ia32_shufpd256((__v4df)(__m256d)(a), \
1574 (__v4df)(__m256d)(b), (int)(mask)))
1575
1576/* Compare */
1577#define _CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */
1578#define _CMP_LT_OS 0x01 /* Less-than (ordered, signaling) */
1579#define _CMP_LE_OS 0x02 /* Less-than-or-equal (ordered, signaling) */
1580#define _CMP_UNORD_Q 0x03 /* Unordered (non-signaling) */
1581#define _CMP_NEQ_UQ 0x04 /* Not-equal (unordered, non-signaling) */
1582#define _CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */
1583#define _CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered, signaling) */
1584#define _CMP_ORD_Q 0x07 /* Ordered (non-signaling) */
1585#define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */
1586#define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unordered, signaling) */
1587#define _CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */
1588#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */
1589#define _CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */
1590#define _CMP_GE_OS 0x0d /* Greater-than-or-equal (ordered, signaling) */
1591#define _CMP_GT_OS 0x0e /* Greater-than (ordered, signaling) */
1592#define _CMP_TRUE_UQ 0x0f /* True (unordered, non-signaling) */
1593#define _CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */
1594#define _CMP_LT_OQ 0x11 /* Less-than (ordered, non-signaling) */
1595#define _CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, non-signaling) */
1596#define _CMP_UNORD_S 0x13 /* Unordered (signaling) */
1597#define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */
1598#define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */
1599#define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unordered, non-signaling) */
1600#define _CMP_ORD_S 0x17 /* Ordered (signaling) */
1601#define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */
1602#define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unordered, non-signaling) */
1603#define _CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */
1604#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling) */
1605#define _CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */
1606#define _CMP_GE_OQ 0x1d /* Greater-than-or-equal (ordered, non-signaling) */
1607#define _CMP_GT_OQ 0x1e /* Greater-than (ordered, non-signaling) */
1608#define _CMP_TRUE_US 0x1f /* True (unordered, signaling) */
1609
1610/// Compares each of the corresponding double-precision values of two
1611/// 128-bit vectors of [2 x double], using the operation specified by the
1612/// immediate integer operand.
1613///
1614/// Returns a [2 x double] vector consisting of two doubles corresponding to
1615/// the two comparison results: zero if the comparison is false, and all 1's
1616/// if the comparison is true.
1617///
1618/// \headerfile <x86intrin.h>
1619///
1620/// \code
1621/// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c);
1622/// \endcode
1623///
1624/// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
1625///
1626/// \param a
1627/// A 128-bit vector of [2 x double].
1628/// \param b
1629/// A 128-bit vector of [2 x double].
1630/// \param c
1631/// An immediate integer operand, with bits [4:0] specifying which comparison
1632/// operation to use: \n
1633/// 0x00: Equal (ordered, non-signaling) \n
1634/// 0x01: Less-than (ordered, signaling) \n
1635/// 0x02: Less-than-or-equal (ordered, signaling) \n
1636/// 0x03: Unordered (non-signaling) \n
1637/// 0x04: Not-equal (unordered, non-signaling) \n
1638/// 0x05: Not-less-than (unordered, signaling) \n
1639/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1640/// 0x07: Ordered (non-signaling) \n
1641/// 0x08: Equal (unordered, non-signaling) \n
1642/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1643/// 0x0A: Not-greater-than (unordered, signaling) \n
1644/// 0x0B: False (ordered, non-signaling) \n
1645/// 0x0C: Not-equal (ordered, non-signaling) \n
1646/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1647/// 0x0E: Greater-than (ordered, signaling) \n
1648/// 0x0F: True (unordered, non-signaling) \n
1649/// 0x10: Equal (ordered, signaling) \n
1650/// 0x11: Less-than (ordered, non-signaling) \n
1651/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1652/// 0x13: Unordered (signaling) \n
1653/// 0x14: Not-equal (unordered, signaling) \n
1654/// 0x15: Not-less-than (unordered, non-signaling) \n
1655/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1656/// 0x17: Ordered (signaling) \n
1657/// 0x18: Equal (unordered, signaling) \n
1658/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1659/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1660/// 0x1B: False (ordered, signaling) \n
1661/// 0x1C: Not-equal (ordered, signaling) \n
1662/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1663/// 0x1E: Greater-than (ordered, non-signaling) \n
1664/// 0x1F: True (unordered, signaling)
1665/// \returns A 128-bit vector of [2 x double] containing the comparison results.
1666#define _mm_cmp_pd(a, b, c) \
1667 ((__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), \
1668 (__v2df)(__m128d)(b), (c)))
1669
1670/// Compares each of the corresponding values of two 128-bit vectors of
1671/// [4 x float], using the operation specified by the immediate integer
1672/// operand.
1673///
1674/// Returns a [4 x float] vector consisting of four floats corresponding to
1675/// the four comparison results: zero if the comparison is false, and all 1's
1676/// if the comparison is true.
1677///
1678/// \headerfile <x86intrin.h>
1679///
1680/// \code
1681/// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c);
1682/// \endcode
1683///
1684/// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
1685///
1686/// \param a
1687/// A 128-bit vector of [4 x float].
1688/// \param b
1689/// A 128-bit vector of [4 x float].
1690/// \param c
1691/// An immediate integer operand, with bits [4:0] specifying which comparison
1692/// operation to use: \n
1693/// 0x00: Equal (ordered, non-signaling) \n
1694/// 0x01: Less-than (ordered, signaling) \n
1695/// 0x02: Less-than-or-equal (ordered, signaling) \n
1696/// 0x03: Unordered (non-signaling) \n
1697/// 0x04: Not-equal (unordered, non-signaling) \n
1698/// 0x05: Not-less-than (unordered, signaling) \n
1699/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1700/// 0x07: Ordered (non-signaling) \n
1701/// 0x08: Equal (unordered, non-signaling) \n
1702/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1703/// 0x0A: Not-greater-than (unordered, signaling) \n
1704/// 0x0B: False (ordered, non-signaling) \n
1705/// 0x0C: Not-equal (ordered, non-signaling) \n
1706/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1707/// 0x0E: Greater-than (ordered, signaling) \n
1708/// 0x0F: True (unordered, non-signaling) \n
1709/// 0x10: Equal (ordered, signaling) \n
1710/// 0x11: Less-than (ordered, non-signaling) \n
1711/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1712/// 0x13: Unordered (signaling) \n
1713/// 0x14: Not-equal (unordered, signaling) \n
1714/// 0x15: Not-less-than (unordered, non-signaling) \n
1715/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1716/// 0x17: Ordered (signaling) \n
1717/// 0x18: Equal (unordered, signaling) \n
1718/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1719/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1720/// 0x1B: False (ordered, signaling) \n
1721/// 0x1C: Not-equal (ordered, signaling) \n
1722/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1723/// 0x1E: Greater-than (ordered, non-signaling) \n
1724/// 0x1F: True (unordered, signaling)
1725/// \returns A 128-bit vector of [4 x float] containing the comparison results.
1726#define _mm_cmp_ps(a, b, c) \
1727 ((__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), \
1728 (__v4sf)(__m128)(b), (c)))
1729
1730/// Compares each of the corresponding double-precision values of two
1731/// 256-bit vectors of [4 x double], using the operation specified by the
1732/// immediate integer operand.
1733///
1734/// Returns a [4 x double] vector consisting of four doubles corresponding to
1735/// the four comparison results: zero if the comparison is false, and all 1's
1736/// if the comparison is true.
1737///
1738/// \headerfile <x86intrin.h>
1739///
1740/// \code
1741/// __m256d _mm256_cmp_pd(__m256d a, __m256d b, const int c);
1742/// \endcode
1743///
1744/// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
1745///
1746/// \param a
1747/// A 256-bit vector of [4 x double].
1748/// \param b
1749/// A 256-bit vector of [4 x double].
1750/// \param c
1751/// An immediate integer operand, with bits [4:0] specifying which comparison
1752/// operation to use: \n
1753/// 0x00: Equal (ordered, non-signaling) \n
1754/// 0x01: Less-than (ordered, signaling) \n
1755/// 0x02: Less-than-or-equal (ordered, signaling) \n
1756/// 0x03: Unordered (non-signaling) \n
1757/// 0x04: Not-equal (unordered, non-signaling) \n
1758/// 0x05: Not-less-than (unordered, signaling) \n
1759/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1760/// 0x07: Ordered (non-signaling) \n
1761/// 0x08: Equal (unordered, non-signaling) \n
1762/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1763/// 0x0A: Not-greater-than (unordered, signaling) \n
1764/// 0x0B: False (ordered, non-signaling) \n
1765/// 0x0C: Not-equal (ordered, non-signaling) \n
1766/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1767/// 0x0E: Greater-than (ordered, signaling) \n
1768/// 0x0F: True (unordered, non-signaling) \n
1769/// 0x10: Equal (ordered, signaling) \n
1770/// 0x11: Less-than (ordered, non-signaling) \n
1771/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1772/// 0x13: Unordered (signaling) \n
1773/// 0x14: Not-equal (unordered, signaling) \n
1774/// 0x15: Not-less-than (unordered, non-signaling) \n
1775/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1776/// 0x17: Ordered (signaling) \n
1777/// 0x18: Equal (unordered, signaling) \n
1778/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1779/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1780/// 0x1B: False (ordered, signaling) \n
1781/// 0x1C: Not-equal (ordered, signaling) \n
1782/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1783/// 0x1E: Greater-than (ordered, non-signaling) \n
1784/// 0x1F: True (unordered, signaling)
1785/// \returns A 256-bit vector of [4 x double] containing the comparison results.
1786#define _mm256_cmp_pd(a, b, c) \
1787 ((__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \
1788 (__v4df)(__m256d)(b), (c)))
1789
1790/// Compares each of the corresponding values of two 256-bit vectors of
1791/// [8 x float], using the operation specified by the immediate integer
1792/// operand.
1793///
1794/// Returns a [8 x float] vector consisting of eight floats corresponding to
1795/// the eight comparison results: zero if the comparison is false, and all
1796/// 1's if the comparison is true.
1797///
1798/// \headerfile <x86intrin.h>
1799///
1800/// \code
1801/// __m256 _mm256_cmp_ps(__m256 a, __m256 b, const int c);
1802/// \endcode
1803///
1804/// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
1805///
1806/// \param a
1807/// A 256-bit vector of [8 x float].
1808/// \param b
1809/// A 256-bit vector of [8 x float].
1810/// \param c
1811/// An immediate integer operand, with bits [4:0] specifying which comparison
1812/// operation to use: \n
1813/// 0x00: Equal (ordered, non-signaling) \n
1814/// 0x01: Less-than (ordered, signaling) \n
1815/// 0x02: Less-than-or-equal (ordered, signaling) \n
1816/// 0x03: Unordered (non-signaling) \n
1817/// 0x04: Not-equal (unordered, non-signaling) \n
1818/// 0x05: Not-less-than (unordered, signaling) \n
1819/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1820/// 0x07: Ordered (non-signaling) \n
1821/// 0x08: Equal (unordered, non-signaling) \n
1822/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1823/// 0x0A: Not-greater-than (unordered, signaling) \n
1824/// 0x0B: False (ordered, non-signaling) \n
1825/// 0x0C: Not-equal (ordered, non-signaling) \n
1826/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1827/// 0x0E: Greater-than (ordered, signaling) \n
1828/// 0x0F: True (unordered, non-signaling) \n
1829/// 0x10: Equal (ordered, signaling) \n
1830/// 0x11: Less-than (ordered, non-signaling) \n
1831/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1832/// 0x13: Unordered (signaling) \n
1833/// 0x14: Not-equal (unordered, signaling) \n
1834/// 0x15: Not-less-than (unordered, non-signaling) \n
1835/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1836/// 0x17: Ordered (signaling) \n
1837/// 0x18: Equal (unordered, signaling) \n
1838/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1839/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1840/// 0x1B: False (ordered, signaling) \n
1841/// 0x1C: Not-equal (ordered, signaling) \n
1842/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1843/// 0x1E: Greater-than (ordered, non-signaling) \n
1844/// 0x1F: True (unordered, signaling)
1845/// \returns A 256-bit vector of [8 x float] containing the comparison results.
1846#define _mm256_cmp_ps(a, b, c) \
1847 ((__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \
1848 (__v8sf)(__m256)(b), (c)))
1849
1850/// Compares each of the corresponding scalar double-precision values of
1851/// two 128-bit vectors of [2 x double], using the operation specified by the
1852/// immediate integer operand.
1853///
1854/// If the result is true, all 64 bits of the destination vector are set;
1855/// otherwise they are cleared.
1856///
1857/// \headerfile <x86intrin.h>
1858///
1859/// \code
1860/// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c);
1861/// \endcode
1862///
1863/// This intrinsic corresponds to the <c> VCMPSD </c> instruction.
1864///
1865/// \param a
1866/// A 128-bit vector of [2 x double].
1867/// \param b
1868/// A 128-bit vector of [2 x double].
1869/// \param c
1870/// An immediate integer operand, with bits [4:0] specifying which comparison
1871/// operation to use: \n
1872/// 0x00: Equal (ordered, non-signaling) \n
1873/// 0x01: Less-than (ordered, signaling) \n
1874/// 0x02: Less-than-or-equal (ordered, signaling) \n
1875/// 0x03: Unordered (non-signaling) \n
1876/// 0x04: Not-equal (unordered, non-signaling) \n
1877/// 0x05: Not-less-than (unordered, signaling) \n
1878/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1879/// 0x07: Ordered (non-signaling) \n
1880/// 0x08: Equal (unordered, non-signaling) \n
1881/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1882/// 0x0A: Not-greater-than (unordered, signaling) \n
1883/// 0x0B: False (ordered, non-signaling) \n
1884/// 0x0C: Not-equal (ordered, non-signaling) \n
1885/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1886/// 0x0E: Greater-than (ordered, signaling) \n
1887/// 0x0F: True (unordered, non-signaling) \n
1888/// 0x10: Equal (ordered, signaling) \n
1889/// 0x11: Less-than (ordered, non-signaling) \n
1890/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1891/// 0x13: Unordered (signaling) \n
1892/// 0x14: Not-equal (unordered, signaling) \n
1893/// 0x15: Not-less-than (unordered, non-signaling) \n
1894/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1895/// 0x17: Ordered (signaling) \n
1896/// 0x18: Equal (unordered, signaling) \n
1897/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1898/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1899/// 0x1B: False (ordered, signaling) \n
1900/// 0x1C: Not-equal (ordered, signaling) \n
1901/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1902/// 0x1E: Greater-than (ordered, non-signaling) \n
1903/// 0x1F: True (unordered, signaling)
1904/// \returns A 128-bit vector of [2 x double] containing the comparison results.
1905#define _mm_cmp_sd(a, b, c) \
1906 ((__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), \
1907 (__v2df)(__m128d)(b), (c)))
1908
1909/// Compares each of the corresponding scalar values of two 128-bit
1910/// vectors of [4 x float], using the operation specified by the immediate
1911/// integer operand.
1912///
1913/// If the result is true, all 32 bits of the destination vector are set;
1914/// otherwise they are cleared.
1915///
1916/// \headerfile <x86intrin.h>
1917///
1918/// \code
1919/// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c);
1920/// \endcode
1921///
1922/// This intrinsic corresponds to the <c> VCMPSS </c> instruction.
1923///
1924/// \param a
1925/// A 128-bit vector of [4 x float].
1926/// \param b
1927/// A 128-bit vector of [4 x float].
1928/// \param c
1929/// An immediate integer operand, with bits [4:0] specifying which comparison
1930/// operation to use: \n
1931/// 0x00: Equal (ordered, non-signaling) \n
1932/// 0x01: Less-than (ordered, signaling) \n
1933/// 0x02: Less-than-or-equal (ordered, signaling) \n
1934/// 0x03: Unordered (non-signaling) \n
1935/// 0x04: Not-equal (unordered, non-signaling) \n
1936/// 0x05: Not-less-than (unordered, signaling) \n
1937/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
1938/// 0x07: Ordered (non-signaling) \n
1939/// 0x08: Equal (unordered, non-signaling) \n
1940/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
1941/// 0x0A: Not-greater-than (unordered, signaling) \n
1942/// 0x0B: False (ordered, non-signaling) \n
1943/// 0x0C: Not-equal (ordered, non-signaling) \n
1944/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
1945/// 0x0E: Greater-than (ordered, signaling) \n
1946/// 0x0F: True (unordered, non-signaling) \n
1947/// 0x10: Equal (ordered, signaling) \n
1948/// 0x11: Less-than (ordered, non-signaling) \n
1949/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
1950/// 0x13: Unordered (signaling) \n
1951/// 0x14: Not-equal (unordered, signaling) \n
1952/// 0x15: Not-less-than (unordered, non-signaling) \n
1953/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1954/// 0x17: Ordered (signaling) \n
1955/// 0x18: Equal (unordered, signaling) \n
1956/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1957/// 0x1A: Not-greater-than (unordered, non-signaling) \n
1958/// 0x1B: False (ordered, signaling) \n
1959/// 0x1C: Not-equal (ordered, signaling) \n
1960/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1961/// 0x1E: Greater-than (ordered, non-signaling) \n
1962/// 0x1F: True (unordered, signaling)
1963/// \returns A 128-bit vector of [4 x float] containing the comparison results.
1964#define _mm_cmp_ss(a, b, c) \
1965 ((__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), \
1966 (__v4sf)(__m128)(b), (c)))
1967
1968/// Takes a [8 x i32] vector and returns the vector element value
1969/// indexed by the immediate constant operand.
1970///
1971/// \headerfile <x86intrin.h>
1972///
1973/// \code
1974/// int _mm256_extract_epi32(__m256i X, const int N);
1975/// \endcode
1976///
1977/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
1978/// instruction.
1979///
1980/// \param X
1981/// A 256-bit vector of [8 x i32].
1982/// \param N
1983/// An immediate integer operand with bits [2:0] determining which vector
1984/// element is extracted and returned.
1985/// \returns A 32-bit integer containing the extracted 32 bits of extended
1986/// packed data.
1987#define _mm256_extract_epi32(X, N) \
1988 ((int)__builtin_ia32_vec_ext_v8si((__v8si)(__m256i)(X), (int)(N)))
1989
1990/// Takes a [16 x i16] vector and returns the vector element value
1991/// indexed by the immediate constant operand.
1992///
1993/// \headerfile <x86intrin.h>
1994///
1995/// \code
1996/// int _mm256_extract_epi16(__m256i X, const int N);
1997/// \endcode
1998///
1999/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
2000/// instruction.
2001///
2002/// \param X
2003/// A 256-bit integer vector of [16 x i16].
2004/// \param N
2005/// An immediate integer operand with bits [3:0] determining which vector
2006/// element is extracted and returned.
2007/// \returns A 32-bit integer containing the extracted 16 bits of zero extended
2008/// packed data.
2009#define _mm256_extract_epi16(X, N) \
2010 ((int)(unsigned short)__builtin_ia32_vec_ext_v16hi((__v16hi)(__m256i)(X), \
2011 (int)(N)))
2012
2013/// Takes a [32 x i8] vector and returns the vector element value
2014/// indexed by the immediate constant operand.
2015///
2016/// \headerfile <x86intrin.h>
2017///
2018/// \code
2019/// int _mm256_extract_epi8(__m256i X, const int N);
2020/// \endcode
2021///
2022/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
2023/// instruction.
2024///
2025/// \param X
2026/// A 256-bit integer vector of [32 x i8].
2027/// \param N
2028/// An immediate integer operand with bits [4:0] determining which vector
2029/// element is extracted and returned.
2030/// \returns A 32-bit integer containing the extracted 8 bits of zero extended
2031/// packed data.
2032#define _mm256_extract_epi8(X, N) \
2033 ((int)(unsigned char)__builtin_ia32_vec_ext_v32qi((__v32qi)(__m256i)(X), \
2034 (int)(N)))
2035
2036#ifdef __x86_64__
2037/// Takes a [4 x i64] vector and returns the vector element value
2038/// indexed by the immediate constant operand.
2039///
2040/// \headerfile <x86intrin.h>
2041///
2042/// \code
2043/// long long _mm256_extract_epi64(__m256i X, const int N);
2044/// \endcode
2045///
2046/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
2047/// instruction.
2048///
2049/// \param X
2050/// A 256-bit integer vector of [4 x i64].
2051/// \param N
2052/// An immediate integer operand with bits [1:0] determining which vector
2053/// element is extracted and returned.
2054/// \returns A 64-bit integer containing the extracted 64 bits of extended
2055/// packed data.
2056#define _mm256_extract_epi64(X, N) \
2057 ((long long)__builtin_ia32_vec_ext_v4di((__v4di)(__m256i)(X), (int)(N)))
2058#endif
2059
2060/// Takes a [8 x i32] vector and replaces the vector element value
2061/// indexed by the immediate constant operand by a new value. Returns the
2062/// modified vector.
2063///
2064/// \headerfile <x86intrin.h>
2065///
2066/// \code
2067/// __m256i _mm256_insert_epi32(__m256i X, int I, const int N);
2068/// \endcode
2069///
2070/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2071/// instruction.
2072///
2073/// \param X
2074/// A vector of [8 x i32] to be used by the insert operation.
2075/// \param I
2076/// An integer value. The replacement value for the insert operation.
2077/// \param N
2078/// An immediate integer specifying the index of the vector element to be
2079/// replaced.
2080/// \returns A copy of vector \a X, after replacing its element indexed by
2081/// \a N with \a I.
2082#define _mm256_insert_epi32(X, I, N) \
2083 ((__m256i)__builtin_ia32_vec_set_v8si((__v8si)(__m256i)(X), \
2084 (int)(I), (int)(N)))
2085
2086
2087/// Takes a [16 x i16] vector and replaces the vector element value
2088/// indexed by the immediate constant operand with a new value. Returns the
2089/// modified vector.
2090///
2091/// \headerfile <x86intrin.h>
2092///
2093/// \code
2094/// __m256i _mm256_insert_epi16(__m256i X, int I, const int N);
2095/// \endcode
2096///
2097/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2098/// instruction.
2099///
2100/// \param X
2101/// A vector of [16 x i16] to be used by the insert operation.
2102/// \param I
2103/// An i16 integer value. The replacement value for the insert operation.
2104/// \param N
2105/// An immediate integer specifying the index of the vector element to be
2106/// replaced.
2107/// \returns A copy of vector \a X, after replacing its element indexed by
2108/// \a N with \a I.
2109#define _mm256_insert_epi16(X, I, N) \
2110 ((__m256i)__builtin_ia32_vec_set_v16hi((__v16hi)(__m256i)(X), \
2111 (int)(I), (int)(N)))
2112
2113/// Takes a [32 x i8] vector and replaces the vector element value
2114/// indexed by the immediate constant operand with a new value. Returns the
2115/// modified vector.
2116///
2117/// \headerfile <x86intrin.h>
2118///
2119/// \code
2120/// __m256i _mm256_insert_epi8(__m256i X, int I, const int N);
2121/// \endcode
2122///
2123/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2124/// instruction.
2125///
2126/// \param X
2127/// A vector of [32 x i8] to be used by the insert operation.
2128/// \param I
2129/// An i8 integer value. The replacement value for the insert operation.
2130/// \param N
2131/// An immediate integer specifying the index of the vector element to be
2132/// replaced.
2133/// \returns A copy of vector \a X, after replacing its element indexed by
2134/// \a N with \a I.
2135#define _mm256_insert_epi8(X, I, N) \
2136 ((__m256i)__builtin_ia32_vec_set_v32qi((__v32qi)(__m256i)(X), \
2137 (int)(I), (int)(N)))
2138
2139#ifdef __x86_64__
2140/// Takes a [4 x i64] vector and replaces the vector element value
2141/// indexed by the immediate constant operand with a new value. Returns the
2142/// modified vector.
2143///
2144/// \headerfile <x86intrin.h>
2145///
2146/// \code
2147/// __m256i _mm256_insert_epi64(__m256i X, int I, const int N);
2148/// \endcode
2149///
2150/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2151/// instruction.
2152///
2153/// \param X
2154/// A vector of [4 x i64] to be used by the insert operation.
2155/// \param I
2156/// A 64-bit integer value. The replacement value for the insert operation.
2157/// \param N
2158/// An immediate integer specifying the index of the vector element to be
2159/// replaced.
2160/// \returns A copy of vector \a X, after replacing its element indexed by
2161/// \a N with \a I.
2162#define _mm256_insert_epi64(X, I, N) \
2163 ((__m256i)__builtin_ia32_vec_set_v4di((__v4di)(__m256i)(X), \
2164 (long long)(I), (int)(N)))
2165#endif
2166
2167/* Conversion */
2168/// Converts a vector of [4 x i32] into a vector of [4 x double].
2169///
2170/// \headerfile <x86intrin.h>
2171///
2172/// This intrinsic corresponds to the <c> VCVTDQ2PD </c> instruction.
2173///
2174/// \param __a
2175/// A 128-bit integer vector of [4 x i32].
2176/// \returns A 256-bit vector of [4 x double] containing the converted values.
2177static __inline __m256d __DEFAULT_FN_ATTRS
2178_mm256_cvtepi32_pd(__m128i __a)
2179{
2180 return (__m256d)__builtin_convertvector((__v4si)__a, __v4df);
2181}
2182
2183/// Converts a vector of [8 x i32] into a vector of [8 x float].
2184///
2185/// \headerfile <x86intrin.h>
2186///
2187/// This intrinsic corresponds to the <c> VCVTDQ2PS </c> instruction.
2188///
2189/// \param __a
2190/// A 256-bit integer vector.
2191/// \returns A 256-bit vector of [8 x float] containing the converted values.
2192static __inline __m256 __DEFAULT_FN_ATTRS
2193_mm256_cvtepi32_ps(__m256i __a)
2194{
2195 return (__m256)__builtin_convertvector((__v8si)__a, __v8sf);
2196}
2197
2198/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of
2199/// [4 x float].
2200///
2201/// \headerfile <x86intrin.h>
2202///
2203/// This intrinsic corresponds to the <c> VCVTPD2PS </c> instruction.
2204///
2205/// \param __a
2206/// A 256-bit vector of [4 x double].
2207/// \returns A 128-bit vector of [4 x float] containing the converted values.
2208static __inline __m128 __DEFAULT_FN_ATTRS
2209_mm256_cvtpd_ps(__m256d __a)
2210{
2211 return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) __a);
2212}
2213
2214/// Converts a vector of [8 x float] into a vector of [8 x i32].
2215///
2216/// \headerfile <x86intrin.h>
2217///
2218/// This intrinsic corresponds to the <c> VCVTPS2DQ </c> instruction.
2219///
2220/// \param __a
2221/// A 256-bit vector of [8 x float].
2222/// \returns A 256-bit integer vector containing the converted values.
2223static __inline __m256i __DEFAULT_FN_ATTRS
2224_mm256_cvtps_epi32(__m256 __a)
2225{
2226 return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) __a);
2227}
2228
2229/// Converts a 128-bit vector of [4 x float] into a 256-bit vector of [4
2230/// x double].
2231///
2232/// \headerfile <x86intrin.h>
2233///
2234/// This intrinsic corresponds to the <c> VCVTPS2PD </c> instruction.
2235///
2236/// \param __a
2237/// A 128-bit vector of [4 x float].
2238/// \returns A 256-bit vector of [4 x double] containing the converted values.
2239static __inline __m256d __DEFAULT_FN_ATTRS
2240_mm256_cvtps_pd(__m128 __a)
2241{
2242 return (__m256d)__builtin_convertvector((__v4sf)__a, __v4df);
2243}
2244
2245/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4
2246/// x i32], truncating the result by rounding towards zero when it is
2247/// inexact.
2248///
2249/// \headerfile <x86intrin.h>
2250///
2251/// This intrinsic corresponds to the <c> VCVTTPD2DQ </c> instruction.
2252///
2253/// \param __a
2254/// A 256-bit vector of [4 x double].
2255/// \returns A 128-bit integer vector containing the converted values.
2256static __inline __m128i __DEFAULT_FN_ATTRS
2257_mm256_cvttpd_epi32(__m256d __a)
2258{
2259 return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) __a);
2260}
2261
2262/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4
2263/// x i32]. When a conversion is inexact, the value returned is rounded
2264/// according to the rounding control bits in the MXCSR register.
2265///
2266/// \headerfile <x86intrin.h>
2267///
2268/// This intrinsic corresponds to the <c> VCVTPD2DQ </c> instruction.
2269///
2270/// \param __a
2271/// A 256-bit vector of [4 x double].
2272/// \returns A 128-bit integer vector containing the converted values.
2273static __inline __m128i __DEFAULT_FN_ATTRS
2274_mm256_cvtpd_epi32(__m256d __a)
2275{
2276 return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a);
2277}
2278
2279/// Converts a vector of [8 x float] into a vector of [8 x i32],
2280/// truncating the result by rounding towards zero when it is inexact.
2281///
2282/// \headerfile <x86intrin.h>
2283///
2284/// This intrinsic corresponds to the <c> VCVTTPS2DQ </c> instruction.
2285///
2286/// \param __a
2287/// A 256-bit vector of [8 x float].
2288/// \returns A 256-bit integer vector containing the converted values.
2289static __inline __m256i __DEFAULT_FN_ATTRS
2290_mm256_cvttps_epi32(__m256 __a)
2291{
2292 return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) __a);
2293}
2294
2295/// Returns the first element of the input vector of [4 x double].
2296///
2297/// \headerfile <x86intrin.h>
2298///
2299/// This intrinsic is a utility function and does not correspond to a specific
2300/// instruction.
2301///
2302/// \param __a
2303/// A 256-bit vector of [4 x double].
2304/// \returns A 64 bit double containing the first element of the input vector.
2305static __inline double __DEFAULT_FN_ATTRS
2306_mm256_cvtsd_f64(__m256d __a)
2307{
2308 return __a[0];
2309}
2310
2311/// Returns the first element of the input vector of [8 x i32].
2312///
2313/// \headerfile <x86intrin.h>
2314///
2315/// This intrinsic is a utility function and does not correspond to a specific
2316/// instruction.
2317///
2318/// \param __a
2319/// A 256-bit vector of [8 x i32].
2320/// \returns A 32 bit integer containing the first element of the input vector.
2321static __inline int __DEFAULT_FN_ATTRS
2322_mm256_cvtsi256_si32(__m256i __a)
2323{
2324 __v8si __b = (__v8si)__a;
2325 return __b[0];
2326}
2327
2328/// Returns the first element of the input vector of [8 x float].
2329///
2330/// \headerfile <x86intrin.h>
2331///
2332/// This intrinsic is a utility function and does not correspond to a specific
2333/// instruction.
2334///
2335/// \param __a
2336/// A 256-bit vector of [8 x float].
2337/// \returns A 32 bit float containing the first element of the input vector.
2338static __inline float __DEFAULT_FN_ATTRS
2339_mm256_cvtss_f32(__m256 __a)
2340{
2341 return __a[0];
2342}
2343
2344/* Vector replicate */
2345/// Moves and duplicates odd-indexed values from a 256-bit vector of
2346/// [8 x float] to float values in a 256-bit vector of [8 x float].
2347///
2348/// \headerfile <x86intrin.h>
2349///
2350/// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction.
2351///
2352/// \param __a
2353/// A 256-bit vector of [8 x float]. \n
2354/// Bits [255:224] of \a __a are written to bits [255:224] and [223:192] of
2355/// the return value. \n
2356/// Bits [191:160] of \a __a are written to bits [191:160] and [159:128] of
2357/// the return value. \n
2358/// Bits [127:96] of \a __a are written to bits [127:96] and [95:64] of the
2359/// return value. \n
2360/// Bits [63:32] of \a __a are written to bits [63:32] and [31:0] of the
2361/// return value.
2362/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
2363/// values.
2364static __inline __m256 __DEFAULT_FN_ATTRS
2365_mm256_movehdup_ps(__m256 __a)
2366{
2367 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 1, 1, 3, 3, 5, 5, 7, 7);
2368}
2369
2370/// Moves and duplicates even-indexed values from a 256-bit vector of
2371/// [8 x float] to float values in a 256-bit vector of [8 x float].
2372///
2373/// \headerfile <x86intrin.h>
2374///
2375/// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction.
2376///
2377/// \param __a
2378/// A 256-bit vector of [8 x float]. \n
2379/// Bits [223:192] of \a __a are written to bits [255:224] and [223:192] of
2380/// the return value. \n
2381/// Bits [159:128] of \a __a are written to bits [191:160] and [159:128] of
2382/// the return value. \n
2383/// Bits [95:64] of \a __a are written to bits [127:96] and [95:64] of the
2384/// return value. \n
2385/// Bits [31:0] of \a __a are written to bits [63:32] and [31:0] of the
2386/// return value.
2387/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
2388/// values.
2389static __inline __m256 __DEFAULT_FN_ATTRS
2390_mm256_moveldup_ps(__m256 __a)
2391{
2392 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 0, 2, 2, 4, 4, 6, 6);
2393}
2394
2395/// Moves and duplicates double-precision floating point values from a
2396/// 256-bit vector of [4 x double] to double-precision values in a 256-bit
2397/// vector of [4 x double].
2398///
2399/// \headerfile <x86intrin.h>
2400///
2401/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
2402///
2403/// \param __a
2404/// A 256-bit vector of [4 x double]. \n
2405/// Bits [63:0] of \a __a are written to bits [127:64] and [63:0] of the
2406/// return value. \n
2407/// Bits [191:128] of \a __a are written to bits [255:192] and [191:128] of
2408/// the return value.
2409/// \returns A 256-bit vector of [4 x double] containing the moved and
2410/// duplicated values.
2411static __inline __m256d __DEFAULT_FN_ATTRS
2412_mm256_movedup_pd(__m256d __a)
2413{
2414 return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 0, 2, 2);
2415}
2416
2417/* Unpack and Interleave */
2418/// Unpacks the odd-indexed vector elements from two 256-bit vectors of
2419/// [4 x double] and interleaves them into a 256-bit vector of [4 x double].
2420///
2421/// \headerfile <x86intrin.h>
2422///
2423/// This intrinsic corresponds to the <c> VUNPCKHPD </c> instruction.
2424///
2425/// \param __a
2426/// A 256-bit floating-point vector of [4 x double]. \n
2427/// Bits [127:64] are written to bits [63:0] of the return value. \n
2428/// Bits [255:192] are written to bits [191:128] of the return value. \n
2429/// \param __b
2430/// A 256-bit floating-point vector of [4 x double]. \n
2431/// Bits [127:64] are written to bits [127:64] of the return value. \n
2432/// Bits [255:192] are written to bits [255:192] of the return value. \n
2433/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
2434static __inline __m256d __DEFAULT_FN_ATTRS
2435_mm256_unpackhi_pd(__m256d __a, __m256d __b)
2436{
2437 return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 1, 5, 1+2, 5+2);
2438}
2439
2440/// Unpacks the even-indexed vector elements from two 256-bit vectors of
2441/// [4 x double] and interleaves them into a 256-bit vector of [4 x double].
2442///
2443/// \headerfile <x86intrin.h>
2444///
2445/// This intrinsic corresponds to the <c> VUNPCKLPD </c> instruction.
2446///
2447/// \param __a
2448/// A 256-bit floating-point vector of [4 x double]. \n
2449/// Bits [63:0] are written to bits [63:0] of the return value. \n
2450/// Bits [191:128] are written to bits [191:128] of the return value.
2451/// \param __b
2452/// A 256-bit floating-point vector of [4 x double]. \n
2453/// Bits [63:0] are written to bits [127:64] of the return value. \n
2454/// Bits [191:128] are written to bits [255:192] of the return value. \n
2455/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
2456static __inline __m256d __DEFAULT_FN_ATTRS
2457_mm256_unpacklo_pd(__m256d __a, __m256d __b)
2458{
2459 return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 0, 4, 0+2, 4+2);
2460}
2461
2462/// Unpacks the 32-bit vector elements 2, 3, 6 and 7 from each of the
2463/// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
2464/// vector of [8 x float].
2465///
2466/// \headerfile <x86intrin.h>
2467///
2468/// This intrinsic corresponds to the <c> VUNPCKHPS </c> instruction.
2469///
2470/// \param __a
2471/// A 256-bit vector of [8 x float]. \n
2472/// Bits [95:64] are written to bits [31:0] of the return value. \n
2473/// Bits [127:96] are written to bits [95:64] of the return value. \n
2474/// Bits [223:192] are written to bits [159:128] of the return value. \n
2475/// Bits [255:224] are written to bits [223:192] of the return value.
2476/// \param __b
2477/// A 256-bit vector of [8 x float]. \n
2478/// Bits [95:64] are written to bits [63:32] of the return value. \n
2479/// Bits [127:96] are written to bits [127:96] of the return value. \n
2480/// Bits [223:192] are written to bits [191:160] of the return value. \n
2481/// Bits [255:224] are written to bits [255:224] of the return value.
2482/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
2483static __inline __m256 __DEFAULT_FN_ATTRS
2484_mm256_unpackhi_ps(__m256 __a, __m256 __b)
2485{
2486 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
2487}
2488
2489/// Unpacks the 32-bit vector elements 0, 1, 4 and 5 from each of the
2490/// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
2491/// vector of [8 x float].
2492///
2493/// \headerfile <x86intrin.h>
2494///
2495/// This intrinsic corresponds to the <c> VUNPCKLPS </c> instruction.
2496///
2497/// \param __a
2498/// A 256-bit vector of [8 x float]. \n
2499/// Bits [31:0] are written to bits [31:0] of the return value. \n
2500/// Bits [63:32] are written to bits [95:64] of the return value. \n
2501/// Bits [159:128] are written to bits [159:128] of the return value. \n
2502/// Bits [191:160] are written to bits [223:192] of the return value.
2503/// \param __b
2504/// A 256-bit vector of [8 x float]. \n
2505/// Bits [31:0] are written to bits [63:32] of the return value. \n
2506/// Bits [63:32] are written to bits [127:96] of the return value. \n
2507/// Bits [159:128] are written to bits [191:160] of the return value. \n
2508/// Bits [191:160] are written to bits [255:224] of the return value.
2509/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
2510static __inline __m256 __DEFAULT_FN_ATTRS
2511_mm256_unpacklo_ps(__m256 __a, __m256 __b)
2512{
2513 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
2514}
2515
2516/* Bit Test */
2517/// Given two 128-bit floating-point vectors of [2 x double], perform an
2518/// element-by-element comparison of the double-precision element in the
2519/// first source vector and the corresponding element in the second source
2520/// vector.
2521///
2522/// The EFLAGS register is updated as follows: \n
2523/// If there is at least one pair of double-precision elements where the
2524/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2525/// ZF flag is set to 1. \n
2526/// If there is at least one pair of double-precision elements where the
2527/// sign-bit of the first element is 0 and the sign-bit of the second element
2528/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2529/// This intrinsic returns the value of the ZF flag.
2530///
2531/// \headerfile <x86intrin.h>
2532///
2533/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2534///
2535/// \param __a
2536/// A 128-bit vector of [2 x double].
2537/// \param __b
2538/// A 128-bit vector of [2 x double].
2539/// \returns the ZF flag in the EFLAGS register.
2540static __inline int __DEFAULT_FN_ATTRS128
2541_mm_testz_pd(__m128d __a, __m128d __b)
2542{
2543 return __builtin_ia32_vtestzpd((__v2df)__a, (__v2df)__b);
2544}
2545
2546/// Given two 128-bit floating-point vectors of [2 x double], perform an
2547/// element-by-element comparison of the double-precision element in the
2548/// first source vector and the corresponding element in the second source
2549/// vector.
2550///
2551/// The EFLAGS register is updated as follows: \n
2552/// If there is at least one pair of double-precision elements where the
2553/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2554/// ZF flag is set to 1. \n
2555/// If there is at least one pair of double-precision elements where the
2556/// sign-bit of the first element is 0 and the sign-bit of the second element
2557/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2558/// This intrinsic returns the value of the CF flag.
2559///
2560/// \headerfile <x86intrin.h>
2561///
2562/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2563///
2564/// \param __a
2565/// A 128-bit vector of [2 x double].
2566/// \param __b
2567/// A 128-bit vector of [2 x double].
2568/// \returns the CF flag in the EFLAGS register.
2569static __inline int __DEFAULT_FN_ATTRS128
2570_mm_testc_pd(__m128d __a, __m128d __b)
2571{
2572 return __builtin_ia32_vtestcpd((__v2df)__a, (__v2df)__b);
2573}
2574
2575/// Given two 128-bit floating-point vectors of [2 x double], perform an
2576/// element-by-element comparison of the double-precision element in the
2577/// first source vector and the corresponding element in the second source
2578/// vector.
2579///
2580/// The EFLAGS register is updated as follows: \n
2581/// If there is at least one pair of double-precision elements where the
2582/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2583/// ZF flag is set to 1. \n
2584/// If there is at least one pair of double-precision elements where the
2585/// sign-bit of the first element is 0 and the sign-bit of the second element
2586/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2587/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2588/// otherwise it returns 0.
2589///
2590/// \headerfile <x86intrin.h>
2591///
2592/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2593///
2594/// \param __a
2595/// A 128-bit vector of [2 x double].
2596/// \param __b
2597/// A 128-bit vector of [2 x double].
2598/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2599static __inline int __DEFAULT_FN_ATTRS128
2600_mm_testnzc_pd(__m128d __a, __m128d __b)
2601{
2602 return __builtin_ia32_vtestnzcpd((__v2df)__a, (__v2df)__b);
2603}
2604
2605/// Given two 128-bit floating-point vectors of [4 x float], perform an
2606/// element-by-element comparison of the single-precision element in the
2607/// first source vector and the corresponding element in the second source
2608/// vector.
2609///
2610/// The EFLAGS register is updated as follows: \n
2611/// If there is at least one pair of single-precision elements where the
2612/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2613/// ZF flag is set to 1. \n
2614/// If there is at least one pair of single-precision elements where the
2615/// sign-bit of the first element is 0 and the sign-bit of the second element
2616/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2617/// This intrinsic returns the value of the ZF flag.
2618///
2619/// \headerfile <x86intrin.h>
2620///
2621/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2622///
2623/// \param __a
2624/// A 128-bit vector of [4 x float].
2625/// \param __b
2626/// A 128-bit vector of [4 x float].
2627/// \returns the ZF flag.
2628static __inline int __DEFAULT_FN_ATTRS128
2629_mm_testz_ps(__m128 __a, __m128 __b)
2630{
2631 return __builtin_ia32_vtestzps((__v4sf)__a, (__v4sf)__b);
2632}
2633
2634/// Given two 128-bit floating-point vectors of [4 x float], perform an
2635/// element-by-element comparison of the single-precision element in the
2636/// first source vector and the corresponding element in the second source
2637/// vector.
2638///
2639/// The EFLAGS register is updated as follows: \n
2640/// If there is at least one pair of single-precision elements where the
2641/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2642/// ZF flag is set to 1. \n
2643/// If there is at least one pair of single-precision elements where the
2644/// sign-bit of the first element is 0 and the sign-bit of the second element
2645/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2646/// This intrinsic returns the value of the CF flag.
2647///
2648/// \headerfile <x86intrin.h>
2649///
2650/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2651///
2652/// \param __a
2653/// A 128-bit vector of [4 x float].
2654/// \param __b
2655/// A 128-bit vector of [4 x float].
2656/// \returns the CF flag.
2657static __inline int __DEFAULT_FN_ATTRS128
2658_mm_testc_ps(__m128 __a, __m128 __b)
2659{
2660 return __builtin_ia32_vtestcps((__v4sf)__a, (__v4sf)__b);
2661}
2662
2663/// Given two 128-bit floating-point vectors of [4 x float], perform an
2664/// element-by-element comparison of the single-precision element in the
2665/// first source vector and the corresponding element in the second source
2666/// vector.
2667///
2668/// The EFLAGS register is updated as follows: \n
2669/// If there is at least one pair of single-precision elements where the
2670/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2671/// ZF flag is set to 1. \n
2672/// If there is at least one pair of single-precision elements where the
2673/// sign-bit of the first element is 0 and the sign-bit of the second element
2674/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2675/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2676/// otherwise it returns 0.
2677///
2678/// \headerfile <x86intrin.h>
2679///
2680/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2681///
2682/// \param __a
2683/// A 128-bit vector of [4 x float].
2684/// \param __b
2685/// A 128-bit vector of [4 x float].
2686/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2687static __inline int __DEFAULT_FN_ATTRS128
2688_mm_testnzc_ps(__m128 __a, __m128 __b)
2689{
2690 return __builtin_ia32_vtestnzcps((__v4sf)__a, (__v4sf)__b);
2691}
2692
2693/// Given two 256-bit floating-point vectors of [4 x double], perform an
2694/// element-by-element comparison of the double-precision elements in the
2695/// first source vector and the corresponding elements in the second source
2696/// vector.
2697///
2698/// The EFLAGS register is updated as follows: \n
2699/// If there is at least one pair of double-precision elements where the
2700/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2701/// ZF flag is set to 1. \n
2702/// If there is at least one pair of double-precision elements where the
2703/// sign-bit of the first element is 0 and the sign-bit of the second element
2704/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2705/// This intrinsic returns the value of the ZF flag.
2706///
2707/// \headerfile <x86intrin.h>
2708///
2709/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2710///
2711/// \param __a
2712/// A 256-bit vector of [4 x double].
2713/// \param __b
2714/// A 256-bit vector of [4 x double].
2715/// \returns the ZF flag.
2716static __inline int __DEFAULT_FN_ATTRS
2717_mm256_testz_pd(__m256d __a, __m256d __b)
2718{
2719 return __builtin_ia32_vtestzpd256((__v4df)__a, (__v4df)__b);
2720}
2721
2722/// Given two 256-bit floating-point vectors of [4 x double], perform an
2723/// element-by-element comparison of the double-precision elements in the
2724/// first source vector and the corresponding elements in the second source
2725/// vector.
2726///
2727/// The EFLAGS register is updated as follows: \n
2728/// If there is at least one pair of double-precision elements where the
2729/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2730/// ZF flag is set to 1. \n
2731/// If there is at least one pair of double-precision elements where the
2732/// sign-bit of the first element is 0 and the sign-bit of the second element
2733/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2734/// This intrinsic returns the value of the CF flag.
2735///
2736/// \headerfile <x86intrin.h>
2737///
2738/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2739///
2740/// \param __a
2741/// A 256-bit vector of [4 x double].
2742/// \param __b
2743/// A 256-bit vector of [4 x double].
2744/// \returns the CF flag.
2745static __inline int __DEFAULT_FN_ATTRS
2746_mm256_testc_pd(__m256d __a, __m256d __b)
2747{
2748 return __builtin_ia32_vtestcpd256((__v4df)__a, (__v4df)__b);
2749}
2750
2751/// Given two 256-bit floating-point vectors of [4 x double], perform an
2752/// element-by-element comparison of the double-precision elements in the
2753/// first source vector and the corresponding elements in the second source
2754/// vector.
2755///
2756/// The EFLAGS register is updated as follows: \n
2757/// If there is at least one pair of double-precision elements where the
2758/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2759/// ZF flag is set to 1. \n
2760/// If there is at least one pair of double-precision elements where the
2761/// sign-bit of the first element is 0 and the sign-bit of the second element
2762/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2763/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2764/// otherwise it returns 0.
2765///
2766/// \headerfile <x86intrin.h>
2767///
2768/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2769///
2770/// \param __a
2771/// A 256-bit vector of [4 x double].
2772/// \param __b
2773/// A 256-bit vector of [4 x double].
2774/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2775static __inline int __DEFAULT_FN_ATTRS
2776_mm256_testnzc_pd(__m256d __a, __m256d __b)
2777{
2778 return __builtin_ia32_vtestnzcpd256((__v4df)__a, (__v4df)__b);
2779}
2780
2781/// Given two 256-bit floating-point vectors of [8 x float], perform an
2782/// element-by-element comparison of the single-precision element in the
2783/// first source vector and the corresponding element in the second source
2784/// vector.
2785///
2786/// The EFLAGS register is updated as follows: \n
2787/// If there is at least one pair of single-precision elements where the
2788/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2789/// ZF flag is set to 1. \n
2790/// If there is at least one pair of single-precision elements where the
2791/// sign-bit of the first element is 0 and the sign-bit of the second element
2792/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2793/// This intrinsic returns the value of the ZF flag.
2794///
2795/// \headerfile <x86intrin.h>
2796///
2797/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2798///
2799/// \param __a
2800/// A 256-bit vector of [8 x float].
2801/// \param __b
2802/// A 256-bit vector of [8 x float].
2803/// \returns the ZF flag.
2804static __inline int __DEFAULT_FN_ATTRS
2805_mm256_testz_ps(__m256 __a, __m256 __b)
2806{
2807 return __builtin_ia32_vtestzps256((__v8sf)__a, (__v8sf)__b);
2808}
2809
2810/// Given two 256-bit floating-point vectors of [8 x float], perform an
2811/// element-by-element comparison of the single-precision element in the
2812/// first source vector and the corresponding element in the second source
2813/// vector.
2814///
2815/// The EFLAGS register is updated as follows: \n
2816/// If there is at least one pair of single-precision elements where the
2817/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2818/// ZF flag is set to 1. \n
2819/// If there is at least one pair of single-precision elements where the
2820/// sign-bit of the first element is 0 and the sign-bit of the second element
2821/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2822/// This intrinsic returns the value of the CF flag.
2823///
2824/// \headerfile <x86intrin.h>
2825///
2826/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2827///
2828/// \param __a
2829/// A 256-bit vector of [8 x float].
2830/// \param __b
2831/// A 256-bit vector of [8 x float].
2832/// \returns the CF flag.
2833static __inline int __DEFAULT_FN_ATTRS
2834_mm256_testc_ps(__m256 __a, __m256 __b)
2835{
2836 return __builtin_ia32_vtestcps256((__v8sf)__a, (__v8sf)__b);
2837}
2838
2839/// Given two 256-bit floating-point vectors of [8 x float], perform an
2840/// element-by-element comparison of the single-precision elements in the
2841/// first source vector and the corresponding elements in the second source
2842/// vector.
2843///
2844/// The EFLAGS register is updated as follows: \n
2845/// If there is at least one pair of single-precision elements where the
2846/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2847/// ZF flag is set to 1. \n
2848/// If there is at least one pair of single-precision elements where the
2849/// sign-bit of the first element is 0 and the sign-bit of the second element
2850/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2851/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2852/// otherwise it returns 0.
2853///
2854/// \headerfile <x86intrin.h>
2855///
2856/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2857///
2858/// \param __a
2859/// A 256-bit vector of [8 x float].
2860/// \param __b
2861/// A 256-bit vector of [8 x float].
2862/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2863static __inline int __DEFAULT_FN_ATTRS
2864_mm256_testnzc_ps(__m256 __a, __m256 __b)
2865{
2866 return __builtin_ia32_vtestnzcps256((__v8sf)__a, (__v8sf)__b);
2867}
2868
2869/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
2870/// of the two source vectors.
2871///
2872/// The EFLAGS register is updated as follows: \n
2873/// If there is at least one pair of bits where both bits are 1, the ZF flag
2874/// is set to 0. Otherwise the ZF flag is set to 1. \n
2875/// If there is at least one pair of bits where the bit from the first source
2876/// vector is 0 and the bit from the second source vector is 1, the CF flag
2877/// is set to 0. Otherwise the CF flag is set to 1. \n
2878/// This intrinsic returns the value of the ZF flag.
2879///
2880/// \headerfile <x86intrin.h>
2881///
2882/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
2883///
2884/// \param __a
2885/// A 256-bit integer vector.
2886/// \param __b
2887/// A 256-bit integer vector.
2888/// \returns the ZF flag.
2889static __inline int __DEFAULT_FN_ATTRS
2890_mm256_testz_si256(__m256i __a, __m256i __b)
2891{
2892 return __builtin_ia32_ptestz256((__v4di)__a, (__v4di)__b);
2893}
2894
2895/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
2896/// of the two source vectors.
2897///
2898/// The EFLAGS register is updated as follows: \n
2899/// If there is at least one pair of bits where both bits are 1, the ZF flag
2900/// is set to 0. Otherwise the ZF flag is set to 1. \n
2901/// If there is at least one pair of bits where the bit from the first source
2902/// vector is 0 and the bit from the second source vector is 1, the CF flag
2903/// is set to 0. Otherwise the CF flag is set to 1. \n
2904/// This intrinsic returns the value of the CF flag.
2905///
2906/// \headerfile <x86intrin.h>
2907///
2908/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
2909///
2910/// \param __a
2911/// A 256-bit integer vector.
2912/// \param __b
2913/// A 256-bit integer vector.
2914/// \returns the CF flag.
2915static __inline int __DEFAULT_FN_ATTRS
2916_mm256_testc_si256(__m256i __a, __m256i __b)
2917{
2918 return __builtin_ia32_ptestc256((__v4di)__a, (__v4di)__b);
2919}
2920
2921/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
2922/// of the two source vectors.
2923///
2924/// The EFLAGS register is updated as follows: \n
2925/// If there is at least one pair of bits where both bits are 1, the ZF flag
2926/// is set to 0. Otherwise the ZF flag is set to 1. \n
2927/// If there is at least one pair of bits where the bit from the first source
2928/// vector is 0 and the bit from the second source vector is 1, the CF flag
2929/// is set to 0. Otherwise the CF flag is set to 1. \n
2930/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2931/// otherwise it returns 0.
2932///
2933/// \headerfile <x86intrin.h>
2934///
2935/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
2936///
2937/// \param __a
2938/// A 256-bit integer vector.
2939/// \param __b
2940/// A 256-bit integer vector.
2941/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2942static __inline int __DEFAULT_FN_ATTRS
2943_mm256_testnzc_si256(__m256i __a, __m256i __b)
2944{
2945 return __builtin_ia32_ptestnzc256((__v4di)__a, (__v4di)__b);
2946}
2947
2948/* Vector extract sign mask */
2949/// Extracts the sign bits of double-precision floating point elements
2950/// in a 256-bit vector of [4 x double] and writes them to the lower order
2951/// bits of the return value.
2952///
2953/// \headerfile <x86intrin.h>
2954///
2955/// This intrinsic corresponds to the <c> VMOVMSKPD </c> instruction.
2956///
2957/// \param __a
2958/// A 256-bit vector of [4 x double] containing the double-precision
2959/// floating point values with sign bits to be extracted.
2960/// \returns The sign bits from the operand, written to bits [3:0].
2961static __inline int __DEFAULT_FN_ATTRS
2962_mm256_movemask_pd(__m256d __a)
2963{
2964 return __builtin_ia32_movmskpd256((__v4df)__a);
2965}
2966
2967/// Extracts the sign bits of single-precision floating point elements
2968/// in a 256-bit vector of [8 x float] and writes them to the lower order
2969/// bits of the return value.
2970///
2971/// \headerfile <x86intrin.h>
2972///
2973/// This intrinsic corresponds to the <c> VMOVMSKPS </c> instruction.
2974///
2975/// \param __a
2976/// A 256-bit vector of [8 x float] containing the single-precision floating
2977/// point values with sign bits to be extracted.
2978/// \returns The sign bits from the operand, written to bits [7:0].
2979static __inline int __DEFAULT_FN_ATTRS
2980_mm256_movemask_ps(__m256 __a)
2981{
2982 return __builtin_ia32_movmskps256((__v8sf)__a);
2983}
2984
2985/* Vector __zero */
2986/// Zeroes the contents of all XMM or YMM registers.
2987///
2988/// \headerfile <x86intrin.h>
2989///
2990/// This intrinsic corresponds to the <c> VZEROALL </c> instruction.
2991static __inline void __attribute__((__always_inline__, __nodebug__, __target__("avx")))
2992_mm256_zeroall(void)
2993{
2994 __builtin_ia32_vzeroall();
2995}
2996
2997/// Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
2998///
2999/// \headerfile <x86intrin.h>
3000///
3001/// This intrinsic corresponds to the <c> VZEROUPPER </c> instruction.
3002static __inline void __attribute__((__always_inline__, __nodebug__, __target__("avx")))
3003_mm256_zeroupper(void)
3004{
3005 __builtin_ia32_vzeroupper();
3006}
3007
3008/* Vector load with broadcast */
3009/// Loads a scalar single-precision floating point value from the
3010/// specified address pointed to by \a __a and broadcasts it to the elements
3011/// of a [4 x float] vector.
3012///
3013/// \headerfile <x86intrin.h>
3014///
3015/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
3016///
3017/// \param __a
3018/// The single-precision floating point value to be broadcast.
3019/// \returns A 128-bit vector of [4 x float] whose 32-bit elements are set
3020/// equal to the broadcast value.
3021static __inline __m128 __DEFAULT_FN_ATTRS128
3022_mm_broadcast_ss(float const *__a)
3023{
3024 struct __mm_broadcast_ss_struct {
3025 float __f;
3026 } __attribute__((__packed__, __may_alias__));
3027 float __f = ((const struct __mm_broadcast_ss_struct*)__a)->__f;
3028 return __extension__ (__m128){ __f, __f, __f, __f };
3029}
3030
3031/// Loads a scalar double-precision floating point value from the
3032/// specified address pointed to by \a __a and broadcasts it to the elements
3033/// of a [4 x double] vector.
3034///
3035/// \headerfile <x86intrin.h>
3036///
3037/// This intrinsic corresponds to the <c> VBROADCASTSD </c> instruction.
3038///
3039/// \param __a
3040/// The double-precision floating point value to be broadcast.
3041/// \returns A 256-bit vector of [4 x double] whose 64-bit elements are set
3042/// equal to the broadcast value.
3043static __inline __m256d __DEFAULT_FN_ATTRS
3044_mm256_broadcast_sd(double const *__a)
3045{
3046 struct __mm256_broadcast_sd_struct {
3047 double __d;
3048 } __attribute__((__packed__, __may_alias__));
3049 double __d = ((const struct __mm256_broadcast_sd_struct*)__a)->__d;
3050 return __extension__ (__m256d)(__v4df){ __d, __d, __d, __d };
3051}
3052
3053/// Loads a scalar single-precision floating point value from the
3054/// specified address pointed to by \a __a and broadcasts it to the elements
3055/// of a [8 x float] vector.
3056///
3057/// \headerfile <x86intrin.h>
3058///
3059/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
3060///
3061/// \param __a
3062/// The single-precision floating point value to be broadcast.
3063/// \returns A 256-bit vector of [8 x float] whose 32-bit elements are set
3064/// equal to the broadcast value.
3065static __inline __m256 __DEFAULT_FN_ATTRS
3066_mm256_broadcast_ss(float const *__a)
3067{
3068 struct __mm256_broadcast_ss_struct {
3069 float __f;
3070 } __attribute__((__packed__, __may_alias__));
3071 float __f = ((const struct __mm256_broadcast_ss_struct*)__a)->__f;
3072 return __extension__ (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f };
3073}
3074
3075/// Loads the data from a 128-bit vector of [2 x double] from the
3076/// specified address pointed to by \a __a and broadcasts it to 128-bit
3077/// elements in a 256-bit vector of [4 x double].
3078///
3079/// \headerfile <x86intrin.h>
3080///
3081/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
3082///
3083/// \param __a
3084/// The 128-bit vector of [2 x double] to be broadcast.
3085/// \returns A 256-bit vector of [4 x double] whose 128-bit elements are set
3086/// equal to the broadcast value.
3087static __inline __m256d __DEFAULT_FN_ATTRS
3088_mm256_broadcast_pd(__m128d const *__a)
3089{
3090 __m128d __b = _mm_loadu_pd((const double *)__a);
3091 return (__m256d)__builtin_shufflevector((__v2df)__b, (__v2df)__b,
3092 0, 1, 0, 1);
3093}
3094
3095/// Loads the data from a 128-bit vector of [4 x float] from the
3096/// specified address pointed to by \a __a and broadcasts it to 128-bit
3097/// elements in a 256-bit vector of [8 x float].
3098///
3099/// \headerfile <x86intrin.h>
3100///
3101/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
3102///
3103/// \param __a
3104/// The 128-bit vector of [4 x float] to be broadcast.
3105/// \returns A 256-bit vector of [8 x float] whose 128-bit elements are set
3106/// equal to the broadcast value.
3107static __inline __m256 __DEFAULT_FN_ATTRS
3108_mm256_broadcast_ps(__m128 const *__a)
3109{
3110 __m128 __b = _mm_loadu_ps((const float *)__a);
3111 return (__m256)__builtin_shufflevector((__v4sf)__b, (__v4sf)__b,
3112 0, 1, 2, 3, 0, 1, 2, 3);
3113}
3114
3115/* SIMD load ops */
3116/// Loads 4 double-precision floating point values from a 32-byte aligned
3117/// memory location pointed to by \a __p into a vector of [4 x double].
3118///
3119/// \headerfile <x86intrin.h>
3120///
3121/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
3122///
3123/// \param __p
3124/// A 32-byte aligned pointer to a memory location containing
3125/// double-precision floating point values.
3126/// \returns A 256-bit vector of [4 x double] containing the moved values.
3127static __inline __m256d __DEFAULT_FN_ATTRS
3128_mm256_load_pd(double const *__p)
3129{
3130 return *(const __m256d *)__p;
3131}
3132
3133/// Loads 8 single-precision floating point values from a 32-byte aligned
3134/// memory location pointed to by \a __p into a vector of [8 x float].
3135///
3136/// \headerfile <x86intrin.h>
3137///
3138/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
3139///
3140/// \param __p
3141/// A 32-byte aligned pointer to a memory location containing float values.
3142/// \returns A 256-bit vector of [8 x float] containing the moved values.
3143static __inline __m256 __DEFAULT_FN_ATTRS
3144_mm256_load_ps(float const *__p)
3145{
3146 return *(const __m256 *)__p;
3147}
3148
3149/// Loads 4 double-precision floating point values from an unaligned
3150/// memory location pointed to by \a __p into a vector of [4 x double].
3151///
3152/// \headerfile <x86intrin.h>
3153///
3154/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
3155///
3156/// \param __p
3157/// A pointer to a memory location containing double-precision floating
3158/// point values.
3159/// \returns A 256-bit vector of [4 x double] containing the moved values.
3160static __inline __m256d __DEFAULT_FN_ATTRS
3161_mm256_loadu_pd(double const *__p)
3162{
3163 struct __loadu_pd {
3164 __m256d_u __v;
3165 } __attribute__((__packed__, __may_alias__));
3166 return ((const struct __loadu_pd*)__p)->__v;
3167}
3168
3169/// Loads 8 single-precision floating point values from an unaligned
3170/// memory location pointed to by \a __p into a vector of [8 x float].
3171///
3172/// \headerfile <x86intrin.h>
3173///
3174/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
3175///
3176/// \param __p
3177/// A pointer to a memory location containing single-precision floating
3178/// point values.
3179/// \returns A 256-bit vector of [8 x float] containing the moved values.
3180static __inline __m256 __DEFAULT_FN_ATTRS
3181_mm256_loadu_ps(float const *__p)
3182{
3183 struct __loadu_ps {
3184 __m256_u __v;
3185 } __attribute__((__packed__, __may_alias__));
3186 return ((const struct __loadu_ps*)__p)->__v;
3187}
3188
3189/// Loads 256 bits of integer data from a 32-byte aligned memory
3190/// location pointed to by \a __p into elements of a 256-bit integer vector.
3191///
3192/// \headerfile <x86intrin.h>
3193///
3194/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
3195///
3196/// \param __p
3197/// A 32-byte aligned pointer to a 256-bit integer vector containing integer
3198/// values.
3199/// \returns A 256-bit integer vector containing the moved values.
3200static __inline __m256i __DEFAULT_FN_ATTRS
3201_mm256_load_si256(__m256i const *__p)
3202{
3203 return *__p;
3204}
3205
3206/// Loads 256 bits of integer data from an unaligned memory location
3207/// pointed to by \a __p into a 256-bit integer vector.
3208///
3209/// \headerfile <x86intrin.h>
3210///
3211/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
3212///
3213/// \param __p
3214/// A pointer to a 256-bit integer vector containing integer values.
3215/// \returns A 256-bit integer vector containing the moved values.
3216static __inline __m256i __DEFAULT_FN_ATTRS
3217_mm256_loadu_si256(__m256i_u const *__p)
3218{
3219 struct __loadu_si256 {
3220 __m256i_u __v;
3221 } __attribute__((__packed__, __may_alias__));
3222 return ((const struct __loadu_si256*)__p)->__v;
3223}
3224
3225/// Loads 256 bits of integer data from an unaligned memory location
3226/// pointed to by \a __p into a 256-bit integer vector. This intrinsic may
3227/// perform better than \c _mm256_loadu_si256 when the data crosses a cache
3228/// line boundary.
3229///
3230/// \headerfile <x86intrin.h>
3231///
3232/// This intrinsic corresponds to the <c> VLDDQU </c> instruction.
3233///
3234/// \param __p
3235/// A pointer to a 256-bit integer vector containing integer values.
3236/// \returns A 256-bit integer vector containing the moved values.
3237static __inline __m256i __DEFAULT_FN_ATTRS
3238_mm256_lddqu_si256(__m256i_u const *__p)
3239{
3240 return (__m256i)__builtin_ia32_lddqu256((char const *)__p);
3241}
3242
3243/* SIMD store ops */
3244/// Stores double-precision floating point values from a 256-bit vector
3245/// of [4 x double] to a 32-byte aligned memory location pointed to by
3246/// \a __p.
3247///
3248/// \headerfile <x86intrin.h>
3249///
3250/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
3251///
3252/// \param __p
3253/// A 32-byte aligned pointer to a memory location that will receive the
3254/// double-precision floaing point values.
3255/// \param __a
3256/// A 256-bit vector of [4 x double] containing the values to be moved.
3257static __inline void __DEFAULT_FN_ATTRS
3258_mm256_store_pd(double *__p, __m256d __a)
3259{
3260 *(__m256d *)__p = __a;
3261}
3262
3263/// Stores single-precision floating point values from a 256-bit vector
3264/// of [8 x float] to a 32-byte aligned memory location pointed to by \a __p.
3265///
3266/// \headerfile <x86intrin.h>
3267///
3268/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
3269///
3270/// \param __p
3271/// A 32-byte aligned pointer to a memory location that will receive the
3272/// float values.
3273/// \param __a
3274/// A 256-bit vector of [8 x float] containing the values to be moved.
3275static __inline void __DEFAULT_FN_ATTRS
3276_mm256_store_ps(float *__p, __m256 __a)
3277{
3278 *(__m256 *)__p = __a;
3279}
3280
3281/// Stores double-precision floating point values from a 256-bit vector
3282/// of [4 x double] to an unaligned memory location pointed to by \a __p.
3283///
3284/// \headerfile <x86intrin.h>
3285///
3286/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
3287///
3288/// \param __p
3289/// A pointer to a memory location that will receive the double-precision
3290/// floating point values.
3291/// \param __a
3292/// A 256-bit vector of [4 x double] containing the values to be moved.
3293static __inline void __DEFAULT_FN_ATTRS
3294_mm256_storeu_pd(double *__p, __m256d __a)
3295{
3296 struct __storeu_pd {
3297 __m256d_u __v;
3298 } __attribute__((__packed__, __may_alias__));
3299 ((struct __storeu_pd*)__p)->__v = __a;
3300}
3301
3302/// Stores single-precision floating point values from a 256-bit vector
3303/// of [8 x float] to an unaligned memory location pointed to by \a __p.
3304///
3305/// \headerfile <x86intrin.h>
3306///
3307/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
3308///
3309/// \param __p
3310/// A pointer to a memory location that will receive the float values.
3311/// \param __a
3312/// A 256-bit vector of [8 x float] containing the values to be moved.
3313static __inline void __DEFAULT_FN_ATTRS
3314_mm256_storeu_ps(float *__p, __m256 __a)
3315{
3316 struct __storeu_ps {
3317 __m256_u __v;
3318 } __attribute__((__packed__, __may_alias__));
3319 ((struct __storeu_ps*)__p)->__v = __a;
3320}
3321
3322/// Stores integer values from a 256-bit integer vector to a 32-byte
3323/// aligned memory location pointed to by \a __p.
3324///
3325/// \headerfile <x86intrin.h>
3326///
3327/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
3328///
3329/// \param __p
3330/// A 32-byte aligned pointer to a memory location that will receive the
3331/// integer values.
3332/// \param __a
3333/// A 256-bit integer vector containing the values to be moved.
3334static __inline void __DEFAULT_FN_ATTRS
3335_mm256_store_si256(__m256i *__p, __m256i __a)
3336{
3337 *__p = __a;
3338}
3339
3340/// Stores integer values from a 256-bit integer vector to an unaligned
3341/// memory location pointed to by \a __p.
3342///
3343/// \headerfile <x86intrin.h>
3344///
3345/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
3346///
3347/// \param __p
3348/// A pointer to a memory location that will receive the integer values.
3349/// \param __a
3350/// A 256-bit integer vector containing the values to be moved.
3351static __inline void __DEFAULT_FN_ATTRS
3352_mm256_storeu_si256(__m256i_u *__p, __m256i __a)
3353{
3354 struct __storeu_si256 {
3355 __m256i_u __v;
3356 } __attribute__((__packed__, __may_alias__));
3357 ((struct __storeu_si256*)__p)->__v = __a;
3358}
3359
3360/* Conditional load ops */
3361/// Conditionally loads double-precision floating point elements from a
3362/// memory location pointed to by \a __p into a 128-bit vector of
3363/// [2 x double], depending on the mask bits associated with each data
3364/// element.
3365///
3366/// \headerfile <x86intrin.h>
3367///
3368/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3369///
3370/// \param __p
3371/// A pointer to a memory location that contains the double-precision
3372/// floating point values.
3373/// \param __m
3374/// A 128-bit integer vector containing the mask. The most significant bit of
3375/// each data element represents the mask bits. If a mask bit is zero, the
3376/// corresponding value in the memory location is not loaded and the
3377/// corresponding field in the return value is set to zero.
3378/// \returns A 128-bit vector of [2 x double] containing the loaded values.
3379static __inline __m128d __DEFAULT_FN_ATTRS128
3380_mm_maskload_pd(double const *__p, __m128i __m)
3381{
3382 return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2di)__m);
3383}
3384
3385/// Conditionally loads double-precision floating point elements from a
3386/// memory location pointed to by \a __p into a 256-bit vector of
3387/// [4 x double], depending on the mask bits associated with each data
3388/// element.
3389///
3390/// \headerfile <x86intrin.h>
3391///
3392/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3393///
3394/// \param __p
3395/// A pointer to a memory location that contains the double-precision
3396/// floating point values.
3397/// \param __m
3398/// A 256-bit integer vector of [4 x quadword] containing the mask. The most
3399/// significant bit of each quadword element represents the mask bits. If a
3400/// mask bit is zero, the corresponding value in the memory location is not
3401/// loaded and the corresponding field in the return value is set to zero.
3402/// \returns A 256-bit vector of [4 x double] containing the loaded values.
3403static __inline __m256d __DEFAULT_FN_ATTRS
3404_mm256_maskload_pd(double const *__p, __m256i __m)
3405{
3406 return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)__p,
3407 (__v4di)__m);
3408}
3409
3410/// Conditionally loads single-precision floating point elements from a
3411/// memory location pointed to by \a __p into a 128-bit vector of
3412/// [4 x float], depending on the mask bits associated with each data
3413/// element.
3414///
3415/// \headerfile <x86intrin.h>
3416///
3417/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3418///
3419/// \param __p
3420/// A pointer to a memory location that contains the single-precision
3421/// floating point values.
3422/// \param __m
3423/// A 128-bit integer vector containing the mask. The most significant bit of
3424/// each data element represents the mask bits. If a mask bit is zero, the
3425/// corresponding value in the memory location is not loaded and the
3426/// corresponding field in the return value is set to zero.
3427/// \returns A 128-bit vector of [4 x float] containing the loaded values.
3428static __inline __m128 __DEFAULT_FN_ATTRS128
3429_mm_maskload_ps(float const *__p, __m128i __m)
3430{
3431 return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4si)__m);
3432}
3433
3434/// Conditionally loads single-precision floating point elements from a
3435/// memory location pointed to by \a __p into a 256-bit vector of
3436/// [8 x float], depending on the mask bits associated with each data
3437/// element.
3438///
3439/// \headerfile <x86intrin.h>
3440///
3441/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3442///
3443/// \param __p
3444/// A pointer to a memory location that contains the single-precision
3445/// floating point values.
3446/// \param __m
3447/// A 256-bit integer vector of [8 x dword] containing the mask. The most
3448/// significant bit of each dword element represents the mask bits. If a mask
3449/// bit is zero, the corresponding value in the memory location is not loaded
3450/// and the corresponding field in the return value is set to zero.
3451/// \returns A 256-bit vector of [8 x float] containing the loaded values.
3452static __inline __m256 __DEFAULT_FN_ATTRS
3453_mm256_maskload_ps(float const *__p, __m256i __m)
3454{
3455 return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)__p, (__v8si)__m);
3456}
3457
3458/* Conditional store ops */
3459/// Moves single-precision floating point values from a 256-bit vector
3460/// of [8 x float] to a memory location pointed to by \a __p, according to
3461/// the specified mask.
3462///
3463/// \headerfile <x86intrin.h>
3464///
3465/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3466///
3467/// \param __p
3468/// A pointer to a memory location that will receive the float values.
3469/// \param __m
3470/// A 256-bit integer vector of [8 x dword] containing the mask. The most
3471/// significant bit of each dword element in the mask vector represents the
3472/// mask bits. If a mask bit is zero, the corresponding value from vector
3473/// \a __a is not stored and the corresponding field in the memory location
3474/// pointed to by \a __p is not changed.
3475/// \param __a
3476/// A 256-bit vector of [8 x float] containing the values to be stored.
3477static __inline void __DEFAULT_FN_ATTRS
3478_mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a)
3479{
3480 __builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8si)__m, (__v8sf)__a);
3481}
3482
3483/// Moves double-precision values from a 128-bit vector of [2 x double]
3484/// to a memory location pointed to by \a __p, according to the specified
3485/// mask.
3486///
3487/// \headerfile <x86intrin.h>
3488///
3489/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3490///
3491/// \param __p
3492/// A pointer to a memory location that will receive the float values.
3493/// \param __m
3494/// A 128-bit integer vector containing the mask. The most significant bit of
3495/// each field in the mask vector represents the mask bits. If a mask bit is
3496/// zero, the corresponding value from vector \a __a is not stored and the
3497/// corresponding field in the memory location pointed to by \a __p is not
3498/// changed.
3499/// \param __a
3500/// A 128-bit vector of [2 x double] containing the values to be stored.
3501static __inline void __DEFAULT_FN_ATTRS128
3502_mm_maskstore_pd(double *__p, __m128i __m, __m128d __a)
3503{
3504 __builtin_ia32_maskstorepd((__v2df *)__p, (__v2di)__m, (__v2df)__a);
3505}
3506
3507/// Moves double-precision values from a 256-bit vector of [4 x double]
3508/// to a memory location pointed to by \a __p, according to the specified
3509/// mask.
3510///
3511/// \headerfile <x86intrin.h>
3512///
3513/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3514///
3515/// \param __p
3516/// A pointer to a memory location that will receive the float values.
3517/// \param __m
3518/// A 256-bit integer vector of [4 x quadword] containing the mask. The most
3519/// significant bit of each quadword element in the mask vector represents
3520/// the mask bits. If a mask bit is zero, the corresponding value from vector
3521/// __a is not stored and the corresponding field in the memory location
3522/// pointed to by \a __p is not changed.
3523/// \param __a
3524/// A 256-bit vector of [4 x double] containing the values to be stored.
3525static __inline void __DEFAULT_FN_ATTRS
3526_mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a)
3527{
3528 __builtin_ia32_maskstorepd256((__v4df *)__p, (__v4di)__m, (__v4df)__a);
3529}
3530
3531/// Moves single-precision floating point values from a 128-bit vector
3532/// of [4 x float] to a memory location pointed to by \a __p, according to
3533/// the specified mask.
3534///
3535/// \headerfile <x86intrin.h>
3536///
3537/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3538///
3539/// \param __p
3540/// A pointer to a memory location that will receive the float values.
3541/// \param __m
3542/// A 128-bit integer vector containing the mask. The most significant bit of
3543/// each field in the mask vector represents the mask bits. If a mask bit is
3544/// zero, the corresponding value from vector __a is not stored and the
3545/// corresponding field in the memory location pointed to by \a __p is not
3546/// changed.
3547/// \param __a
3548/// A 128-bit vector of [4 x float] containing the values to be stored.
3549static __inline void __DEFAULT_FN_ATTRS128
3550_mm_maskstore_ps(float *__p, __m128i __m, __m128 __a)
3551{
3552 __builtin_ia32_maskstoreps((__v4sf *)__p, (__v4si)__m, (__v4sf)__a);
3553}
3554
3555/* Cacheability support ops */
3556/// Moves integer data from a 256-bit integer vector to a 32-byte
3557/// aligned memory location. To minimize caching, the data is flagged as
3558/// non-temporal (unlikely to be used again soon).
3559///
3560/// \headerfile <x86intrin.h>
3561///
3562/// This intrinsic corresponds to the <c> VMOVNTDQ </c> instruction.
3563///
3564/// \param __a
3565/// A pointer to a 32-byte aligned memory location that will receive the
3566/// integer values.
3567/// \param __b
3568/// A 256-bit integer vector containing the values to be moved.
3569static __inline void __DEFAULT_FN_ATTRS
3570_mm256_stream_si256(void *__a, __m256i __b)
3571{
3572 typedef __v4di __v4di_aligned __attribute__((aligned(32)));
3573 __builtin_nontemporal_store((__v4di_aligned)__b, (__v4di_aligned*)__a);
3574}
3575
3576/// Moves double-precision values from a 256-bit vector of [4 x double]
3577/// to a 32-byte aligned memory location. To minimize caching, the data is
3578/// flagged as non-temporal (unlikely to be used again soon).
3579///
3580/// \headerfile <x86intrin.h>
3581///
3582/// This intrinsic corresponds to the <c> VMOVNTPD </c> instruction.
3583///
3584/// \param __a
3585/// A pointer to a 32-byte aligned memory location that will receive the
3586/// double-precision floating-point values.
3587/// \param __b
3588/// A 256-bit vector of [4 x double] containing the values to be moved.
3589static __inline void __DEFAULT_FN_ATTRS
3590_mm256_stream_pd(void *__a, __m256d __b)
3591{
3592 typedef __v4df __v4df_aligned __attribute__((aligned(32)));
3593 __builtin_nontemporal_store((__v4df_aligned)__b, (__v4df_aligned*)__a);
3594}
3595
3596/// Moves single-precision floating point values from a 256-bit vector
3597/// of [8 x float] to a 32-byte aligned memory location. To minimize
3598/// caching, the data is flagged as non-temporal (unlikely to be used again
3599/// soon).
3600///
3601/// \headerfile <x86intrin.h>
3602///
3603/// This intrinsic corresponds to the <c> VMOVNTPS </c> instruction.
3604///
3605/// \param __p
3606/// A pointer to a 32-byte aligned memory location that will receive the
3607/// single-precision floating point values.
3608/// \param __a
3609/// A 256-bit vector of [8 x float] containing the values to be moved.
3610static __inline void __DEFAULT_FN_ATTRS
3611_mm256_stream_ps(void *__p, __m256 __a)
3612{
3613 typedef __v8sf __v8sf_aligned __attribute__((aligned(32)));
3614 __builtin_nontemporal_store((__v8sf_aligned)__a, (__v8sf_aligned*)__p);
3615}
3616
3617/* Create vectors */
3618/// Create a 256-bit vector of [4 x double] with undefined values.
3619///
3620/// \headerfile <x86intrin.h>
3621///
3622/// This intrinsic has no corresponding instruction.
3623///
3624/// \returns A 256-bit vector of [4 x double] containing undefined values.
3625static __inline__ __m256d __DEFAULT_FN_ATTRS
3626_mm256_undefined_pd(void)
3627{
3628 return (__m256d)__builtin_ia32_undef256();
3629}
3630
3631/// Create a 256-bit vector of [8 x float] with undefined values.
3632///
3633/// \headerfile <x86intrin.h>
3634///
3635/// This intrinsic has no corresponding instruction.
3636///
3637/// \returns A 256-bit vector of [8 x float] containing undefined values.
3638static __inline__ __m256 __DEFAULT_FN_ATTRS
3639_mm256_undefined_ps(void)
3640{
3641 return (__m256)__builtin_ia32_undef256();
3642}
3643
3644/// Create a 256-bit integer vector with undefined values.
3645///
3646/// \headerfile <x86intrin.h>
3647///
3648/// This intrinsic has no corresponding instruction.
3649///
3650/// \returns A 256-bit integer vector containing undefined values.
3651static __inline__ __m256i __DEFAULT_FN_ATTRS
3652_mm256_undefined_si256(void)
3653{
3654 return (__m256i)__builtin_ia32_undef256();
3655}
3656
3657/// Constructs a 256-bit floating-point vector of [4 x double]
3658/// initialized with the specified double-precision floating-point values.
3659///
3660/// \headerfile <x86intrin.h>
3661///
3662/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
3663/// instruction.
3664///
3665/// \param __a
3666/// A double-precision floating-point value used to initialize bits [255:192]
3667/// of the result.
3668/// \param __b
3669/// A double-precision floating-point value used to initialize bits [191:128]
3670/// of the result.
3671/// \param __c
3672/// A double-precision floating-point value used to initialize bits [127:64]
3673/// of the result.
3674/// \param __d
3675/// A double-precision floating-point value used to initialize bits [63:0]
3676/// of the result.
3677/// \returns An initialized 256-bit floating-point vector of [4 x double].
3678static __inline __m256d __DEFAULT_FN_ATTRS
3679_mm256_set_pd(double __a, double __b, double __c, double __d)
3680{
3681 return __extension__ (__m256d){ __d, __c, __b, __a };
3682}
3683
3684/// Constructs a 256-bit floating-point vector of [8 x float] initialized
3685/// with the specified single-precision floating-point values.
3686///
3687/// \headerfile <x86intrin.h>
3688///
3689/// This intrinsic is a utility function and does not correspond to a specific
3690/// instruction.
3691///
3692/// \param __a
3693/// A single-precision floating-point value used to initialize bits [255:224]
3694/// of the result.
3695/// \param __b
3696/// A single-precision floating-point value used to initialize bits [223:192]
3697/// of the result.
3698/// \param __c
3699/// A single-precision floating-point value used to initialize bits [191:160]
3700/// of the result.
3701/// \param __d
3702/// A single-precision floating-point value used to initialize bits [159:128]
3703/// of the result.
3704/// \param __e
3705/// A single-precision floating-point value used to initialize bits [127:96]
3706/// of the result.
3707/// \param __f
3708/// A single-precision floating-point value used to initialize bits [95:64]
3709/// of the result.
3710/// \param __g
3711/// A single-precision floating-point value used to initialize bits [63:32]
3712/// of the result.
3713/// \param __h
3714/// A single-precision floating-point value used to initialize bits [31:0]
3715/// of the result.
3716/// \returns An initialized 256-bit floating-point vector of [8 x float].
3717static __inline __m256 __DEFAULT_FN_ATTRS
3718_mm256_set_ps(float __a, float __b, float __c, float __d,
3719 float __e, float __f, float __g, float __h)
3720{
3721 return __extension__ (__m256){ __h, __g, __f, __e, __d, __c, __b, __a };
3722}
3723
3724/// Constructs a 256-bit integer vector initialized with the specified
3725/// 32-bit integral values.
3726///
3727/// \headerfile <x86intrin.h>
3728///
3729/// This intrinsic is a utility function and does not correspond to a specific
3730/// instruction.
3731///
3732/// \param __i0
3733/// A 32-bit integral value used to initialize bits [255:224] of the result.
3734/// \param __i1
3735/// A 32-bit integral value used to initialize bits [223:192] of the result.
3736/// \param __i2
3737/// A 32-bit integral value used to initialize bits [191:160] of the result.
3738/// \param __i3
3739/// A 32-bit integral value used to initialize bits [159:128] of the result.
3740/// \param __i4
3741/// A 32-bit integral value used to initialize bits [127:96] of the result.
3742/// \param __i5
3743/// A 32-bit integral value used to initialize bits [95:64] of the result.
3744/// \param __i6
3745/// A 32-bit integral value used to initialize bits [63:32] of the result.
3746/// \param __i7
3747/// A 32-bit integral value used to initialize bits [31:0] of the result.
3748/// \returns An initialized 256-bit integer vector.
3749static __inline __m256i __DEFAULT_FN_ATTRS
3750_mm256_set_epi32(int __i0, int __i1, int __i2, int __i3,
3751 int __i4, int __i5, int __i6, int __i7)
3752{
3753 return __extension__ (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 };
3754}
3755
3756/// Constructs a 256-bit integer vector initialized with the specified
3757/// 16-bit integral values.
3758///
3759/// \headerfile <x86intrin.h>
3760///
3761/// This intrinsic is a utility function and does not correspond to a specific
3762/// instruction.
3763///
3764/// \param __w15
3765/// A 16-bit integral value used to initialize bits [255:240] of the result.
3766/// \param __w14
3767/// A 16-bit integral value used to initialize bits [239:224] of the result.
3768/// \param __w13
3769/// A 16-bit integral value used to initialize bits [223:208] of the result.
3770/// \param __w12
3771/// A 16-bit integral value used to initialize bits [207:192] of the result.
3772/// \param __w11
3773/// A 16-bit integral value used to initialize bits [191:176] of the result.
3774/// \param __w10
3775/// A 16-bit integral value used to initialize bits [175:160] of the result.
3776/// \param __w09
3777/// A 16-bit integral value used to initialize bits [159:144] of the result.
3778/// \param __w08
3779/// A 16-bit integral value used to initialize bits [143:128] of the result.
3780/// \param __w07
3781/// A 16-bit integral value used to initialize bits [127:112] of the result.
3782/// \param __w06
3783/// A 16-bit integral value used to initialize bits [111:96] of the result.
3784/// \param __w05
3785/// A 16-bit integral value used to initialize bits [95:80] of the result.
3786/// \param __w04
3787/// A 16-bit integral value used to initialize bits [79:64] of the result.
3788/// \param __w03
3789/// A 16-bit integral value used to initialize bits [63:48] of the result.
3790/// \param __w02
3791/// A 16-bit integral value used to initialize bits [47:32] of the result.
3792/// \param __w01
3793/// A 16-bit integral value used to initialize bits [31:16] of the result.
3794/// \param __w00
3795/// A 16-bit integral value used to initialize bits [15:0] of the result.
3796/// \returns An initialized 256-bit integer vector.
3797static __inline __m256i __DEFAULT_FN_ATTRS
3798_mm256_set_epi16(short __w15, short __w14, short __w13, short __w12,
3799 short __w11, short __w10, short __w09, short __w08,
3800 short __w07, short __w06, short __w05, short __w04,
3801 short __w03, short __w02, short __w01, short __w00)
3802{
3803 return __extension__ (__m256i)(__v16hi){ __w00, __w01, __w02, __w03, __w04, __w05, __w06,
3804 __w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 };
3805}
3806
3807/// Constructs a 256-bit integer vector initialized with the specified
3808/// 8-bit integral values.
3809///
3810/// \headerfile <x86intrin.h>
3811///
3812/// This intrinsic is a utility function and does not correspond to a specific
3813/// instruction.
3814///
3815/// \param __b31
3816/// An 8-bit integral value used to initialize bits [255:248] of the result.
3817/// \param __b30
3818/// An 8-bit integral value used to initialize bits [247:240] of the result.
3819/// \param __b29
3820/// An 8-bit integral value used to initialize bits [239:232] of the result.
3821/// \param __b28
3822/// An 8-bit integral value used to initialize bits [231:224] of the result.
3823/// \param __b27
3824/// An 8-bit integral value used to initialize bits [223:216] of the result.
3825/// \param __b26
3826/// An 8-bit integral value used to initialize bits [215:208] of the result.
3827/// \param __b25
3828/// An 8-bit integral value used to initialize bits [207:200] of the result.
3829/// \param __b24
3830/// An 8-bit integral value used to initialize bits [199:192] of the result.
3831/// \param __b23
3832/// An 8-bit integral value used to initialize bits [191:184] of the result.
3833/// \param __b22
3834/// An 8-bit integral value used to initialize bits [183:176] of the result.
3835/// \param __b21
3836/// An 8-bit integral value used to initialize bits [175:168] of the result.
3837/// \param __b20
3838/// An 8-bit integral value used to initialize bits [167:160] of the result.
3839/// \param __b19
3840/// An 8-bit integral value used to initialize bits [159:152] of the result.
3841/// \param __b18
3842/// An 8-bit integral value used to initialize bits [151:144] of the result.
3843/// \param __b17
3844/// An 8-bit integral value used to initialize bits [143:136] of the result.
3845/// \param __b16
3846/// An 8-bit integral value used to initialize bits [135:128] of the result.
3847/// \param __b15
3848/// An 8-bit integral value used to initialize bits [127:120] of the result.
3849/// \param __b14
3850/// An 8-bit integral value used to initialize bits [119:112] of the result.
3851/// \param __b13
3852/// An 8-bit integral value used to initialize bits [111:104] of the result.
3853/// \param __b12
3854/// An 8-bit integral value used to initialize bits [103:96] of the result.
3855/// \param __b11
3856/// An 8-bit integral value used to initialize bits [95:88] of the result.
3857/// \param __b10
3858/// An 8-bit integral value used to initialize bits [87:80] of the result.
3859/// \param __b09
3860/// An 8-bit integral value used to initialize bits [79:72] of the result.
3861/// \param __b08
3862/// An 8-bit integral value used to initialize bits [71:64] of the result.
3863/// \param __b07
3864/// An 8-bit integral value used to initialize bits [63:56] of the result.
3865/// \param __b06
3866/// An 8-bit integral value used to initialize bits [55:48] of the result.
3867/// \param __b05
3868/// An 8-bit integral value used to initialize bits [47:40] of the result.
3869/// \param __b04
3870/// An 8-bit integral value used to initialize bits [39:32] of the result.
3871/// \param __b03
3872/// An 8-bit integral value used to initialize bits [31:24] of the result.
3873/// \param __b02
3874/// An 8-bit integral value used to initialize bits [23:16] of the result.
3875/// \param __b01
3876/// An 8-bit integral value used to initialize bits [15:8] of the result.
3877/// \param __b00
3878/// An 8-bit integral value used to initialize bits [7:0] of the result.
3879/// \returns An initialized 256-bit integer vector.
3880static __inline __m256i __DEFAULT_FN_ATTRS
3881_mm256_set_epi8(char __b31, char __b30, char __b29, char __b28,
3882 char __b27, char __b26, char __b25, char __b24,
3883 char __b23, char __b22, char __b21, char __b20,
3884 char __b19, char __b18, char __b17, char __b16,
3885 char __b15, char __b14, char __b13, char __b12,
3886 char __b11, char __b10, char __b09, char __b08,
3887 char __b07, char __b06, char __b05, char __b04,
3888 char __b03, char __b02, char __b01, char __b00)
3889{
3890 return __extension__ (__m256i)(__v32qi){
3891 __b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
3892 __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
3893 __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
3894 __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31
3895 };
3896}
3897
3898/// Constructs a 256-bit integer vector initialized with the specified
3899/// 64-bit integral values.
3900///
3901/// \headerfile <x86intrin.h>
3902///
3903/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
3904/// instruction.
3905///
3906/// \param __a
3907/// A 64-bit integral value used to initialize bits [255:192] of the result.
3908/// \param __b
3909/// A 64-bit integral value used to initialize bits [191:128] of the result.
3910/// \param __c
3911/// A 64-bit integral value used to initialize bits [127:64] of the result.
3912/// \param __d
3913/// A 64-bit integral value used to initialize bits [63:0] of the result.
3914/// \returns An initialized 256-bit integer vector.
3915static __inline __m256i __DEFAULT_FN_ATTRS
3916_mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d)
3917{
3918 return __extension__ (__m256i)(__v4di){ __d, __c, __b, __a };
3919}
3920
3921/* Create vectors with elements in reverse order */
3922/// Constructs a 256-bit floating-point vector of [4 x double],
3923/// initialized in reverse order with the specified double-precision
3924/// floating-point values.
3925///
3926/// \headerfile <x86intrin.h>
3927///
3928/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
3929/// instruction.
3930///
3931/// \param __a
3932/// A double-precision floating-point value used to initialize bits [63:0]
3933/// of the result.
3934/// \param __b
3935/// A double-precision floating-point value used to initialize bits [127:64]
3936/// of the result.
3937/// \param __c
3938/// A double-precision floating-point value used to initialize bits [191:128]
3939/// of the result.
3940/// \param __d
3941/// A double-precision floating-point value used to initialize bits [255:192]
3942/// of the result.
3943/// \returns An initialized 256-bit floating-point vector of [4 x double].
3944static __inline __m256d __DEFAULT_FN_ATTRS
3945_mm256_setr_pd(double __a, double __b, double __c, double __d)
3946{
3947 return _mm256_set_pd(__d, __c, __b, __a);
3948}
3949
3950/// Constructs a 256-bit floating-point vector of [8 x float],
3951/// initialized in reverse order with the specified single-precision
3952/// float-point values.
3953///
3954/// \headerfile <x86intrin.h>
3955///
3956/// This intrinsic is a utility function and does not correspond to a specific
3957/// instruction.
3958///
3959/// \param __a
3960/// A single-precision floating-point value used to initialize bits [31:0]
3961/// of the result.
3962/// \param __b
3963/// A single-precision floating-point value used to initialize bits [63:32]
3964/// of the result.
3965/// \param __c
3966/// A single-precision floating-point value used to initialize bits [95:64]
3967/// of the result.
3968/// \param __d
3969/// A single-precision floating-point value used to initialize bits [127:96]
3970/// of the result.
3971/// \param __e
3972/// A single-precision floating-point value used to initialize bits [159:128]
3973/// of the result.
3974/// \param __f
3975/// A single-precision floating-point value used to initialize bits [191:160]
3976/// of the result.
3977/// \param __g
3978/// A single-precision floating-point value used to initialize bits [223:192]
3979/// of the result.
3980/// \param __h
3981/// A single-precision floating-point value used to initialize bits [255:224]
3982/// of the result.
3983/// \returns An initialized 256-bit floating-point vector of [8 x float].
3984static __inline __m256 __DEFAULT_FN_ATTRS
3985_mm256_setr_ps(float __a, float __b, float __c, float __d,
3986 float __e, float __f, float __g, float __h)
3987{
3988 return _mm256_set_ps(__h, __g, __f, __e, __d, __c, __b, __a);
3989}
3990
3991/// Constructs a 256-bit integer vector, initialized in reverse order
3992/// with the specified 32-bit integral values.
3993///
3994/// \headerfile <x86intrin.h>
3995///
3996/// This intrinsic is a utility function and does not correspond to a specific
3997/// instruction.
3998///
3999/// \param __i0
4000/// A 32-bit integral value used to initialize bits [31:0] of the result.
4001/// \param __i1
4002/// A 32-bit integral value used to initialize bits [63:32] of the result.
4003/// \param __i2
4004/// A 32-bit integral value used to initialize bits [95:64] of the result.
4005/// \param __i3
4006/// A 32-bit integral value used to initialize bits [127:96] of the result.
4007/// \param __i4
4008/// A 32-bit integral value used to initialize bits [159:128] of the result.
4009/// \param __i5
4010/// A 32-bit integral value used to initialize bits [191:160] of the result.
4011/// \param __i6
4012/// A 32-bit integral value used to initialize bits [223:192] of the result.
4013/// \param __i7
4014/// A 32-bit integral value used to initialize bits [255:224] of the result.
4015/// \returns An initialized 256-bit integer vector.
4016static __inline __m256i __DEFAULT_FN_ATTRS
4017_mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3,
4018 int __i4, int __i5, int __i6, int __i7)
4019{
4020 return _mm256_set_epi32(__i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0);
4021}
4022
4023/// Constructs a 256-bit integer vector, initialized in reverse order
4024/// with the specified 16-bit integral values.
4025///
4026/// \headerfile <x86intrin.h>
4027///
4028/// This intrinsic is a utility function and does not correspond to a specific
4029/// instruction.
4030///
4031/// \param __w15
4032/// A 16-bit integral value used to initialize bits [15:0] of the result.
4033/// \param __w14
4034/// A 16-bit integral value used to initialize bits [31:16] of the result.
4035/// \param __w13
4036/// A 16-bit integral value used to initialize bits [47:32] of the result.
4037/// \param __w12
4038/// A 16-bit integral value used to initialize bits [63:48] of the result.
4039/// \param __w11
4040/// A 16-bit integral value used to initialize bits [79:64] of the result.
4041/// \param __w10
4042/// A 16-bit integral value used to initialize bits [95:80] of the result.
4043/// \param __w09
4044/// A 16-bit integral value used to initialize bits [111:96] of the result.
4045/// \param __w08
4046/// A 16-bit integral value used to initialize bits [127:112] of the result.
4047/// \param __w07
4048/// A 16-bit integral value used to initialize bits [143:128] of the result.
4049/// \param __w06
4050/// A 16-bit integral value used to initialize bits [159:144] of the result.
4051/// \param __w05
4052/// A 16-bit integral value used to initialize bits [175:160] of the result.
4053/// \param __w04
4054/// A 16-bit integral value used to initialize bits [191:176] of the result.
4055/// \param __w03
4056/// A 16-bit integral value used to initialize bits [207:192] of the result.
4057/// \param __w02
4058/// A 16-bit integral value used to initialize bits [223:208] of the result.
4059/// \param __w01
4060/// A 16-bit integral value used to initialize bits [239:224] of the result.
4061/// \param __w00
4062/// A 16-bit integral value used to initialize bits [255:240] of the result.
4063/// \returns An initialized 256-bit integer vector.
4064static __inline __m256i __DEFAULT_FN_ATTRS
4065_mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12,
4066 short __w11, short __w10, short __w09, short __w08,
4067 short __w07, short __w06, short __w05, short __w04,
4068 short __w03, short __w02, short __w01, short __w00)
4069{
4070 return _mm256_set_epi16(__w00, __w01, __w02, __w03,
4071 __w04, __w05, __w06, __w07,
4072 __w08, __w09, __w10, __w11,
4073 __w12, __w13, __w14, __w15);
4074}
4075
4076/// Constructs a 256-bit integer vector, initialized in reverse order
4077/// with the specified 8-bit integral values.
4078///
4079/// \headerfile <x86intrin.h>
4080///
4081/// This intrinsic is a utility function and does not correspond to a specific
4082/// instruction.
4083///
4084/// \param __b31
4085/// An 8-bit integral value used to initialize bits [7:0] of the result.
4086/// \param __b30
4087/// An 8-bit integral value used to initialize bits [15:8] of the result.
4088/// \param __b29
4089/// An 8-bit integral value used to initialize bits [23:16] of the result.
4090/// \param __b28
4091/// An 8-bit integral value used to initialize bits [31:24] of the result.
4092/// \param __b27
4093/// An 8-bit integral value used to initialize bits [39:32] of the result.
4094/// \param __b26
4095/// An 8-bit integral value used to initialize bits [47:40] of the result.
4096/// \param __b25
4097/// An 8-bit integral value used to initialize bits [55:48] of the result.
4098/// \param __b24
4099/// An 8-bit integral value used to initialize bits [63:56] of the result.
4100/// \param __b23
4101/// An 8-bit integral value used to initialize bits [71:64] of the result.
4102/// \param __b22
4103/// An 8-bit integral value used to initialize bits [79:72] of the result.
4104/// \param __b21
4105/// An 8-bit integral value used to initialize bits [87:80] of the result.
4106/// \param __b20
4107/// An 8-bit integral value used to initialize bits [95:88] of the result.
4108/// \param __b19
4109/// An 8-bit integral value used to initialize bits [103:96] of the result.
4110/// \param __b18
4111/// An 8-bit integral value used to initialize bits [111:104] of the result.
4112/// \param __b17
4113/// An 8-bit integral value used to initialize bits [119:112] of the result.
4114/// \param __b16
4115/// An 8-bit integral value used to initialize bits [127:120] of the result.
4116/// \param __b15
4117/// An 8-bit integral value used to initialize bits [135:128] of the result.
4118/// \param __b14
4119/// An 8-bit integral value used to initialize bits [143:136] of the result.
4120/// \param __b13
4121/// An 8-bit integral value used to initialize bits [151:144] of the result.
4122/// \param __b12
4123/// An 8-bit integral value used to initialize bits [159:152] of the result.
4124/// \param __b11
4125/// An 8-bit integral value used to initialize bits [167:160] of the result.
4126/// \param __b10
4127/// An 8-bit integral value used to initialize bits [175:168] of the result.
4128/// \param __b09
4129/// An 8-bit integral value used to initialize bits [183:176] of the result.
4130/// \param __b08
4131/// An 8-bit integral value used to initialize bits [191:184] of the result.
4132/// \param __b07
4133/// An 8-bit integral value used to initialize bits [199:192] of the result.
4134/// \param __b06
4135/// An 8-bit integral value used to initialize bits [207:200] of the result.
4136/// \param __b05
4137/// An 8-bit integral value used to initialize bits [215:208] of the result.
4138/// \param __b04
4139/// An 8-bit integral value used to initialize bits [223:216] of the result.
4140/// \param __b03
4141/// An 8-bit integral value used to initialize bits [231:224] of the result.
4142/// \param __b02
4143/// An 8-bit integral value used to initialize bits [239:232] of the result.
4144/// \param __b01
4145/// An 8-bit integral value used to initialize bits [247:240] of the result.
4146/// \param __b00
4147/// An 8-bit integral value used to initialize bits [255:248] of the result.
4148/// \returns An initialized 256-bit integer vector.
4149static __inline __m256i __DEFAULT_FN_ATTRS
4150_mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28,
4151 char __b27, char __b26, char __b25, char __b24,
4152 char __b23, char __b22, char __b21, char __b20,
4153 char __b19, char __b18, char __b17, char __b16,
4154 char __b15, char __b14, char __b13, char __b12,
4155 char __b11, char __b10, char __b09, char __b08,
4156 char __b07, char __b06, char __b05, char __b04,
4157 char __b03, char __b02, char __b01, char __b00)
4158{
4159 return _mm256_set_epi8(__b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
4160 __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
4161 __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
4162 __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31);
4163}
4164
4165/// Constructs a 256-bit integer vector, initialized in reverse order
4166/// with the specified 64-bit integral values.
4167///
4168/// \headerfile <x86intrin.h>
4169///
4170/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
4171/// instruction.
4172///
4173/// \param __a
4174/// A 64-bit integral value used to initialize bits [63:0] of the result.
4175/// \param __b
4176/// A 64-bit integral value used to initialize bits [127:64] of the result.
4177/// \param __c
4178/// A 64-bit integral value used to initialize bits [191:128] of the result.
4179/// \param __d
4180/// A 64-bit integral value used to initialize bits [255:192] of the result.
4181/// \returns An initialized 256-bit integer vector.
4182static __inline __m256i __DEFAULT_FN_ATTRS
4183_mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d)
4184{
4185 return _mm256_set_epi64x(__d, __c, __b, __a);
4186}
4187
4188/* Create vectors with repeated elements */
4189/// Constructs a 256-bit floating-point vector of [4 x double], with each
4190/// of the four double-precision floating-point vector elements set to the
4191/// specified double-precision floating-point value.
4192///
4193/// \headerfile <x86intrin.h>
4194///
4195/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
4196///
4197/// \param __w
4198/// A double-precision floating-point value used to initialize each vector
4199/// element of the result.
4200/// \returns An initialized 256-bit floating-point vector of [4 x double].
4201static __inline __m256d __DEFAULT_FN_ATTRS
4202_mm256_set1_pd(double __w)
4203{
4204 return _mm256_set_pd(__w, __w, __w, __w);
4205}
4206
4207/// Constructs a 256-bit floating-point vector of [8 x float], with each
4208/// of the eight single-precision floating-point vector elements set to the
4209/// specified single-precision floating-point value.
4210///
4211/// \headerfile <x86intrin.h>
4212///
4213/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
4214/// instruction.
4215///
4216/// \param __w
4217/// A single-precision floating-point value used to initialize each vector
4218/// element of the result.
4219/// \returns An initialized 256-bit floating-point vector of [8 x float].
4220static __inline __m256 __DEFAULT_FN_ATTRS
4221_mm256_set1_ps(float __w)
4222{
4223 return _mm256_set_ps(__w, __w, __w, __w, __w, __w, __w, __w);
4224}
4225
4226/// Constructs a 256-bit integer vector of [8 x i32], with each of the
4227/// 32-bit integral vector elements set to the specified 32-bit integral
4228/// value.
4229///
4230/// \headerfile <x86intrin.h>
4231///
4232/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
4233/// instruction.
4234///
4235/// \param __i
4236/// A 32-bit integral value used to initialize each vector element of the
4237/// result.
4238/// \returns An initialized 256-bit integer vector of [8 x i32].
4239static __inline __m256i __DEFAULT_FN_ATTRS
4240_mm256_set1_epi32(int __i)
4241{
4242 return _mm256_set_epi32(__i, __i, __i, __i, __i, __i, __i, __i);
4243}
4244
4245/// Constructs a 256-bit integer vector of [16 x i16], with each of the
4246/// 16-bit integral vector elements set to the specified 16-bit integral
4247/// value.
4248///
4249/// \headerfile <x86intrin.h>
4250///
4251/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
4252///
4253/// \param __w
4254/// A 16-bit integral value used to initialize each vector element of the
4255/// result.
4256/// \returns An initialized 256-bit integer vector of [16 x i16].
4257static __inline __m256i __DEFAULT_FN_ATTRS
4258_mm256_set1_epi16(short __w)
4259{
4260 return _mm256_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w,
4261 __w, __w, __w, __w, __w, __w, __w, __w);
4262}
4263
4264/// Constructs a 256-bit integer vector of [32 x i8], with each of the
4265/// 8-bit integral vector elements set to the specified 8-bit integral value.
4266///
4267/// \headerfile <x86intrin.h>
4268///
4269/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
4270///
4271/// \param __b
4272/// An 8-bit integral value used to initialize each vector element of the
4273/// result.
4274/// \returns An initialized 256-bit integer vector of [32 x i8].
4275static __inline __m256i __DEFAULT_FN_ATTRS
4276_mm256_set1_epi8(char __b)
4277{
4278 return _mm256_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b,
4279 __b, __b, __b, __b, __b, __b, __b, __b,
4280 __b, __b, __b, __b, __b, __b, __b, __b,
4281 __b, __b, __b, __b, __b, __b, __b, __b);
4282}
4283
4284/// Constructs a 256-bit integer vector of [4 x i64], with each of the
4285/// 64-bit integral vector elements set to the specified 64-bit integral
4286/// value.
4287///
4288/// \headerfile <x86intrin.h>
4289///
4290/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
4291///
4292/// \param __q
4293/// A 64-bit integral value used to initialize each vector element of the
4294/// result.
4295/// \returns An initialized 256-bit integer vector of [4 x i64].
4296static __inline __m256i __DEFAULT_FN_ATTRS
4297_mm256_set1_epi64x(long long __q)
4298{
4299 return _mm256_set_epi64x(__q, __q, __q, __q);
4300}
4301
4302/* Create __zeroed vectors */
4303/// Constructs a 256-bit floating-point vector of [4 x double] with all
4304/// vector elements initialized to zero.
4305///
4306/// \headerfile <x86intrin.h>
4307///
4308/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
4309///
4310/// \returns A 256-bit vector of [4 x double] with all elements set to zero.
4311static __inline __m256d __DEFAULT_FN_ATTRS
4312_mm256_setzero_pd(void)
4313{
4314 return __extension__ (__m256d){ 0.0, 0.0, 0.0, 0.0 };
4315}
4316
4317/// Constructs a 256-bit floating-point vector of [8 x float] with all
4318/// vector elements initialized to zero.
4319///
4320/// \headerfile <x86intrin.h>
4321///
4322/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
4323///
4324/// \returns A 256-bit vector of [8 x float] with all elements set to zero.
4325static __inline __m256 __DEFAULT_FN_ATTRS
4326_mm256_setzero_ps(void)
4327{
4328 return __extension__ (__m256){ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
4329}
4330
4331/// Constructs a 256-bit integer vector initialized to zero.
4332///
4333/// \headerfile <x86intrin.h>
4334///
4335/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
4336///
4337/// \returns A 256-bit integer vector initialized to zero.
4338static __inline __m256i __DEFAULT_FN_ATTRS
4339_mm256_setzero_si256(void)
4340{
4341 return __extension__ (__m256i)(__v4di){ 0, 0, 0, 0 };
4342}
4343
4344/* Cast between vector types */
4345/// Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
4346/// floating-point vector of [8 x float].
4347///
4348/// \headerfile <x86intrin.h>
4349///
4350/// This intrinsic has no corresponding instruction.
4351///
4352/// \param __a
4353/// A 256-bit floating-point vector of [4 x double].
4354/// \returns A 256-bit floating-point vector of [8 x float] containing the same
4355/// bitwise pattern as the parameter.
4356static __inline __m256 __DEFAULT_FN_ATTRS
4357_mm256_castpd_ps(__m256d __a)
4358{
4359 return (__m256)__a;
4360}
4361
4362/// Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
4363/// integer vector.
4364///
4365/// \headerfile <x86intrin.h>
4366///
4367/// This intrinsic has no corresponding instruction.
4368///
4369/// \param __a
4370/// A 256-bit floating-point vector of [4 x double].
4371/// \returns A 256-bit integer vector containing the same bitwise pattern as the
4372/// parameter.
4373static __inline __m256i __DEFAULT_FN_ATTRS
4374_mm256_castpd_si256(__m256d __a)
4375{
4376 return (__m256i)__a;
4377}
4378
4379/// Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
4380/// floating-point vector of [4 x double].
4381///
4382/// \headerfile <x86intrin.h>
4383///
4384/// This intrinsic has no corresponding instruction.
4385///
4386/// \param __a
4387/// A 256-bit floating-point vector of [8 x float].
4388/// \returns A 256-bit floating-point vector of [4 x double] containing the same
4389/// bitwise pattern as the parameter.
4390static __inline __m256d __DEFAULT_FN_ATTRS
4391_mm256_castps_pd(__m256 __a)
4392{
4393 return (__m256d)__a;
4394}
4395
4396/// Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
4397/// integer vector.
4398///
4399/// \headerfile <x86intrin.h>
4400///
4401/// This intrinsic has no corresponding instruction.
4402///
4403/// \param __a
4404/// A 256-bit floating-point vector of [8 x float].
4405/// \returns A 256-bit integer vector containing the same bitwise pattern as the
4406/// parameter.
4407static __inline __m256i __DEFAULT_FN_ATTRS
4408_mm256_castps_si256(__m256 __a)
4409{
4410 return (__m256i)__a;
4411}
4412
4413/// Casts a 256-bit integer vector into a 256-bit floating-point vector
4414/// of [8 x float].
4415///
4416/// \headerfile <x86intrin.h>
4417///
4418/// This intrinsic has no corresponding instruction.
4419///
4420/// \param __a
4421/// A 256-bit integer vector.
4422/// \returns A 256-bit floating-point vector of [8 x float] containing the same
4423/// bitwise pattern as the parameter.
4424static __inline __m256 __DEFAULT_FN_ATTRS
4425_mm256_castsi256_ps(__m256i __a)
4426{
4427 return (__m256)__a;
4428}
4429
4430/// Casts a 256-bit integer vector into a 256-bit floating-point vector
4431/// of [4 x double].
4432///
4433/// \headerfile <x86intrin.h>
4434///
4435/// This intrinsic has no corresponding instruction.
4436///
4437/// \param __a
4438/// A 256-bit integer vector.
4439/// \returns A 256-bit floating-point vector of [4 x double] containing the same
4440/// bitwise pattern as the parameter.
4441static __inline __m256d __DEFAULT_FN_ATTRS
4442_mm256_castsi256_pd(__m256i __a)
4443{
4444 return (__m256d)__a;
4445}
4446
4447/// Returns the lower 128 bits of a 256-bit floating-point vector of
4448/// [4 x double] as a 128-bit floating-point vector of [2 x double].
4449///
4450/// \headerfile <x86intrin.h>
4451///
4452/// This intrinsic has no corresponding instruction.
4453///
4454/// \param __a
4455/// A 256-bit floating-point vector of [4 x double].
4456/// \returns A 128-bit floating-point vector of [2 x double] containing the
4457/// lower 128 bits of the parameter.
4458static __inline __m128d __DEFAULT_FN_ATTRS
4459_mm256_castpd256_pd128(__m256d __a)
4460{
4461 return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 1);
4462}
4463
4464/// Returns the lower 128 bits of a 256-bit floating-point vector of
4465/// [8 x float] as a 128-bit floating-point vector of [4 x float].
4466///
4467/// \headerfile <x86intrin.h>
4468///
4469/// This intrinsic has no corresponding instruction.
4470///
4471/// \param __a
4472/// A 256-bit floating-point vector of [8 x float].
4473/// \returns A 128-bit floating-point vector of [4 x float] containing the
4474/// lower 128 bits of the parameter.
4475static __inline __m128 __DEFAULT_FN_ATTRS
4476_mm256_castps256_ps128(__m256 __a)
4477{
4478 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 1, 2, 3);
4479}
4480
4481/// Truncates a 256-bit integer vector into a 128-bit integer vector.
4482///
4483/// \headerfile <x86intrin.h>
4484///
4485/// This intrinsic has no corresponding instruction.
4486///
4487/// \param __a
4488/// A 256-bit integer vector.
4489/// \returns A 128-bit integer vector containing the lower 128 bits of the
4490/// parameter.
4491static __inline __m128i __DEFAULT_FN_ATTRS
4492_mm256_castsi256_si128(__m256i __a)
4493{
4494 return __builtin_shufflevector((__v4di)__a, (__v4di)__a, 0, 1);
4495}
4496
4497/// Constructs a 256-bit floating-point vector of [4 x double] from a
4498/// 128-bit floating-point vector of [2 x double].
4499///
4500/// The lower 128 bits contain the value of the source vector. The contents
4501/// of the upper 128 bits are undefined.
4502///
4503/// \headerfile <x86intrin.h>
4504///
4505/// This intrinsic has no corresponding instruction.
4506///
4507/// \param __a
4508/// A 128-bit vector of [2 x double].
4509/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
4510/// contain the value of the parameter. The contents of the upper 128 bits
4511/// are undefined.
4512static __inline __m256d __DEFAULT_FN_ATTRS
4513_mm256_castpd128_pd256(__m128d __a)
4514{
4515 return __builtin_shufflevector(
4516 (__v2df)__a, (__v2df)__builtin_nondeterministic_value(__a), 0, 1, 2, 3);
4517}
4518
4519/// Constructs a 256-bit floating-point vector of [8 x float] from a
4520/// 128-bit floating-point vector of [4 x float].
4521///
4522/// The lower 128 bits contain the value of the source vector. The contents
4523/// of the upper 128 bits are undefined.
4524///
4525/// \headerfile <x86intrin.h>
4526///
4527/// This intrinsic has no corresponding instruction.
4528///
4529/// \param __a
4530/// A 128-bit vector of [4 x float].
4531/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
4532/// contain the value of the parameter. The contents of the upper 128 bits
4533/// are undefined.
4534static __inline __m256 __DEFAULT_FN_ATTRS
4535_mm256_castps128_ps256(__m128 __a)
4536{
4537 return __builtin_shufflevector((__v4sf)__a,
4538 (__v4sf)__builtin_nondeterministic_value(__a),
4539 0, 1, 2, 3, 4, 5, 6, 7);
4540}
4541
4542/// Constructs a 256-bit integer vector from a 128-bit integer vector.
4543///
4544/// The lower 128 bits contain the value of the source vector. The contents
4545/// of the upper 128 bits are undefined.
4546///
4547/// \headerfile <x86intrin.h>
4548///
4549/// This intrinsic has no corresponding instruction.
4550///
4551/// \param __a
4552/// A 128-bit integer vector.
4553/// \returns A 256-bit integer vector. The lower 128 bits contain the value of
4554/// the parameter. The contents of the upper 128 bits are undefined.
4555static __inline __m256i __DEFAULT_FN_ATTRS
4556_mm256_castsi128_si256(__m128i __a)
4557{
4558 return __builtin_shufflevector(
4559 (__v2di)__a, (__v2di)__builtin_nondeterministic_value(__a), 0, 1, 2, 3);
4560}
4561
4562/// Constructs a 256-bit floating-point vector of [4 x double] from a
4563/// 128-bit floating-point vector of [2 x double]. The lower 128 bits
4564/// contain the value of the source vector. The upper 128 bits are set
4565/// to zero.
4566///
4567/// \headerfile <x86intrin.h>
4568///
4569/// This intrinsic has no corresponding instruction.
4570///
4571/// \param __a
4572/// A 128-bit vector of [2 x double].
4573/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
4574/// contain the value of the parameter. The upper 128 bits are set to zero.
4575static __inline __m256d __DEFAULT_FN_ATTRS
4576_mm256_zextpd128_pd256(__m128d __a)
4577{
4578 return __builtin_shufflevector((__v2df)__a, (__v2df)_mm_setzero_pd(), 0, 1, 2, 3);
4579}
4580
4581/// Constructs a 256-bit floating-point vector of [8 x float] from a
4582/// 128-bit floating-point vector of [4 x float]. The lower 128 bits contain
4583/// the value of the source vector. The upper 128 bits are set to zero.
4584///
4585/// \headerfile <x86intrin.h>
4586///
4587/// This intrinsic has no corresponding instruction.
4588///
4589/// \param __a
4590/// A 128-bit vector of [4 x float].
4591/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
4592/// contain the value of the parameter. The upper 128 bits are set to zero.
4593static __inline __m256 __DEFAULT_FN_ATTRS
4594_mm256_zextps128_ps256(__m128 __a)
4595{
4596 return __builtin_shufflevector((__v4sf)__a, (__v4sf)_mm_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7);
4597}
4598
4599/// Constructs a 256-bit integer vector from a 128-bit integer vector.
4600/// The lower 128 bits contain the value of the source vector. The upper
4601/// 128 bits are set to zero.
4602///
4603/// \headerfile <x86intrin.h>
4604///
4605/// This intrinsic has no corresponding instruction.
4606///
4607/// \param __a
4608/// A 128-bit integer vector.
4609/// \returns A 256-bit integer vector. The lower 128 bits contain the value of
4610/// the parameter. The upper 128 bits are set to zero.
4611static __inline __m256i __DEFAULT_FN_ATTRS
4612_mm256_zextsi128_si256(__m128i __a)
4613{
4614 return __builtin_shufflevector((__v2di)__a, (__v2di)_mm_setzero_si128(), 0, 1, 2, 3);
4615}
4616
4617/*
4618 Vector insert.
4619 We use macros rather than inlines because we only want to accept
4620 invocations where the immediate M is a constant expression.
4621*/
4622/// Constructs a new 256-bit vector of [8 x float] by first duplicating
4623/// a 256-bit vector of [8 x float] given in the first parameter, and then
4624/// replacing either the upper or the lower 128 bits with the contents of a
4625/// 128-bit vector of [4 x float] in the second parameter.
4626///
4627/// The immediate integer parameter determines between the upper or the lower
4628/// 128 bits.
4629///
4630/// \headerfile <x86intrin.h>
4631///
4632/// \code
4633/// __m256 _mm256_insertf128_ps(__m256 V1, __m128 V2, const int M);
4634/// \endcode
4635///
4636/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4637///
4638/// \param V1
4639/// A 256-bit vector of [8 x float]. This vector is copied to the result
4640/// first, and then either the upper or the lower 128 bits of the result will
4641/// be replaced by the contents of \a V2.
4642/// \param V2
4643/// A 128-bit vector of [4 x float]. The contents of this parameter are
4644/// written to either the upper or the lower 128 bits of the result depending
4645/// on the value of parameter \a M.
4646/// \param M
4647/// An immediate integer. The least significant bit determines how the values
4648/// from the two parameters are interleaved: \n
4649/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
4650/// and bits [255:128] of \a V1 are copied to bits [255:128] of the
4651/// result. \n
4652/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
4653/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
4654/// result.
4655/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
4656#define _mm256_insertf128_ps(V1, V2, M) \
4657 ((__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)(__m256)(V1), \
4658 (__v4sf)(__m128)(V2), (int)(M)))
4659
4660/// Constructs a new 256-bit vector of [4 x double] by first duplicating
4661/// a 256-bit vector of [4 x double] given in the first parameter, and then
4662/// replacing either the upper or the lower 128 bits with the contents of a
4663/// 128-bit vector of [2 x double] in the second parameter.
4664///
4665/// The immediate integer parameter determines between the upper or the lower
4666/// 128 bits.
4667///
4668/// \headerfile <x86intrin.h>
4669///
4670/// \code
4671/// __m256d _mm256_insertf128_pd(__m256d V1, __m128d V2, const int M);
4672/// \endcode
4673///
4674/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4675///
4676/// \param V1
4677/// A 256-bit vector of [4 x double]. This vector is copied to the result
4678/// first, and then either the upper or the lower 128 bits of the result will
4679/// be replaced by the contents of \a V2.
4680/// \param V2
4681/// A 128-bit vector of [2 x double]. The contents of this parameter are
4682/// written to either the upper or the lower 128 bits of the result depending
4683/// on the value of parameter \a M.
4684/// \param M
4685/// An immediate integer. The least significant bit determines how the values
4686/// from the two parameters are interleaved: \n
4687/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
4688/// and bits [255:128] of \a V1 are copied to bits [255:128] of the
4689/// result. \n
4690/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
4691/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
4692/// result.
4693/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
4694#define _mm256_insertf128_pd(V1, V2, M) \
4695 ((__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)(__m256d)(V1), \
4696 (__v2df)(__m128d)(V2), (int)(M)))
4697
4698/// Constructs a new 256-bit integer vector by first duplicating a
4699/// 256-bit integer vector given in the first parameter, and then replacing
4700/// either the upper or the lower 128 bits with the contents of a 128-bit
4701/// integer vector in the second parameter.
4702///
4703/// The immediate integer parameter determines between the upper or the lower
4704/// 128 bits.
4705///
4706/// \headerfile <x86intrin.h>
4707///
4708/// \code
4709/// __m256i _mm256_insertf128_si256(__m256i V1, __m128i V2, const int M);
4710/// \endcode
4711///
4712/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4713///
4714/// \param V1
4715/// A 256-bit integer vector. This vector is copied to the result first, and
4716/// then either the upper or the lower 128 bits of the result will be
4717/// replaced by the contents of \a V2.
4718/// \param V2
4719/// A 128-bit integer vector. The contents of this parameter are written to
4720/// either the upper or the lower 128 bits of the result depending on the
4721/// value of parameter \a M.
4722/// \param M
4723/// An immediate integer. The least significant bit determines how the values
4724/// from the two parameters are interleaved: \n
4725/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
4726/// and bits [255:128] of \a V1 are copied to bits [255:128] of the
4727/// result. \n
4728/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
4729/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
4730/// result.
4731/// \returns A 256-bit integer vector containing the interleaved values.
4732#define _mm256_insertf128_si256(V1, V2, M) \
4733 ((__m256i)__builtin_ia32_vinsertf128_si256((__v8si)(__m256i)(V1), \
4734 (__v4si)(__m128i)(V2), (int)(M)))
4735
4736/*
4737 Vector extract.
4738 We use macros rather than inlines because we only want to accept
4739 invocations where the immediate M is a constant expression.
4740*/
4741/// Extracts either the upper or the lower 128 bits from a 256-bit vector
4742/// of [8 x float], as determined by the immediate integer parameter, and
4743/// returns the extracted bits as a 128-bit vector of [4 x float].
4744///
4745/// \headerfile <x86intrin.h>
4746///
4747/// \code
4748/// __m128 _mm256_extractf128_ps(__m256 V, const int M);
4749/// \endcode
4750///
4751/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
4752///
4753/// \param V
4754/// A 256-bit vector of [8 x float].
4755/// \param M
4756/// An immediate integer. The least significant bit determines which bits are
4757/// extracted from the first parameter: \n
4758/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
4759/// result. \n
4760/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
4761/// \returns A 128-bit vector of [4 x float] containing the extracted bits.
4762#define _mm256_extractf128_ps(V, M) \
4763 ((__m128)__builtin_ia32_vextractf128_ps256((__v8sf)(__m256)(V), (int)(M)))
4764
4765/// Extracts either the upper or the lower 128 bits from a 256-bit vector
4766/// of [4 x double], as determined by the immediate integer parameter, and
4767/// returns the extracted bits as a 128-bit vector of [2 x double].
4768///
4769/// \headerfile <x86intrin.h>
4770///
4771/// \code
4772/// __m128d _mm256_extractf128_pd(__m256d V, const int M);
4773/// \endcode
4774///
4775/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
4776///
4777/// \param V
4778/// A 256-bit vector of [4 x double].
4779/// \param M
4780/// An immediate integer. The least significant bit determines which bits are
4781/// extracted from the first parameter: \n
4782/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
4783/// result. \n
4784/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
4785/// \returns A 128-bit vector of [2 x double] containing the extracted bits.
4786#define _mm256_extractf128_pd(V, M) \
4787 ((__m128d)__builtin_ia32_vextractf128_pd256((__v4df)(__m256d)(V), (int)(M)))
4788
4789/// Extracts either the upper or the lower 128 bits from a 256-bit
4790/// integer vector, as determined by the immediate integer parameter, and
4791/// returns the extracted bits as a 128-bit integer vector.
4792///
4793/// \headerfile <x86intrin.h>
4794///
4795/// \code
4796/// __m128i _mm256_extractf128_si256(__m256i V, const int M);
4797/// \endcode
4798///
4799/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
4800///
4801/// \param V
4802/// A 256-bit integer vector.
4803/// \param M
4804/// An immediate integer. The least significant bit determines which bits are
4805/// extracted from the first parameter: \n
4806/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
4807/// result. \n
4808/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
4809/// \returns A 128-bit integer vector containing the extracted bits.
4810#define _mm256_extractf128_si256(V, M) \
4811 ((__m128i)__builtin_ia32_vextractf128_si256((__v8si)(__m256i)(V), (int)(M)))
4812
4813/// Constructs a 256-bit floating-point vector of [8 x float] by
4814/// concatenating two 128-bit floating-point vectors of [4 x float].
4815///
4816/// \headerfile <x86intrin.h>
4817///
4818/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4819///
4820/// \param __hi
4821/// A 128-bit floating-point vector of [4 x float] to be copied to the upper
4822/// 128 bits of the result.
4823/// \param __lo
4824/// A 128-bit floating-point vector of [4 x float] to be copied to the lower
4825/// 128 bits of the result.
4826/// \returns A 256-bit floating-point vector of [8 x float] containing the
4827/// concatenated result.
4828static __inline __m256 __DEFAULT_FN_ATTRS
4829_mm256_set_m128 (__m128 __hi, __m128 __lo)
4830{
4831 return (__m256) __builtin_shufflevector((__v4sf)__lo, (__v4sf)__hi, 0, 1, 2, 3, 4, 5, 6, 7);
4832}
4833
4834/// Constructs a 256-bit floating-point vector of [4 x double] by
4835/// concatenating two 128-bit floating-point vectors of [2 x double].
4836///
4837/// \headerfile <x86intrin.h>
4838///
4839/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4840///
4841/// \param __hi
4842/// A 128-bit floating-point vector of [2 x double] to be copied to the upper
4843/// 128 bits of the result.
4844/// \param __lo
4845/// A 128-bit floating-point vector of [2 x double] to be copied to the lower
4846/// 128 bits of the result.
4847/// \returns A 256-bit floating-point vector of [4 x double] containing the
4848/// concatenated result.
4849static __inline __m256d __DEFAULT_FN_ATTRS
4850_mm256_set_m128d (__m128d __hi, __m128d __lo)
4851{
4852 return (__m256d) __builtin_shufflevector((__v2df)__lo, (__v2df)__hi, 0, 1, 2, 3);
4853}
4854
4855/// Constructs a 256-bit integer vector by concatenating two 128-bit
4856/// integer vectors.
4857///
4858/// \headerfile <x86intrin.h>
4859///
4860/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4861///
4862/// \param __hi
4863/// A 128-bit integer vector to be copied to the upper 128 bits of the
4864/// result.
4865/// \param __lo
4866/// A 128-bit integer vector to be copied to the lower 128 bits of the
4867/// result.
4868/// \returns A 256-bit integer vector containing the concatenated result.
4869static __inline __m256i __DEFAULT_FN_ATTRS
4870_mm256_set_m128i (__m128i __hi, __m128i __lo)
4871{
4872 return (__m256i) __builtin_shufflevector((__v2di)__lo, (__v2di)__hi, 0, 1, 2, 3);
4873}
4874
4875/// Constructs a 256-bit floating-point vector of [8 x float] by
4876/// concatenating two 128-bit floating-point vectors of [4 x float]. This is
4877/// similar to _mm256_set_m128, but the order of the input parameters is
4878/// swapped.
4879///
4880/// \headerfile <x86intrin.h>
4881///
4882/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4883///
4884/// \param __lo
4885/// A 128-bit floating-point vector of [4 x float] to be copied to the lower
4886/// 128 bits of the result.
4887/// \param __hi
4888/// A 128-bit floating-point vector of [4 x float] to be copied to the upper
4889/// 128 bits of the result.
4890/// \returns A 256-bit floating-point vector of [8 x float] containing the
4891/// concatenated result.
4892static __inline __m256 __DEFAULT_FN_ATTRS
4893_mm256_setr_m128 (__m128 __lo, __m128 __hi)
4894{
4895 return _mm256_set_m128(__hi, __lo);
4896}
4897
4898/// Constructs a 256-bit floating-point vector of [4 x double] by
4899/// concatenating two 128-bit floating-point vectors of [2 x double]. This is
4900/// similar to _mm256_set_m128d, but the order of the input parameters is
4901/// swapped.
4902///
4903/// \headerfile <x86intrin.h>
4904///
4905/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4906///
4907/// \param __lo
4908/// A 128-bit floating-point vector of [2 x double] to be copied to the lower
4909/// 128 bits of the result.
4910/// \param __hi
4911/// A 128-bit floating-point vector of [2 x double] to be copied to the upper
4912/// 128 bits of the result.
4913/// \returns A 256-bit floating-point vector of [4 x double] containing the
4914/// concatenated result.
4915static __inline __m256d __DEFAULT_FN_ATTRS
4916_mm256_setr_m128d (__m128d __lo, __m128d __hi)
4917{
4918 return (__m256d)_mm256_set_m128d(__hi, __lo);
4919}
4920
4921/// Constructs a 256-bit integer vector by concatenating two 128-bit
4922/// integer vectors. This is similar to _mm256_set_m128i, but the order of
4923/// the input parameters is swapped.
4924///
4925/// \headerfile <x86intrin.h>
4926///
4927/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4928///
4929/// \param __lo
4930/// A 128-bit integer vector to be copied to the lower 128 bits of the
4931/// result.
4932/// \param __hi
4933/// A 128-bit integer vector to be copied to the upper 128 bits of the
4934/// result.
4935/// \returns A 256-bit integer vector containing the concatenated result.
4936static __inline __m256i __DEFAULT_FN_ATTRS
4937_mm256_setr_m128i (__m128i __lo, __m128i __hi)
4938{
4939 return (__m256i)_mm256_set_m128i(__hi, __lo);
4940}
4941
4942/* SIMD load ops (unaligned) */
4943/// Loads two 128-bit floating-point vectors of [4 x float] from
4944/// unaligned memory locations and constructs a 256-bit floating-point vector
4945/// of [8 x float] by concatenating the two 128-bit vectors.
4946///
4947/// \headerfile <x86intrin.h>
4948///
4949/// This intrinsic corresponds to load instructions followed by the
4950/// <c> VINSERTF128 </c> instruction.
4951///
4952/// \param __addr_hi
4953/// A pointer to a 128-bit memory location containing 4 consecutive
4954/// single-precision floating-point values. These values are to be copied to
4955/// bits[255:128] of the result. The address of the memory location does not
4956/// have to be aligned.
4957/// \param __addr_lo
4958/// A pointer to a 128-bit memory location containing 4 consecutive
4959/// single-precision floating-point values. These values are to be copied to
4960/// bits[127:0] of the result. The address of the memory location does not
4961/// have to be aligned.
4962/// \returns A 256-bit floating-point vector of [8 x float] containing the
4963/// concatenated result.
4964static __inline __m256 __DEFAULT_FN_ATTRS
4965_mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo)
4966{
4967 return _mm256_set_m128(_mm_loadu_ps(__addr_hi), _mm_loadu_ps(__addr_lo));
4968}
4969
4970/// Loads two 128-bit floating-point vectors of [2 x double] from
4971/// unaligned memory locations and constructs a 256-bit floating-point vector
4972/// of [4 x double] by concatenating the two 128-bit vectors.
4973///
4974/// \headerfile <x86intrin.h>
4975///
4976/// This intrinsic corresponds to load instructions followed by the
4977/// <c> VINSERTF128 </c> instruction.
4978///
4979/// \param __addr_hi
4980/// A pointer to a 128-bit memory location containing two consecutive
4981/// double-precision floating-point values. These values are to be copied to
4982/// bits[255:128] of the result. The address of the memory location does not
4983/// have to be aligned.
4984/// \param __addr_lo
4985/// A pointer to a 128-bit memory location containing two consecutive
4986/// double-precision floating-point values. These values are to be copied to
4987/// bits[127:0] of the result. The address of the memory location does not
4988/// have to be aligned.
4989/// \returns A 256-bit floating-point vector of [4 x double] containing the
4990/// concatenated result.
4991static __inline __m256d __DEFAULT_FN_ATTRS
4992_mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo)
4993{
4994 return _mm256_set_m128d(_mm_loadu_pd(__addr_hi), _mm_loadu_pd(__addr_lo));
4995}
4996
4997/// Loads two 128-bit integer vectors from unaligned memory locations and
4998/// constructs a 256-bit integer vector by concatenating the two 128-bit
4999/// vectors.
5000///
5001/// \headerfile <x86intrin.h>
5002///
5003/// This intrinsic corresponds to load instructions followed by the
5004/// <c> VINSERTF128 </c> instruction.
5005///
5006/// \param __addr_hi
5007/// A pointer to a 128-bit memory location containing a 128-bit integer
5008/// vector. This vector is to be copied to bits[255:128] of the result. The
5009/// address of the memory location does not have to be aligned.
5010/// \param __addr_lo
5011/// A pointer to a 128-bit memory location containing a 128-bit integer
5012/// vector. This vector is to be copied to bits[127:0] of the result. The
5013/// address of the memory location does not have to be aligned.
5014/// \returns A 256-bit integer vector containing the concatenated result.
5015static __inline __m256i __DEFAULT_FN_ATTRS
5016_mm256_loadu2_m128i(__m128i_u const *__addr_hi, __m128i_u const *__addr_lo)
5017{
5018 return _mm256_set_m128i(_mm_loadu_si128(__addr_hi), _mm_loadu_si128(__addr_lo));
5019}
5020
5021/* SIMD store ops (unaligned) */
5022/// Stores the upper and lower 128 bits of a 256-bit floating-point
5023/// vector of [8 x float] into two different unaligned memory locations.
5024///
5025/// \headerfile <x86intrin.h>
5026///
5027/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
5028/// store instructions.
5029///
5030/// \param __addr_hi
5031/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
5032/// copied to this memory location. The address of this memory location does
5033/// not have to be aligned.
5034/// \param __addr_lo
5035/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
5036/// copied to this memory location. The address of this memory location does
5037/// not have to be aligned.
5038/// \param __a
5039/// A 256-bit floating-point vector of [8 x float].
5040static __inline void __DEFAULT_FN_ATTRS
5041_mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a)
5042{
5043 __m128 __v128;
5044
5045 __v128 = _mm256_castps256_ps128(__a);
5046 _mm_storeu_ps(__addr_lo, __v128);
5047 __v128 = _mm256_extractf128_ps(__a, 1);
5048 _mm_storeu_ps(__addr_hi, __v128);
5049}
5050
5051/// Stores the upper and lower 128 bits of a 256-bit floating-point
5052/// vector of [4 x double] into two different unaligned memory locations.
5053///
5054/// \headerfile <x86intrin.h>
5055///
5056/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
5057/// store instructions.
5058///
5059/// \param __addr_hi
5060/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
5061/// copied to this memory location. The address of this memory location does
5062/// not have to be aligned.
5063/// \param __addr_lo
5064/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
5065/// copied to this memory location. The address of this memory location does
5066/// not have to be aligned.
5067/// \param __a
5068/// A 256-bit floating-point vector of [4 x double].
5069static __inline void __DEFAULT_FN_ATTRS
5070_mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a)
5071{
5072 __m128d __v128;
5073
5074 __v128 = _mm256_castpd256_pd128(__a);
5075 _mm_storeu_pd(__addr_lo, __v128);
5076 __v128 = _mm256_extractf128_pd(__a, 1);
5077 _mm_storeu_pd(__addr_hi, __v128);
5078}
5079
5080/// Stores the upper and lower 128 bits of a 256-bit integer vector into
5081/// two different unaligned memory locations.
5082///
5083/// \headerfile <x86intrin.h>
5084///
5085/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
5086/// store instructions.
5087///
5088/// \param __addr_hi
5089/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
5090/// copied to this memory location. The address of this memory location does
5091/// not have to be aligned.
5092/// \param __addr_lo
5093/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
5094/// copied to this memory location. The address of this memory location does
5095/// not have to be aligned.
5096/// \param __a
5097/// A 256-bit integer vector.
5098static __inline void __DEFAULT_FN_ATTRS
5099_mm256_storeu2_m128i(__m128i_u *__addr_hi, __m128i_u *__addr_lo, __m256i __a)
5100{
5101 __m128i __v128;
5102
5103 __v128 = _mm256_castsi256_si128(__a);
5104 _mm_storeu_si128(__addr_lo, __v128);
5105 __v128 = _mm256_extractf128_si256(__a, 1);
5106 _mm_storeu_si128(__addr_hi, __v128);
5107}
5108
5109#undef __DEFAULT_FN_ATTRS
5110#undef __DEFAULT_FN_ATTRS128
5111
5112#endif /* __AVXINTRIN_H */
5113

Warning: This file is not a C or C++ file. It does not have highlighting.

source code of clang/lib/Headers/avxintrin.h