Warning: This file is not a C or C++ file. It does not have highlighting.

1/*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10#ifndef __XMMINTRIN_H
11#define __XMMINTRIN_H
12
13#if !defined(__i386__) && !defined(__x86_64__)
14#error "This header is only meant to be used on x86 and x64 architecture"
15#endif
16
17#include <mmintrin.h>
18
19typedef int __v4si __attribute__((__vector_size__(16)));
20typedef float __v4sf __attribute__((__vector_size__(16)));
21typedef float __m128 __attribute__((__vector_size__(16), __aligned__(16)));
22
23typedef float __m128_u __attribute__((__vector_size__(16), __aligned__(1)));
24
25/* Unsigned types */
26typedef unsigned int __v4su __attribute__((__vector_size__(16)));
27
28/* This header should only be included in a hosted environment as it depends on
29 * a standard library to provide allocation routines. */
30#if __STDC_HOSTED__
31#include <mm_malloc.h>
32#endif
33
34/* Define the default attributes for the functions in this file. */
35#define __DEFAULT_FN_ATTRS \
36 __attribute__((__always_inline__, __nodebug__, __target__("sse,no-evex512"), \
37 __min_vector_width__(128)))
38#define __DEFAULT_FN_ATTRS_MMX \
39 __attribute__((__always_inline__, __nodebug__, \
40 __target__("mmx,sse,no-evex512"), __min_vector_width__(64)))
41
42/// Adds the 32-bit float values in the low-order bits of the operands.
43///
44/// \headerfile <x86intrin.h>
45///
46/// This intrinsic corresponds to the <c> VADDSS / ADDSS </c> instructions.
47///
48/// \param __a
49/// A 128-bit vector of [4 x float] containing one of the source operands.
50/// The lower 32 bits of this operand are used in the calculation.
51/// \param __b
52/// A 128-bit vector of [4 x float] containing one of the source operands.
53/// The lower 32 bits of this operand are used in the calculation.
54/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the sum
55/// of the lower 32 bits of both operands. The upper 96 bits are copied from
56/// the upper 96 bits of the first source operand.
57static __inline__ __m128 __DEFAULT_FN_ATTRS
58_mm_add_ss(__m128 __a, __m128 __b)
59{
60 __a[0] += __b[0];
61 return __a;
62}
63
64/// Adds two 128-bit vectors of [4 x float], and returns the results of
65/// the addition.
66///
67/// \headerfile <x86intrin.h>
68///
69/// This intrinsic corresponds to the <c> VADDPS / ADDPS </c> instructions.
70///
71/// \param __a
72/// A 128-bit vector of [4 x float] containing one of the source operands.
73/// \param __b
74/// A 128-bit vector of [4 x float] containing one of the source operands.
75/// \returns A 128-bit vector of [4 x float] containing the sums of both
76/// operands.
77static __inline__ __m128 __DEFAULT_FN_ATTRS
78_mm_add_ps(__m128 __a, __m128 __b)
79{
80 return (__m128)((__v4sf)__a + (__v4sf)__b);
81}
82
83/// Subtracts the 32-bit float value in the low-order bits of the second
84/// operand from the corresponding value in the first operand.
85///
86/// \headerfile <x86intrin.h>
87///
88/// This intrinsic corresponds to the <c> VSUBSS / SUBSS </c> instructions.
89///
90/// \param __a
91/// A 128-bit vector of [4 x float] containing the minuend. The lower 32 bits
92/// of this operand are used in the calculation.
93/// \param __b
94/// A 128-bit vector of [4 x float] containing the subtrahend. The lower 32
95/// bits of this operand are used in the calculation.
96/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
97/// difference of the lower 32 bits of both operands. The upper 96 bits are
98/// copied from the upper 96 bits of the first source operand.
99static __inline__ __m128 __DEFAULT_FN_ATTRS
100_mm_sub_ss(__m128 __a, __m128 __b)
101{
102 __a[0] -= __b[0];
103 return __a;
104}
105
106/// Subtracts each of the values of the second operand from the first
107/// operand, both of which are 128-bit vectors of [4 x float] and returns
108/// the results of the subtraction.
109///
110/// \headerfile <x86intrin.h>
111///
112/// This intrinsic corresponds to the <c> VSUBPS / SUBPS </c> instructions.
113///
114/// \param __a
115/// A 128-bit vector of [4 x float] containing the minuend.
116/// \param __b
117/// A 128-bit vector of [4 x float] containing the subtrahend.
118/// \returns A 128-bit vector of [4 x float] containing the differences between
119/// both operands.
120static __inline__ __m128 __DEFAULT_FN_ATTRS
121_mm_sub_ps(__m128 __a, __m128 __b)
122{
123 return (__m128)((__v4sf)__a - (__v4sf)__b);
124}
125
126/// Multiplies two 32-bit float values in the low-order bits of the
127/// operands.
128///
129/// \headerfile <x86intrin.h>
130///
131/// This intrinsic corresponds to the <c> VMULSS / MULSS </c> instructions.
132///
133/// \param __a
134/// A 128-bit vector of [4 x float] containing one of the source operands.
135/// The lower 32 bits of this operand are used in the calculation.
136/// \param __b
137/// A 128-bit vector of [4 x float] containing one of the source operands.
138/// The lower 32 bits of this operand are used in the calculation.
139/// \returns A 128-bit vector of [4 x float] containing the product of the lower
140/// 32 bits of both operands. The upper 96 bits are copied from the upper 96
141/// bits of the first source operand.
142static __inline__ __m128 __DEFAULT_FN_ATTRS
143_mm_mul_ss(__m128 __a, __m128 __b)
144{
145 __a[0] *= __b[0];
146 return __a;
147}
148
149/// Multiplies two 128-bit vectors of [4 x float] and returns the
150/// results of the multiplication.
151///
152/// \headerfile <x86intrin.h>
153///
154/// This intrinsic corresponds to the <c> VMULPS / MULPS </c> instructions.
155///
156/// \param __a
157/// A 128-bit vector of [4 x float] containing one of the source operands.
158/// \param __b
159/// A 128-bit vector of [4 x float] containing one of the source operands.
160/// \returns A 128-bit vector of [4 x float] containing the products of both
161/// operands.
162static __inline__ __m128 __DEFAULT_FN_ATTRS
163_mm_mul_ps(__m128 __a, __m128 __b)
164{
165 return (__m128)((__v4sf)__a * (__v4sf)__b);
166}
167
168/// Divides the value in the low-order 32 bits of the first operand by
169/// the corresponding value in the second operand.
170///
171/// \headerfile <x86intrin.h>
172///
173/// This intrinsic corresponds to the <c> VDIVSS / DIVSS </c> instructions.
174///
175/// \param __a
176/// A 128-bit vector of [4 x float] containing the dividend. The lower 32
177/// bits of this operand are used in the calculation.
178/// \param __b
179/// A 128-bit vector of [4 x float] containing the divisor. The lower 32 bits
180/// of this operand are used in the calculation.
181/// \returns A 128-bit vector of [4 x float] containing the quotients of the
182/// lower 32 bits of both operands. The upper 96 bits are copied from the
183/// upper 96 bits of the first source operand.
184static __inline__ __m128 __DEFAULT_FN_ATTRS
185_mm_div_ss(__m128 __a, __m128 __b)
186{
187 __a[0] /= __b[0];
188 return __a;
189}
190
191/// Divides two 128-bit vectors of [4 x float].
192///
193/// \headerfile <x86intrin.h>
194///
195/// This intrinsic corresponds to the <c> VDIVPS / DIVPS </c> instructions.
196///
197/// \param __a
198/// A 128-bit vector of [4 x float] containing the dividend.
199/// \param __b
200/// A 128-bit vector of [4 x float] containing the divisor.
201/// \returns A 128-bit vector of [4 x float] containing the quotients of both
202/// operands.
203static __inline__ __m128 __DEFAULT_FN_ATTRS
204_mm_div_ps(__m128 __a, __m128 __b)
205{
206 return (__m128)((__v4sf)__a / (__v4sf)__b);
207}
208
209/// Calculates the square root of the value stored in the low-order bits
210/// of a 128-bit vector of [4 x float].
211///
212/// \headerfile <x86intrin.h>
213///
214/// This intrinsic corresponds to the <c> VSQRTSS / SQRTSS </c> instructions.
215///
216/// \param __a
217/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
218/// used in the calculation.
219/// \returns A 128-bit vector of [4 x float] containing the square root of the
220/// value in the low-order bits of the operand.
221static __inline__ __m128 __DEFAULT_FN_ATTRS
222_mm_sqrt_ss(__m128 __a)
223{
224 return (__m128)__builtin_ia32_sqrtss((__v4sf)__a);
225}
226
227/// Calculates the square roots of the values stored in a 128-bit vector
228/// of [4 x float].
229///
230/// \headerfile <x86intrin.h>
231///
232/// This intrinsic corresponds to the <c> VSQRTPS / SQRTPS </c> instructions.
233///
234/// \param __a
235/// A 128-bit vector of [4 x float].
236/// \returns A 128-bit vector of [4 x float] containing the square roots of the
237/// values in the operand.
238static __inline__ __m128 __DEFAULT_FN_ATTRS
239_mm_sqrt_ps(__m128 __a)
240{
241 return __builtin_ia32_sqrtps((__v4sf)__a);
242}
243
244/// Calculates the approximate reciprocal of the value stored in the
245/// low-order bits of a 128-bit vector of [4 x float].
246///
247/// \headerfile <x86intrin.h>
248///
249/// This intrinsic corresponds to the <c> VRCPSS / RCPSS </c> instructions.
250///
251/// \param __a
252/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
253/// used in the calculation.
254/// \returns A 128-bit vector of [4 x float] containing the approximate
255/// reciprocal of the value in the low-order bits of the operand.
256static __inline__ __m128 __DEFAULT_FN_ATTRS
257_mm_rcp_ss(__m128 __a)
258{
259 return (__m128)__builtin_ia32_rcpss((__v4sf)__a);
260}
261
262/// Calculates the approximate reciprocals of the values stored in a
263/// 128-bit vector of [4 x float].
264///
265/// \headerfile <x86intrin.h>
266///
267/// This intrinsic corresponds to the <c> VRCPPS / RCPPS </c> instructions.
268///
269/// \param __a
270/// A 128-bit vector of [4 x float].
271/// \returns A 128-bit vector of [4 x float] containing the approximate
272/// reciprocals of the values in the operand.
273static __inline__ __m128 __DEFAULT_FN_ATTRS
274_mm_rcp_ps(__m128 __a)
275{
276 return (__m128)__builtin_ia32_rcpps((__v4sf)__a);
277}
278
279/// Calculates the approximate reciprocal of the square root of the value
280/// stored in the low-order bits of a 128-bit vector of [4 x float].
281///
282/// \headerfile <x86intrin.h>
283///
284/// This intrinsic corresponds to the <c> VRSQRTSS / RSQRTSS </c> instructions.
285///
286/// \param __a
287/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
288/// used in the calculation.
289/// \returns A 128-bit vector of [4 x float] containing the approximate
290/// reciprocal of the square root of the value in the low-order bits of the
291/// operand.
292static __inline__ __m128 __DEFAULT_FN_ATTRS
293_mm_rsqrt_ss(__m128 __a)
294{
295 return __builtin_ia32_rsqrtss((__v4sf)__a);
296}
297
298/// Calculates the approximate reciprocals of the square roots of the
299/// values stored in a 128-bit vector of [4 x float].
300///
301/// \headerfile <x86intrin.h>
302///
303/// This intrinsic corresponds to the <c> VRSQRTPS / RSQRTPS </c> instructions.
304///
305/// \param __a
306/// A 128-bit vector of [4 x float].
307/// \returns A 128-bit vector of [4 x float] containing the approximate
308/// reciprocals of the square roots of the values in the operand.
309static __inline__ __m128 __DEFAULT_FN_ATTRS
310_mm_rsqrt_ps(__m128 __a)
311{
312 return __builtin_ia32_rsqrtps((__v4sf)__a);
313}
314
315/// Compares two 32-bit float values in the low-order bits of both
316/// operands and returns the lesser value in the low-order bits of the
317/// vector of [4 x float].
318///
319/// \headerfile <x86intrin.h>
320///
321/// This intrinsic corresponds to the <c> VMINSS / MINSS </c> instructions.
322///
323/// \param __a
324/// A 128-bit vector of [4 x float] containing one of the operands. The lower
325/// 32 bits of this operand are used in the comparison.
326/// \param __b
327/// A 128-bit vector of [4 x float] containing one of the operands. The lower
328/// 32 bits of this operand are used in the comparison.
329/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
330/// minimum value between both operands. The upper 96 bits are copied from
331/// the upper 96 bits of the first source operand.
332static __inline__ __m128 __DEFAULT_FN_ATTRS
333_mm_min_ss(__m128 __a, __m128 __b)
334{
335 return __builtin_ia32_minss((__v4sf)__a, (__v4sf)__b);
336}
337
338/// Compares two 128-bit vectors of [4 x float] and returns the lesser
339/// of each pair of values.
340///
341/// \headerfile <x86intrin.h>
342///
343/// This intrinsic corresponds to the <c> VMINPS / MINPS </c> instructions.
344///
345/// \param __a
346/// A 128-bit vector of [4 x float] containing one of the operands.
347/// \param __b
348/// A 128-bit vector of [4 x float] containing one of the operands.
349/// \returns A 128-bit vector of [4 x float] containing the minimum values
350/// between both operands.
351static __inline__ __m128 __DEFAULT_FN_ATTRS
352_mm_min_ps(__m128 __a, __m128 __b)
353{
354 return __builtin_ia32_minps((__v4sf)__a, (__v4sf)__b);
355}
356
357/// Compares two 32-bit float values in the low-order bits of both
358/// operands and returns the greater value in the low-order bits of a 128-bit
359/// vector of [4 x float].
360///
361/// \headerfile <x86intrin.h>
362///
363/// This intrinsic corresponds to the <c> VMAXSS / MAXSS </c> instructions.
364///
365/// \param __a
366/// A 128-bit vector of [4 x float] containing one of the operands. The lower
367/// 32 bits of this operand are used in the comparison.
368/// \param __b
369/// A 128-bit vector of [4 x float] containing one of the operands. The lower
370/// 32 bits of this operand are used in the comparison.
371/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
372/// maximum value between both operands. The upper 96 bits are copied from
373/// the upper 96 bits of the first source operand.
374static __inline__ __m128 __DEFAULT_FN_ATTRS
375_mm_max_ss(__m128 __a, __m128 __b)
376{
377 return __builtin_ia32_maxss((__v4sf)__a, (__v4sf)__b);
378}
379
380/// Compares two 128-bit vectors of [4 x float] and returns the greater
381/// of each pair of values.
382///
383/// \headerfile <x86intrin.h>
384///
385/// This intrinsic corresponds to the <c> VMAXPS / MAXPS </c> instructions.
386///
387/// \param __a
388/// A 128-bit vector of [4 x float] containing one of the operands.
389/// \param __b
390/// A 128-bit vector of [4 x float] containing one of the operands.
391/// \returns A 128-bit vector of [4 x float] containing the maximum values
392/// between both operands.
393static __inline__ __m128 __DEFAULT_FN_ATTRS
394_mm_max_ps(__m128 __a, __m128 __b)
395{
396 return __builtin_ia32_maxps((__v4sf)__a, (__v4sf)__b);
397}
398
399/// Performs a bitwise AND of two 128-bit vectors of [4 x float].
400///
401/// \headerfile <x86intrin.h>
402///
403/// This intrinsic corresponds to the <c> VANDPS / ANDPS </c> instructions.
404///
405/// \param __a
406/// A 128-bit vector containing one of the source operands.
407/// \param __b
408/// A 128-bit vector containing one of the source operands.
409/// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
410/// values between both operands.
411static __inline__ __m128 __DEFAULT_FN_ATTRS
412_mm_and_ps(__m128 __a, __m128 __b)
413{
414 return (__m128)((__v4su)__a & (__v4su)__b);
415}
416
417/// Performs a bitwise AND of two 128-bit vectors of [4 x float], using
418/// the one's complement of the values contained in the first source
419/// operand.
420///
421/// \headerfile <x86intrin.h>
422///
423/// This intrinsic corresponds to the <c> VANDNPS / ANDNPS </c> instructions.
424///
425/// \param __a
426/// A 128-bit vector of [4 x float] containing the first source operand. The
427/// one's complement of this value is used in the bitwise AND.
428/// \param __b
429/// A 128-bit vector of [4 x float] containing the second source operand.
430/// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
431/// one's complement of the first operand and the values in the second
432/// operand.
433static __inline__ __m128 __DEFAULT_FN_ATTRS
434_mm_andnot_ps(__m128 __a, __m128 __b)
435{
436 return (__m128)(~(__v4su)__a & (__v4su)__b);
437}
438
439/// Performs a bitwise OR of two 128-bit vectors of [4 x float].
440///
441/// \headerfile <x86intrin.h>
442///
443/// This intrinsic corresponds to the <c> VORPS / ORPS </c> instructions.
444///
445/// \param __a
446/// A 128-bit vector of [4 x float] containing one of the source operands.
447/// \param __b
448/// A 128-bit vector of [4 x float] containing one of the source operands.
449/// \returns A 128-bit vector of [4 x float] containing the bitwise OR of the
450/// values between both operands.
451static __inline__ __m128 __DEFAULT_FN_ATTRS
452_mm_or_ps(__m128 __a, __m128 __b)
453{
454 return (__m128)((__v4su)__a | (__v4su)__b);
455}
456
457/// Performs a bitwise exclusive OR of two 128-bit vectors of
458/// [4 x float].
459///
460/// \headerfile <x86intrin.h>
461///
462/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instructions.
463///
464/// \param __a
465/// A 128-bit vector of [4 x float] containing one of the source operands.
466/// \param __b
467/// A 128-bit vector of [4 x float] containing one of the source operands.
468/// \returns A 128-bit vector of [4 x float] containing the bitwise exclusive OR
469/// of the values between both operands.
470static __inline__ __m128 __DEFAULT_FN_ATTRS
471_mm_xor_ps(__m128 __a, __m128 __b)
472{
473 return (__m128)((__v4su)__a ^ (__v4su)__b);
474}
475
476/// Compares two 32-bit float values in the low-order bits of both
477/// operands for equality and returns the result of the comparison in the
478/// low-order bits of a vector [4 x float].
479///
480/// \headerfile <x86intrin.h>
481///
482/// This intrinsic corresponds to the <c> VCMPEQSS / CMPEQSS </c> instructions.
483///
484/// \param __a
485/// A 128-bit vector of [4 x float] containing one of the operands. The lower
486/// 32 bits of this operand are used in the comparison.
487/// \param __b
488/// A 128-bit vector of [4 x float] containing one of the operands. The lower
489/// 32 bits of this operand are used in the comparison.
490/// \returns A 128-bit vector of [4 x float] containing the comparison results
491/// in the low-order bits.
492static __inline__ __m128 __DEFAULT_FN_ATTRS
493_mm_cmpeq_ss(__m128 __a, __m128 __b)
494{
495 return (__m128)__builtin_ia32_cmpeqss((__v4sf)__a, (__v4sf)__b);
496}
497
498/// Compares each of the corresponding 32-bit float values of the
499/// 128-bit vectors of [4 x float] for equality.
500///
501/// \headerfile <x86intrin.h>
502///
503/// This intrinsic corresponds to the <c> VCMPEQPS / CMPEQPS </c> instructions.
504///
505/// \param __a
506/// A 128-bit vector of [4 x float].
507/// \param __b
508/// A 128-bit vector of [4 x float].
509/// \returns A 128-bit vector of [4 x float] containing the comparison results.
510static __inline__ __m128 __DEFAULT_FN_ATTRS
511_mm_cmpeq_ps(__m128 __a, __m128 __b)
512{
513 return (__m128)__builtin_ia32_cmpeqps((__v4sf)__a, (__v4sf)__b);
514}
515
516/// Compares two 32-bit float values in the low-order bits of both
517/// operands to determine if the value in the first operand is less than the
518/// corresponding value in the second operand and returns the result of the
519/// comparison in the low-order bits of a vector of [4 x float].
520///
521/// \headerfile <x86intrin.h>
522///
523/// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
524///
525/// \param __a
526/// A 128-bit vector of [4 x float] containing one of the operands. The lower
527/// 32 bits of this operand are used in the comparison.
528/// \param __b
529/// A 128-bit vector of [4 x float] containing one of the operands. The lower
530/// 32 bits of this operand are used in the comparison.
531/// \returns A 128-bit vector of [4 x float] containing the comparison results
532/// in the low-order bits.
533static __inline__ __m128 __DEFAULT_FN_ATTRS
534_mm_cmplt_ss(__m128 __a, __m128 __b)
535{
536 return (__m128)__builtin_ia32_cmpltss((__v4sf)__a, (__v4sf)__b);
537}
538
539/// Compares each of the corresponding 32-bit float values of the
540/// 128-bit vectors of [4 x float] to determine if the values in the first
541/// operand are less than those in the second operand.
542///
543/// \headerfile <x86intrin.h>
544///
545/// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
546///
547/// \param __a
548/// A 128-bit vector of [4 x float].
549/// \param __b
550/// A 128-bit vector of [4 x float].
551/// \returns A 128-bit vector of [4 x float] containing the comparison results.
552static __inline__ __m128 __DEFAULT_FN_ATTRS
553_mm_cmplt_ps(__m128 __a, __m128 __b)
554{
555 return (__m128)__builtin_ia32_cmpltps((__v4sf)__a, (__v4sf)__b);
556}
557
558/// Compares two 32-bit float values in the low-order bits of both
559/// operands to determine if the value in the first operand is less than or
560/// equal to the corresponding value in the second operand and returns the
561/// result of the comparison in the low-order bits of a vector of
562/// [4 x float].
563///
564/// \headerfile <x86intrin.h>
565///
566/// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
567///
568/// \param __a
569/// A 128-bit vector of [4 x float] containing one of the operands. The lower
570/// 32 bits of this operand are used in the comparison.
571/// \param __b
572/// A 128-bit vector of [4 x float] containing one of the operands. The lower
573/// 32 bits of this operand are used in the comparison.
574/// \returns A 128-bit vector of [4 x float] containing the comparison results
575/// in the low-order bits.
576static __inline__ __m128 __DEFAULT_FN_ATTRS
577_mm_cmple_ss(__m128 __a, __m128 __b)
578{
579 return (__m128)__builtin_ia32_cmpless((__v4sf)__a, (__v4sf)__b);
580}
581
582/// Compares each of the corresponding 32-bit float values of the
583/// 128-bit vectors of [4 x float] to determine if the values in the first
584/// operand are less than or equal to those in the second operand.
585///
586/// \headerfile <x86intrin.h>
587///
588/// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
589///
590/// \param __a
591/// A 128-bit vector of [4 x float].
592/// \param __b
593/// A 128-bit vector of [4 x float].
594/// \returns A 128-bit vector of [4 x float] containing the comparison results.
595static __inline__ __m128 __DEFAULT_FN_ATTRS
596_mm_cmple_ps(__m128 __a, __m128 __b)
597{
598 return (__m128)__builtin_ia32_cmpleps((__v4sf)__a, (__v4sf)__b);
599}
600
601/// Compares two 32-bit float values in the low-order bits of both
602/// operands to determine if the value in the first operand is greater than
603/// the corresponding value in the second operand and returns the result of
604/// the comparison in the low-order bits of a vector of [4 x float].
605///
606/// \headerfile <x86intrin.h>
607///
608/// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
609///
610/// \param __a
611/// A 128-bit vector of [4 x float] containing one of the operands. The lower
612/// 32 bits of this operand are used in the comparison.
613/// \param __b
614/// A 128-bit vector of [4 x float] containing one of the operands. The lower
615/// 32 bits of this operand are used in the comparison.
616/// \returns A 128-bit vector of [4 x float] containing the comparison results
617/// in the low-order bits.
618static __inline__ __m128 __DEFAULT_FN_ATTRS
619_mm_cmpgt_ss(__m128 __a, __m128 __b)
620{
621 return (__m128)__builtin_shufflevector((__v4sf)__a,
622 (__v4sf)__builtin_ia32_cmpltss((__v4sf)__b, (__v4sf)__a),
623 4, 1, 2, 3);
624}
625
626/// Compares each of the corresponding 32-bit float values of the
627/// 128-bit vectors of [4 x float] to determine if the values in the first
628/// operand are greater than those in the second operand.
629///
630/// \headerfile <x86intrin.h>
631///
632/// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
633///
634/// \param __a
635/// A 128-bit vector of [4 x float].
636/// \param __b
637/// A 128-bit vector of [4 x float].
638/// \returns A 128-bit vector of [4 x float] containing the comparison results.
639static __inline__ __m128 __DEFAULT_FN_ATTRS
640_mm_cmpgt_ps(__m128 __a, __m128 __b)
641{
642 return (__m128)__builtin_ia32_cmpltps((__v4sf)__b, (__v4sf)__a);
643}
644
645/// Compares two 32-bit float values in the low-order bits of both
646/// operands to determine if the value in the first operand is greater than
647/// or equal to the corresponding value in the second operand and returns
648/// the result of the comparison in the low-order bits of a vector of
649/// [4 x float].
650///
651/// \headerfile <x86intrin.h>
652///
653/// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
654///
655/// \param __a
656/// A 128-bit vector of [4 x float] containing one of the operands. The lower
657/// 32 bits of this operand are used in the comparison.
658/// \param __b
659/// A 128-bit vector of [4 x float] containing one of the operands. The lower
660/// 32 bits of this operand are used in the comparison.
661/// \returns A 128-bit vector of [4 x float] containing the comparison results
662/// in the low-order bits.
663static __inline__ __m128 __DEFAULT_FN_ATTRS
664_mm_cmpge_ss(__m128 __a, __m128 __b)
665{
666 return (__m128)__builtin_shufflevector((__v4sf)__a,
667 (__v4sf)__builtin_ia32_cmpless((__v4sf)__b, (__v4sf)__a),
668 4, 1, 2, 3);
669}
670
671/// Compares each of the corresponding 32-bit float values of the
672/// 128-bit vectors of [4 x float] to determine if the values in the first
673/// operand are greater than or equal to those in the second operand.
674///
675/// \headerfile <x86intrin.h>
676///
677/// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
678///
679/// \param __a
680/// A 128-bit vector of [4 x float].
681/// \param __b
682/// A 128-bit vector of [4 x float].
683/// \returns A 128-bit vector of [4 x float] containing the comparison results.
684static __inline__ __m128 __DEFAULT_FN_ATTRS
685_mm_cmpge_ps(__m128 __a, __m128 __b)
686{
687 return (__m128)__builtin_ia32_cmpleps((__v4sf)__b, (__v4sf)__a);
688}
689
690/// Compares two 32-bit float values in the low-order bits of both
691/// operands for inequality and returns the result of the comparison in the
692/// low-order bits of a vector of [4 x float].
693///
694/// \headerfile <x86intrin.h>
695///
696/// This intrinsic corresponds to the <c> VCMPNEQSS / CMPNEQSS </c>
697/// instructions.
698///
699/// \param __a
700/// A 128-bit vector of [4 x float] containing one of the operands. The lower
701/// 32 bits of this operand are used in the comparison.
702/// \param __b
703/// A 128-bit vector of [4 x float] containing one of the operands. The lower
704/// 32 bits of this operand are used in the comparison.
705/// \returns A 128-bit vector of [4 x float] containing the comparison results
706/// in the low-order bits.
707static __inline__ __m128 __DEFAULT_FN_ATTRS
708_mm_cmpneq_ss(__m128 __a, __m128 __b)
709{
710 return (__m128)__builtin_ia32_cmpneqss((__v4sf)__a, (__v4sf)__b);
711}
712
713/// Compares each of the corresponding 32-bit float values of the
714/// 128-bit vectors of [4 x float] for inequality.
715///
716/// \headerfile <x86intrin.h>
717///
718/// This intrinsic corresponds to the <c> VCMPNEQPS / CMPNEQPS </c>
719/// instructions.
720///
721/// \param __a
722/// A 128-bit vector of [4 x float].
723/// \param __b
724/// A 128-bit vector of [4 x float].
725/// \returns A 128-bit vector of [4 x float] containing the comparison results.
726static __inline__ __m128 __DEFAULT_FN_ATTRS
727_mm_cmpneq_ps(__m128 __a, __m128 __b)
728{
729 return (__m128)__builtin_ia32_cmpneqps((__v4sf)__a, (__v4sf)__b);
730}
731
732/// Compares two 32-bit float values in the low-order bits of both
733/// operands to determine if the value in the first operand is not less than
734/// the corresponding value in the second operand and returns the result of
735/// the comparison in the low-order bits of a vector of [4 x float].
736///
737/// \headerfile <x86intrin.h>
738///
739/// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
740/// instructions.
741///
742/// \param __a
743/// A 128-bit vector of [4 x float] containing one of the operands. The lower
744/// 32 bits of this operand are used in the comparison.
745/// \param __b
746/// A 128-bit vector of [4 x float] containing one of the operands. The lower
747/// 32 bits of this operand are used in the comparison.
748/// \returns A 128-bit vector of [4 x float] containing the comparison results
749/// in the low-order bits.
750static __inline__ __m128 __DEFAULT_FN_ATTRS
751_mm_cmpnlt_ss(__m128 __a, __m128 __b)
752{
753 return (__m128)__builtin_ia32_cmpnltss((__v4sf)__a, (__v4sf)__b);
754}
755
756/// Compares each of the corresponding 32-bit float values of the
757/// 128-bit vectors of [4 x float] to determine if the values in the first
758/// operand are not less than those in the second operand.
759///
760/// \headerfile <x86intrin.h>
761///
762/// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
763/// instructions.
764///
765/// \param __a
766/// A 128-bit vector of [4 x float].
767/// \param __b
768/// A 128-bit vector of [4 x float].
769/// \returns A 128-bit vector of [4 x float] containing the comparison results.
770static __inline__ __m128 __DEFAULT_FN_ATTRS
771_mm_cmpnlt_ps(__m128 __a, __m128 __b)
772{
773 return (__m128)__builtin_ia32_cmpnltps((__v4sf)__a, (__v4sf)__b);
774}
775
776/// Compares two 32-bit float values in the low-order bits of both
777/// operands to determine if the value in the first operand is not less than
778/// or equal to the corresponding value in the second operand and returns
779/// the result of the comparison in the low-order bits of a vector of
780/// [4 x float].
781///
782/// \headerfile <x86intrin.h>
783///
784/// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
785/// instructions.
786///
787/// \param __a
788/// A 128-bit vector of [4 x float] containing one of the operands. The lower
789/// 32 bits of this operand are used in the comparison.
790/// \param __b
791/// A 128-bit vector of [4 x float] containing one of the operands. The lower
792/// 32 bits of this operand are used in the comparison.
793/// \returns A 128-bit vector of [4 x float] containing the comparison results
794/// in the low-order bits.
795static __inline__ __m128 __DEFAULT_FN_ATTRS
796_mm_cmpnle_ss(__m128 __a, __m128 __b)
797{
798 return (__m128)__builtin_ia32_cmpnless((__v4sf)__a, (__v4sf)__b);
799}
800
801/// Compares each of the corresponding 32-bit float values of the
802/// 128-bit vectors of [4 x float] to determine if the values in the first
803/// operand are not less than or equal to those in the second operand.
804///
805/// \headerfile <x86intrin.h>
806///
807/// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
808/// instructions.
809///
810/// \param __a
811/// A 128-bit vector of [4 x float].
812/// \param __b
813/// A 128-bit vector of [4 x float].
814/// \returns A 128-bit vector of [4 x float] containing the comparison results.
815static __inline__ __m128 __DEFAULT_FN_ATTRS
816_mm_cmpnle_ps(__m128 __a, __m128 __b)
817{
818 return (__m128)__builtin_ia32_cmpnleps((__v4sf)__a, (__v4sf)__b);
819}
820
821/// Compares two 32-bit float values in the low-order bits of both
822/// operands to determine if the value in the first operand is not greater
823/// than the corresponding value in the second operand and returns the
824/// result of the comparison in the low-order bits of a vector of
825/// [4 x float].
826///
827/// \headerfile <x86intrin.h>
828///
829/// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
830/// instructions.
831///
832/// \param __a
833/// A 128-bit vector of [4 x float] containing one of the operands. The lower
834/// 32 bits of this operand are used in the comparison.
835/// \param __b
836/// A 128-bit vector of [4 x float] containing one of the operands. The lower
837/// 32 bits of this operand are used in the comparison.
838/// \returns A 128-bit vector of [4 x float] containing the comparison results
839/// in the low-order bits.
840static __inline__ __m128 __DEFAULT_FN_ATTRS
841_mm_cmpngt_ss(__m128 __a, __m128 __b)
842{
843 return (__m128)__builtin_shufflevector((__v4sf)__a,
844 (__v4sf)__builtin_ia32_cmpnltss((__v4sf)__b, (__v4sf)__a),
845 4, 1, 2, 3);
846}
847
848/// Compares each of the corresponding 32-bit float values of the
849/// 128-bit vectors of [4 x float] to determine if the values in the first
850/// operand are not greater than those in the second operand.
851///
852/// \headerfile <x86intrin.h>
853///
854/// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
855/// instructions.
856///
857/// \param __a
858/// A 128-bit vector of [4 x float].
859/// \param __b
860/// A 128-bit vector of [4 x float].
861/// \returns A 128-bit vector of [4 x float] containing the comparison results.
862static __inline__ __m128 __DEFAULT_FN_ATTRS
863_mm_cmpngt_ps(__m128 __a, __m128 __b)
864{
865 return (__m128)__builtin_ia32_cmpnltps((__v4sf)__b, (__v4sf)__a);
866}
867
868/// Compares two 32-bit float values in the low-order bits of both
869/// operands to determine if the value in the first operand is not greater
870/// than or equal to the corresponding value in the second operand and
871/// returns the result of the comparison in the low-order bits of a vector
872/// of [4 x float].
873///
874/// \headerfile <x86intrin.h>
875///
876/// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
877/// instructions.
878///
879/// \param __a
880/// A 128-bit vector of [4 x float] containing one of the operands. The lower
881/// 32 bits of this operand are used in the comparison.
882/// \param __b
883/// A 128-bit vector of [4 x float] containing one of the operands. The lower
884/// 32 bits of this operand are used in the comparison.
885/// \returns A 128-bit vector of [4 x float] containing the comparison results
886/// in the low-order bits.
887static __inline__ __m128 __DEFAULT_FN_ATTRS
888_mm_cmpnge_ss(__m128 __a, __m128 __b)
889{
890 return (__m128)__builtin_shufflevector((__v4sf)__a,
891 (__v4sf)__builtin_ia32_cmpnless((__v4sf)__b, (__v4sf)__a),
892 4, 1, 2, 3);
893}
894
895/// Compares each of the corresponding 32-bit float values of the
896/// 128-bit vectors of [4 x float] to determine if the values in the first
897/// operand are not greater than or equal to those in the second operand.
898///
899/// \headerfile <x86intrin.h>
900///
901/// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
902/// instructions.
903///
904/// \param __a
905/// A 128-bit vector of [4 x float].
906/// \param __b
907/// A 128-bit vector of [4 x float].
908/// \returns A 128-bit vector of [4 x float] containing the comparison results.
909static __inline__ __m128 __DEFAULT_FN_ATTRS
910_mm_cmpnge_ps(__m128 __a, __m128 __b)
911{
912 return (__m128)__builtin_ia32_cmpnleps((__v4sf)__b, (__v4sf)__a);
913}
914
915/// Compares two 32-bit float values in the low-order bits of both
916/// operands to determine if the value in the first operand is ordered with
917/// respect to the corresponding value in the second operand and returns the
918/// result of the comparison in the low-order bits of a vector of
919/// [4 x float].
920///
921/// \headerfile <x86intrin.h>
922///
923/// This intrinsic corresponds to the <c> VCMPORDSS / CMPORDSS </c>
924/// instructions.
925///
926/// \param __a
927/// A 128-bit vector of [4 x float] containing one of the operands. The lower
928/// 32 bits of this operand are used in the comparison.
929/// \param __b
930/// A 128-bit vector of [4 x float] containing one of the operands. The lower
931/// 32 bits of this operand are used in the comparison.
932/// \returns A 128-bit vector of [4 x float] containing the comparison results
933/// in the low-order bits.
934static __inline__ __m128 __DEFAULT_FN_ATTRS
935_mm_cmpord_ss(__m128 __a, __m128 __b)
936{
937 return (__m128)__builtin_ia32_cmpordss((__v4sf)__a, (__v4sf)__b);
938}
939
940/// Compares each of the corresponding 32-bit float values of the
941/// 128-bit vectors of [4 x float] to determine if the values in the first
942/// operand are ordered with respect to those in the second operand.
943///
944/// \headerfile <x86intrin.h>
945///
946/// This intrinsic corresponds to the <c> VCMPORDPS / CMPORDPS </c>
947/// instructions.
948///
949/// \param __a
950/// A 128-bit vector of [4 x float].
951/// \param __b
952/// A 128-bit vector of [4 x float].
953/// \returns A 128-bit vector of [4 x float] containing the comparison results.
954static __inline__ __m128 __DEFAULT_FN_ATTRS
955_mm_cmpord_ps(__m128 __a, __m128 __b)
956{
957 return (__m128)__builtin_ia32_cmpordps((__v4sf)__a, (__v4sf)__b);
958}
959
960/// Compares two 32-bit float values in the low-order bits of both
961/// operands to determine if the value in the first operand is unordered
962/// with respect to the corresponding value in the second operand and
963/// returns the result of the comparison in the low-order bits of a vector
964/// of [4 x float].
965///
966/// \headerfile <x86intrin.h>
967///
968/// This intrinsic corresponds to the <c> VCMPUNORDSS / CMPUNORDSS </c>
969/// instructions.
970///
971/// \param __a
972/// A 128-bit vector of [4 x float] containing one of the operands. The lower
973/// 32 bits of this operand are used in the comparison.
974/// \param __b
975/// A 128-bit vector of [4 x float] containing one of the operands. The lower
976/// 32 bits of this operand are used in the comparison.
977/// \returns A 128-bit vector of [4 x float] containing the comparison results
978/// in the low-order bits.
979static __inline__ __m128 __DEFAULT_FN_ATTRS
980_mm_cmpunord_ss(__m128 __a, __m128 __b)
981{
982 return (__m128)__builtin_ia32_cmpunordss((__v4sf)__a, (__v4sf)__b);
983}
984
985/// Compares each of the corresponding 32-bit float values of the
986/// 128-bit vectors of [4 x float] to determine if the values in the first
987/// operand are unordered with respect to those in the second operand.
988///
989/// \headerfile <x86intrin.h>
990///
991/// This intrinsic corresponds to the <c> VCMPUNORDPS / CMPUNORDPS </c>
992/// instructions.
993///
994/// \param __a
995/// A 128-bit vector of [4 x float].
996/// \param __b
997/// A 128-bit vector of [4 x float].
998/// \returns A 128-bit vector of [4 x float] containing the comparison results.
999static __inline__ __m128 __DEFAULT_FN_ATTRS
1000_mm_cmpunord_ps(__m128 __a, __m128 __b)
1001{
1002 return (__m128)__builtin_ia32_cmpunordps((__v4sf)__a, (__v4sf)__b);
1003}
1004
1005/// Compares two 32-bit float values in the low-order bits of both
1006/// operands for equality and returns the result of the comparison.
1007///
1008/// If either of the two lower 32-bit values is NaN, 0 is returned.
1009///
1010/// \headerfile <x86intrin.h>
1011///
1012/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
1013/// instructions.
1014///
1015/// \param __a
1016/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1017/// used in the comparison.
1018/// \param __b
1019/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1020/// used in the comparison.
1021/// \returns An integer containing the comparison results. If either of the
1022/// two lower 32-bit values is NaN, 0 is returned.
1023static __inline__ int __DEFAULT_FN_ATTRS
1024_mm_comieq_ss(__m128 __a, __m128 __b)
1025{
1026 return __builtin_ia32_comieq((__v4sf)__a, (__v4sf)__b);
1027}
1028
1029/// Compares two 32-bit float values in the low-order bits of both
1030/// operands to determine if the first operand is less than the second
1031/// operand and returns the result of the comparison.
1032///
1033/// If either of the two lower 32-bit values is NaN, 0 is returned.
1034///
1035/// \headerfile <x86intrin.h>
1036///
1037/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
1038/// instructions.
1039///
1040/// \param __a
1041/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1042/// used in the comparison.
1043/// \param __b
1044/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1045/// used in the comparison.
1046/// \returns An integer containing the comparison results. If either of the two
1047/// lower 32-bit values is NaN, 0 is returned.
1048static __inline__ int __DEFAULT_FN_ATTRS
1049_mm_comilt_ss(__m128 __a, __m128 __b)
1050{
1051 return __builtin_ia32_comilt((__v4sf)__a, (__v4sf)__b);
1052}
1053
1054/// Compares two 32-bit float values in the low-order bits of both
1055/// operands to determine if the first operand is less than or equal to the
1056/// second operand and returns the result of the comparison.
1057///
1058/// If either of the two lower 32-bit values is NaN, 0 is returned.
1059///
1060/// \headerfile <x86intrin.h>
1061///
1062/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1063///
1064/// \param __a
1065/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1066/// used in the comparison.
1067/// \param __b
1068/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1069/// used in the comparison.
1070/// \returns An integer containing the comparison results. If either of the two
1071/// lower 32-bit values is NaN, 0 is returned.
1072static __inline__ int __DEFAULT_FN_ATTRS
1073_mm_comile_ss(__m128 __a, __m128 __b)
1074{
1075 return __builtin_ia32_comile((__v4sf)__a, (__v4sf)__b);
1076}
1077
1078/// Compares two 32-bit float values in the low-order bits of both
1079/// operands to determine if the first operand is greater than the second
1080/// operand and returns the result of the comparison.
1081///
1082/// If either of the two lower 32-bit values is NaN, 0 is returned.
1083///
1084/// \headerfile <x86intrin.h>
1085///
1086/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1087///
1088/// \param __a
1089/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1090/// used in the comparison.
1091/// \param __b
1092/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1093/// used in the comparison.
1094/// \returns An integer containing the comparison results. If either of the
1095/// two lower 32-bit values is NaN, 0 is returned.
1096static __inline__ int __DEFAULT_FN_ATTRS
1097_mm_comigt_ss(__m128 __a, __m128 __b)
1098{
1099 return __builtin_ia32_comigt((__v4sf)__a, (__v4sf)__b);
1100}
1101
1102/// Compares two 32-bit float values in the low-order bits of both
1103/// operands to determine if the first operand is greater than or equal to
1104/// the second operand and returns the result of the comparison.
1105///
1106/// If either of the two lower 32-bit values is NaN, 0 is returned.
1107///
1108/// \headerfile <x86intrin.h>
1109///
1110/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1111///
1112/// \param __a
1113/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1114/// used in the comparison.
1115/// \param __b
1116/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1117/// used in the comparison.
1118/// \returns An integer containing the comparison results. If either of the two
1119/// lower 32-bit values is NaN, 0 is returned.
1120static __inline__ int __DEFAULT_FN_ATTRS
1121_mm_comige_ss(__m128 __a, __m128 __b)
1122{
1123 return __builtin_ia32_comige((__v4sf)__a, (__v4sf)__b);
1124}
1125
1126/// Compares two 32-bit float values in the low-order bits of both
1127/// operands to determine if the first operand is not equal to the second
1128/// operand and returns the result of the comparison.
1129///
1130/// If either of the two lower 32-bit values is NaN, 1 is returned.
1131///
1132/// \headerfile <x86intrin.h>
1133///
1134/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1135///
1136/// \param __a
1137/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1138/// used in the comparison.
1139/// \param __b
1140/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1141/// used in the comparison.
1142/// \returns An integer containing the comparison results. If either of the
1143/// two lower 32-bit values is NaN, 1 is returned.
1144static __inline__ int __DEFAULT_FN_ATTRS
1145_mm_comineq_ss(__m128 __a, __m128 __b)
1146{
1147 return __builtin_ia32_comineq((__v4sf)__a, (__v4sf)__b);
1148}
1149
1150/// Performs an unordered comparison of two 32-bit float values using
1151/// the low-order bits of both operands to determine equality and returns
1152/// the result of the comparison.
1153///
1154/// If either of the two lower 32-bit values is NaN, 0 is returned.
1155///
1156/// \headerfile <x86intrin.h>
1157///
1158/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1159///
1160/// \param __a
1161/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1162/// used in the comparison.
1163/// \param __b
1164/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1165/// used in the comparison.
1166/// \returns An integer containing the comparison results. If either of the two
1167/// lower 32-bit values is NaN, 0 is returned.
1168static __inline__ int __DEFAULT_FN_ATTRS
1169_mm_ucomieq_ss(__m128 __a, __m128 __b)
1170{
1171 return __builtin_ia32_ucomieq((__v4sf)__a, (__v4sf)__b);
1172}
1173
1174/// Performs an unordered comparison of two 32-bit float values using
1175/// the low-order bits of both operands to determine if the first operand is
1176/// less than the second operand and returns the result of the comparison.
1177///
1178/// If either of the two lower 32-bit values is NaN, 0 is returned.
1179///
1180/// \headerfile <x86intrin.h>
1181///
1182/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1183///
1184/// \param __a
1185/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1186/// used in the comparison.
1187/// \param __b
1188/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1189/// used in the comparison.
1190/// \returns An integer containing the comparison results. If either of the two
1191/// lower 32-bit values is NaN, 0 is returned.
1192static __inline__ int __DEFAULT_FN_ATTRS
1193_mm_ucomilt_ss(__m128 __a, __m128 __b)
1194{
1195 return __builtin_ia32_ucomilt((__v4sf)__a, (__v4sf)__b);
1196}
1197
1198/// Performs an unordered comparison of two 32-bit float values using
1199/// the low-order bits of both operands to determine if the first operand is
1200/// less than or equal to the second operand and returns the result of the
1201/// comparison.
1202///
1203/// If either of the two lower 32-bit values is NaN, 0 is returned.
1204///
1205/// \headerfile <x86intrin.h>
1206///
1207/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1208///
1209/// \param __a
1210/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1211/// used in the comparison.
1212/// \param __b
1213/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1214/// used in the comparison.
1215/// \returns An integer containing the comparison results. If either of the two
1216/// lower 32-bit values is NaN, 0 is returned.
1217static __inline__ int __DEFAULT_FN_ATTRS
1218_mm_ucomile_ss(__m128 __a, __m128 __b)
1219{
1220 return __builtin_ia32_ucomile((__v4sf)__a, (__v4sf)__b);
1221}
1222
1223/// Performs an unordered comparison of two 32-bit float values using
1224/// the low-order bits of both operands to determine if the first operand is
1225/// greater than the second operand and returns the result of the
1226/// comparison.
1227///
1228/// If either of the two lower 32-bit values is NaN, 0 is returned.
1229///
1230/// \headerfile <x86intrin.h>
1231///
1232/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1233///
1234/// \param __a
1235/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1236/// used in the comparison.
1237/// \param __b
1238/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1239/// used in the comparison.
1240/// \returns An integer containing the comparison results. If either of the two
1241/// lower 32-bit values is NaN, 0 is returned.
1242static __inline__ int __DEFAULT_FN_ATTRS
1243_mm_ucomigt_ss(__m128 __a, __m128 __b)
1244{
1245 return __builtin_ia32_ucomigt((__v4sf)__a, (__v4sf)__b);
1246}
1247
1248/// Performs an unordered comparison of two 32-bit float values using
1249/// the low-order bits of both operands to determine if the first operand is
1250/// greater than or equal to the second operand and returns the result of
1251/// the comparison.
1252///
1253/// If either of the two lower 32-bit values is NaN, 0 is returned.
1254///
1255/// \headerfile <x86intrin.h>
1256///
1257/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1258///
1259/// \param __a
1260/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1261/// used in the comparison.
1262/// \param __b
1263/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1264/// used in the comparison.
1265/// \returns An integer containing the comparison results. If either of the two
1266/// lower 32-bit values is NaN, 0 is returned.
1267static __inline__ int __DEFAULT_FN_ATTRS
1268_mm_ucomige_ss(__m128 __a, __m128 __b)
1269{
1270 return __builtin_ia32_ucomige((__v4sf)__a, (__v4sf)__b);
1271}
1272
1273/// Performs an unordered comparison of two 32-bit float values using
1274/// the low-order bits of both operands to determine inequality and returns
1275/// the result of the comparison.
1276///
1277/// If either of the two lower 32-bit values is NaN, 1 is returned.
1278///
1279/// \headerfile <x86intrin.h>
1280///
1281/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1282///
1283/// \param __a
1284/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1285/// used in the comparison.
1286/// \param __b
1287/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1288/// used in the comparison.
1289/// \returns An integer containing the comparison results. If either of the two
1290/// lower 32-bit values is NaN, 1 is returned.
1291static __inline__ int __DEFAULT_FN_ATTRS
1292_mm_ucomineq_ss(__m128 __a, __m128 __b)
1293{
1294 return __builtin_ia32_ucomineq((__v4sf)__a, (__v4sf)__b);
1295}
1296
1297/// Converts a float value contained in the lower 32 bits of a vector of
1298/// [4 x float] into a 32-bit integer.
1299///
1300/// \headerfile <x86intrin.h>
1301///
1302/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1303/// instructions.
1304///
1305/// \param __a
1306/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1307/// used in the conversion.
1308/// \returns A 32-bit integer containing the converted value.
1309static __inline__ int __DEFAULT_FN_ATTRS
1310_mm_cvtss_si32(__m128 __a)
1311{
1312 return __builtin_ia32_cvtss2si((__v4sf)__a);
1313}
1314
1315/// Converts a float value contained in the lower 32 bits of a vector of
1316/// [4 x float] into a 32-bit integer.
1317///
1318/// \headerfile <x86intrin.h>
1319///
1320/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1321/// instructions.
1322///
1323/// \param __a
1324/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1325/// used in the conversion.
1326/// \returns A 32-bit integer containing the converted value.
1327static __inline__ int __DEFAULT_FN_ATTRS
1328_mm_cvt_ss2si(__m128 __a)
1329{
1330 return _mm_cvtss_si32(__a);
1331}
1332
1333#ifdef __x86_64__
1334
1335/// Converts a float value contained in the lower 32 bits of a vector of
1336/// [4 x float] into a 64-bit integer.
1337///
1338/// \headerfile <x86intrin.h>
1339///
1340/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1341/// instructions.
1342///
1343/// \param __a
1344/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1345/// used in the conversion.
1346/// \returns A 64-bit integer containing the converted value.
1347static __inline__ long long __DEFAULT_FN_ATTRS
1348_mm_cvtss_si64(__m128 __a)
1349{
1350 return __builtin_ia32_cvtss2si64((__v4sf)__a);
1351}
1352
1353#endif
1354
1355/// Converts two low-order float values in a 128-bit vector of
1356/// [4 x float] into a 64-bit vector of [2 x i32].
1357///
1358/// \headerfile <x86intrin.h>
1359///
1360/// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
1361///
1362/// \param __a
1363/// A 128-bit vector of [4 x float].
1364/// \returns A 64-bit integer vector containing the converted values.
1365static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
1366_mm_cvtps_pi32(__m128 __a)
1367{
1368 return (__m64)__builtin_ia32_cvtps2pi((__v4sf)__a);
1369}
1370
1371/// Converts two low-order float values in a 128-bit vector of
1372/// [4 x float] into a 64-bit vector of [2 x i32].
1373///
1374/// \headerfile <x86intrin.h>
1375///
1376/// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
1377///
1378/// \param __a
1379/// A 128-bit vector of [4 x float].
1380/// \returns A 64-bit integer vector containing the converted values.
1381static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
1382_mm_cvt_ps2pi(__m128 __a)
1383{
1384 return _mm_cvtps_pi32(__a);
1385}
1386
1387/// Converts a float value contained in the lower 32 bits of a vector of
1388/// [4 x float] into a 32-bit integer, truncating the result when it is
1389/// inexact.
1390///
1391/// \headerfile <x86intrin.h>
1392///
1393/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1394/// instructions.
1395///
1396/// \param __a
1397/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1398/// used in the conversion.
1399/// \returns A 32-bit integer containing the converted value.
1400static __inline__ int __DEFAULT_FN_ATTRS
1401_mm_cvttss_si32(__m128 __a)
1402{
1403 return __builtin_ia32_cvttss2si((__v4sf)__a);
1404}
1405
1406/// Converts a float value contained in the lower 32 bits of a vector of
1407/// [4 x float] into a 32-bit integer, truncating the result when it is
1408/// inexact.
1409///
1410/// \headerfile <x86intrin.h>
1411///
1412/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1413/// instructions.
1414///
1415/// \param __a
1416/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1417/// used in the conversion.
1418/// \returns A 32-bit integer containing the converted value.
1419static __inline__ int __DEFAULT_FN_ATTRS
1420_mm_cvtt_ss2si(__m128 __a)
1421{
1422 return _mm_cvttss_si32(__a);
1423}
1424
1425#ifdef __x86_64__
1426/// Converts a float value contained in the lower 32 bits of a vector of
1427/// [4 x float] into a 64-bit integer, truncating the result when it is
1428/// inexact.
1429///
1430/// \headerfile <x86intrin.h>
1431///
1432/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1433/// instructions.
1434///
1435/// \param __a
1436/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1437/// used in the conversion.
1438/// \returns A 64-bit integer containing the converted value.
1439static __inline__ long long __DEFAULT_FN_ATTRS
1440_mm_cvttss_si64(__m128 __a)
1441{
1442 return __builtin_ia32_cvttss2si64((__v4sf)__a);
1443}
1444#endif
1445
1446/// Converts two low-order float values in a 128-bit vector of
1447/// [4 x float] into a 64-bit vector of [2 x i32], truncating the result
1448/// when it is inexact.
1449///
1450/// \headerfile <x86intrin.h>
1451///
1452/// This intrinsic corresponds to the <c> CVTTPS2PI / VTTPS2PI </c>
1453/// instructions.
1454///
1455/// \param __a
1456/// A 128-bit vector of [4 x float].
1457/// \returns A 64-bit integer vector containing the converted values.
1458static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
1459_mm_cvttps_pi32(__m128 __a)
1460{
1461 return (__m64)__builtin_ia32_cvttps2pi((__v4sf)__a);
1462}
1463
1464/// Converts two low-order float values in a 128-bit vector of [4 x
1465/// float] into a 64-bit vector of [2 x i32], truncating the result when it
1466/// is inexact.
1467///
1468/// \headerfile <x86intrin.h>
1469///
1470/// This intrinsic corresponds to the <c> CVTTPS2PI </c> instruction.
1471///
1472/// \param __a
1473/// A 128-bit vector of [4 x float].
1474/// \returns A 64-bit integer vector containing the converted values.
1475static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
1476_mm_cvtt_ps2pi(__m128 __a)
1477{
1478 return _mm_cvttps_pi32(__a);
1479}
1480
1481/// Converts a 32-bit signed integer value into a floating point value
1482/// and writes it to the lower 32 bits of the destination. The remaining
1483/// higher order elements of the destination vector are copied from the
1484/// corresponding elements in the first operand.
1485///
1486/// \headerfile <x86intrin.h>
1487///
1488/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
1489///
1490/// \param __a
1491/// A 128-bit vector of [4 x float].
1492/// \param __b
1493/// A 32-bit signed integer operand containing the value to be converted.
1494/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1495/// converted value of the second operand. The upper 96 bits are copied from
1496/// the upper 96 bits of the first operand.
1497static __inline__ __m128 __DEFAULT_FN_ATTRS
1498_mm_cvtsi32_ss(__m128 __a, int __b)
1499{
1500 __a[0] = __b;
1501 return __a;
1502}
1503
1504/// Converts a 32-bit signed integer value into a floating point value
1505/// and writes it to the lower 32 bits of the destination. The remaining
1506/// higher order elements of the destination are copied from the
1507/// corresponding elements in the first operand.
1508///
1509/// \headerfile <x86intrin.h>
1510///
1511/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
1512///
1513/// \param __a
1514/// A 128-bit vector of [4 x float].
1515/// \param __b
1516/// A 32-bit signed integer operand containing the value to be converted.
1517/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1518/// converted value of the second operand. The upper 96 bits are copied from
1519/// the upper 96 bits of the first operand.
1520static __inline__ __m128 __DEFAULT_FN_ATTRS
1521_mm_cvt_si2ss(__m128 __a, int __b)
1522{
1523 return _mm_cvtsi32_ss(__a, __b);
1524}
1525
1526#ifdef __x86_64__
1527
1528/// Converts a 64-bit signed integer value into a floating point value
1529/// and writes it to the lower 32 bits of the destination. The remaining
1530/// higher order elements of the destination are copied from the
1531/// corresponding elements in the first operand.
1532///
1533/// \headerfile <x86intrin.h>
1534///
1535/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
1536///
1537/// \param __a
1538/// A 128-bit vector of [4 x float].
1539/// \param __b
1540/// A 64-bit signed integer operand containing the value to be converted.
1541/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1542/// converted value of the second operand. The upper 96 bits are copied from
1543/// the upper 96 bits of the first operand.
1544static __inline__ __m128 __DEFAULT_FN_ATTRS
1545_mm_cvtsi64_ss(__m128 __a, long long __b)
1546{
1547 __a[0] = __b;
1548 return __a;
1549}
1550
1551#endif
1552
1553/// Converts two elements of a 64-bit vector of [2 x i32] into two
1554/// floating point values and writes them to the lower 64-bits of the
1555/// destination. The remaining higher order elements of the destination are
1556/// copied from the corresponding elements in the first operand.
1557///
1558/// \headerfile <x86intrin.h>
1559///
1560/// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
1561///
1562/// \param __a
1563/// A 128-bit vector of [4 x float].
1564/// \param __b
1565/// A 64-bit vector of [2 x i32]. The elements in this vector are converted
1566/// and written to the corresponding low-order elements in the destination.
1567/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1568/// converted value of the second operand. The upper 64 bits are copied from
1569/// the upper 64 bits of the first operand.
1570static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
1571_mm_cvtpi32_ps(__m128 __a, __m64 __b)
1572{
1573 return __builtin_ia32_cvtpi2ps((__v4sf)__a, (__v2si)__b);
1574}
1575
1576/// Converts two elements of a 64-bit vector of [2 x i32] into two
1577/// floating point values and writes them to the lower 64-bits of the
1578/// destination. The remaining higher order elements of the destination are
1579/// copied from the corresponding elements in the first operand.
1580///
1581/// \headerfile <x86intrin.h>
1582///
1583/// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
1584///
1585/// \param __a
1586/// A 128-bit vector of [4 x float].
1587/// \param __b
1588/// A 64-bit vector of [2 x i32]. The elements in this vector are converted
1589/// and written to the corresponding low-order elements in the destination.
1590/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1591/// converted value from the second operand. The upper 64 bits are copied
1592/// from the upper 64 bits of the first operand.
1593static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
1594_mm_cvt_pi2ps(__m128 __a, __m64 __b)
1595{
1596 return _mm_cvtpi32_ps(__a, __b);
1597}
1598
1599/// Extracts a float value contained in the lower 32 bits of a vector of
1600/// [4 x float].
1601///
1602/// \headerfile <x86intrin.h>
1603///
1604/// This intrinsic has no corresponding instruction.
1605///
1606/// \param __a
1607/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1608/// used in the extraction.
1609/// \returns A 32-bit float containing the extracted value.
1610static __inline__ float __DEFAULT_FN_ATTRS
1611_mm_cvtss_f32(__m128 __a)
1612{
1613 return __a[0];
1614}
1615
1616/// Loads two packed float values from the address \a __p into the
1617/// high-order bits of a 128-bit vector of [4 x float]. The low-order bits
1618/// are copied from the low-order bits of the first operand.
1619///
1620/// \headerfile <x86intrin.h>
1621///
1622/// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
1623///
1624/// \param __a
1625/// A 128-bit vector of [4 x float]. Bits [63:0] are written to bits [63:0]
1626/// of the destination.
1627/// \param __p
1628/// A pointer to two packed float values. Bits [63:0] are written to bits
1629/// [127:64] of the destination.
1630/// \returns A 128-bit vector of [4 x float] containing the moved values.
1631static __inline__ __m128 __DEFAULT_FN_ATTRS
1632_mm_loadh_pi(__m128 __a, const __m64 *__p)
1633{
1634 typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8)));
1635 struct __mm_loadh_pi_struct {
1636 __mm_loadh_pi_v2f32 __u;
1637 } __attribute__((__packed__, __may_alias__));
1638 __mm_loadh_pi_v2f32 __b = ((const struct __mm_loadh_pi_struct*)__p)->__u;
1639 __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
1640 return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5);
1641}
1642
1643/// Loads two packed float values from the address \a __p into the
1644/// low-order bits of a 128-bit vector of [4 x float]. The high-order bits
1645/// are copied from the high-order bits of the first operand.
1646///
1647/// \headerfile <x86intrin.h>
1648///
1649/// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
1650///
1651/// \param __a
1652/// A 128-bit vector of [4 x float]. Bits [127:64] are written to bits
1653/// [127:64] of the destination.
1654/// \param __p
1655/// A pointer to two packed float values. Bits [63:0] are written to bits
1656/// [63:0] of the destination.
1657/// \returns A 128-bit vector of [4 x float] containing the moved values.
1658static __inline__ __m128 __DEFAULT_FN_ATTRS
1659_mm_loadl_pi(__m128 __a, const __m64 *__p)
1660{
1661 typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8)));
1662 struct __mm_loadl_pi_struct {
1663 __mm_loadl_pi_v2f32 __u;
1664 } __attribute__((__packed__, __may_alias__));
1665 __mm_loadl_pi_v2f32 __b = ((const struct __mm_loadl_pi_struct*)__p)->__u;
1666 __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
1667 return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3);
1668}
1669
1670/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
1671/// 32 bits of the vector are initialized with the single-precision
1672/// floating-point value loaded from a specified memory location. The upper
1673/// 96 bits are set to zero.
1674///
1675/// \headerfile <x86intrin.h>
1676///
1677/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
1678///
1679/// \param __p
1680/// A pointer to a 32-bit memory location containing a single-precision
1681/// floating-point value.
1682/// \returns An initialized 128-bit floating-point vector of [4 x float]. The
1683/// lower 32 bits contain the value loaded from the memory location. The
1684/// upper 96 bits are set to zero.
1685static __inline__ __m128 __DEFAULT_FN_ATTRS
1686_mm_load_ss(const float *__p)
1687{
1688 struct __mm_load_ss_struct {
1689 float __u;
1690 } __attribute__((__packed__, __may_alias__));
1691 float __u = ((const struct __mm_load_ss_struct*)__p)->__u;
1692 return __extension__ (__m128){ __u, 0, 0, 0 };
1693}
1694
1695/// Loads a 32-bit float value and duplicates it to all four vector
1696/// elements of a 128-bit vector of [4 x float].
1697///
1698/// \headerfile <x86intrin.h>
1699///
1700/// This intrinsic corresponds to the <c> VBROADCASTSS / MOVSS + shuffling </c>
1701/// instruction.
1702///
1703/// \param __p
1704/// A pointer to a float value to be loaded and duplicated.
1705/// \returns A 128-bit vector of [4 x float] containing the loaded and
1706/// duplicated values.
1707static __inline__ __m128 __DEFAULT_FN_ATTRS
1708_mm_load1_ps(const float *__p)
1709{
1710 struct __mm_load1_ps_struct {
1711 float __u;
1712 } __attribute__((__packed__, __may_alias__));
1713 float __u = ((const struct __mm_load1_ps_struct*)__p)->__u;
1714 return __extension__ (__m128){ __u, __u, __u, __u };
1715}
1716
1717#define _mm_load_ps1(p) _mm_load1_ps(p)
1718
1719/// Loads a 128-bit floating-point vector of [4 x float] from an aligned
1720/// memory location.
1721///
1722/// \headerfile <x86intrin.h>
1723///
1724/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
1725///
1726/// \param __p
1727/// A pointer to a 128-bit memory location. The address of the memory
1728/// location has to be 128-bit aligned.
1729/// \returns A 128-bit vector of [4 x float] containing the loaded values.
1730static __inline__ __m128 __DEFAULT_FN_ATTRS
1731_mm_load_ps(const float *__p)
1732{
1733 return *(const __m128*)__p;
1734}
1735
1736/// Loads a 128-bit floating-point vector of [4 x float] from an
1737/// unaligned memory location.
1738///
1739/// \headerfile <x86intrin.h>
1740///
1741/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
1742///
1743/// \param __p
1744/// A pointer to a 128-bit memory location. The address of the memory
1745/// location does not have to be aligned.
1746/// \returns A 128-bit vector of [4 x float] containing the loaded values.
1747static __inline__ __m128 __DEFAULT_FN_ATTRS
1748_mm_loadu_ps(const float *__p)
1749{
1750 struct __loadu_ps {
1751 __m128_u __v;
1752 } __attribute__((__packed__, __may_alias__));
1753 return ((const struct __loadu_ps*)__p)->__v;
1754}
1755
1756/// Loads four packed float values, in reverse order, from an aligned
1757/// memory location to 32-bit elements in a 128-bit vector of [4 x float].
1758///
1759/// \headerfile <x86intrin.h>
1760///
1761/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
1762/// instruction.
1763///
1764/// \param __p
1765/// A pointer to a 128-bit memory location. The address of the memory
1766/// location has to be 128-bit aligned.
1767/// \returns A 128-bit vector of [4 x float] containing the moved values, loaded
1768/// in reverse order.
1769static __inline__ __m128 __DEFAULT_FN_ATTRS
1770_mm_loadr_ps(const float *__p)
1771{
1772 __m128 __a = _mm_load_ps(__p);
1773 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
1774}
1775
1776/// Create a 128-bit vector of [4 x float] with undefined values.
1777///
1778/// \headerfile <x86intrin.h>
1779///
1780/// This intrinsic has no corresponding instruction.
1781///
1782/// \returns A 128-bit vector of [4 x float] containing undefined values.
1783static __inline__ __m128 __DEFAULT_FN_ATTRS
1784_mm_undefined_ps(void)
1785{
1786 return (__m128)__builtin_ia32_undef128();
1787}
1788
1789/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
1790/// 32 bits of the vector are initialized with the specified single-precision
1791/// floating-point value. The upper 96 bits are set to zero.
1792///
1793/// \headerfile <x86intrin.h>
1794///
1795/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
1796///
1797/// \param __w
1798/// A single-precision floating-point value used to initialize the lower 32
1799/// bits of the result.
1800/// \returns An initialized 128-bit floating-point vector of [4 x float]. The
1801/// lower 32 bits contain the value provided in the source operand. The
1802/// upper 96 bits are set to zero.
1803static __inline__ __m128 __DEFAULT_FN_ATTRS
1804_mm_set_ss(float __w)
1805{
1806 return __extension__ (__m128){ __w, 0, 0, 0 };
1807}
1808
1809/// Constructs a 128-bit floating-point vector of [4 x float], with each
1810/// of the four single-precision floating-point vector elements set to the
1811/// specified single-precision floating-point value.
1812///
1813/// \headerfile <x86intrin.h>
1814///
1815/// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
1816///
1817/// \param __w
1818/// A single-precision floating-point value used to initialize each vector
1819/// element of the result.
1820/// \returns An initialized 128-bit floating-point vector of [4 x float].
1821static __inline__ __m128 __DEFAULT_FN_ATTRS
1822_mm_set1_ps(float __w)
1823{
1824 return __extension__ (__m128){ __w, __w, __w, __w };
1825}
1826
1827/* Microsoft specific. */
1828/// Constructs a 128-bit floating-point vector of [4 x float], with each
1829/// of the four single-precision floating-point vector elements set to the
1830/// specified single-precision floating-point value.
1831///
1832/// \headerfile <x86intrin.h>
1833///
1834/// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
1835///
1836/// \param __w
1837/// A single-precision floating-point value used to initialize each vector
1838/// element of the result.
1839/// \returns An initialized 128-bit floating-point vector of [4 x float].
1840static __inline__ __m128 __DEFAULT_FN_ATTRS
1841_mm_set_ps1(float __w)
1842{
1843 return _mm_set1_ps(__w);
1844}
1845
1846/// Constructs a 128-bit floating-point vector of [4 x float]
1847/// initialized with the specified single-precision floating-point values.
1848///
1849/// \headerfile <x86intrin.h>
1850///
1851/// This intrinsic is a utility function and does not correspond to a specific
1852/// instruction.
1853///
1854/// \param __z
1855/// A single-precision floating-point value used to initialize bits [127:96]
1856/// of the result.
1857/// \param __y
1858/// A single-precision floating-point value used to initialize bits [95:64]
1859/// of the result.
1860/// \param __x
1861/// A single-precision floating-point value used to initialize bits [63:32]
1862/// of the result.
1863/// \param __w
1864/// A single-precision floating-point value used to initialize bits [31:0]
1865/// of the result.
1866/// \returns An initialized 128-bit floating-point vector of [4 x float].
1867static __inline__ __m128 __DEFAULT_FN_ATTRS
1868_mm_set_ps(float __z, float __y, float __x, float __w)
1869{
1870 return __extension__ (__m128){ __w, __x, __y, __z };
1871}
1872
1873/// Constructs a 128-bit floating-point vector of [4 x float],
1874/// initialized in reverse order with the specified 32-bit single-precision
1875/// float-point values.
1876///
1877/// \headerfile <x86intrin.h>
1878///
1879/// This intrinsic is a utility function and does not correspond to a specific
1880/// instruction.
1881///
1882/// \param __z
1883/// A single-precision floating-point value used to initialize bits [31:0]
1884/// of the result.
1885/// \param __y
1886/// A single-precision floating-point value used to initialize bits [63:32]
1887/// of the result.
1888/// \param __x
1889/// A single-precision floating-point value used to initialize bits [95:64]
1890/// of the result.
1891/// \param __w
1892/// A single-precision floating-point value used to initialize bits [127:96]
1893/// of the result.
1894/// \returns An initialized 128-bit floating-point vector of [4 x float].
1895static __inline__ __m128 __DEFAULT_FN_ATTRS
1896_mm_setr_ps(float __z, float __y, float __x, float __w)
1897{
1898 return __extension__ (__m128){ __z, __y, __x, __w };
1899}
1900
1901/// Constructs a 128-bit floating-point vector of [4 x float] initialized
1902/// to zero.
1903///
1904/// \headerfile <x86intrin.h>
1905///
1906/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
1907///
1908/// \returns An initialized 128-bit floating-point vector of [4 x float] with
1909/// all elements set to zero.
1910static __inline__ __m128 __DEFAULT_FN_ATTRS
1911_mm_setzero_ps(void)
1912{
1913 return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };
1914}
1915
1916/// Stores the upper 64 bits of a 128-bit vector of [4 x float] to a
1917/// memory location.
1918///
1919/// \headerfile <x86intrin.h>
1920///
1921/// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> instruction.
1922///
1923/// \param __p
1924/// A pointer to a 64-bit memory location.
1925/// \param __a
1926/// A 128-bit vector of [4 x float] containing the values to be stored.
1927static __inline__ void __DEFAULT_FN_ATTRS
1928_mm_storeh_pi(__m64 *__p, __m128 __a)
1929{
1930 typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8)));
1931 struct __mm_storeh_pi_struct {
1932 __mm_storeh_pi_v2f32 __u;
1933 } __attribute__((__packed__, __may_alias__));
1934 ((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 2, 3);
1935}
1936
1937/// Stores the lower 64 bits of a 128-bit vector of [4 x float] to a
1938/// memory location.
1939///
1940/// \headerfile <x86intrin.h>
1941///
1942/// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
1943///
1944/// \param __p
1945/// A pointer to a memory location that will receive the float values.
1946/// \param __a
1947/// A 128-bit vector of [4 x float] containing the values to be stored.
1948static __inline__ void __DEFAULT_FN_ATTRS
1949_mm_storel_pi(__m64 *__p, __m128 __a)
1950{
1951 typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8)));
1952 struct __mm_storeh_pi_struct {
1953 __mm_storeh_pi_v2f32 __u;
1954 } __attribute__((__packed__, __may_alias__));
1955 ((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 0, 1);
1956}
1957
1958/// Stores the lower 32 bits of a 128-bit vector of [4 x float] to a
1959/// memory location.
1960///
1961/// \headerfile <x86intrin.h>
1962///
1963/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
1964///
1965/// \param __p
1966/// A pointer to a 32-bit memory location.
1967/// \param __a
1968/// A 128-bit vector of [4 x float] containing the value to be stored.
1969static __inline__ void __DEFAULT_FN_ATTRS
1970_mm_store_ss(float *__p, __m128 __a)
1971{
1972 struct __mm_store_ss_struct {
1973 float __u;
1974 } __attribute__((__packed__, __may_alias__));
1975 ((struct __mm_store_ss_struct*)__p)->__u = __a[0];
1976}
1977
1978/// Stores a 128-bit vector of [4 x float] to an unaligned memory
1979/// location.
1980///
1981/// \headerfile <x86intrin.h>
1982///
1983/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
1984///
1985/// \param __p
1986/// A pointer to a 128-bit memory location. The address of the memory
1987/// location does not have to be aligned.
1988/// \param __a
1989/// A 128-bit vector of [4 x float] containing the values to be stored.
1990static __inline__ void __DEFAULT_FN_ATTRS
1991_mm_storeu_ps(float *__p, __m128 __a)
1992{
1993 struct __storeu_ps {
1994 __m128_u __v;
1995 } __attribute__((__packed__, __may_alias__));
1996 ((struct __storeu_ps*)__p)->__v = __a;
1997}
1998
1999/// Stores a 128-bit vector of [4 x float] into an aligned memory
2000/// location.
2001///
2002/// \headerfile <x86intrin.h>
2003///
2004/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
2005///
2006/// \param __p
2007/// A pointer to a 128-bit memory location. The address of the memory
2008/// location has to be 16-byte aligned.
2009/// \param __a
2010/// A 128-bit vector of [4 x float] containing the values to be stored.
2011static __inline__ void __DEFAULT_FN_ATTRS
2012_mm_store_ps(float *__p, __m128 __a)
2013{
2014 *(__m128*)__p = __a;
2015}
2016
2017/// Stores the lower 32 bits of a 128-bit vector of [4 x float] into
2018/// four contiguous elements in an aligned memory location.
2019///
2020/// \headerfile <x86intrin.h>
2021///
2022/// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
2023/// instruction.
2024///
2025/// \param __p
2026/// A pointer to a 128-bit memory location.
2027/// \param __a
2028/// A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
2029/// of the four contiguous elements pointed by \a __p.
2030static __inline__ void __DEFAULT_FN_ATTRS
2031_mm_store1_ps(float *__p, __m128 __a)
2032{
2033 __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 0, 0);
2034 _mm_store_ps(__p, __a);
2035}
2036
2037/// Stores the lower 32 bits of a 128-bit vector of [4 x float] into
2038/// four contiguous elements in an aligned memory location.
2039///
2040/// \headerfile <x86intrin.h>
2041///
2042/// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
2043/// instruction.
2044///
2045/// \param __p
2046/// A pointer to a 128-bit memory location.
2047/// \param __a
2048/// A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
2049/// of the four contiguous elements pointed by \a __p.
2050static __inline__ void __DEFAULT_FN_ATTRS
2051_mm_store_ps1(float *__p, __m128 __a)
2052{
2053 _mm_store1_ps(__p, __a);
2054}
2055
2056/// Stores float values from a 128-bit vector of [4 x float] to an
2057/// aligned memory location in reverse order.
2058///
2059/// \headerfile <x86intrin.h>
2060///
2061/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
2062/// instruction.
2063///
2064/// \param __p
2065/// A pointer to a 128-bit memory location. The address of the memory
2066/// location has to be 128-bit aligned.
2067/// \param __a
2068/// A 128-bit vector of [4 x float] containing the values to be stored.
2069static __inline__ void __DEFAULT_FN_ATTRS
2070_mm_storer_ps(float *__p, __m128 __a)
2071{
2072 __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
2073 _mm_store_ps(__p, __a);
2074}
2075
2076#define _MM_HINT_ET0 7
2077#define _MM_HINT_ET1 6
2078#define _MM_HINT_T0 3
2079#define _MM_HINT_T1 2
2080#define _MM_HINT_T2 1
2081#define _MM_HINT_NTA 0
2082
2083#ifndef _MSC_VER
2084/* FIXME: We have to #define this because "sel" must be a constant integer, and
2085 Sema doesn't do any form of constant propagation yet. */
2086
2087/// Loads one cache line of data from the specified address to a location
2088/// closer to the processor.
2089///
2090/// \headerfile <x86intrin.h>
2091///
2092/// \code
2093/// void _mm_prefetch(const void *a, const int sel);
2094/// \endcode
2095///
2096/// This intrinsic corresponds to the <c> PREFETCHNTA </c> instruction.
2097///
2098/// \param a
2099/// A pointer to a memory location containing a cache line of data.
2100/// \param sel
2101/// A predefined integer constant specifying the type of prefetch
2102/// operation: \n
2103/// _MM_HINT_NTA: Move data using the non-temporal access (NTA) hint. The
2104/// PREFETCHNTA instruction will be generated. \n
2105/// _MM_HINT_T0: Move data using the T0 hint. The PREFETCHT0 instruction will
2106/// be generated. \n
2107/// _MM_HINT_T1: Move data using the T1 hint. The PREFETCHT1 instruction will
2108/// be generated. \n
2109/// _MM_HINT_T2: Move data using the T2 hint. The PREFETCHT2 instruction will
2110/// be generated.
2111#define _mm_prefetch(a, sel) (__builtin_prefetch((const void *)(a), \
2112 ((sel) >> 2) & 1, (sel) & 0x3))
2113#endif
2114
2115/// Stores a 64-bit integer in the specified aligned memory location. To
2116/// minimize caching, the data is flagged as non-temporal (unlikely to be
2117/// used again soon).
2118///
2119/// \headerfile <x86intrin.h>
2120///
2121/// This intrinsic corresponds to the <c> MOVNTQ </c> instruction.
2122///
2123/// \param __p
2124/// A pointer to an aligned memory location used to store the register value.
2125/// \param __a
2126/// A 64-bit integer containing the value to be stored.
2127static __inline__ void __DEFAULT_FN_ATTRS_MMX
2128_mm_stream_pi(void *__p, __m64 __a)
2129{
2130 __builtin_ia32_movntq((__m64 *)__p, __a);
2131}
2132
2133/// Moves packed float values from a 128-bit vector of [4 x float] to a
2134/// 128-bit aligned memory location. To minimize caching, the data is flagged
2135/// as non-temporal (unlikely to be used again soon).
2136///
2137/// \headerfile <x86intrin.h>
2138///
2139/// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
2140///
2141/// \param __p
2142/// A pointer to a 128-bit aligned memory location that will receive the
2143/// single-precision floating-point values.
2144/// \param __a
2145/// A 128-bit vector of [4 x float] containing the values to be moved.
2146static __inline__ void __DEFAULT_FN_ATTRS
2147_mm_stream_ps(void *__p, __m128 __a)
2148{
2149 __builtin_nontemporal_store((__v4sf)__a, (__v4sf*)__p);
2150}
2151
2152#if defined(__cplusplus)
2153extern "C" {
2154#endif
2155
2156/// Forces strong memory ordering (serialization) between store
2157/// instructions preceding this instruction and store instructions following
2158/// this instruction, ensuring the system completes all previous stores
2159/// before executing subsequent stores.
2160///
2161/// \headerfile <x86intrin.h>
2162///
2163/// This intrinsic corresponds to the <c> SFENCE </c> instruction.
2164///
2165void _mm_sfence(void);
2166
2167#if defined(__cplusplus)
2168} // extern "C"
2169#endif
2170
2171/// Extracts 16-bit element from a 64-bit vector of [4 x i16] and
2172/// returns it, as specified by the immediate integer operand.
2173///
2174/// \headerfile <x86intrin.h>
2175///
2176/// \code
2177/// int _mm_extract_pi16(__m64 a, int n);
2178/// \endcode
2179///
2180/// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
2181///
2182/// \param a
2183/// A 64-bit vector of [4 x i16].
2184/// \param n
2185/// An immediate integer operand that determines which bits are extracted: \n
2186/// 0: Bits [15:0] are copied to the destination. \n
2187/// 1: Bits [31:16] are copied to the destination. \n
2188/// 2: Bits [47:32] are copied to the destination. \n
2189/// 3: Bits [63:48] are copied to the destination.
2190/// \returns A 16-bit integer containing the extracted 16 bits of packed data.
2191#define _mm_extract_pi16(a, n) \
2192 ((int)__builtin_ia32_vec_ext_v4hi((__v4hi)a, (int)n))
2193
2194/// Copies data from the 64-bit vector of [4 x i16] to the destination,
2195/// and inserts the lower 16-bits of an integer operand at the 16-bit offset
2196/// specified by the immediate operand \a n.
2197///
2198/// \headerfile <x86intrin.h>
2199///
2200/// \code
2201/// __m64 _mm_insert_pi16(__m64 a, int d, int n);
2202/// \endcode
2203///
2204/// This intrinsic corresponds to the <c> PINSRW </c> instruction.
2205///
2206/// \param a
2207/// A 64-bit vector of [4 x i16].
2208/// \param d
2209/// An integer. The lower 16-bit value from this operand is written to the
2210/// destination at the offset specified by operand \a n.
2211/// \param n
2212/// An immediate integer operant that determines which the bits to be used
2213/// in the destination. \n
2214/// 0: Bits [15:0] are copied to the destination. \n
2215/// 1: Bits [31:16] are copied to the destination. \n
2216/// 2: Bits [47:32] are copied to the destination. \n
2217/// 3: Bits [63:48] are copied to the destination. \n
2218/// The remaining bits in the destination are copied from the corresponding
2219/// bits in operand \a a.
2220/// \returns A 64-bit integer vector containing the copied packed data from the
2221/// operands.
2222#define _mm_insert_pi16(a, d, n) \
2223 ((__m64)__builtin_ia32_vec_set_v4hi((__v4hi)a, (int)d, (int)n))
2224
2225/// Compares each of the corresponding packed 16-bit integer values of
2226/// the 64-bit integer vectors, and writes the greater value to the
2227/// corresponding bits in the destination.
2228///
2229/// \headerfile <x86intrin.h>
2230///
2231/// This intrinsic corresponds to the <c> PMAXSW </c> instruction.
2232///
2233/// \param __a
2234/// A 64-bit integer vector containing one of the source operands.
2235/// \param __b
2236/// A 64-bit integer vector containing one of the source operands.
2237/// \returns A 64-bit integer vector containing the comparison results.
2238static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2239_mm_max_pi16(__m64 __a, __m64 __b)
2240{
2241 return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b);
2242}
2243
2244/// Compares each of the corresponding packed 8-bit unsigned integer
2245/// values of the 64-bit integer vectors, and writes the greater value to the
2246/// corresponding bits in the destination.
2247///
2248/// \headerfile <x86intrin.h>
2249///
2250/// This intrinsic corresponds to the <c> PMAXUB </c> instruction.
2251///
2252/// \param __a
2253/// A 64-bit integer vector containing one of the source operands.
2254/// \param __b
2255/// A 64-bit integer vector containing one of the source operands.
2256/// \returns A 64-bit integer vector containing the comparison results.
2257static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2258_mm_max_pu8(__m64 __a, __m64 __b)
2259{
2260 return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b);
2261}
2262
2263/// Compares each of the corresponding packed 16-bit integer values of
2264/// the 64-bit integer vectors, and writes the lesser value to the
2265/// corresponding bits in the destination.
2266///
2267/// \headerfile <x86intrin.h>
2268///
2269/// This intrinsic corresponds to the <c> PMINSW </c> instruction.
2270///
2271/// \param __a
2272/// A 64-bit integer vector containing one of the source operands.
2273/// \param __b
2274/// A 64-bit integer vector containing one of the source operands.
2275/// \returns A 64-bit integer vector containing the comparison results.
2276static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2277_mm_min_pi16(__m64 __a, __m64 __b)
2278{
2279 return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b);
2280}
2281
2282/// Compares each of the corresponding packed 8-bit unsigned integer
2283/// values of the 64-bit integer vectors, and writes the lesser value to the
2284/// corresponding bits in the destination.
2285///
2286/// \headerfile <x86intrin.h>
2287///
2288/// This intrinsic corresponds to the <c> PMINUB </c> instruction.
2289///
2290/// \param __a
2291/// A 64-bit integer vector containing one of the source operands.
2292/// \param __b
2293/// A 64-bit integer vector containing one of the source operands.
2294/// \returns A 64-bit integer vector containing the comparison results.
2295static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2296_mm_min_pu8(__m64 __a, __m64 __b)
2297{
2298 return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b);
2299}
2300
2301/// Takes the most significant bit from each 8-bit element in a 64-bit
2302/// integer vector to create an 8-bit mask value. Zero-extends the value to
2303/// 32-bit integer and writes it to the destination.
2304///
2305/// \headerfile <x86intrin.h>
2306///
2307/// This intrinsic corresponds to the <c> PMOVMSKB </c> instruction.
2308///
2309/// \param __a
2310/// A 64-bit integer vector containing the values with bits to be extracted.
2311/// \returns The most significant bit from each 8-bit element in \a __a,
2312/// written to bits [7:0].
2313static __inline__ int __DEFAULT_FN_ATTRS_MMX
2314_mm_movemask_pi8(__m64 __a)
2315{
2316 return __builtin_ia32_pmovmskb((__v8qi)__a);
2317}
2318
2319/// Multiplies packed 16-bit unsigned integer values and writes the
2320/// high-order 16 bits of each 32-bit product to the corresponding bits in
2321/// the destination.
2322///
2323/// \headerfile <x86intrin.h>
2324///
2325/// This intrinsic corresponds to the <c> PMULHUW </c> instruction.
2326///
2327/// \param __a
2328/// A 64-bit integer vector containing one of the source operands.
2329/// \param __b
2330/// A 64-bit integer vector containing one of the source operands.
2331/// \returns A 64-bit integer vector containing the products of both operands.
2332static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2333_mm_mulhi_pu16(__m64 __a, __m64 __b)
2334{
2335 return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b);
2336}
2337
2338/// Shuffles the 4 16-bit integers from a 64-bit integer vector to the
2339/// destination, as specified by the immediate value operand.
2340///
2341/// \headerfile <x86intrin.h>
2342///
2343/// \code
2344/// __m64 _mm_shuffle_pi16(__m64 a, const int n);
2345/// \endcode
2346///
2347/// This intrinsic corresponds to the <c> PSHUFW </c> instruction.
2348///
2349/// \param a
2350/// A 64-bit integer vector containing the values to be shuffled.
2351/// \param n
2352/// An immediate value containing an 8-bit value specifying which elements to
2353/// copy from \a a. The destinations within the 64-bit destination are
2354/// assigned values as follows: \n
2355/// Bits [1:0] are used to assign values to bits [15:0] in the
2356/// destination. \n
2357/// Bits [3:2] are used to assign values to bits [31:16] in the
2358/// destination. \n
2359/// Bits [5:4] are used to assign values to bits [47:32] in the
2360/// destination. \n
2361/// Bits [7:6] are used to assign values to bits [63:48] in the
2362/// destination. \n
2363/// Bit value assignments: \n
2364/// 00: assigned from bits [15:0] of \a a. \n
2365/// 01: assigned from bits [31:16] of \a a. \n
2366/// 10: assigned from bits [47:32] of \a a. \n
2367/// 11: assigned from bits [63:48] of \a a. \n
2368/// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
2369/// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
2370/// <c>[b6, b4, b2, b0]</c>.
2371/// \returns A 64-bit integer vector containing the shuffled values.
2372#define _mm_shuffle_pi16(a, n) \
2373 ((__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n)))
2374
2375/// Conditionally copies the values from each 8-bit element in the first
2376/// 64-bit integer vector operand to the specified memory location, as
2377/// specified by the most significant bit in the corresponding element in the
2378/// second 64-bit integer vector operand.
2379///
2380/// To minimize caching, the data is flagged as non-temporal
2381/// (unlikely to be used again soon).
2382///
2383/// \headerfile <x86intrin.h>
2384///
2385/// This intrinsic corresponds to the <c> MASKMOVQ </c> instruction.
2386///
2387/// \param __d
2388/// A 64-bit integer vector containing the values with elements to be copied.
2389/// \param __n
2390/// A 64-bit integer vector operand. The most significant bit from each 8-bit
2391/// element determines whether the corresponding element in operand \a __d
2392/// is copied. If the most significant bit of a given element is 1, the
2393/// corresponding element in operand \a __d is copied.
2394/// \param __p
2395/// A pointer to a 64-bit memory location that will receive the conditionally
2396/// copied integer values. The address of the memory location does not have
2397/// to be aligned.
2398static __inline__ void __DEFAULT_FN_ATTRS_MMX
2399_mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
2400{
2401 __builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p);
2402}
2403
2404/// Computes the rounded averages of the packed unsigned 8-bit integer
2405/// values and writes the averages to the corresponding bits in the
2406/// destination.
2407///
2408/// \headerfile <x86intrin.h>
2409///
2410/// This intrinsic corresponds to the <c> PAVGB </c> instruction.
2411///
2412/// \param __a
2413/// A 64-bit integer vector containing one of the source operands.
2414/// \param __b
2415/// A 64-bit integer vector containing one of the source operands.
2416/// \returns A 64-bit integer vector containing the averages of both operands.
2417static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2418_mm_avg_pu8(__m64 __a, __m64 __b)
2419{
2420 return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b);
2421}
2422
2423/// Computes the rounded averages of the packed unsigned 16-bit integer
2424/// values and writes the averages to the corresponding bits in the
2425/// destination.
2426///
2427/// \headerfile <x86intrin.h>
2428///
2429/// This intrinsic corresponds to the <c> PAVGW </c> instruction.
2430///
2431/// \param __a
2432/// A 64-bit integer vector containing one of the source operands.
2433/// \param __b
2434/// A 64-bit integer vector containing one of the source operands.
2435/// \returns A 64-bit integer vector containing the averages of both operands.
2436static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2437_mm_avg_pu16(__m64 __a, __m64 __b)
2438{
2439 return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b);
2440}
2441
2442/// Subtracts the corresponding 8-bit unsigned integer values of the two
2443/// 64-bit vector operands and computes the absolute value for each of the
2444/// difference. Then sum of the 8 absolute differences is written to the
2445/// bits [15:0] of the destination; the remaining bits [63:16] are cleared.
2446///
2447/// \headerfile <x86intrin.h>
2448///
2449/// This intrinsic corresponds to the <c> PSADBW </c> instruction.
2450///
2451/// \param __a
2452/// A 64-bit integer vector containing one of the source operands.
2453/// \param __b
2454/// A 64-bit integer vector containing one of the source operands.
2455/// \returns A 64-bit integer vector whose lower 16 bits contain the sums of the
2456/// sets of absolute differences between both operands. The upper bits are
2457/// cleared.
2458static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2459_mm_sad_pu8(__m64 __a, __m64 __b)
2460{
2461 return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b);
2462}
2463
2464#if defined(__cplusplus)
2465extern "C" {
2466#endif
2467
2468/// Returns the contents of the MXCSR register as a 32-bit unsigned
2469/// integer value.
2470///
2471/// There are several groups of macros associated with this
2472/// intrinsic, including:
2473/// <ul>
2474/// <li>
2475/// For checking exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
2476/// _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
2477/// _MM_EXCEPT_INEXACT. There is a convenience wrapper
2478/// _MM_GET_EXCEPTION_STATE().
2479/// </li>
2480/// <li>
2481/// For checking exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
2482/// _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
2483/// There is a convenience wrapper _MM_GET_EXCEPTION_MASK().
2484/// </li>
2485/// <li>
2486/// For checking rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
2487/// _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
2488/// _MM_GET_ROUNDING_MODE().
2489/// </li>
2490/// <li>
2491/// For checking flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
2492/// There is a convenience wrapper _MM_GET_FLUSH_ZERO_MODE().
2493/// </li>
2494/// <li>
2495/// For checking denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
2496/// _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
2497/// _MM_GET_DENORMALS_ZERO_MODE().
2498/// </li>
2499/// </ul>
2500///
2501/// For example, the following expression checks if an overflow exception has
2502/// occurred:
2503/// \code
2504/// ( _mm_getcsr() & _MM_EXCEPT_OVERFLOW )
2505/// \endcode
2506///
2507/// The following expression gets the current rounding mode:
2508/// \code
2509/// _MM_GET_ROUNDING_MODE()
2510/// \endcode
2511///
2512/// \headerfile <x86intrin.h>
2513///
2514/// This intrinsic corresponds to the <c> VSTMXCSR / STMXCSR </c> instruction.
2515///
2516/// \returns A 32-bit unsigned integer containing the contents of the MXCSR
2517/// register.
2518unsigned int _mm_getcsr(void);
2519
2520/// Sets the MXCSR register with the 32-bit unsigned integer value.
2521///
2522/// There are several groups of macros associated with this intrinsic,
2523/// including:
2524/// <ul>
2525/// <li>
2526/// For setting exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
2527/// _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
2528/// _MM_EXCEPT_INEXACT. There is a convenience wrapper
2529/// _MM_SET_EXCEPTION_STATE(x) where x is one of these macros.
2530/// </li>
2531/// <li>
2532/// For setting exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
2533/// _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
2534/// There is a convenience wrapper _MM_SET_EXCEPTION_MASK(x) where x is one
2535/// of these macros.
2536/// </li>
2537/// <li>
2538/// For setting rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
2539/// _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
2540/// _MM_SET_ROUNDING_MODE(x) where x is one of these macros.
2541/// </li>
2542/// <li>
2543/// For setting flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
2544/// There is a convenience wrapper _MM_SET_FLUSH_ZERO_MODE(x) where x is
2545/// one of these macros.
2546/// </li>
2547/// <li>
2548/// For setting denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
2549/// _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
2550/// _MM_SET_DENORMALS_ZERO_MODE(x) where x is one of these macros.
2551/// </li>
2552/// </ul>
2553///
2554/// For example, the following expression causes subsequent floating-point
2555/// operations to round up:
2556/// _mm_setcsr(_mm_getcsr() | _MM_ROUND_UP)
2557///
2558/// The following example sets the DAZ and FTZ flags:
2559/// \code
2560/// void setFlags() {
2561/// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
2562/// _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
2563/// }
2564/// \endcode
2565///
2566/// \headerfile <x86intrin.h>
2567///
2568/// This intrinsic corresponds to the <c> VLDMXCSR / LDMXCSR </c> instruction.
2569///
2570/// \param __i
2571/// A 32-bit unsigned integer value to be written to the MXCSR register.
2572void _mm_setcsr(unsigned int __i);
2573
2574#if defined(__cplusplus)
2575} // extern "C"
2576#endif
2577
2578/// Selects 4 float values from the 128-bit operands of [4 x float], as
2579/// specified by the immediate value operand.
2580///
2581/// \headerfile <x86intrin.h>
2582///
2583/// \code
2584/// __m128 _mm_shuffle_ps(__m128 a, __m128 b, const int mask);
2585/// \endcode
2586///
2587/// This intrinsic corresponds to the <c> VSHUFPS / SHUFPS </c> instruction.
2588///
2589/// \param a
2590/// A 128-bit vector of [4 x float].
2591/// \param b
2592/// A 128-bit vector of [4 x float].
2593/// \param mask
2594/// An immediate value containing an 8-bit value specifying which elements to
2595/// copy from \a a and \a b. \n
2596/// Bits [3:0] specify the values copied from operand \a a. \n
2597/// Bits [7:4] specify the values copied from operand \a b. \n
2598/// The destinations within the 128-bit destination are assigned values as
2599/// follows: \n
2600/// Bits [1:0] are used to assign values to bits [31:0] in the
2601/// destination. \n
2602/// Bits [3:2] are used to assign values to bits [63:32] in the
2603/// destination. \n
2604/// Bits [5:4] are used to assign values to bits [95:64] in the
2605/// destination. \n
2606/// Bits [7:6] are used to assign values to bits [127:96] in the
2607/// destination. \n
2608/// Bit value assignments: \n
2609/// 00: Bits [31:0] copied from the specified operand. \n
2610/// 01: Bits [63:32] copied from the specified operand. \n
2611/// 10: Bits [95:64] copied from the specified operand. \n
2612/// 11: Bits [127:96] copied from the specified operand. \n
2613/// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
2614/// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
2615/// <c>[b6, b4, b2, b0]</c>.
2616/// \returns A 128-bit vector of [4 x float] containing the shuffled values.
2617#define _mm_shuffle_ps(a, b, mask) \
2618 ((__m128)__builtin_ia32_shufps((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \
2619 (int)(mask)))
2620
2621/// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
2622/// [4 x float] and interleaves them into a 128-bit vector of [4 x float].
2623///
2624/// \headerfile <x86intrin.h>
2625///
2626/// This intrinsic corresponds to the <c> VUNPCKHPS / UNPCKHPS </c> instruction.
2627///
2628/// \param __a
2629/// A 128-bit vector of [4 x float]. \n
2630/// Bits [95:64] are written to bits [31:0] of the destination. \n
2631/// Bits [127:96] are written to bits [95:64] of the destination.
2632/// \param __b
2633/// A 128-bit vector of [4 x float].
2634/// Bits [95:64] are written to bits [63:32] of the destination. \n
2635/// Bits [127:96] are written to bits [127:96] of the destination.
2636/// \returns A 128-bit vector of [4 x float] containing the interleaved values.
2637static __inline__ __m128 __DEFAULT_FN_ATTRS
2638_mm_unpackhi_ps(__m128 __a, __m128 __b)
2639{
2640 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 2, 6, 3, 7);
2641}
2642
2643/// Unpacks the low-order (index 0,1) values from two 128-bit vectors of
2644/// [4 x float] and interleaves them into a 128-bit vector of [4 x float].
2645///
2646/// \headerfile <x86intrin.h>
2647///
2648/// This intrinsic corresponds to the <c> VUNPCKLPS / UNPCKLPS </c> instruction.
2649///
2650/// \param __a
2651/// A 128-bit vector of [4 x float]. \n
2652/// Bits [31:0] are written to bits [31:0] of the destination. \n
2653/// Bits [63:32] are written to bits [95:64] of the destination.
2654/// \param __b
2655/// A 128-bit vector of [4 x float]. \n
2656/// Bits [31:0] are written to bits [63:32] of the destination. \n
2657/// Bits [63:32] are written to bits [127:96] of the destination.
2658/// \returns A 128-bit vector of [4 x float] containing the interleaved values.
2659static __inline__ __m128 __DEFAULT_FN_ATTRS
2660_mm_unpacklo_ps(__m128 __a, __m128 __b)
2661{
2662 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 4, 1, 5);
2663}
2664
2665/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
2666/// 32 bits are set to the lower 32 bits of the second parameter. The upper
2667/// 96 bits are set to the upper 96 bits of the first parameter.
2668///
2669/// \headerfile <x86intrin.h>
2670///
2671/// This intrinsic corresponds to the <c> VBLENDPS / BLENDPS / MOVSS </c>
2672/// instruction.
2673///
2674/// \param __a
2675/// A 128-bit floating-point vector of [4 x float]. The upper 96 bits are
2676/// written to the upper 96 bits of the result.
2677/// \param __b
2678/// A 128-bit floating-point vector of [4 x float]. The lower 32 bits are
2679/// written to the lower 32 bits of the result.
2680/// \returns A 128-bit floating-point vector of [4 x float].
2681static __inline__ __m128 __DEFAULT_FN_ATTRS
2682_mm_move_ss(__m128 __a, __m128 __b)
2683{
2684 __a[0] = __b[0];
2685 return __a;
2686}
2687
2688/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
2689/// 64 bits are set to the upper 64 bits of the second parameter. The upper
2690/// 64 bits are set to the upper 64 bits of the first parameter.
2691///
2692/// \headerfile <x86intrin.h>
2693///
2694/// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
2695///
2696/// \param __a
2697/// A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
2698/// written to the upper 64 bits of the result.
2699/// \param __b
2700/// A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
2701/// written to the lower 64 bits of the result.
2702/// \returns A 128-bit floating-point vector of [4 x float].
2703static __inline__ __m128 __DEFAULT_FN_ATTRS
2704_mm_movehl_ps(__m128 __a, __m128 __b)
2705{
2706 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 6, 7, 2, 3);
2707}
2708
2709/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
2710/// 64 bits are set to the lower 64 bits of the first parameter. The upper
2711/// 64 bits are set to the lower 64 bits of the second parameter.
2712///
2713/// \headerfile <x86intrin.h>
2714///
2715/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
2716///
2717/// \param __a
2718/// A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
2719/// written to the lower 64 bits of the result.
2720/// \param __b
2721/// A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
2722/// written to the upper 64 bits of the result.
2723/// \returns A 128-bit floating-point vector of [4 x float].
2724static __inline__ __m128 __DEFAULT_FN_ATTRS
2725_mm_movelh_ps(__m128 __a, __m128 __b)
2726{
2727 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 1, 4, 5);
2728}
2729
2730/// Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x
2731/// float].
2732///
2733/// \headerfile <x86intrin.h>
2734///
2735/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2736///
2737/// \param __a
2738/// A 64-bit vector of [4 x i16]. The elements of the destination are copied
2739/// from the corresponding elements in this operand.
2740/// \returns A 128-bit vector of [4 x float] containing the copied and converted
2741/// values from the operand.
2742static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
2743_mm_cvtpi16_ps(__m64 __a)
2744{
2745 __m64 __b, __c;
2746 __m128 __r;
2747
2748 __b = _mm_setzero_si64();
2749 __b = _mm_cmpgt_pi16(__b, __a);
2750 __c = _mm_unpackhi_pi16(__a, __b);
2751 __r = _mm_setzero_ps();
2752 __r = _mm_cvtpi32_ps(__r, __c);
2753 __r = _mm_movelh_ps(__r, __r);
2754 __c = _mm_unpacklo_pi16(__a, __b);
2755 __r = _mm_cvtpi32_ps(__r, __c);
2756
2757 return __r;
2758}
2759
2760/// Converts a 64-bit vector of 16-bit unsigned integer values into a
2761/// 128-bit vector of [4 x float].
2762///
2763/// \headerfile <x86intrin.h>
2764///
2765/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2766///
2767/// \param __a
2768/// A 64-bit vector of 16-bit unsigned integer values. The elements of the
2769/// destination are copied from the corresponding elements in this operand.
2770/// \returns A 128-bit vector of [4 x float] containing the copied and converted
2771/// values from the operand.
2772static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
2773_mm_cvtpu16_ps(__m64 __a)
2774{
2775 __m64 __b, __c;
2776 __m128 __r;
2777
2778 __b = _mm_setzero_si64();
2779 __c = _mm_unpackhi_pi16(__a, __b);
2780 __r = _mm_setzero_ps();
2781 __r = _mm_cvtpi32_ps(__r, __c);
2782 __r = _mm_movelh_ps(__r, __r);
2783 __c = _mm_unpacklo_pi16(__a, __b);
2784 __r = _mm_cvtpi32_ps(__r, __c);
2785
2786 return __r;
2787}
2788
2789/// Converts the lower four 8-bit values from a 64-bit vector of [8 x i8]
2790/// into a 128-bit vector of [4 x float].
2791///
2792/// \headerfile <x86intrin.h>
2793///
2794/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2795///
2796/// \param __a
2797/// A 64-bit vector of [8 x i8]. The elements of the destination are copied
2798/// from the corresponding lower 4 elements in this operand.
2799/// \returns A 128-bit vector of [4 x float] containing the copied and converted
2800/// values from the operand.
2801static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
2802_mm_cvtpi8_ps(__m64 __a)
2803{
2804 __m64 __b;
2805
2806 __b = _mm_setzero_si64();
2807 __b = _mm_cmpgt_pi8(__b, __a);
2808 __b = _mm_unpacklo_pi8(__a, __b);
2809
2810 return _mm_cvtpi16_ps(__b);
2811}
2812
2813/// Converts the lower four unsigned 8-bit integer values from a 64-bit
2814/// vector of [8 x u8] into a 128-bit vector of [4 x float].
2815///
2816/// \headerfile <x86intrin.h>
2817///
2818/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2819///
2820/// \param __a
2821/// A 64-bit vector of unsigned 8-bit integer values. The elements of the
2822/// destination are copied from the corresponding lower 4 elements in this
2823/// operand.
2824/// \returns A 128-bit vector of [4 x float] containing the copied and converted
2825/// values from the source operand.
2826static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
2827_mm_cvtpu8_ps(__m64 __a)
2828{
2829 __m64 __b;
2830
2831 __b = _mm_setzero_si64();
2832 __b = _mm_unpacklo_pi8(__a, __b);
2833
2834 return _mm_cvtpi16_ps(__b);
2835}
2836
2837/// Converts the two 32-bit signed integer values from each 64-bit vector
2838/// operand of [2 x i32] into a 128-bit vector of [4 x float].
2839///
2840/// \headerfile <x86intrin.h>
2841///
2842/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2843///
2844/// \param __a
2845/// A 64-bit vector of [2 x i32]. The lower elements of the destination are
2846/// copied from the elements in this operand.
2847/// \param __b
2848/// A 64-bit vector of [2 x i32]. The upper elements of the destination are
2849/// copied from the elements in this operand.
2850/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
2851/// copied and converted values from the first operand. The upper 64 bits
2852/// contain the copied and converted values from the second operand.
2853static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
2854_mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
2855{
2856 __m128 __c;
2857
2858 __c = _mm_setzero_ps();
2859 __c = _mm_cvtpi32_ps(__c, __b);
2860 __c = _mm_movelh_ps(__c, __c);
2861
2862 return _mm_cvtpi32_ps(__c, __a);
2863}
2864
2865/// Converts each single-precision floating-point element of a 128-bit
2866/// floating-point vector of [4 x float] into a 16-bit signed integer, and
2867/// packs the results into a 64-bit integer vector of [4 x i16].
2868///
2869/// If the floating-point element is NaN or infinity, or if the
2870/// floating-point element is greater than 0x7FFFFFFF or less than -0x8000,
2871/// it is converted to 0x8000. Otherwise if the floating-point element is
2872/// greater than 0x7FFF, it is converted to 0x7FFF.
2873///
2874/// \headerfile <x86intrin.h>
2875///
2876/// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
2877///
2878/// \param __a
2879/// A 128-bit floating-point vector of [4 x float].
2880/// \returns A 64-bit integer vector of [4 x i16] containing the converted
2881/// values.
2882static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2883_mm_cvtps_pi16(__m128 __a)
2884{
2885 __m64 __b, __c;
2886
2887 __b = _mm_cvtps_pi32(__a);
2888 __a = _mm_movehl_ps(__a, __a);
2889 __c = _mm_cvtps_pi32(__a);
2890
2891 return _mm_packs_pi32(__b, __c);
2892}
2893
2894/// Converts each single-precision floating-point element of a 128-bit
2895/// floating-point vector of [4 x float] into an 8-bit signed integer, and
2896/// packs the results into the lower 32 bits of a 64-bit integer vector of
2897/// [8 x i8]. The upper 32 bits of the vector are set to 0.
2898///
2899/// If the floating-point element is NaN or infinity, or if the
2900/// floating-point element is greater than 0x7FFFFFFF or less than -0x80, it
2901/// is converted to 0x80. Otherwise if the floating-point element is greater
2902/// than 0x7F, it is converted to 0x7F.
2903///
2904/// \headerfile <x86intrin.h>
2905///
2906/// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
2907///
2908/// \param __a
2909/// 128-bit floating-point vector of [4 x float].
2910/// \returns A 64-bit integer vector of [8 x i8]. The lower 32 bits contain the
2911/// converted values and the uppper 32 bits are set to zero.
2912static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2913_mm_cvtps_pi8(__m128 __a)
2914{
2915 __m64 __b, __c;
2916
2917 __b = _mm_cvtps_pi16(__a);
2918 __c = _mm_setzero_si64();
2919
2920 return _mm_packs_pi16(__b, __c);
2921}
2922
2923/// Extracts the sign bits from each single-precision floating-point
2924/// element of a 128-bit floating-point vector of [4 x float] and returns the
2925/// sign bits in bits [0:3] of the result. Bits [31:4] of the result are set
2926/// to zero.
2927///
2928/// \headerfile <x86intrin.h>
2929///
2930/// This intrinsic corresponds to the <c> VMOVMSKPS / MOVMSKPS </c> instruction.
2931///
2932/// \param __a
2933/// A 128-bit floating-point vector of [4 x float].
2934/// \returns A 32-bit integer value. Bits [3:0] contain the sign bits from each
2935/// single-precision floating-point element of the parameter. Bits [31:4] are
2936/// set to zero.
2937static __inline__ int __DEFAULT_FN_ATTRS
2938_mm_movemask_ps(__m128 __a)
2939{
2940 return __builtin_ia32_movmskps((__v4sf)__a);
2941}
2942
2943
2944#define _MM_ALIGN16 __attribute__((aligned(16)))
2945
2946#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
2947
2948#define _MM_EXCEPT_INVALID (0x0001U)
2949#define _MM_EXCEPT_DENORM (0x0002U)
2950#define _MM_EXCEPT_DIV_ZERO (0x0004U)
2951#define _MM_EXCEPT_OVERFLOW (0x0008U)
2952#define _MM_EXCEPT_UNDERFLOW (0x0010U)
2953#define _MM_EXCEPT_INEXACT (0x0020U)
2954#define _MM_EXCEPT_MASK (0x003fU)
2955
2956#define _MM_MASK_INVALID (0x0080U)
2957#define _MM_MASK_DENORM (0x0100U)
2958#define _MM_MASK_DIV_ZERO (0x0200U)
2959#define _MM_MASK_OVERFLOW (0x0400U)
2960#define _MM_MASK_UNDERFLOW (0x0800U)
2961#define _MM_MASK_INEXACT (0x1000U)
2962#define _MM_MASK_MASK (0x1f80U)
2963
2964#define _MM_ROUND_NEAREST (0x0000U)
2965#define _MM_ROUND_DOWN (0x2000U)
2966#define _MM_ROUND_UP (0x4000U)
2967#define _MM_ROUND_TOWARD_ZERO (0x6000U)
2968#define _MM_ROUND_MASK (0x6000U)
2969
2970#define _MM_FLUSH_ZERO_MASK (0x8000U)
2971#define _MM_FLUSH_ZERO_ON (0x8000U)
2972#define _MM_FLUSH_ZERO_OFF (0x0000U)
2973
2974#define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK)
2975#define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK)
2976#define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)
2977#define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK)
2978
2979#define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x)))
2980#define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x)))
2981#define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x)))
2982#define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x)))
2983
2984#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
2985do { \
2986 __m128 tmp3, tmp2, tmp1, tmp0; \
2987 tmp0 = _mm_unpacklo_ps((row0), (row1)); \
2988 tmp2 = _mm_unpacklo_ps((row2), (row3)); \
2989 tmp1 = _mm_unpackhi_ps((row0), (row1)); \
2990 tmp3 = _mm_unpackhi_ps((row2), (row3)); \
2991 (row0) = _mm_movelh_ps(tmp0, tmp2); \
2992 (row1) = _mm_movehl_ps(tmp2, tmp0); \
2993 (row2) = _mm_movelh_ps(tmp1, tmp3); \
2994 (row3) = _mm_movehl_ps(tmp3, tmp1); \
2995} while (0)
2996
2997/* Aliases for compatibility. */
2998#define _m_pextrw _mm_extract_pi16
2999#define _m_pinsrw _mm_insert_pi16
3000#define _m_pmaxsw _mm_max_pi16
3001#define _m_pmaxub _mm_max_pu8
3002#define _m_pminsw _mm_min_pi16
3003#define _m_pminub _mm_min_pu8
3004#define _m_pmovmskb _mm_movemask_pi8
3005#define _m_pmulhuw _mm_mulhi_pu16
3006#define _m_pshufw _mm_shuffle_pi16
3007#define _m_maskmovq _mm_maskmove_si64
3008#define _m_pavgb _mm_avg_pu8
3009#define _m_pavgw _mm_avg_pu16
3010#define _m_psadbw _mm_sad_pu8
3011#define _m_ _mm_
3012
3013#undef __DEFAULT_FN_ATTRS
3014#undef __DEFAULT_FN_ATTRS_MMX
3015
3016/* Ugly hack for backwards-compatibility (compatible with gcc) */
3017#if defined(__SSE2__) && !__building_module(_Builtin_intrinsics)
3018#include <emmintrin.h>
3019#endif
3020
3021#endif /* __XMMINTRIN_H */
3022

Warning: This file is not a C or C++ file. It does not have highlighting.

source code of clang/lib/Headers/xmmintrin.h