smmintrin.h source code [clang/lib/Headers/smmintrin.h]

Warning: This file is not a C or C++ file. It does not have highlighting.

1	/*===---- smmintrin.h - SSE4 intrinsics ------------------------------------===
2	*
3	* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	* See https://llvm.org/LICENSE.txt for license information.
5	* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	*
7	*===-----------------------------------------------------------------------===
8	*/
9
10	#ifndef __SMMINTRIN_H
11	#define __SMMINTRIN_H
12
13	#if !defined(__i386__) && !defined(__x86_64__)
14	#error "This header is only meant to be used on x86 and x64 architecture"
15	#endif
16
17	#include <tmmintrin.h>
18
19	/* Define the default attributes for the functions in this file. */
20	#define __DEFAULT_FN_ATTRS \
21	__attribute__((__always_inline__, __nodebug__, \
22	__target__("sse4.1,no-evex512"), __min_vector_width__(128)))
23
24	/* SSE4 Rounding macros. */
25	#define _MM_FROUND_TO_NEAREST_INT 0x00
26	#define _MM_FROUND_TO_NEG_INF 0x01
27	#define _MM_FROUND_TO_POS_INF 0x02
28	#define _MM_FROUND_TO_ZERO 0x03
29	#define _MM_FROUND_CUR_DIRECTION 0x04
30
31	#define _MM_FROUND_RAISE_EXC 0x00
32	#define _MM_FROUND_NO_EXC 0x08
33
34	#define _MM_FROUND_NINT (_MM_FROUND_RAISE_EXC \| _MM_FROUND_TO_NEAREST_INT)
35	#define _MM_FROUND_FLOOR (_MM_FROUND_RAISE_EXC \| _MM_FROUND_TO_NEG_INF)
36	#define _MM_FROUND_CEIL (_MM_FROUND_RAISE_EXC \| _MM_FROUND_TO_POS_INF)
37	#define _MM_FROUND_TRUNC (_MM_FROUND_RAISE_EXC \| _MM_FROUND_TO_ZERO)
38	#define _MM_FROUND_RINT (_MM_FROUND_RAISE_EXC \| _MM_FROUND_CUR_DIRECTION)
39	#define _MM_FROUND_NEARBYINT (_MM_FROUND_NO_EXC \| _MM_FROUND_CUR_DIRECTION)
40
41	/// Rounds up each element of the 128-bit vector of [4 x float] to an
42	/// integer and returns the rounded values in a 128-bit vector of
43	/// [4 x float].
44	///
45	/// \headerfile <x86intrin.h>
46	///
47	/// \code
48	/// __m128 _mm_ceil_ps(__m128 X);
49	/// \endcode
50	///
51	/// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction.
52	///
53	/// \param X
54	/// A 128-bit vector of [4 x float] values to be rounded up.
55	/// \returns A 128-bit vector of [4 x float] containing the rounded values.
56	#define _mm_ceil_ps(X) _mm_round_ps((X), _MM_FROUND_CEIL)
57
58	/// Rounds up each element of the 128-bit vector of [2 x double] to an
59	/// integer and returns the rounded values in a 128-bit vector of
60	/// [2 x double].
61	///
62	/// \headerfile <x86intrin.h>
63	///
64	/// \code
65	/// __m128d _mm_ceil_pd(__m128d X);
66	/// \endcode
67	///
68	/// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction.
69	///
70	/// \param X
71	/// A 128-bit vector of [2 x double] values to be rounded up.
72	/// \returns A 128-bit vector of [2 x double] containing the rounded values.
73	#define _mm_ceil_pd(X) _mm_round_pd((X), _MM_FROUND_CEIL)
74
75	/// Copies three upper elements of the first 128-bit vector operand to
76	/// the corresponding three upper elements of the 128-bit result vector of
77	/// [4 x float]. Rounds up the lowest element of the second 128-bit vector
78	/// operand to an integer and copies it to the lowest element of the 128-bit
79	/// result vector of [4 x float].
80	///
81	/// \headerfile <x86intrin.h>
82	///
83	/// \code
84	/// __m128 _mm_ceil_ss(__m128 X, __m128 Y);
85	/// \endcode
86	///
87	/// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction.
88	///
89	/// \param X
90	/// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are
91	/// copied to the corresponding bits of the result.
92	/// \param Y
93	/// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is
94	/// rounded up to the nearest integer and copied to the corresponding bits
95	/// of the result.
96	/// \returns A 128-bit vector of [4 x float] containing the copied and rounded
97	/// values.
98	#define _mm_ceil_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_CEIL)
99
100	/// Copies the upper element of the first 128-bit vector operand to the
101	/// corresponding upper element of the 128-bit result vector of [2 x double].
102	/// Rounds up the lower element of the second 128-bit vector operand to an
103	/// integer and copies it to the lower element of the 128-bit result vector
104	/// of [2 x double].
105	///
106	/// \headerfile <x86intrin.h>
107	///
108	/// \code
109	/// __m128d _mm_ceil_sd(__m128d X, __m128d Y);
110	/// \endcode
111	///
112	/// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction.
113	///
114	/// \param X
115	/// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is
116	/// copied to the corresponding bits of the result.
117	/// \param Y
118	/// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is
119	/// rounded up to the nearest integer and copied to the corresponding bits
120	/// of the result.
121	/// \returns A 128-bit vector of [2 x double] containing the copied and rounded
122	/// values.
123	#define _mm_ceil_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_CEIL)
124
125	/// Rounds down each element of the 128-bit vector of [4 x float] to an
126	/// an integer and returns the rounded values in a 128-bit vector of
127	/// [4 x float].
128	///
129	/// \headerfile <x86intrin.h>
130	///
131	/// \code
132	/// __m128 _mm_floor_ps(__m128 X);
133	/// \endcode
134	///
135	/// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction.
136	///
137	/// \param X
138	/// A 128-bit vector of [4 x float] values to be rounded down.
139	/// \returns A 128-bit vector of [4 x float] containing the rounded values.
140	#define _mm_floor_ps(X) _mm_round_ps((X), _MM_FROUND_FLOOR)
141
142	/// Rounds down each element of the 128-bit vector of [2 x double] to an
143	/// integer and returns the rounded values in a 128-bit vector of
144	/// [2 x double].
145	///
146	/// \headerfile <x86intrin.h>
147	///
148	/// \code
149	/// __m128d _mm_floor_pd(__m128d X);
150	/// \endcode
151	///
152	/// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction.
153	///
154	/// \param X
155	/// A 128-bit vector of [2 x double].
156	/// \returns A 128-bit vector of [2 x double] containing the rounded values.
157	#define _mm_floor_pd(X) _mm_round_pd((X), _MM_FROUND_FLOOR)
158
159	/// Copies three upper elements of the first 128-bit vector operand to
160	/// the corresponding three upper elements of the 128-bit result vector of
161	/// [4 x float]. Rounds down the lowest element of the second 128-bit vector
162	/// operand to an integer and copies it to the lowest element of the 128-bit
163	/// result vector of [4 x float].
164	///
165	/// \headerfile <x86intrin.h>
166	///
167	/// \code
168	/// __m128 _mm_floor_ss(__m128 X, __m128 Y);
169	/// \endcode
170	///
171	/// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction.
172	///
173	/// \param X
174	/// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are
175	/// copied to the corresponding bits of the result.
176	/// \param Y
177	/// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is
178	/// rounded down to the nearest integer and copied to the corresponding bits
179	/// of the result.
180	/// \returns A 128-bit vector of [4 x float] containing the copied and rounded
181	/// values.
182	#define _mm_floor_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_FLOOR)
183
184	/// Copies the upper element of the first 128-bit vector operand to the
185	/// corresponding upper element of the 128-bit result vector of [2 x double].
186	/// Rounds down the lower element of the second 128-bit vector operand to an
187	/// integer and copies it to the lower element of the 128-bit result vector
188	/// of [2 x double].
189	///
190	/// \headerfile <x86intrin.h>
191	///
192	/// \code
193	/// __m128d _mm_floor_sd(__m128d X, __m128d Y);
194	/// \endcode
195	///
196	/// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction.
197	///
198	/// \param X
199	/// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is
200	/// copied to the corresponding bits of the result.
201	/// \param Y
202	/// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is
203	/// rounded down to the nearest integer and copied to the corresponding bits
204	/// of the result.
205	/// \returns A 128-bit vector of [2 x double] containing the copied and rounded
206	/// values.
207	#define _mm_floor_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_FLOOR)
208
209	/// Rounds each element of the 128-bit vector of [4 x float] to an
210	/// integer value according to the rounding control specified by the second
211	/// argument and returns the rounded values in a 128-bit vector of
212	/// [4 x float].
213	///
214	/// \headerfile <x86intrin.h>
215	///
216	/// \code
217	/// __m128 _mm_round_ps(__m128 X, const int M);
218	/// \endcode
219	///
220	/// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction.
221	///
222	/// \param X
223	/// A 128-bit vector of [4 x float].
224	/// \param M
225	/// An integer value that specifies the rounding operation. \n
226	/// Bits [7:4] are reserved. \n
227	/// Bit [3] is a precision exception value: \n
228	/// 0: A normal PE exception is used \n
229	/// 1: The PE field is not updated \n
230	/// Bit [2] is the rounding control source: \n
231	/// 0: Use bits [1:0] of \a M \n
232	/// 1: Use the current MXCSR setting \n
233	/// Bits [1:0] contain the rounding control definition: \n
234	/// 00: Nearest \n
235	/// 01: Downward (toward negative infinity) \n
236	/// 10: Upward (toward positive infinity) \n
237	/// 11: Truncated
238	/// \returns A 128-bit vector of [4 x float] containing the rounded values.
239	#define _mm_round_ps(X, M) \
240	((__m128)__builtin_ia32_roundps((__v4sf)(__m128)(X), (M)))
241
242	/// Copies three upper elements of the first 128-bit vector operand to
243	/// the corresponding three upper elements of the 128-bit result vector of
244	/// [4 x float]. Rounds the lowest element of the second 128-bit vector
245	/// operand to an integer value according to the rounding control specified
246	/// by the third argument and copies it to the lowest element of the 128-bit
247	/// result vector of [4 x float].
248	///
249	/// \headerfile <x86intrin.h>
250	///
251	/// \code
252	/// __m128 _mm_round_ss(__m128 X, __m128 Y, const int M);
253	/// \endcode
254	///
255	/// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction.
256	///
257	/// \param X
258	/// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are
259	/// copied to the corresponding bits of the result.
260	/// \param Y
261	/// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is
262	/// rounded to the nearest integer using the specified rounding control and
263	/// copied to the corresponding bits of the result.
264	/// \param M
265	/// An integer value that specifies the rounding operation. \n
266	/// Bits [7:4] are reserved. \n
267	/// Bit [3] is a precision exception value: \n
268	/// 0: A normal PE exception is used \n
269	/// 1: The PE field is not updated \n
270	/// Bit [2] is the rounding control source: \n
271	/// 0: Use bits [1:0] of \a M \n
272	/// 1: Use the current MXCSR setting \n
273	/// Bits [1:0] contain the rounding control definition: \n
274	/// 00: Nearest \n
275	/// 01: Downward (toward negative infinity) \n
276	/// 10: Upward (toward positive infinity) \n
277	/// 11: Truncated
278	/// \returns A 128-bit vector of [4 x float] containing the copied and rounded
279	/// values.
280	#define _mm_round_ss(X, Y, M) \
281	((__m128)__builtin_ia32_roundss((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), \
282	(M)))
283
284	/// Rounds each element of the 128-bit vector of [2 x double] to an
285	/// integer value according to the rounding control specified by the second
286	/// argument and returns the rounded values in a 128-bit vector of
287	/// [2 x double].
288	///
289	/// \headerfile <x86intrin.h>
290	///
291	/// \code
292	/// __m128d _mm_round_pd(__m128d X, const int M);
293	/// \endcode
294	///
295	/// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction.
296	///
297	/// \param X
298	/// A 128-bit vector of [2 x double].
299	/// \param M
300	/// An integer value that specifies the rounding operation. \n
301	/// Bits [7:4] are reserved. \n
302	/// Bit [3] is a precision exception value: \n
303	/// 0: A normal PE exception is used \n
304	/// 1: The PE field is not updated \n
305	/// Bit [2] is the rounding control source: \n
306	/// 0: Use bits [1:0] of \a M \n
307	/// 1: Use the current MXCSR setting \n
308	/// Bits [1:0] contain the rounding control definition: \n
309	/// 00: Nearest \n
310	/// 01: Downward (toward negative infinity) \n
311	/// 10: Upward (toward positive infinity) \n
312	/// 11: Truncated
313	/// \returns A 128-bit vector of [2 x double] containing the rounded values.
314	#define _mm_round_pd(X, M) \
315	((__m128d)__builtin_ia32_roundpd((__v2df)(__m128d)(X), (M)))
316
317	/// Copies the upper element of the first 128-bit vector operand to the
318	/// corresponding upper element of the 128-bit result vector of [2 x double].
319	/// Rounds the lower element of the second 128-bit vector operand to an
320	/// integer value according to the rounding control specified by the third
321	/// argument and copies it to the lower element of the 128-bit result vector
322	/// of [2 x double].
323	///
324	/// \headerfile <x86intrin.h>
325	///
326	/// \code
327	/// __m128d _mm_round_sd(__m128d X, __m128d Y, const int M);
328	/// \endcode
329	///
330	/// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction.
331	///
332	/// \param X
333	/// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is
334	/// copied to the corresponding bits of the result.
335	/// \param Y
336	/// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is
337	/// rounded to the nearest integer using the specified rounding control and
338	/// copied to the corresponding bits of the result.
339	/// \param M
340	/// An integer value that specifies the rounding operation. \n
341	/// Bits [7:4] are reserved. \n
342	/// Bit [3] is a precision exception value: \n
343	/// 0: A normal PE exception is used \n
344	/// 1: The PE field is not updated \n
345	/// Bit [2] is the rounding control source: \n
346	/// 0: Use bits [1:0] of \a M \n
347	/// 1: Use the current MXCSR setting \n
348	/// Bits [1:0] contain the rounding control definition: \n
349	/// 00: Nearest \n
350	/// 01: Downward (toward negative infinity) \n
351	/// 10: Upward (toward positive infinity) \n
352	/// 11: Truncated
353	/// \returns A 128-bit vector of [2 x double] containing the copied and rounded
354	/// values.
355	#define _mm_round_sd(X, Y, M) \
356	((__m128d)__builtin_ia32_roundsd((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), \
357	(M)))
358
359	/* SSE4 Packed Blending Intrinsics. */
360	/// Returns a 128-bit vector of [2 x double] where the values are
361	/// selected from either the first or second operand as specified by the
362	/// third operand, the control mask.
363	///
364	/// \headerfile <x86intrin.h>
365	///
366	/// \code
367	/// __m128d _mm_blend_pd(__m128d V1, __m128d V2, const int M);
368	/// \endcode
369	///
370	/// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction.
371	///
372	/// \param V1
373	/// A 128-bit vector of [2 x double].
374	/// \param V2
375	/// A 128-bit vector of [2 x double].
376	/// \param M
377	/// An immediate integer operand, with mask bits [1:0] specifying how the
378	/// values are to be copied. The position of the mask bit corresponds to the
379	/// index of a copied value. When a mask bit is 0, the corresponding 64-bit
380	/// element in operand \a V1 is copied to the same position in the result.
381	/// When a mask bit is 1, the corresponding 64-bit element in operand \a V2
382	/// is copied to the same position in the result.
383	/// \returns A 128-bit vector of [2 x double] containing the copied values.
384	#define _mm_blend_pd(V1, V2, M) \
385	((__m128d)__builtin_ia32_blendpd((__v2df)(__m128d)(V1), \
386	(__v2df)(__m128d)(V2), (int)(M)))
387
388	/// Returns a 128-bit vector of [4 x float] where the values are selected
389	/// from either the first or second operand as specified by the third
390	/// operand, the control mask.
391	///
392	/// \headerfile <x86intrin.h>
393	///
394	/// \code
395	/// __m128 _mm_blend_ps(__m128 V1, __m128 V2, const int M);
396	/// \endcode
397	///
398	/// This intrinsic corresponds to the <c> VBLENDPS / BLENDPS </c> instruction.
399	///
400	/// \param V1
401	/// A 128-bit vector of [4 x float].
402	/// \param V2
403	/// A 128-bit vector of [4 x float].
404	/// \param M
405	/// An immediate integer operand, with mask bits [3:0] specifying how the
406	/// values are to be copied. The position of the mask bit corresponds to the
407	/// index of a copied value. When a mask bit is 0, the corresponding 32-bit
408	/// element in operand \a V1 is copied to the same position in the result.
409	/// When a mask bit is 1, the corresponding 32-bit element in operand \a V2
410	/// is copied to the same position in the result.
411	/// \returns A 128-bit vector of [4 x float] containing the copied values.
412	#define _mm_blend_ps(V1, V2, M) \
413	((__m128)__builtin_ia32_blendps((__v4sf)(__m128)(V1), (__v4sf)(__m128)(V2), \
414	(int)(M)))
415
416	/// Returns a 128-bit vector of [2 x double] where the values are
417	/// selected from either the first or second operand as specified by the
418	/// third operand, the control mask.
419	///
420	/// \headerfile <x86intrin.h>
421	///
422	/// This intrinsic corresponds to the <c> VBLENDVPD / BLENDVPD </c> instruction.
423	///
424	/// \param __V1
425	/// A 128-bit vector of [2 x double].
426	/// \param __V2
427	/// A 128-bit vector of [2 x double].
428	/// \param __M
429	/// A 128-bit vector operand, with mask bits 127 and 63 specifying how the
430	/// values are to be copied. The position of the mask bit corresponds to the
431	/// most significant bit of a copied value. When a mask bit is 0, the
432	/// corresponding 64-bit element in operand \a __V1 is copied to the same
433	/// position in the result. When a mask bit is 1, the corresponding 64-bit
434	/// element in operand \a __V2 is copied to the same position in the result.
435	/// \returns A 128-bit vector of [2 x double] containing the copied values.
436	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_blendv_pd(__m128d __V1,
437	__m128d __V2,
438	__m128d __M) {
439	return (__m128d)__builtin_ia32_blendvpd((__v2df)__V1, (__v2df)__V2,
440	(__v2df)__M);
441	}
442
443	/// Returns a 128-bit vector of [4 x float] where the values are
444	/// selected from either the first or second operand as specified by the
445	/// third operand, the control mask.
446	///
447	/// \headerfile <x86intrin.h>
448	///
449	/// This intrinsic corresponds to the <c> VBLENDVPS / BLENDVPS </c> instruction.
450	///
451	/// \param __V1
452	/// A 128-bit vector of [4 x float].
453	/// \param __V2
454	/// A 128-bit vector of [4 x float].
455	/// \param __M
456	/// A 128-bit vector operand, with mask bits 127, 95, 63, and 31 specifying
457	/// how the values are to be copied. The position of the mask bit corresponds
458	/// to the most significant bit of a copied value. When a mask bit is 0, the
459	/// corresponding 32-bit element in operand \a __V1 is copied to the same
460	/// position in the result. When a mask bit is 1, the corresponding 32-bit
461	/// element in operand \a __V2 is copied to the same position in the result.
462	/// \returns A 128-bit vector of [4 x float] containing the copied values.
463	static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_blendv_ps(__m128 __V1,
464	__m128 __V2,
465	__m128 __M) {
466	return (__m128)__builtin_ia32_blendvps((__v4sf)__V1, (__v4sf)__V2,
467	(__v4sf)__M);
468	}
469
470	/// Returns a 128-bit vector of [16 x i8] where the values are selected
471	/// from either of the first or second operand as specified by the third
472	/// operand, the control mask.
473	///
474	/// \headerfile <x86intrin.h>
475	///
476	/// This intrinsic corresponds to the <c> VPBLENDVB / PBLENDVB </c> instruction.
477	///
478	/// \param __V1
479	/// A 128-bit vector of [16 x i8].
480	/// \param __V2
481	/// A 128-bit vector of [16 x i8].
482	/// \param __M
483	/// A 128-bit vector operand, with mask bits 127, 119, 111...7 specifying
484	/// how the values are to be copied. The position of the mask bit corresponds
485	/// to the most significant bit of a copied value. When a mask bit is 0, the
486	/// corresponding 8-bit element in operand \a __V1 is copied to the same
487	/// position in the result. When a mask bit is 1, the corresponding 8-bit
488	/// element in operand \a __V2 is copied to the same position in the result.
489	/// \returns A 128-bit vector of [16 x i8] containing the copied values.
490	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_blendv_epi8(__m128i __V1,
491	__m128i __V2,
492	__m128i __M) {
493	return (__m128i)__builtin_ia32_pblendvb128((__v16qi)__V1, (__v16qi)__V2,
494	(__v16qi)__M);
495	}
496
497	/// Returns a 128-bit vector of [8 x i16] where the values are selected
498	/// from either of the first or second operand as specified by the third
499	/// operand, the control mask.
500	///
501	/// \headerfile <x86intrin.h>
502	///
503	/// \code
504	/// __m128i _mm_blend_epi16(__m128i V1, __m128i V2, const int M);
505	/// \endcode
506	///
507	/// This intrinsic corresponds to the <c> VPBLENDW / PBLENDW </c> instruction.
508	///
509	/// \param V1
510	/// A 128-bit vector of [8 x i16].
511	/// \param V2
512	/// A 128-bit vector of [8 x i16].
513	/// \param M
514	/// An immediate integer operand, with mask bits [7:0] specifying how the
515	/// values are to be copied. The position of the mask bit corresponds to the
516	/// index of a copied value. When a mask bit is 0, the corresponding 16-bit
517	/// element in operand \a V1 is copied to the same position in the result.
518	/// When a mask bit is 1, the corresponding 16-bit element in operand \a V2
519	/// is copied to the same position in the result.
520	/// \returns A 128-bit vector of [8 x i16] containing the copied values.
521	#define _mm_blend_epi16(V1, V2, M) \
522	((__m128i)__builtin_ia32_pblendw128((__v8hi)(__m128i)(V1), \
523	(__v8hi)(__m128i)(V2), (int)(M)))
524
525	/* SSE4 Dword Multiply Instructions. */
526	/// Multiples corresponding elements of two 128-bit vectors of [4 x i32]
527	/// and returns the lower 32 bits of the each product in a 128-bit vector of
528	/// [4 x i32].
529	///
530	/// \headerfile <x86intrin.h>
531	///
532	/// This intrinsic corresponds to the <c> VPMULLD / PMULLD </c> instruction.
533	///
534	/// \param __V1
535	/// A 128-bit integer vector.
536	/// \param __V2
537	/// A 128-bit integer vector.
538	/// \returns A 128-bit integer vector containing the products of both operands.
539	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi32(__m128i __V1,
540	__m128i __V2) {
541	return (__m128i)((__v4su)__V1 * (__v4su)__V2);
542	}
543
544	/// Multiplies corresponding even-indexed elements of two 128-bit
545	/// vectors of [4 x i32] and returns a 128-bit vector of [2 x i64]
546	/// containing the products.
547	///
548	/// \headerfile <x86intrin.h>
549	///
550	/// This intrinsic corresponds to the <c> VPMULDQ / PMULDQ </c> instruction.
551	///
552	/// \param __V1
553	/// A 128-bit vector of [4 x i32].
554	/// \param __V2
555	/// A 128-bit vector of [4 x i32].
556	/// \returns A 128-bit vector of [2 x i64] containing the products of both
557	/// operands.
558	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epi32(__m128i __V1,
559	__m128i __V2) {
560	return (__m128i)__builtin_ia32_pmuldq128((__v4si)__V1, (__v4si)__V2);
561	}
562
563	/* SSE4 Floating Point Dot Product Instructions. */
564	/// Computes the dot product of the two 128-bit vectors of [4 x float]
565	/// and returns it in the elements of the 128-bit result vector of
566	/// [4 x float].
567	///
568	/// The immediate integer operand controls which input elements
569	/// will contribute to the dot product, and where the final results are
570	/// returned.
571	///
572	/// \headerfile <x86intrin.h>
573	///
574	/// \code
575	/// __m128 _mm_dp_ps(__m128 X, __m128 Y, const int M);
576	/// \endcode
577	///
578	/// This intrinsic corresponds to the <c> VDPPS / DPPS </c> instruction.
579	///
580	/// \param X
581	/// A 128-bit vector of [4 x float].
582	/// \param Y
583	/// A 128-bit vector of [4 x float].
584	/// \param M
585	/// An immediate integer operand. Mask bits [7:4] determine which elements
586	/// of the input vectors are used, with bit [4] corresponding to the lowest
587	/// element and bit [7] corresponding to the highest element of each [4 x
588	/// float] vector. If a bit is set, the corresponding elements from the two
589	/// input vectors are used as an input for dot product; otherwise that input
590	/// is treated as zero. Bits [3:0] determine which elements of the result
591	/// will receive a copy of the final dot product, with bit [0] corresponding
592	/// to the lowest element and bit [3] corresponding to the highest element of
593	/// each [4 x float] subvector. If a bit is set, the dot product is returned
594	/// in the corresponding element; otherwise that element is set to zero.
595	/// \returns A 128-bit vector of [4 x float] containing the dot product.
596	#define _mm_dp_ps(X, Y, M) \
597	((__m128)__builtin_ia32_dpps((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (M)))
598
599	/// Computes the dot product of the two 128-bit vectors of [2 x double]
600	/// and returns it in the elements of the 128-bit result vector of
601	/// [2 x double].
602	///
603	/// The immediate integer operand controls which input
604	/// elements will contribute to the dot product, and where the final results
605	/// are returned.
606	///
607	/// \headerfile <x86intrin.h>
608	///
609	/// \code
610	/// __m128d _mm_dp_pd(__m128d X, __m128d Y, const int M);
611	/// \endcode
612	///
613	/// This intrinsic corresponds to the <c> VDPPD / DPPD </c> instruction.
614	///
615	/// \param X
616	/// A 128-bit vector of [2 x double].
617	/// \param Y
618	/// A 128-bit vector of [2 x double].
619	/// \param M
620	/// An immediate integer operand. Mask bits [5:4] determine which elements
621	/// of the input vectors are used, with bit [4] corresponding to the lowest
622	/// element and bit [5] corresponding to the highest element of each of [2 x
623	/// double] vector. If a bit is set, the corresponding elements from the two
624	/// input vectors are used as an input for dot product; otherwise that input
625	/// is treated as zero. Bits [1:0] determine which elements of the result
626	/// will receive a copy of the final dot product, with bit [0] corresponding
627	/// to the lowest element and bit [1] corresponding to the highest element of
628	/// each [2 x double] vector. If a bit is set, the dot product is returned in
629	/// the corresponding element; otherwise that element is set to zero.
630	#define _mm_dp_pd(X, Y, M) \
631	((__m128d)__builtin_ia32_dppd((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), \
632	(M)))
633
634	/* SSE4 Streaming Load Hint Instruction. */
635	/// Loads integer values from a 128-bit aligned memory location to a
636	/// 128-bit integer vector.
637	///
638	/// \headerfile <x86intrin.h>
639	///
640	/// This intrinsic corresponds to the <c> VMOVNTDQA / MOVNTDQA </c> instruction.
641	///
642	/// \param __V
643	/// A pointer to a 128-bit aligned memory location that contains the integer
644	/// values.
645	/// \returns A 128-bit integer vector containing the data stored at the
646	/// specified memory location.
647	static __inline__ __m128i __DEFAULT_FN_ATTRS
648	_mm_stream_load_si128(const void *__V) {
649	return (__m128i)__builtin_nontemporal_load((const __v2di *)__V);
650	}
651
652	/* SSE4 Packed Integer Min/Max Instructions. */
653	/// Compares the corresponding elements of two 128-bit vectors of
654	/// [16 x i8] and returns a 128-bit vector of [16 x i8] containing the lesser
655	/// of the two values.
656	///
657	/// \headerfile <x86intrin.h>
658	///
659	/// This intrinsic corresponds to the <c> VPMINSB / PMINSB </c> instruction.
660	///
661	/// \param __V1
662	/// A 128-bit vector of [16 x i8].
663	/// \param __V2
664	/// A 128-bit vector of [16 x i8]
665	/// \returns A 128-bit vector of [16 x i8] containing the lesser values.
666	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi8(__m128i __V1,
667	__m128i __V2) {
668	return (__m128i)__builtin_elementwise_min((__v16qs)__V1, (__v16qs)__V2);
669	}
670
671	/// Compares the corresponding elements of two 128-bit vectors of
672	/// [16 x i8] and returns a 128-bit vector of [16 x i8] containing the
673	/// greater value of the two.
674	///
675	/// \headerfile <x86intrin.h>
676	///
677	/// This intrinsic corresponds to the <c> VPMAXSB / PMAXSB </c> instruction.
678	///
679	/// \param __V1
680	/// A 128-bit vector of [16 x i8].
681	/// \param __V2
682	/// A 128-bit vector of [16 x i8].
683	/// \returns A 128-bit vector of [16 x i8] containing the greater values.
684	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi8(__m128i __V1,
685	__m128i __V2) {
686	return (__m128i)__builtin_elementwise_max((__v16qs)__V1, (__v16qs)__V2);
687	}
688
689	/// Compares the corresponding elements of two 128-bit vectors of
690	/// [8 x u16] and returns a 128-bit vector of [8 x u16] containing the lesser
691	/// value of the two.
692	///
693	/// \headerfile <x86intrin.h>
694	///
695	/// This intrinsic corresponds to the <c> VPMINUW / PMINUW </c> instruction.
696	///
697	/// \param __V1
698	/// A 128-bit vector of [8 x u16].
699	/// \param __V2
700	/// A 128-bit vector of [8 x u16].
701	/// \returns A 128-bit vector of [8 x u16] containing the lesser values.
702	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu16(__m128i __V1,
703	__m128i __V2) {
704	return (__m128i)__builtin_elementwise_min((__v8hu)__V1, (__v8hu)__V2);
705	}
706
707	/// Compares the corresponding elements of two 128-bit vectors of
708	/// [8 x u16] and returns a 128-bit vector of [8 x u16] containing the
709	/// greater value of the two.
710	///
711	/// \headerfile <x86intrin.h>
712	///
713	/// This intrinsic corresponds to the <c> VPMAXUW / PMAXUW </c> instruction.
714	///
715	/// \param __V1
716	/// A 128-bit vector of [8 x u16].
717	/// \param __V2
718	/// A 128-bit vector of [8 x u16].
719	/// \returns A 128-bit vector of [8 x u16] containing the greater values.
720	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu16(__m128i __V1,
721	__m128i __V2) {
722	return (__m128i)__builtin_elementwise_max((__v8hu)__V1, (__v8hu)__V2);
723	}
724
725	/// Compares the corresponding elements of two 128-bit vectors of
726	/// [4 x i32] and returns a 128-bit vector of [4 x i32] containing the lesser
727	/// value of the two.
728	///
729	/// \headerfile <x86intrin.h>
730	///
731	/// This intrinsic corresponds to the <c> VPMINSD / PMINSD </c> instruction.
732	///
733	/// \param __V1
734	/// A 128-bit vector of [4 x i32].
735	/// \param __V2
736	/// A 128-bit vector of [4 x i32].
737	/// \returns A 128-bit vector of [4 x i32] containing the lesser values.
738	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi32(__m128i __V1,
739	__m128i __V2) {
740	return (__m128i)__builtin_elementwise_min((__v4si)__V1, (__v4si)__V2);
741	}
742
743	/// Compares the corresponding elements of two 128-bit vectors of
744	/// [4 x i32] and returns a 128-bit vector of [4 x i32] containing the
745	/// greater value of the two.
746	///
747	/// \headerfile <x86intrin.h>
748	///
749	/// This intrinsic corresponds to the <c> VPMAXSD / PMAXSD </c> instruction.
750	///
751	/// \param __V1
752	/// A 128-bit vector of [4 x i32].
753	/// \param __V2
754	/// A 128-bit vector of [4 x i32].
755	/// \returns A 128-bit vector of [4 x i32] containing the greater values.
756	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi32(__m128i __V1,
757	__m128i __V2) {
758	return (__m128i)__builtin_elementwise_max((__v4si)__V1, (__v4si)__V2);
759	}
760
761	/// Compares the corresponding elements of two 128-bit vectors of
762	/// [4 x u32] and returns a 128-bit vector of [4 x u32] containing the lesser
763	/// value of the two.
764	///
765	/// \headerfile <x86intrin.h>
766	///
767	/// This intrinsic corresponds to the <c> VPMINUD / PMINUD </c> instruction.
768	///
769	/// \param __V1
770	/// A 128-bit vector of [4 x u32].
771	/// \param __V2
772	/// A 128-bit vector of [4 x u32].
773	/// \returns A 128-bit vector of [4 x u32] containing the lesser values.
774	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu32(__m128i __V1,
775	__m128i __V2) {
776	return (__m128i)__builtin_elementwise_min((__v4su)__V1, (__v4su)__V2);
777	}
778
779	/// Compares the corresponding elements of two 128-bit vectors of
780	/// [4 x u32] and returns a 128-bit vector of [4 x u32] containing the
781	/// greater value of the two.
782	///
783	/// \headerfile <x86intrin.h>
784	///
785	/// This intrinsic corresponds to the <c> VPMAXUD / PMAXUD </c> instruction.
786	///
787	/// \param __V1
788	/// A 128-bit vector of [4 x u32].
789	/// \param __V2
790	/// A 128-bit vector of [4 x u32].
791	/// \returns A 128-bit vector of [4 x u32] containing the greater values.
792	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu32(__m128i __V1,
793	__m128i __V2) {
794	return (__m128i)__builtin_elementwise_max((__v4su)__V1, (__v4su)__V2);
795	}
796
797	/* SSE4 Insertion and Extraction from XMM Register Instructions. */
798	/// Takes the first argument \a X and inserts an element from the second
799	/// argument \a Y as selected by the third argument \a N. That result then
800	/// has elements zeroed out also as selected by the third argument \a N. The
801	/// resulting 128-bit vector of [4 x float] is then returned.
802	///
803	/// \headerfile <x86intrin.h>
804	///
805	/// \code
806	/// __m128 _mm_insert_ps(__m128 X, __m128 Y, const int N);
807	/// \endcode
808	///
809	/// This intrinsic corresponds to the <c> VINSERTPS </c> instruction.
810	///
811	/// \param X
812	/// A 128-bit vector source operand of [4 x float]. With the exception of
813	/// those bits in the result copied from parameter \a Y and zeroed by bits
814	/// [3:0] of \a N, all bits from this parameter are copied to the result.
815	/// \param Y
816	/// A 128-bit vector source operand of [4 x float]. One single-precision
817	/// floating-point element from this source, as determined by the immediate
818	/// parameter, is copied to the result.
819	/// \param N
820	/// Specifies which bits from operand \a Y will be copied, which bits in the
821	/// result they will be copied to, and which bits in the result will be
822	/// cleared. The following assignments are made: \n
823	/// Bits [7:6] specify the bits to copy from operand \a Y: \n
824	/// 00: Selects bits [31:0] from operand \a Y. \n
825	/// 01: Selects bits [63:32] from operand \a Y. \n
826	/// 10: Selects bits [95:64] from operand \a Y. \n
827	/// 11: Selects bits [127:96] from operand \a Y. \n
828	/// Bits [5:4] specify the bits in the result to which the selected bits
829	/// from operand \a Y are copied: \n
830	/// 00: Copies the selected bits from \a Y to result bits [31:0]. \n
831	/// 01: Copies the selected bits from \a Y to result bits [63:32]. \n
832	/// 10: Copies the selected bits from \a Y to result bits [95:64]. \n
833	/// 11: Copies the selected bits from \a Y to result bits [127:96]. \n
834	/// Bits[3:0]: If any of these bits are set, the corresponding result
835	/// element is cleared.
836	/// \returns A 128-bit vector of [4 x float] containing the copied
837	/// single-precision floating point elements from the operands.
838	#define _mm_insert_ps(X, Y, N) __builtin_ia32_insertps128((X), (Y), (N))
839
840	/// Extracts a 32-bit integer from a 128-bit vector of [4 x float] and
841	/// returns it, using the immediate value parameter \a N as a selector.
842	///
843	/// \headerfile <x86intrin.h>
844	///
845	/// \code
846	/// int _mm_extract_ps(__m128 X, const int N);
847	/// \endcode
848	///
849	/// This intrinsic corresponds to the <c> VEXTRACTPS / EXTRACTPS </c>
850	/// instruction.
851	///
852	/// \param X
853	/// A 128-bit vector of [4 x float].
854	/// \param N
855	/// An immediate value. Bits [1:0] determines which bits from the argument
856	/// \a X are extracted and returned: \n
857	/// 00: Bits [31:0] of parameter \a X are returned. \n
858	/// 01: Bits [63:32] of parameter \a X are returned. \n
859	/// 10: Bits [95:64] of parameter \a X are returned. \n
860	/// 11: Bits [127:96] of parameter \a X are returned.
861	/// \returns A 32-bit integer containing the extracted 32 bits of float data.
862	#define _mm_extract_ps(X, N) \
863	__builtin_bit_cast( \
864	int, __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)))
865
866	/* Miscellaneous insert and extract macros. */
867	/* Extract a single-precision float from X at index N into D. */
868	#define _MM_EXTRACT_FLOAT(D, X, N) \
869	do { \
870	(D) = __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)); \
871	} while (0)
872
873	/* Or together 2 sets of indexes (X and Y) with the zeroing bits (Z) to create
874	an index suitable for _mm_insert_ps. */
875	#define _MM_MK_INSERTPS_NDX(X, Y, Z) (((X) << 6) \| ((Y) << 4) \| (Z))
876
877	/* Extract a float from X at index N into the first index of the return. */
878	#define _MM_PICK_OUT_PS(X, N) \
879	_mm_insert_ps(_mm_setzero_ps(), (X), _MM_MK_INSERTPS_NDX((N), 0, 0x0e))
880
881	/* Insert int into packed integer array at index. */
882	/// Constructs a 128-bit vector of [16 x i8] by first making a copy of
883	/// the 128-bit integer vector parameter, and then inserting the lower 8 bits
884	/// of an integer parameter \a I into an offset specified by the immediate
885	/// value parameter \a N.
886	///
887	/// \headerfile <x86intrin.h>
888	///
889	/// \code
890	/// __m128i _mm_insert_epi8(__m128i X, int I, const int N);
891	/// \endcode
892	///
893	/// This intrinsic corresponds to the <c> VPINSRB / PINSRB </c> instruction.
894	///
895	/// \param X
896	/// A 128-bit integer vector of [16 x i8]. This vector is copied to the
897	/// result and then one of the sixteen elements in the result vector is
898	/// replaced by the lower 8 bits of \a I.
899	/// \param I
900	/// An integer. The lower 8 bits of this operand are written to the result
901	/// beginning at the offset specified by \a N.
902	/// \param N
903	/// An immediate value. Bits [3:0] specify the bit offset in the result at
904	/// which the lower 8 bits of \a I are written. \n
905	/// 0000: Bits [7:0] of the result are used for insertion. \n
906	/// 0001: Bits [15:8] of the result are used for insertion. \n
907	/// 0010: Bits [23:16] of the result are used for insertion. \n
908	/// 0011: Bits [31:24] of the result are used for insertion. \n
909	/// 0100: Bits [39:32] of the result are used for insertion. \n
910	/// 0101: Bits [47:40] of the result are used for insertion. \n
911	/// 0110: Bits [55:48] of the result are used for insertion. \n
912	/// 0111: Bits [63:56] of the result are used for insertion. \n
913	/// 1000: Bits [71:64] of the result are used for insertion. \n
914	/// 1001: Bits [79:72] of the result are used for insertion. \n
915	/// 1010: Bits [87:80] of the result are used for insertion. \n
916	/// 1011: Bits [95:88] of the result are used for insertion. \n
917	/// 1100: Bits [103:96] of the result are used for insertion. \n
918	/// 1101: Bits [111:104] of the result are used for insertion. \n
919	/// 1110: Bits [119:112] of the result are used for insertion. \n
920	/// 1111: Bits [127:120] of the result are used for insertion.
921	/// \returns A 128-bit integer vector containing the constructed values.
922	#define _mm_insert_epi8(X, I, N) \
923	((__m128i)__builtin_ia32_vec_set_v16qi((__v16qi)(__m128i)(X), (int)(I), \
924	(int)(N)))
925
926	/// Constructs a 128-bit vector of [4 x i32] by first making a copy of
927	/// the 128-bit integer vector parameter, and then inserting the 32-bit
928	/// integer parameter \a I at the offset specified by the immediate value
929	/// parameter \a N.
930	///
931	/// \headerfile <x86intrin.h>
932	///
933	/// \code
934	/// __m128i _mm_insert_epi32(__m128i X, int I, const int N);
935	/// \endcode
936	///
937	/// This intrinsic corresponds to the <c> VPINSRD / PINSRD </c> instruction.
938	///
939	/// \param X
940	/// A 128-bit integer vector of [4 x i32]. This vector is copied to the
941	/// result and then one of the four elements in the result vector is
942	/// replaced by \a I.
943	/// \param I
944	/// A 32-bit integer that is written to the result beginning at the offset
945	/// specified by \a N.
946	/// \param N
947	/// An immediate value. Bits [1:0] specify the bit offset in the result at
948	/// which the integer \a I is written. \n
949	/// 00: Bits [31:0] of the result are used for insertion. \n
950	/// 01: Bits [63:32] of the result are used for insertion. \n
951	/// 10: Bits [95:64] of the result are used for insertion. \n
952	/// 11: Bits [127:96] of the result are used for insertion.
953	/// \returns A 128-bit integer vector containing the constructed values.
954	#define _mm_insert_epi32(X, I, N) \
955	((__m128i)__builtin_ia32_vec_set_v4si((__v4si)(__m128i)(X), (int)(I), \
956	(int)(N)))
957
958	#ifdef __x86_64__
959	/// Constructs a 128-bit vector of [2 x i64] by first making a copy of
960	/// the 128-bit integer vector parameter, and then inserting the 64-bit
961	/// integer parameter \a I, using the immediate value parameter \a N as an
962	/// insertion location selector.
963	///
964	/// \headerfile <x86intrin.h>
965	///
966	/// \code
967	/// __m128i _mm_insert_epi64(__m128i X, long long I, const int N);
968	/// \endcode
969	///
970	/// This intrinsic corresponds to the <c> VPINSRQ / PINSRQ </c> instruction.
971	///
972	/// \param X
973	/// A 128-bit integer vector of [2 x i64]. This vector is copied to the
974	/// result and then one of the two elements in the result vector is replaced
975	/// by \a I.
976	/// \param I
977	/// A 64-bit integer that is written to the result beginning at the offset
978	/// specified by \a N.
979	/// \param N
980	/// An immediate value. Bit [0] specifies the bit offset in the result at
981	/// which the integer \a I is written. \n
982	/// 0: Bits [63:0] of the result are used for insertion. \n
983	/// 1: Bits [127:64] of the result are used for insertion. \n
984	/// \returns A 128-bit integer vector containing the constructed values.
985	#define _mm_insert_epi64(X, I, N) \
986	((__m128i)__builtin_ia32_vec_set_v2di((__v2di)(__m128i)(X), (long long)(I), \
987	(int)(N)))
988	#endif /* __x86_64__ */
989
990	/* Extract int from packed integer array at index. This returns the element
991	* as a zero extended value, so it is unsigned.
992	*/
993	/// Extracts an 8-bit element from the 128-bit integer vector of
994	/// [16 x i8], using the immediate value parameter \a N as a selector.
995	///
996	/// \headerfile <x86intrin.h>
997	///
998	/// \code
999	/// int _mm_extract_epi8(__m128i X, const int N);
1000	/// \endcode
1001	///
1002	/// This intrinsic corresponds to the <c> VPEXTRB / PEXTRB </c> instruction.
1003	///
1004	/// \param X
1005	/// A 128-bit integer vector.
1006	/// \param N
1007	/// An immediate value. Bits [3:0] specify which 8-bit vector element from
1008	/// the argument \a X to extract and copy to the result. \n
1009	/// 0000: Bits [7:0] of parameter \a X are extracted. \n
1010	/// 0001: Bits [15:8] of the parameter \a X are extracted. \n
1011	/// 0010: Bits [23:16] of the parameter \a X are extracted. \n
1012	/// 0011: Bits [31:24] of the parameter \a X are extracted. \n
1013	/// 0100: Bits [39:32] of the parameter \a X are extracted. \n
1014	/// 0101: Bits [47:40] of the parameter \a X are extracted. \n
1015	/// 0110: Bits [55:48] of the parameter \a X are extracted. \n
1016	/// 0111: Bits [63:56] of the parameter \a X are extracted. \n
1017	/// 1000: Bits [71:64] of the parameter \a X are extracted. \n
1018	/// 1001: Bits [79:72] of the parameter \a X are extracted. \n
1019	/// 1010: Bits [87:80] of the parameter \a X are extracted. \n
1020	/// 1011: Bits [95:88] of the parameter \a X are extracted. \n
1021	/// 1100: Bits [103:96] of the parameter \a X are extracted. \n
1022	/// 1101: Bits [111:104] of the parameter \a X are extracted. \n
1023	/// 1110: Bits [119:112] of the parameter \a X are extracted. \n
1024	/// 1111: Bits [127:120] of the parameter \a X are extracted.
1025	/// \returns An unsigned integer, whose lower 8 bits are selected from the
1026	/// 128-bit integer vector parameter and the remaining bits are assigned
1027	/// zeros.
1028	#define _mm_extract_epi8(X, N) \
1029	((int)(unsigned char)__builtin_ia32_vec_ext_v16qi((__v16qi)(__m128i)(X), \
1030	(int)(N)))
1031
1032	/// Extracts a 32-bit element from the 128-bit integer vector of
1033	/// [4 x i32], using the immediate value parameter \a N as a selector.
1034	///
1035	/// \headerfile <x86intrin.h>
1036	///
1037	/// \code
1038	/// int _mm_extract_epi32(__m128i X, const int N);
1039	/// \endcode
1040	///
1041	/// This intrinsic corresponds to the <c> VPEXTRD / PEXTRD </c> instruction.
1042	///
1043	/// \param X
1044	/// A 128-bit integer vector.
1045	/// \param N
1046	/// An immediate value. Bits [1:0] specify which 32-bit vector element from
1047	/// the argument \a X to extract and copy to the result. \n
1048	/// 00: Bits [31:0] of the parameter \a X are extracted. \n
1049	/// 01: Bits [63:32] of the parameter \a X are extracted. \n
1050	/// 10: Bits [95:64] of the parameter \a X are extracted. \n
1051	/// 11: Bits [127:96] of the parameter \a X are exracted.
1052	/// \returns An integer, whose lower 32 bits are selected from the 128-bit
1053	/// integer vector parameter and the remaining bits are assigned zeros.
1054	#define _mm_extract_epi32(X, N) \
1055	((int)__builtin_ia32_vec_ext_v4si((__v4si)(__m128i)(X), (int)(N)))
1056
1057	/// Extracts a 64-bit element from the 128-bit integer vector of
1058	/// [2 x i64], using the immediate value parameter \a N as a selector.
1059	///
1060	/// \headerfile <x86intrin.h>
1061	///
1062	/// \code
1063	/// long long _mm_extract_epi64(__m128i X, const int N);
1064	/// \endcode
1065	///
1066	/// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> instruction
1067	/// in 64-bit mode.
1068	///
1069	/// \param X
1070	/// A 128-bit integer vector.
1071	/// \param N
1072	/// An immediate value. Bit [0] specifies which 64-bit vector element from
1073	/// the argument \a X to return. \n
1074	/// 0: Bits [63:0] are returned. \n
1075	/// 1: Bits [127:64] are returned. \n
1076	/// \returns A 64-bit integer.
1077	#define _mm_extract_epi64(X, N) \
1078	((long long)__builtin_ia32_vec_ext_v2di((__v2di)(__m128i)(X), (int)(N)))
1079
1080	/* SSE4 128-bit Packed Integer Comparisons. */
1081	/// Tests whether the specified bits in a 128-bit integer vector are all
1082	/// zeros.
1083	///
1084	/// \headerfile <x86intrin.h>
1085	///
1086	/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
1087	///
1088	/// \param __M
1089	/// A 128-bit integer vector containing the bits to be tested.
1090	/// \param __V
1091	/// A 128-bit integer vector selecting which bits to test in operand \a __M.
1092	/// \returns TRUE if the specified bits are all zeros; FALSE otherwise.
1093	static __inline__ int __DEFAULT_FN_ATTRS _mm_testz_si128(__m128i __M,
1094	__m128i __V) {
1095	return __builtin_ia32_ptestz128((__v2di)__M, (__v2di)__V);
1096	}
1097
1098	/// Tests whether the specified bits in a 128-bit integer vector are all
1099	/// ones.
1100	///
1101	/// \headerfile <x86intrin.h>
1102	///
1103	/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
1104	///
1105	/// \param __M
1106	/// A 128-bit integer vector containing the bits to be tested.
1107	/// \param __V
1108	/// A 128-bit integer vector selecting which bits to test in operand \a __M.
1109	/// \returns TRUE if the specified bits are all ones; FALSE otherwise.
1110	static __inline__ int __DEFAULT_FN_ATTRS _mm_testc_si128(__m128i __M,
1111	__m128i __V) {
1112	return __builtin_ia32_ptestc128((__v2di)__M, (__v2di)__V);
1113	}
1114
1115	/// Tests whether the specified bits in a 128-bit integer vector are
1116	/// neither all zeros nor all ones.
1117	///
1118	/// \headerfile <x86intrin.h>
1119	///
1120	/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
1121	///
1122	/// \param __M
1123	/// A 128-bit integer vector containing the bits to be tested.
1124	/// \param __V
1125	/// A 128-bit integer vector selecting which bits to test in operand \a __M.
1126	/// \returns TRUE if the specified bits are neither all zeros nor all ones;
1127	/// FALSE otherwise.
1128	static __inline__ int __DEFAULT_FN_ATTRS _mm_testnzc_si128(__m128i __M,
1129	__m128i __V) {
1130	return __builtin_ia32_ptestnzc128((__v2di)__M, (__v2di)__V);
1131	}
1132
1133	/// Tests whether the specified bits in a 128-bit integer vector are all
1134	/// ones.
1135	///
1136	/// \headerfile <x86intrin.h>
1137	///
1138	/// \code
1139	/// int _mm_test_all_ones(__m128i V);
1140	/// \endcode
1141	///
1142	/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
1143	///
1144	/// \param V
1145	/// A 128-bit integer vector containing the bits to be tested.
1146	/// \returns TRUE if the bits specified in the operand are all set to 1; FALSE
1147	/// otherwise.
1148	#define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_set1_epi32(-1))
1149
1150	/// Tests whether the specified bits in a 128-bit integer vector are
1151	/// neither all zeros nor all ones.
1152	///
1153	/// \headerfile <x86intrin.h>
1154	///
1155	/// \code
1156	/// int _mm_test_mix_ones_zeros(__m128i M, __m128i V);
1157	/// \endcode
1158	///
1159	/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
1160	///
1161	/// \param M
1162	/// A 128-bit integer vector containing the bits to be tested.
1163	/// \param V
1164	/// A 128-bit integer vector selecting which bits to test in operand \a M.
1165	/// \returns TRUE if the specified bits are neither all zeros nor all ones;
1166	/// FALSE otherwise.
1167	#define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V))
1168
1169	/// Tests whether the specified bits in a 128-bit integer vector are all
1170	/// zeros.
1171	///
1172	/// \headerfile <x86intrin.h>
1173	///
1174	/// \code
1175	/// int _mm_test_all_zeros(__m128i M, __m128i V);
1176	/// \endcode
1177	///
1178	/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
1179	///
1180	/// \param M
1181	/// A 128-bit integer vector containing the bits to be tested.
1182	/// \param V
1183	/// A 128-bit integer vector selecting which bits to test in operand \a M.
1184	/// \returns TRUE if the specified bits are all zeros; FALSE otherwise.
1185	#define _mm_test_all_zeros(M, V) _mm_testz_si128((M), (V))
1186
1187	/* SSE4 64-bit Packed Integer Comparisons. */
1188	/// Compares each of the corresponding 64-bit values of the 128-bit
1189	/// integer vectors for equality.
1190	///
1191	/// \headerfile <x86intrin.h>
1192	///
1193	/// This intrinsic corresponds to the <c> VPCMPEQQ / PCMPEQQ </c> instruction.
1194	///
1195	/// \param __V1
1196	/// A 128-bit integer vector.
1197	/// \param __V2
1198	/// A 128-bit integer vector.
1199	/// \returns A 128-bit integer vector containing the comparison results.
1200	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi64(__m128i __V1,
1201	__m128i __V2) {
1202	return (__m128i)((__v2di)__V1 == (__v2di)__V2);
1203	}
1204
1205	/* SSE4 Packed Integer Sign-Extension. */
1206	/// Sign-extends each of the lower eight 8-bit integer elements of a
1207	/// 128-bit vector of [16 x i8] to 16-bit values and returns them in a
1208	/// 128-bit vector of [8 x i16]. The upper eight elements of the input vector
1209	/// are unused.
1210	///
1211	/// \headerfile <x86intrin.h>
1212	///
1213	/// This intrinsic corresponds to the <c> VPMOVSXBW / PMOVSXBW </c> instruction.
1214	///
1215	/// \param __V
1216	/// A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are
1217	/// sign-extended to 16-bit values.
1218	/// \returns A 128-bit vector of [8 x i16] containing the sign-extended values.
1219	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi8_epi16(__m128i __V) {
1220	/* This function always performs a signed extension, but __v16qi is a char
1221	which may be signed or unsigned, so use __v16qs. */
1222	return (__m128i) __builtin_convertvector(
1223	__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6,
1224	7),
1225	__v8hi);
1226	}
1227
1228	/// Sign-extends each of the lower four 8-bit integer elements of a
1229	/// 128-bit vector of [16 x i8] to 32-bit values and returns them in a
1230	/// 128-bit vector of [4 x i32]. The upper twelve elements of the input
1231	/// vector are unused.
1232	///
1233	/// \headerfile <x86intrin.h>
1234	///
1235	/// This intrinsic corresponds to the <c> VPMOVSXBD / PMOVSXBD </c> instruction.
1236	///
1237	/// \param __V
1238	/// A 128-bit vector of [16 x i8]. The lower four 8-bit elements are
1239	/// sign-extended to 32-bit values.
1240	/// \returns A 128-bit vector of [4 x i32] containing the sign-extended values.
1241	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi8_epi32(__m128i __V) {
1242	/* This function always performs a signed extension, but __v16qi is a char
1243	which may be signed or unsigned, so use __v16qs. */
1244	return (__m128i) __builtin_convertvector(
1245	__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4si);
1246	}
1247
1248	/// Sign-extends each of the lower two 8-bit integer elements of a
1249	/// 128-bit integer vector of [16 x i8] to 64-bit values and returns them in
1250	/// a 128-bit vector of [2 x i64]. The upper fourteen elements of the input
1251	/// vector are unused.
1252	///
1253	/// \headerfile <x86intrin.h>
1254	///
1255	/// This intrinsic corresponds to the <c> VPMOVSXBQ / PMOVSXBQ </c> instruction.
1256	///
1257	/// \param __V
1258	/// A 128-bit vector of [16 x i8]. The lower two 8-bit elements are
1259	/// sign-extended to 64-bit values.
1260	/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
1261	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi8_epi64(__m128i __V) {
1262	/* This function always performs a signed extension, but __v16qi is a char
1263	which may be signed or unsigned, so use __v16qs. */
1264	return (__m128i) __builtin_convertvector(
1265	__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1), __v2di);
1266	}
1267
1268	/// Sign-extends each of the lower four 16-bit integer elements of a
1269	/// 128-bit integer vector of [8 x i16] to 32-bit values and returns them in
1270	/// a 128-bit vector of [4 x i32]. The upper four elements of the input
1271	/// vector are unused.
1272	///
1273	/// \headerfile <x86intrin.h>
1274	///
1275	/// This intrinsic corresponds to the <c> VPMOVSXWD / PMOVSXWD </c> instruction.
1276	///
1277	/// \param __V
1278	/// A 128-bit vector of [8 x i16]. The lower four 16-bit elements are
1279	/// sign-extended to 32-bit values.
1280	/// \returns A 128-bit vector of [4 x i32] containing the sign-extended values.
1281	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi16_epi32(__m128i __V) {
1282	return (__m128i) __builtin_convertvector(
1283	__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4si);
1284	}
1285
1286	/// Sign-extends each of the lower two 16-bit integer elements of a
1287	/// 128-bit integer vector of [8 x i16] to 64-bit values and returns them in
1288	/// a 128-bit vector of [2 x i64]. The upper six elements of the input
1289	/// vector are unused.
1290	///
1291	/// \headerfile <x86intrin.h>
1292	///
1293	/// This intrinsic corresponds to the <c> VPMOVSXWQ / PMOVSXWQ </c> instruction.
1294	///
1295	/// \param __V
1296	/// A 128-bit vector of [8 x i16]. The lower two 16-bit elements are
1297	/// sign-extended to 64-bit values.
1298	/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
1299	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi16_epi64(__m128i __V) {
1300	return (__m128i) __builtin_convertvector(
1301	__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1), __v2di);
1302	}
1303
1304	/// Sign-extends each of the lower two 32-bit integer elements of a
1305	/// 128-bit integer vector of [4 x i32] to 64-bit values and returns them in
1306	/// a 128-bit vector of [2 x i64]. The upper two elements of the input vector
1307	/// are unused.
1308	///
1309	/// \headerfile <x86intrin.h>
1310	///
1311	/// This intrinsic corresponds to the <c> VPMOVSXDQ / PMOVSXDQ </c> instruction.
1312	///
1313	/// \param __V
1314	/// A 128-bit vector of [4 x i32]. The lower two 32-bit elements are
1315	/// sign-extended to 64-bit values.
1316	/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
1317	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi32_epi64(__m128i __V) {
1318	return (__m128i) __builtin_convertvector(
1319	__builtin_shufflevector((__v4si)__V, (__v4si)__V, 0, 1), __v2di);
1320	}
1321
1322	/* SSE4 Packed Integer Zero-Extension. */
1323	/// Zero-extends each of the lower eight 8-bit integer elements of a
1324	/// 128-bit vector of [16 x i8] to 16-bit values and returns them in a
1325	/// 128-bit vector of [8 x i16]. The upper eight elements of the input vector
1326	/// are unused.
1327	///
1328	/// \headerfile <x86intrin.h>
1329	///
1330	/// This intrinsic corresponds to the <c> VPMOVZXBW / PMOVZXBW </c> instruction.
1331	///
1332	/// \param __V
1333	/// A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are
1334	/// zero-extended to 16-bit values.
1335	/// \returns A 128-bit vector of [8 x i16] containing the zero-extended values.
1336	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu8_epi16(__m128i __V) {
1337	return (__m128i) __builtin_convertvector(
1338	__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6,
1339	7),
1340	__v8hi);
1341	}
1342
1343	/// Zero-extends each of the lower four 8-bit integer elements of a
1344	/// 128-bit vector of [16 x i8] to 32-bit values and returns them in a
1345	/// 128-bit vector of [4 x i32]. The upper twelve elements of the input
1346	/// vector are unused.
1347	///
1348	/// \headerfile <x86intrin.h>
1349	///
1350	/// This intrinsic corresponds to the <c> VPMOVZXBD / PMOVZXBD </c> instruction.
1351	///
1352	/// \param __V
1353	/// A 128-bit vector of [16 x i8]. The lower four 8-bit elements are
1354	/// zero-extended to 32-bit values.
1355	/// \returns A 128-bit vector of [4 x i32] containing the zero-extended values.
1356	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu8_epi32(__m128i __V) {
1357	return (__m128i) __builtin_convertvector(
1358	__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4si);
1359	}
1360
1361	/// Zero-extends each of the lower two 8-bit integer elements of a
1362	/// 128-bit integer vector of [16 x i8] to 64-bit values and returns them in
1363	/// a 128-bit vector of [2 x i64]. The upper fourteen elements of the input
1364	/// vector are unused.
1365	///
1366	/// \headerfile <x86intrin.h>
1367	///
1368	/// This intrinsic corresponds to the <c> VPMOVZXBQ / PMOVZXBQ </c> instruction.
1369	///
1370	/// \param __V
1371	/// A 128-bit vector of [16 x i8]. The lower two 8-bit elements are
1372	/// zero-extended to 64-bit values.
1373	/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
1374	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu8_epi64(__m128i __V) {
1375	return (__m128i) __builtin_convertvector(
1376	__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1), __v2di);
1377	}
1378
1379	/// Zero-extends each of the lower four 16-bit integer elements of a
1380	/// 128-bit integer vector of [8 x i16] to 32-bit values and returns them in
1381	/// a 128-bit vector of [4 x i32]. The upper four elements of the input
1382	/// vector are unused.
1383	///
1384	/// \headerfile <x86intrin.h>
1385	///
1386	/// This intrinsic corresponds to the <c> VPMOVZXWD / PMOVZXWD </c> instruction.
1387	///
1388	/// \param __V
1389	/// A 128-bit vector of [8 x i16]. The lower four 16-bit elements are
1390	/// zero-extended to 32-bit values.
1391	/// \returns A 128-bit vector of [4 x i32] containing the zero-extended values.
1392	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu16_epi32(__m128i __V) {
1393	return (__m128i) __builtin_convertvector(
1394	__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4si);
1395	}
1396
1397	/// Zero-extends each of the lower two 16-bit integer elements of a
1398	/// 128-bit integer vector of [8 x i16] to 64-bit values and returns them in
1399	/// a 128-bit vector of [2 x i64]. The upper six elements of the input vector
1400	/// are unused.
1401	///
1402	/// \headerfile <x86intrin.h>
1403	///
1404	/// This intrinsic corresponds to the <c> VPMOVZXWQ / PMOVZXWQ </c> instruction.
1405	///
1406	/// \param __V
1407	/// A 128-bit vector of [8 x i16]. The lower two 16-bit elements are
1408	/// zero-extended to 64-bit values.
1409	/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
1410	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu16_epi64(__m128i __V) {
1411	return (__m128i) __builtin_convertvector(
1412	__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1), __v2di);
1413	}
1414
1415	/// Zero-extends each of the lower two 32-bit integer elements of a
1416	/// 128-bit integer vector of [4 x i32] to 64-bit values and returns them in
1417	/// a 128-bit vector of [2 x i64]. The upper two elements of the input vector
1418	/// are unused.
1419	///
1420	/// \headerfile <x86intrin.h>
1421	///
1422	/// This intrinsic corresponds to the <c> VPMOVZXDQ / PMOVZXDQ </c> instruction.
1423	///
1424	/// \param __V
1425	/// A 128-bit vector of [4 x i32]. The lower two 32-bit elements are
1426	/// zero-extended to 64-bit values.
1427	/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
1428	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu32_epi64(__m128i __V) {
1429	return (__m128i) __builtin_convertvector(
1430	__builtin_shufflevector((__v4su)__V, (__v4su)__V, 0, 1), __v2di);
1431	}
1432
1433	/* SSE4 Pack with Unsigned Saturation. */
1434	/// Converts 32-bit signed integers from both 128-bit integer vector
1435	/// operands into 16-bit unsigned integers, and returns the packed result.
1436	/// Values greater than 0xFFFF are saturated to 0xFFFF. Values less than
1437	/// 0x0000 are saturated to 0x0000.
1438	///
1439	/// \headerfile <x86intrin.h>
1440	///
1441	/// This intrinsic corresponds to the <c> VPACKUSDW / PACKUSDW </c> instruction.
1442	///
1443	/// \param __V1
1444	/// A 128-bit vector of [4 x i32]. Each 32-bit element is treated as a
1445	/// signed integer and is converted to a 16-bit unsigned integer with
1446	/// saturation. Values greater than 0xFFFF are saturated to 0xFFFF. Values
1447	/// less than 0x0000 are saturated to 0x0000. The converted [4 x i16] values
1448	/// are written to the lower 64 bits of the result.
1449	/// \param __V2
1450	/// A 128-bit vector of [4 x i32]. Each 32-bit element is treated as a
1451	/// signed integer and is converted to a 16-bit unsigned integer with
1452	/// saturation. Values greater than 0xFFFF are saturated to 0xFFFF. Values
1453	/// less than 0x0000 are saturated to 0x0000. The converted [4 x i16] values
1454	/// are written to the higher 64 bits of the result.
1455	/// \returns A 128-bit vector of [8 x i16] containing the converted values.
1456	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi32(__m128i __V1,
1457	__m128i __V2) {
1458	return (__m128i)__builtin_ia32_packusdw128((__v4si)__V1, (__v4si)__V2);
1459	}
1460
1461	/* SSE4 Multiple Packed Sums of Absolute Difference. */
1462	/// Subtracts 8-bit unsigned integer values and computes the absolute
1463	/// values of the differences to the corresponding bits in the destination.
1464	/// Then sums of the absolute differences are returned according to the bit
1465	/// fields in the immediate operand.
1466	///
1467	/// \headerfile <x86intrin.h>
1468	///
1469	/// \code
1470	/// __m128i _mm_mpsadbw_epu8(__m128i X, __m128i Y, const int M);
1471	/// \endcode
1472	///
1473	/// This intrinsic corresponds to the <c> VMPSADBW / MPSADBW </c> instruction.
1474	///
1475	/// \param X
1476	/// A 128-bit vector of [16 x i8].
1477	/// \param Y
1478	/// A 128-bit vector of [16 x i8].
1479	/// \param M
1480	/// An 8-bit immediate operand specifying how the absolute differences are to
1481	/// be calculated, according to the following algorithm:
1482	/// \code
1483	/// // M2 represents bit 2 of the immediate operand
1484	/// // M10 represents bits [1:0] of the immediate operand
1485	/// i = M2 * 4;
1486	/// j = M10 * 4;
1487	/// for (k = 0; k < 8; k = k + 1) {
1488	/// d0 = abs(X[i + k + 0] - Y[j + 0]);
1489	/// d1 = abs(X[i + k + 1] - Y[j + 1]);
1490	/// d2 = abs(X[i + k + 2] - Y[j + 2]);
1491	/// d3 = abs(X[i + k + 3] - Y[j + 3]);
1492	/// r[k] = d0 + d1 + d2 + d3;
1493	/// }
1494	/// \endcode
1495	/// \returns A 128-bit integer vector containing the sums of the sets of
1496	/// absolute differences between both operands.
1497	#define _mm_mpsadbw_epu8(X, Y, M) \
1498	((__m128i)__builtin_ia32_mpsadbw128((__v16qi)(__m128i)(X), \
1499	(__v16qi)(__m128i)(Y), (M)))
1500
1501	/// Finds the minimum unsigned 16-bit element in the input 128-bit
1502	/// vector of [8 x u16] and returns it and along with its index.
1503	///
1504	/// \headerfile <x86intrin.h>
1505	///
1506	/// This intrinsic corresponds to the <c> VPHMINPOSUW / PHMINPOSUW </c>
1507	/// instruction.
1508	///
1509	/// \param __V
1510	/// A 128-bit vector of [8 x u16].
1511	/// \returns A 128-bit value where bits [15:0] contain the minimum value found
1512	/// in parameter \a __V, bits [18:16] contain the index of the minimum value
1513	/// and the remaining bits are set to 0.
1514	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_minpos_epu16(__m128i __V) {
1515	return (__m128i)__builtin_ia32_phminposuw128((__v8hi)__V);
1516	}
1517
1518	/* Handle the sse4.2 definitions here. */
1519
1520	/* These definitions are normally in nmmintrin.h, but gcc puts them in here
1521	so we'll do the same. */
1522
1523	#undef __DEFAULT_FN_ATTRS
1524	#define __DEFAULT_FN_ATTRS \
1525	__attribute__((__always_inline__, __nodebug__, __target__("sse4.2")))
1526
1527	/* These specify the type of data that we're comparing. */
1528	#define _SIDD_UBYTE_OPS 0x00
1529	#define _SIDD_UWORD_OPS 0x01
1530	#define _SIDD_SBYTE_OPS 0x02
1531	#define _SIDD_SWORD_OPS 0x03
1532
1533	/* These specify the type of comparison operation. */
1534	#define _SIDD_CMP_EQUAL_ANY 0x00
1535	#define _SIDD_CMP_RANGES 0x04
1536	#define _SIDD_CMP_EQUAL_EACH 0x08
1537	#define _SIDD_CMP_EQUAL_ORDERED 0x0c
1538
1539	/* These macros specify the polarity of the operation. */
1540	#define _SIDD_POSITIVE_POLARITY 0x00
1541	#define _SIDD_NEGATIVE_POLARITY 0x10
1542	#define _SIDD_MASKED_POSITIVE_POLARITY 0x20
1543	#define _SIDD_MASKED_NEGATIVE_POLARITY 0x30
1544
1545	/* These macros are used in _mm_cmpXstri() to specify the return. */
1546	#define _SIDD_LEAST_SIGNIFICANT 0x00
1547	#define _SIDD_MOST_SIGNIFICANT 0x40
1548
1549	/* These macros are used in _mm_cmpXstri() to specify the return. */
1550	#define _SIDD_BIT_MASK 0x00
1551	#define _SIDD_UNIT_MASK 0x40
1552
1553	/* SSE4.2 Packed Comparison Intrinsics. */
1554	/// Uses the immediate operand \a M to perform a comparison of string
1555	/// data with implicitly defined lengths that is contained in source operands
1556	/// \a A and \a B. Returns a 128-bit integer vector representing the result
1557	/// mask of the comparison.
1558	///
1559	/// \headerfile <x86intrin.h>
1560	///
1561	/// \code
1562	/// __m128i _mm_cmpistrm(__m128i A, __m128i B, const int M);
1563	/// \endcode
1564	///
1565	/// This intrinsic corresponds to the <c> VPCMPISTRM / PCMPISTRM </c>
1566	/// instruction.
1567	///
1568	/// \param A
1569	/// A 128-bit integer vector containing one of the source operands to be
1570	/// compared.
1571	/// \param B
1572	/// A 128-bit integer vector containing one of the source operands to be
1573	/// compared.
1574	/// \param M
1575	/// An 8-bit immediate operand specifying whether the characters are bytes or
1576	/// words, the type of comparison to perform, and the format of the return
1577	/// value. \n
1578	/// Bits [1:0]: Determine source data format. \n
1579	/// 00: 16 unsigned bytes \n
1580	/// 01: 8 unsigned words \n
1581	/// 10: 16 signed bytes \n
1582	/// 11: 8 signed words \n
1583	/// Bits [3:2]: Determine comparison type and aggregation method. \n
1584	/// 00: Subset: Each character in \a B is compared for equality with all
1585	/// the characters in \a A. \n
1586	/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
1587	/// basis is greater than or equal for even-indexed elements in \a A,
1588	/// and less than or equal for odd-indexed elements in \a A. \n
1589	/// 10: Match: Compare each pair of corresponding characters in \a A and
1590	/// \a B for equality. \n
1591	/// 11: Substring: Search \a B for substring matches of \a A. \n
1592	/// Bits [5:4]: Determine whether to perform a one's complement on the bit
1593	/// mask of the comparison results. \n
1594	/// 00: No effect. \n
1595	/// 01: Negate the bit mask. \n
1596	/// 10: No effect. \n
1597	/// 11: Negate the bit mask only for bits with an index less than or equal
1598	/// to the size of \a A or \a B. \n
1599	/// Bit [6]: Determines whether the result is zero-extended or expanded to 16
1600	/// bytes. \n
1601	/// 0: The result is zero-extended to 16 bytes. \n
1602	/// 1: The result is expanded to 16 bytes (this expansion is performed by
1603	/// repeating each bit 8 or 16 times).
1604	/// \returns Returns a 128-bit integer vector representing the result mask of
1605	/// the comparison.
1606	#define _mm_cmpistrm(A, B, M) \
1607	((__m128i)__builtin_ia32_pcmpistrm128((__v16qi)(__m128i)(A), \
1608	(__v16qi)(__m128i)(B), (int)(M)))
1609
1610	/// Uses the immediate operand \a M to perform a comparison of string
1611	/// data with implicitly defined lengths that is contained in source operands
1612	/// \a A and \a B. Returns an integer representing the result index of the
1613	/// comparison.
1614	///
1615	/// \headerfile <x86intrin.h>
1616	///
1617	/// \code
1618	/// int _mm_cmpistri(__m128i A, __m128i B, const int M);
1619	/// \endcode
1620	///
1621	/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
1622	/// instruction.
1623	///
1624	/// \param A
1625	/// A 128-bit integer vector containing one of the source operands to be
1626	/// compared.
1627	/// \param B
1628	/// A 128-bit integer vector containing one of the source operands to be
1629	/// compared.
1630	/// \param M
1631	/// An 8-bit immediate operand specifying whether the characters are bytes or
1632	/// words, the type of comparison to perform, and the format of the return
1633	/// value. \n
1634	/// Bits [1:0]: Determine source data format. \n
1635	/// 00: 16 unsigned bytes \n
1636	/// 01: 8 unsigned words \n
1637	/// 10: 16 signed bytes \n
1638	/// 11: 8 signed words \n
1639	/// Bits [3:2]: Determine comparison type and aggregation method. \n
1640	/// 00: Subset: Each character in \a B is compared for equality with all
1641	/// the characters in \a A. \n
1642	/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
1643	/// basis is greater than or equal for even-indexed elements in \a A,
1644	/// and less than or equal for odd-indexed elements in \a A. \n
1645	/// 10: Match: Compare each pair of corresponding characters in \a A and
1646	/// \a B for equality. \n
1647	/// 11: Substring: Search B for substring matches of \a A. \n
1648	/// Bits [5:4]: Determine whether to perform a one's complement on the bit
1649	/// mask of the comparison results. \n
1650	/// 00: No effect. \n
1651	/// 01: Negate the bit mask. \n
1652	/// 10: No effect. \n
1653	/// 11: Negate the bit mask only for bits with an index less than or equal
1654	/// to the size of \a A or \a B. \n
1655	/// Bit [6]: Determines whether the index of the lowest set bit or the
1656	/// highest set bit is returned. \n
1657	/// 0: The index of the least significant set bit. \n
1658	/// 1: The index of the most significant set bit. \n
1659	/// \returns Returns an integer representing the result index of the comparison.
1660	#define _mm_cmpistri(A, B, M) \
1661	((int)__builtin_ia32_pcmpistri128((__v16qi)(__m128i)(A), \
1662	(__v16qi)(__m128i)(B), (int)(M)))
1663
1664	/// Uses the immediate operand \a M to perform a comparison of string
1665	/// data with explicitly defined lengths that is contained in source operands
1666	/// \a A and \a B. Returns a 128-bit integer vector representing the result
1667	/// mask of the comparison.
1668	///
1669	/// \headerfile <x86intrin.h>
1670	///
1671	/// \code
1672	/// __m128i _mm_cmpestrm(__m128i A, int LA, __m128i B, int LB, const int M);
1673	/// \endcode
1674	///
1675	/// This intrinsic corresponds to the <c> VPCMPESTRM / PCMPESTRM </c>
1676	/// instruction.
1677	///
1678	/// \param A
1679	/// A 128-bit integer vector containing one of the source operands to be
1680	/// compared.
1681	/// \param LA
1682	/// An integer that specifies the length of the string in \a A.
1683	/// \param B
1684	/// A 128-bit integer vector containing one of the source operands to be
1685	/// compared.
1686	/// \param LB
1687	/// An integer that specifies the length of the string in \a B.
1688	/// \param M
1689	/// An 8-bit immediate operand specifying whether the characters are bytes or
1690	/// words, the type of comparison to perform, and the format of the return
1691	/// value. \n
1692	/// Bits [1:0]: Determine source data format. \n
1693	/// 00: 16 unsigned bytes \n
1694	/// 01: 8 unsigned words \n
1695	/// 10: 16 signed bytes \n
1696	/// 11: 8 signed words \n
1697	/// Bits [3:2]: Determine comparison type and aggregation method. \n
1698	/// 00: Subset: Each character in \a B is compared for equality with all
1699	/// the characters in \a A. \n
1700	/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
1701	/// basis is greater than or equal for even-indexed elements in \a A,
1702	/// and less than or equal for odd-indexed elements in \a A. \n
1703	/// 10: Match: Compare each pair of corresponding characters in \a A and
1704	/// \a B for equality. \n
1705	/// 11: Substring: Search \a B for substring matches of \a A. \n
1706	/// Bits [5:4]: Determine whether to perform a one's complement on the bit
1707	/// mask of the comparison results. \n
1708	/// 00: No effect. \n
1709	/// 01: Negate the bit mask. \n
1710	/// 10: No effect. \n
1711	/// 11: Negate the bit mask only for bits with an index less than or equal
1712	/// to the size of \a A or \a B. \n
1713	/// Bit [6]: Determines whether the result is zero-extended or expanded to 16
1714	/// bytes. \n
1715	/// 0: The result is zero-extended to 16 bytes. \n
1716	/// 1: The result is expanded to 16 bytes (this expansion is performed by
1717	/// repeating each bit 8 or 16 times). \n
1718	/// \returns Returns a 128-bit integer vector representing the result mask of
1719	/// the comparison.
1720	#define _mm_cmpestrm(A, LA, B, LB, M) \
1721	((__m128i)__builtin_ia32_pcmpestrm128((__v16qi)(__m128i)(A), (int)(LA), \
1722	(__v16qi)(__m128i)(B), (int)(LB), \
1723	(int)(M)))
1724
1725	/// Uses the immediate operand \a M to perform a comparison of string
1726	/// data with explicitly defined lengths that is contained in source operands
1727	/// \a A and \a B. Returns an integer representing the result index of the
1728	/// comparison.
1729	///
1730	/// \headerfile <x86intrin.h>
1731	///
1732	/// \code
1733	/// int _mm_cmpestri(__m128i A, int LA, __m128i B, int LB, const int M);
1734	/// \endcode
1735	///
1736	/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
1737	/// instruction.
1738	///
1739	/// \param A
1740	/// A 128-bit integer vector containing one of the source operands to be
1741	/// compared.
1742	/// \param LA
1743	/// An integer that specifies the length of the string in \a A.
1744	/// \param B
1745	/// A 128-bit integer vector containing one of the source operands to be
1746	/// compared.
1747	/// \param LB
1748	/// An integer that specifies the length of the string in \a B.
1749	/// \param M
1750	/// An 8-bit immediate operand specifying whether the characters are bytes or
1751	/// words, the type of comparison to perform, and the format of the return
1752	/// value. \n
1753	/// Bits [1:0]: Determine source data format. \n
1754	/// 00: 16 unsigned bytes \n
1755	/// 01: 8 unsigned words \n
1756	/// 10: 16 signed bytes \n
1757	/// 11: 8 signed words \n
1758	/// Bits [3:2]: Determine comparison type and aggregation method. \n
1759	/// 00: Subset: Each character in \a B is compared for equality with all
1760	/// the characters in \a A. \n
1761	/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
1762	/// basis is greater than or equal for even-indexed elements in \a A,
1763	/// and less than or equal for odd-indexed elements in \a A. \n
1764	/// 10: Match: Compare each pair of corresponding characters in \a A and
1765	/// \a B for equality. \n
1766	/// 11: Substring: Search B for substring matches of \a A. \n
1767	/// Bits [5:4]: Determine whether to perform a one's complement on the bit
1768	/// mask of the comparison results. \n
1769	/// 00: No effect. \n
1770	/// 01: Negate the bit mask. \n
1771	/// 10: No effect. \n
1772	/// 11: Negate the bit mask only for bits with an index less than or equal
1773	/// to the size of \a A or \a B. \n
1774	/// Bit [6]: Determines whether the index of the lowest set bit or the
1775	/// highest set bit is returned. \n
1776	/// 0: The index of the least significant set bit. \n
1777	/// 1: The index of the most significant set bit. \n
1778	/// \returns Returns an integer representing the result index of the comparison.
1779	#define _mm_cmpestri(A, LA, B, LB, M) \
1780	((int)__builtin_ia32_pcmpestri128((__v16qi)(__m128i)(A), (int)(LA), \
1781	(__v16qi)(__m128i)(B), (int)(LB), \
1782	(int)(M)))
1783
1784	/* SSE4.2 Packed Comparison Intrinsics and EFlag Reading. */
1785	/// Uses the immediate operand \a M to perform a comparison of string
1786	/// data with implicitly defined lengths that is contained in source operands
1787	/// \a A and \a B. Returns 1 if the bit mask is zero and the length of the
1788	/// string in \a B is the maximum, otherwise, returns 0.
1789	///
1790	/// \headerfile <x86intrin.h>
1791	///
1792	/// \code
1793	/// int _mm_cmpistra(__m128i A, __m128i B, const int M);
1794	/// \endcode
1795	///
1796	/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
1797	/// instruction.
1798	///
1799	/// \param A
1800	/// A 128-bit integer vector containing one of the source operands to be
1801	/// compared.
1802	/// \param B
1803	/// A 128-bit integer vector containing one of the source operands to be
1804	/// compared.
1805	/// \param M
1806	/// An 8-bit immediate operand specifying whether the characters are bytes or
1807	/// words and the type of comparison to perform. \n
1808	/// Bits [1:0]: Determine source data format. \n
1809	/// 00: 16 unsigned bytes \n
1810	/// 01: 8 unsigned words \n
1811	/// 10: 16 signed bytes \n
1812	/// 11: 8 signed words \n
1813	/// Bits [3:2]: Determine comparison type and aggregation method. \n
1814	/// 00: Subset: Each character in \a B is compared for equality with all
1815	/// the characters in \a A. \n
1816	/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
1817	/// basis is greater than or equal for even-indexed elements in \a A,
1818	/// and less than or equal for odd-indexed elements in \a A. \n
1819	/// 10: Match: Compare each pair of corresponding characters in \a A and
1820	/// \a B for equality. \n
1821	/// 11: Substring: Search \a B for substring matches of \a A. \n
1822	/// Bits [5:4]: Determine whether to perform a one's complement on the bit
1823	/// mask of the comparison results. \n
1824	/// 00: No effect. \n
1825	/// 01: Negate the bit mask. \n
1826	/// 10: No effect. \n
1827	/// 11: Negate the bit mask only for bits with an index less than or equal
1828	/// to the size of \a A or \a B. \n
1829	/// \returns Returns 1 if the bit mask is zero and the length of the string in
1830	/// \a B is the maximum; otherwise, returns 0.
1831	#define _mm_cmpistra(A, B, M) \
1832	((int)__builtin_ia32_pcmpistria128((__v16qi)(__m128i)(A), \
1833	(__v16qi)(__m128i)(B), (int)(M)))
1834
1835	/// Uses the immediate operand \a M to perform a comparison of string
1836	/// data with implicitly defined lengths that is contained in source operands
1837	/// \a A and \a B. Returns 1 if the bit mask is non-zero, otherwise, returns
1838	/// 0.
1839	///
1840	/// \headerfile <x86intrin.h>
1841	///
1842	/// \code
1843	/// int _mm_cmpistrc(__m128i A, __m128i B, const int M);
1844	/// \endcode
1845	///
1846	/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
1847	/// instruction.
1848	///
1849	/// \param A
1850	/// A 128-bit integer vector containing one of the source operands to be
1851	/// compared.
1852	/// \param B
1853	/// A 128-bit integer vector containing one of the source operands to be
1854	/// compared.
1855	/// \param M
1856	/// An 8-bit immediate operand specifying whether the characters are bytes or
1857	/// words and the type of comparison to perform. \n
1858	/// Bits [1:0]: Determine source data format. \n
1859	/// 00: 16 unsigned bytes \n
1860	/// 01: 8 unsigned words \n
1861	/// 10: 16 signed bytes \n
1862	/// 11: 8 signed words \n
1863	/// Bits [3:2]: Determine comparison type and aggregation method. \n
1864	/// 00: Subset: Each character in \a B is compared for equality with all
1865	/// the characters in \a A. \n
1866	/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
1867	/// basis is greater than or equal for even-indexed elements in \a A,
1868	/// and less than or equal for odd-indexed elements in \a A. \n
1869	/// 10: Match: Compare each pair of corresponding characters in \a A and
1870	/// \a B for equality. \n
1871	/// 11: Substring: Search B for substring matches of \a A. \n
1872	/// Bits [5:4]: Determine whether to perform a one's complement on the bit
1873	/// mask of the comparison results. \n
1874	/// 00: No effect. \n
1875	/// 01: Negate the bit mask. \n
1876	/// 10: No effect. \n
1877	/// 11: Negate the bit mask only for bits with an index less than or equal
1878	/// to the size of \a A or \a B.
1879	/// \returns Returns 1 if the bit mask is non-zero, otherwise, returns 0.
1880	#define _mm_cmpistrc(A, B, M) \
1881	((int)__builtin_ia32_pcmpistric128((__v16qi)(__m128i)(A), \
1882	(__v16qi)(__m128i)(B), (int)(M)))
1883
1884	/// Uses the immediate operand \a M to perform a comparison of string
1885	/// data with implicitly defined lengths that is contained in source operands
1886	/// \a A and \a B. Returns bit 0 of the resulting bit mask.
1887	///
1888	/// \headerfile <x86intrin.h>
1889	///
1890	/// \code
1891	/// int _mm_cmpistro(__m128i A, __m128i B, const int M);
1892	/// \endcode
1893	///
1894	/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
1895	/// instruction.
1896	///
1897	/// \param A
1898	/// A 128-bit integer vector containing one of the source operands to be
1899	/// compared.
1900	/// \param B
1901	/// A 128-bit integer vector containing one of the source operands to be
1902	/// compared.
1903	/// \param M
1904	/// An 8-bit immediate operand specifying whether the characters are bytes or
1905	/// words and the type of comparison to perform. \n
1906	/// Bits [1:0]: Determine source data format. \n
1907	/// 00: 16 unsigned bytes \n
1908	/// 01: 8 unsigned words \n
1909	/// 10: 16 signed bytes \n
1910	/// 11: 8 signed words \n
1911	/// Bits [3:2]: Determine comparison type and aggregation method. \n
1912	/// 00: Subset: Each character in \a B is compared for equality with all
1913	/// the characters in \a A. \n
1914	/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
1915	/// basis is greater than or equal for even-indexed elements in \a A,
1916	/// and less than or equal for odd-indexed elements in \a A. \n
1917	/// 10: Match: Compare each pair of corresponding characters in \a A and
1918	/// \a B for equality. \n
1919	/// 11: Substring: Search B for substring matches of \a A. \n
1920	/// Bits [5:4]: Determine whether to perform a one's complement on the bit
1921	/// mask of the comparison results. \n
1922	/// 00: No effect. \n
1923	/// 01: Negate the bit mask. \n
1924	/// 10: No effect. \n
1925	/// 11: Negate the bit mask only for bits with an index less than or equal
1926	/// to the size of \a A or \a B. \n
1927	/// \returns Returns bit 0 of the resulting bit mask.
1928	#define _mm_cmpistro(A, B, M) \
1929	((int)__builtin_ia32_pcmpistrio128((__v16qi)(__m128i)(A), \
1930	(__v16qi)(__m128i)(B), (int)(M)))
1931
1932	/// Uses the immediate operand \a M to perform a comparison of string
1933	/// data with implicitly defined lengths that is contained in source operands
1934	/// \a A and \a B. Returns 1 if the length of the string in \a A is less than
1935	/// the maximum, otherwise, returns 0.
1936	///
1937	/// \headerfile <x86intrin.h>
1938	///
1939	/// \code
1940	/// int _mm_cmpistrs(__m128i A, __m128i B, const int M);
1941	/// \endcode
1942	///
1943	/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
1944	/// instruction.
1945	///
1946	/// \param A
1947	/// A 128-bit integer vector containing one of the source operands to be
1948	/// compared.
1949	/// \param B
1950	/// A 128-bit integer vector containing one of the source operands to be
1951	/// compared.
1952	/// \param M
1953	/// An 8-bit immediate operand specifying whether the characters are bytes or
1954	/// words and the type of comparison to perform. \n
1955	/// Bits [1:0]: Determine source data format. \n
1956	/// 00: 16 unsigned bytes \n
1957	/// 01: 8 unsigned words \n
1958	/// 10: 16 signed bytes \n
1959	/// 11: 8 signed words \n
1960	/// Bits [3:2]: Determine comparison type and aggregation method. \n
1961	/// 00: Subset: Each character in \a B is compared for equality with all
1962	/// the characters in \a A. \n
1963	/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
1964	/// basis is greater than or equal for even-indexed elements in \a A,
1965	/// and less than or equal for odd-indexed elements in \a A. \n
1966	/// 10: Match: Compare each pair of corresponding characters in \a A and
1967	/// \a B for equality. \n
1968	/// 11: Substring: Search \a B for substring matches of \a A. \n
1969	/// Bits [5:4]: Determine whether to perform a one's complement on the bit
1970	/// mask of the comparison results. \n
1971	/// 00: No effect. \n
1972	/// 01: Negate the bit mask. \n
1973	/// 10: No effect. \n
1974	/// 11: Negate the bit mask only for bits with an index less than or equal
1975	/// to the size of \a A or \a B. \n
1976	/// \returns Returns 1 if the length of the string in \a A is less than the
1977	/// maximum, otherwise, returns 0.
1978	#define _mm_cmpistrs(A, B, M) \
1979	((int)__builtin_ia32_pcmpistris128((__v16qi)(__m128i)(A), \
1980	(__v16qi)(__m128i)(B), (int)(M)))
1981
1982	/// Uses the immediate operand \a M to perform a comparison of string
1983	/// data with implicitly defined lengths that is contained in source operands
1984	/// \a A and \a B. Returns 1 if the length of the string in \a B is less than
1985	/// the maximum, otherwise, returns 0.
1986	///
1987	/// \headerfile <x86intrin.h>
1988	///
1989	/// \code
1990	/// int _mm_cmpistrz(__m128i A, __m128i B, const int M);
1991	/// \endcode
1992	///
1993	/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
1994	/// instruction.
1995	///
1996	/// \param A
1997	/// A 128-bit integer vector containing one of the source operands to be
1998	/// compared.
1999	/// \param B
2000	/// A 128-bit integer vector containing one of the source operands to be
2001	/// compared.
2002	/// \param M
2003	/// An 8-bit immediate operand specifying whether the characters are bytes or
2004	/// words and the type of comparison to perform. \n
2005	/// Bits [1:0]: Determine source data format. \n
2006	/// 00: 16 unsigned bytes \n
2007	/// 01: 8 unsigned words \n
2008	/// 10: 16 signed bytes \n
2009	/// 11: 8 signed words \n
2010	/// Bits [3:2]: Determine comparison type and aggregation method. \n
2011	/// 00: Subset: Each character in \a B is compared for equality with all
2012	/// the characters in \a A. \n
2013	/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
2014	/// basis is greater than or equal for even-indexed elements in \a A,
2015	/// and less than or equal for odd-indexed elements in \a A. \n
2016	/// 10: Match: Compare each pair of corresponding characters in \a A and
2017	/// \a B for equality. \n
2018	/// 11: Substring: Search \a B for substring matches of \a A. \n
2019	/// Bits [5:4]: Determine whether to perform a one's complement on the bit
2020	/// mask of the comparison results. \n
2021	/// 00: No effect. \n
2022	/// 01: Negate the bit mask. \n
2023	/// 10: No effect. \n
2024	/// 11: Negate the bit mask only for bits with an index less than or equal
2025	/// to the size of \a A or \a B.
2026	/// \returns Returns 1 if the length of the string in \a B is less than the
2027	/// maximum, otherwise, returns 0.
2028	#define _mm_cmpistrz(A, B, M) \
2029	((int)__builtin_ia32_pcmpistriz128((__v16qi)(__m128i)(A), \
2030	(__v16qi)(__m128i)(B), (int)(M)))
2031
2032	/// Uses the immediate operand \a M to perform a comparison of string
2033	/// data with explicitly defined lengths that is contained in source operands
2034	/// \a A and \a B. Returns 1 if the bit mask is zero and the length of the
2035	/// string in \a B is the maximum, otherwise, returns 0.
2036	///
2037	/// \headerfile <x86intrin.h>
2038	///
2039	/// \code
2040	/// int _mm_cmpestra(__m128i A, int LA, __m128i B, int LB, const int M);
2041	/// \endcode
2042	///
2043	/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
2044	/// instruction.
2045	///
2046	/// \param A
2047	/// A 128-bit integer vector containing one of the source operands to be
2048	/// compared.
2049	/// \param LA
2050	/// An integer that specifies the length of the string in \a A.
2051	/// \param B
2052	/// A 128-bit integer vector containing one of the source operands to be
2053	/// compared.
2054	/// \param LB
2055	/// An integer that specifies the length of the string in \a B.
2056	/// \param M
2057	/// An 8-bit immediate operand specifying whether the characters are bytes or
2058	/// words and the type of comparison to perform. \n
2059	/// Bits [1:0]: Determine source data format. \n
2060	/// 00: 16 unsigned bytes \n
2061	/// 01: 8 unsigned words \n
2062	/// 10: 16 signed bytes \n
2063	/// 11: 8 signed words \n
2064	/// Bits [3:2]: Determine comparison type and aggregation method. \n
2065	/// 00: Subset: Each character in \a B is compared for equality with all
2066	/// the characters in \a A. \n
2067	/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
2068	/// basis is greater than or equal for even-indexed elements in \a A,
2069	/// and less than or equal for odd-indexed elements in \a A. \n
2070	/// 10: Match: Compare each pair of corresponding characters in \a A and
2071	/// \a B for equality. \n
2072	/// 11: Substring: Search \a B for substring matches of \a A. \n
2073	/// Bits [5:4]: Determine whether to perform a one's complement on the bit
2074	/// mask of the comparison results. \n
2075	/// 00: No effect. \n
2076	/// 01: Negate the bit mask. \n
2077	/// 10: No effect. \n
2078	/// 11: Negate the bit mask only for bits with an index less than or equal
2079	/// to the size of \a A or \a B.
2080	/// \returns Returns 1 if the bit mask is zero and the length of the string in
2081	/// \a B is the maximum, otherwise, returns 0.
2082	#define _mm_cmpestra(A, LA, B, LB, M) \
2083	((int)__builtin_ia32_pcmpestria128((__v16qi)(__m128i)(A), (int)(LA), \
2084	(__v16qi)(__m128i)(B), (int)(LB), \
2085	(int)(M)))
2086
2087	/// Uses the immediate operand \a M to perform a comparison of string
2088	/// data with explicitly defined lengths that is contained in source operands
2089	/// \a A and \a B. Returns 1 if the resulting mask is non-zero, otherwise,
2090	/// returns 0.
2091	///
2092	/// \headerfile <x86intrin.h>
2093	///
2094	/// \code
2095	/// int _mm_cmpestrc(__m128i A, int LA, __m128i B, int LB, const int M);
2096	/// \endcode
2097	///
2098	/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
2099	/// instruction.
2100	///
2101	/// \param A
2102	/// A 128-bit integer vector containing one of the source operands to be
2103	/// compared.
2104	/// \param LA
2105	/// An integer that specifies the length of the string in \a A.
2106	/// \param B
2107	/// A 128-bit integer vector containing one of the source operands to be
2108	/// compared.
2109	/// \param LB
2110	/// An integer that specifies the length of the string in \a B.
2111	/// \param M
2112	/// An 8-bit immediate operand specifying whether the characters are bytes or
2113	/// words and the type of comparison to perform. \n
2114	/// Bits [1:0]: Determine source data format. \n
2115	/// 00: 16 unsigned bytes \n
2116	/// 01: 8 unsigned words \n
2117	/// 10: 16 signed bytes \n
2118	/// 11: 8 signed words \n
2119	/// Bits [3:2]: Determine comparison type and aggregation method. \n
2120	/// 00: Subset: Each character in \a B is compared for equality with all
2121	/// the characters in \a A. \n
2122	/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
2123	/// basis is greater than or equal for even-indexed elements in \a A,
2124	/// and less than or equal for odd-indexed elements in \a A. \n
2125	/// 10: Match: Compare each pair of corresponding characters in \a A and
2126	/// \a B for equality. \n
2127	/// 11: Substring: Search \a B for substring matches of \a A. \n
2128	/// Bits [5:4]: Determine whether to perform a one's complement on the bit
2129	/// mask of the comparison results. \n
2130	/// 00: No effect. \n
2131	/// 01: Negate the bit mask. \n
2132	/// 10: No effect. \n
2133	/// 11: Negate the bit mask only for bits with an index less than or equal
2134	/// to the size of \a A or \a B. \n
2135	/// \returns Returns 1 if the resulting mask is non-zero, otherwise, returns 0.
2136	#define _mm_cmpestrc(A, LA, B, LB, M) \
2137	((int)__builtin_ia32_pcmpestric128((__v16qi)(__m128i)(A), (int)(LA), \
2138	(__v16qi)(__m128i)(B), (int)(LB), \
2139	(int)(M)))
2140
2141	/// Uses the immediate operand \a M to perform a comparison of string
2142	/// data with explicitly defined lengths that is contained in source operands
2143	/// \a A and \a B. Returns bit 0 of the resulting bit mask.
2144	///
2145	/// \headerfile <x86intrin.h>
2146	///
2147	/// \code
2148	/// int _mm_cmpestro(__m128i A, int LA, __m128i B, int LB, const int M);
2149	/// \endcode
2150	///
2151	/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
2152	/// instruction.
2153	///
2154	/// \param A
2155	/// A 128-bit integer vector containing one of the source operands to be
2156	/// compared.
2157	/// \param LA
2158	/// An integer that specifies the length of the string in \a A.
2159	/// \param B
2160	/// A 128-bit integer vector containing one of the source operands to be
2161	/// compared.
2162	/// \param LB
2163	/// An integer that specifies the length of the string in \a B.
2164	/// \param M
2165	/// An 8-bit immediate operand specifying whether the characters are bytes or
2166	/// words and the type of comparison to perform. \n
2167	/// Bits [1:0]: Determine source data format. \n
2168	/// 00: 16 unsigned bytes \n
2169	/// 01: 8 unsigned words \n
2170	/// 10: 16 signed bytes \n
2171	/// 11: 8 signed words \n
2172	/// Bits [3:2]: Determine comparison type and aggregation method. \n
2173	/// 00: Subset: Each character in \a B is compared for equality with all
2174	/// the characters in \a A. \n
2175	/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
2176	/// basis is greater than or equal for even-indexed elements in \a A,
2177	/// and less than or equal for odd-indexed elements in \a A. \n
2178	/// 10: Match: Compare each pair of corresponding characters in \a A and
2179	/// \a B for equality. \n
2180	/// 11: Substring: Search \a B for substring matches of \a A. \n
2181	/// Bits [5:4]: Determine whether to perform a one's complement on the bit
2182	/// mask of the comparison results. \n
2183	/// 00: No effect. \n
2184	/// 01: Negate the bit mask. \n
2185	/// 10: No effect. \n
2186	/// 11: Negate the bit mask only for bits with an index less than or equal
2187	/// to the size of \a A or \a B.
2188	/// \returns Returns bit 0 of the resulting bit mask.
2189	#define _mm_cmpestro(A, LA, B, LB, M) \
2190	((int)__builtin_ia32_pcmpestrio128((__v16qi)(__m128i)(A), (int)(LA), \
2191	(__v16qi)(__m128i)(B), (int)(LB), \
2192	(int)(M)))
2193
2194	/// Uses the immediate operand \a M to perform a comparison of string
2195	/// data with explicitly defined lengths that is contained in source operands
2196	/// \a A and \a B. Returns 1 if the length of the string in \a A is less than
2197	/// the maximum, otherwise, returns 0.
2198	///
2199	/// \headerfile <x86intrin.h>
2200	///
2201	/// \code
2202	/// int _mm_cmpestrs(__m128i A, int LA, __m128i B, int LB, const int M);
2203	/// \endcode
2204	///
2205	/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
2206	/// instruction.
2207	///
2208	/// \param A
2209	/// A 128-bit integer vector containing one of the source operands to be
2210	/// compared.
2211	/// \param LA
2212	/// An integer that specifies the length of the string in \a A.
2213	/// \param B
2214	/// A 128-bit integer vector containing one of the source operands to be
2215	/// compared.
2216	/// \param LB
2217	/// An integer that specifies the length of the string in \a B.
2218	/// \param M
2219	/// An 8-bit immediate operand specifying whether the characters are bytes or
2220	/// words and the type of comparison to perform. \n
2221	/// Bits [1:0]: Determine source data format. \n
2222	/// 00: 16 unsigned bytes \n
2223	/// 01: 8 unsigned words \n
2224	/// 10: 16 signed bytes \n
2225	/// 11: 8 signed words \n
2226	/// Bits [3:2]: Determine comparison type and aggregation method. \n
2227	/// 00: Subset: Each character in \a B is compared for equality with all
2228	/// the characters in \a A. \n
2229	/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
2230	/// basis is greater than or equal for even-indexed elements in \a A,
2231	/// and less than or equal for odd-indexed elements in \a A. \n
2232	/// 10: Match: Compare each pair of corresponding characters in \a A and
2233	/// \a B for equality. \n
2234	/// 11: Substring: Search \a B for substring matches of \a A. \n
2235	/// Bits [5:4]: Determine whether to perform a one's complement in the bit
2236	/// mask of the comparison results. \n
2237	/// 00: No effect. \n
2238	/// 01: Negate the bit mask. \n
2239	/// 10: No effect. \n
2240	/// 11: Negate the bit mask only for bits with an index less than or equal
2241	/// to the size of \a A or \a B. \n
2242	/// \returns Returns 1 if the length of the string in \a A is less than the
2243	/// maximum, otherwise, returns 0.
2244	#define _mm_cmpestrs(A, LA, B, LB, M) \
2245	((int)__builtin_ia32_pcmpestris128((__v16qi)(__m128i)(A), (int)(LA), \
2246	(__v16qi)(__m128i)(B), (int)(LB), \
2247	(int)(M)))
2248
2249	/// Uses the immediate operand \a M to perform a comparison of string
2250	/// data with explicitly defined lengths that is contained in source operands
2251	/// \a A and \a B. Returns 1 if the length of the string in \a B is less than
2252	/// the maximum, otherwise, returns 0.
2253	///
2254	/// \headerfile <x86intrin.h>
2255	///
2256	/// \code
2257	/// int _mm_cmpestrz(__m128i A, int LA, __m128i B, int LB, const int M);
2258	/// \endcode
2259	///
2260	/// This intrinsic corresponds to the <c> VPCMPESTRI </c> instruction.
2261	///
2262	/// \param A
2263	/// A 128-bit integer vector containing one of the source operands to be
2264	/// compared.
2265	/// \param LA
2266	/// An integer that specifies the length of the string in \a A.
2267	/// \param B
2268	/// A 128-bit integer vector containing one of the source operands to be
2269	/// compared.
2270	/// \param LB
2271	/// An integer that specifies the length of the string in \a B.
2272	/// \param M
2273	/// An 8-bit immediate operand specifying whether the characters are bytes or
2274	/// words and the type of comparison to perform. \n
2275	/// Bits [1:0]: Determine source data format. \n
2276	/// 00: 16 unsigned bytes \n
2277	/// 01: 8 unsigned words \n
2278	/// 10: 16 signed bytes \n
2279	/// 11: 8 signed words \n
2280	/// Bits [3:2]: Determine comparison type and aggregation method. \n
2281	/// 00: Subset: Each character in \a B is compared for equality with all
2282	/// the characters in \a A. \n
2283	/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
2284	/// basis is greater than or equal for even-indexed elements in \a A,
2285	/// and less than or equal for odd-indexed elements in \a A. \n
2286	/// 10: Match: Compare each pair of corresponding characters in \a A and
2287	/// \a B for equality. \n
2288	/// 11: Substring: Search \a B for substring matches of \a A. \n
2289	/// Bits [5:4]: Determine whether to perform a one's complement on the bit
2290	/// mask of the comparison results. \n
2291	/// 00: No effect. \n
2292	/// 01: Negate the bit mask. \n
2293	/// 10: No effect. \n
2294	/// 11: Negate the bit mask only for bits with an index less than or equal
2295	/// to the size of \a A or \a B.
2296	/// \returns Returns 1 if the length of the string in \a B is less than the
2297	/// maximum, otherwise, returns 0.
2298	#define _mm_cmpestrz(A, LA, B, LB, M) \
2299	((int)__builtin_ia32_pcmpestriz128((__v16qi)(__m128i)(A), (int)(LA), \
2300	(__v16qi)(__m128i)(B), (int)(LB), \
2301	(int)(M)))
2302
2303	/* SSE4.2 Compare Packed Data -- Greater Than. */
2304	/// Compares each of the corresponding 64-bit values of the 128-bit
2305	/// integer vectors to determine if the values in the first operand are
2306	/// greater than those in the second operand.
2307	///
2308	/// \headerfile <x86intrin.h>
2309	///
2310	/// This intrinsic corresponds to the <c> VPCMPGTQ / PCMPGTQ </c> instruction.
2311	///
2312	/// \param __V1
2313	/// A 128-bit integer vector.
2314	/// \param __V2
2315	/// A 128-bit integer vector.
2316	/// \returns A 128-bit integer vector containing the comparison results.
2317	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi64(__m128i __V1,
2318	__m128i __V2) {
2319	return (__m128i)((__v2di)__V1 > (__v2di)__V2);
2320	}
2321
2322	#undef __DEFAULT_FN_ATTRS
2323
2324	#include <popcntintrin.h>
2325
2326	#include <crc32intrin.h>
2327
2328	#endif /* __SMMINTRIN_H */
2329

Warning: This file is not a C or C++ file. It does not have highlighting.

source code of clang/lib/Headers/smmintrin.h