avx512vlvnniintrin.h source code [clang/lib/Headers/avx512vlvnniintrin.h]

Warning: This file is not a C or C++ file. It does not have highlighting.

1	/*===------------- avx512vlvnniintrin.h - VNNI intrinsics ------------------===
2	*
3	*
4	* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5	* See https://llvm.org/LICENSE.txt for license information.
6	* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7	*
8	*===-----------------------------------------------------------------------===
9	*/
10	#ifndef __IMMINTRIN_H
11	#error "Never use <avx512vlvnniintrin.h> directly; include <immintrin.h> instead."
12	#endif
13
14	#ifndef __AVX512VLVNNIINTRIN_H
15	#define __AVX512VLVNNIINTRIN_H
16
17	/* Define the default attributes for the functions in this file. */
18	#define __DEFAULT_FN_ATTRS128 \
19	__attribute__((__always_inline__, __nodebug__, \
20	__target__("avx512vl,avx512vnni,no-evex512"), \
21	__min_vector_width__(128)))
22	#define __DEFAULT_FN_ATTRS256 \
23	__attribute__((__always_inline__, __nodebug__, \
24	__target__("avx512vl,avx512vnni,no-evex512"), \
25	__min_vector_width__(256)))
26
27	/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
28	/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
29	/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
30	/// in \a S, and store the packed 32-bit results in DST.
31	///
32	/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
33	///
34	/// \code{.operation}
35	/// FOR j := 0 to 7
36	/// tmp1.word := Signed(ZeroExtend16(A.byte[4j]) SignExtend16(B.byte[4*j]))
37	/// tmp2.word := Signed(ZeroExtend16(A.byte[4j+1]) SignExtend16(B.byte[4*j+1]))
38	/// tmp3.word := Signed(ZeroExtend16(A.byte[4j+2]) SignExtend16(B.byte[4*j+2]))
39	/// tmp4.word := Signed(ZeroExtend16(A.byte[4j+3]) SignExtend16(B.byte[4*j+3]))
40	/// DST.dword[j] := S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
41	/// ENDFOR
42	/// DST[MAX:256] := 0
43	/// \endcode
44	#define _mm256_dpbusd_epi32(S, A, B) \
45	((__m256i)__builtin_ia32_vpdpbusd256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
46
47	/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
48	/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
49	/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
50	/// in \a S using signed saturation, and store the packed 32-bit results in DST.
51	///
52	/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
53	///
54	/// \code{.operation}
55	/// FOR j := 0 to 7
56	/// tmp1.word := Signed(ZeroExtend16(A.byte[4j]) SignExtend16(B.byte[4*j]))
57	/// tmp2.word := Signed(ZeroExtend16(A.byte[4j+1]) SignExtend16(B.byte[4*j+1]))
58	/// tmp3.word := Signed(ZeroExtend16(A.byte[4j+2]) SignExtend16(B.byte[4*j+2]))
59	/// tmp4.word := Signed(ZeroExtend16(A.byte[4j+3]) SignExtend16(B.byte[4*j+3]))
60	/// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
61	/// ENDFOR
62	/// DST[MAX:256] := 0
63	/// \endcode
64	#define _mm256_dpbusds_epi32(S, A, B) \
65	((__m256i)__builtin_ia32_vpdpbusds256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
66
67	/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
68	/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
69	/// results. Sum these 2 results with the corresponding 32-bit integer in \a S,
70	/// and store the packed 32-bit results in DST.
71	///
72	/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
73	///
74	/// \code{.operation}
75	/// FOR j := 0 to 7
76	/// tmp1.dword := SignExtend32(A.word[2j]) SignExtend32(B.word[2*j])
77	/// tmp2.dword := SignExtend32(A.word[2j+1]) SignExtend32(B.word[2*j+1])
78	/// DST.dword[j] := S.dword[j] + tmp1 + tmp2
79	/// ENDFOR
80	/// DST[MAX:256] := 0
81	/// \endcode
82	#define _mm256_dpwssd_epi32(S, A, B) \
83	((__m256i)__builtin_ia32_vpdpwssd256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
84
85	/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
86	/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
87	/// results. Sum these 2 results with the corresponding 32-bit integer in \a S
88	/// using signed saturation, and store the packed 32-bit results in DST.
89	///
90	/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
91	///
92	/// \code{.operation}
93	/// FOR j := 0 to 7
94	/// tmp1.dword := SignExtend32(A.word[2j]) SignExtend32(B.word[2*j])
95	/// tmp2.dword := SignExtend32(A.word[2j+1]) SignExtend32(B.word[2*j+1])
96	/// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2)
97	/// ENDFOR
98	/// DST[MAX:256] := 0
99	/// \endcode
100	#define _mm256_dpwssds_epi32(S, A, B) \
101	((__m256i)__builtin_ia32_vpdpwssds256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
102
103	/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
104	/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
105	/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
106	/// in \a S, and store the packed 32-bit results in DST.
107	///
108	/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
109	///
110	/// \code{.operation}
111	/// FOR j := 0 to 3
112	/// tmp1.word := Signed(ZeroExtend16(A.byte[4j]) SignExtend16(B.byte[4*j]))
113	/// tmp2.word := Signed(ZeroExtend16(A.byte[4j+1]) SignExtend16(B.byte[4*j+1]))
114	/// tmp3.word := Signed(ZeroExtend16(A.byte[4j+2]) SignExtend16(B.byte[4*j+2]))
115	/// tmp4.word := Signed(ZeroExtend16(A.byte[4j+3]) SignExtend16(B.byte[4*j+3]))
116	/// DST.dword[j] := S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
117	/// ENDFOR
118	/// DST[MAX:128] := 0
119	/// \endcode
120	#define _mm_dpbusd_epi32(S, A, B) \
121	((__m128i)__builtin_ia32_vpdpbusd128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
122
123	/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
124	/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
125	/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
126	/// in \a S using signed saturation, and store the packed 32-bit results in DST.
127	///
128	/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
129	///
130	/// \code{.operation}
131	/// FOR j := 0 to 3
132	/// tmp1.word := Signed(ZeroExtend16(A.byte[4j]) SignExtend16(B.byte[4*j]))
133	/// tmp2.word := Signed(ZeroExtend16(A.byte[4j+1]) SignExtend16(B.byte[4*j+1]))
134	/// tmp3.word := Signed(ZeroExtend16(A.byte[4j+2]) SignExtend16(B.byte[4*j+2]))
135	/// tmp4.word := Signed(ZeroExtend16(A.byte[4j+3]) SignExtend16(B.byte[4*j+3]))
136	/// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
137	/// ENDFOR
138	/// DST[MAX:128] := 0
139	/// \endcode
140	#define _mm_dpbusds_epi32(S, A, B) \
141	((__m128i)__builtin_ia32_vpdpbusds128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
142
143	/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
144	/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
145	/// results. Sum these 2 results with the corresponding 32-bit integer in \a S,
146	/// and store the packed 32-bit results in DST.
147	///
148	/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
149	///
150	/// \code{.operation}
151	/// FOR j := 0 to 3
152	/// tmp1.dword := SignExtend32(A.word[2j]) SignExtend32(B.word[2*j])
153	/// tmp2.dword := SignExtend32(A.word[2j+1]) SignExtend32(B.word[2*j+1])
154	/// DST.dword[j] := S.dword[j] + tmp1 + tmp2
155	/// ENDFOR
156	/// DST[MAX:128] := 0
157	/// \endcode
158	#define _mm_dpwssd_epi32(S, A, B) \
159	((__m128i)__builtin_ia32_vpdpwssd128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
160
161	/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
162	/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
163	/// results. Sum these 2 results with the corresponding 32-bit integer in \a S
164	/// using signed saturation, and store the packed 32-bit results in DST.
165	///
166	/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
167	///
168	/// \code{.operation}
169	/// FOR j := 0 to 3
170	/// tmp1.dword := SignExtend32(A.word[2j]) SignExtend32(B.word[2*j])
171	/// tmp2.dword := SignExtend32(A.word[2j+1]) SignExtend32(B.word[2*j+1])
172	/// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2)
173	/// ENDFOR
174	/// DST[MAX:128] := 0
175	/// \endcode
176	#define _mm_dpwssds_epi32(S, A, B) \
177	((__m128i)__builtin_ia32_vpdpwssds128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
178
179	static __inline__ __m256i __DEFAULT_FN_ATTRS256
180	_mm256_mask_dpbusd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
181	{
182	return (__m256i)__builtin_ia32_selectd_256(__U,
183	(__v8si)_mm256_dpbusd_epi32(__S, __A, __B),
184	(__v8si)__S);
185	}
186
187	static __inline__ __m256i __DEFAULT_FN_ATTRS256
188	_mm256_maskz_dpbusd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
189	{
190	return (__m256i)__builtin_ia32_selectd_256(__U,
191	(__v8si)_mm256_dpbusd_epi32(__S, __A, __B),
192	(__v8si)_mm256_setzero_si256());
193	}
194
195	static __inline__ __m256i __DEFAULT_FN_ATTRS256
196	_mm256_mask_dpbusds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
197	{
198	return (__m256i)__builtin_ia32_selectd_256(__U,
199	(__v8si)_mm256_dpbusds_epi32(__S, __A, __B),
200	(__v8si)__S);
201	}
202
203	static __inline__ __m256i __DEFAULT_FN_ATTRS256
204	_mm256_maskz_dpbusds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
205	{
206	return (__m256i)__builtin_ia32_selectd_256(__U,
207	(__v8si)_mm256_dpbusds_epi32(__S, __A, __B),
208	(__v8si)_mm256_setzero_si256());
209	}
210
211	static __inline__ __m256i __DEFAULT_FN_ATTRS256
212	_mm256_mask_dpwssd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
213	{
214	return (__m256i)__builtin_ia32_selectd_256(__U,
215	(__v8si)_mm256_dpwssd_epi32(__S, __A, __B),
216	(__v8si)__S);
217	}
218
219	static __inline__ __m256i __DEFAULT_FN_ATTRS256
220	_mm256_maskz_dpwssd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
221	{
222	return (__m256i)__builtin_ia32_selectd_256(__U,
223	(__v8si)_mm256_dpwssd_epi32(__S, __A, __B),
224	(__v8si)_mm256_setzero_si256());
225	}
226
227	static __inline__ __m256i __DEFAULT_FN_ATTRS256
228	_mm256_mask_dpwssds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
229	{
230	return (__m256i)__builtin_ia32_selectd_256(__U,
231	(__v8si)_mm256_dpwssds_epi32(__S, __A, __B),
232	(__v8si)__S);
233	}
234
235	static __inline__ __m256i __DEFAULT_FN_ATTRS256
236	_mm256_maskz_dpwssds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
237	{
238	return (__m256i)__builtin_ia32_selectd_256(__U,
239	(__v8si)_mm256_dpwssds_epi32(__S, __A, __B),
240	(__v8si)_mm256_setzero_si256());
241	}
242
243	static __inline__ __m128i __DEFAULT_FN_ATTRS128
244	_mm_mask_dpbusd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
245	{
246	return (__m128i)__builtin_ia32_selectd_128(__U,
247	(__v4si)_mm_dpbusd_epi32(__S, __A, __B),
248	(__v4si)__S);
249	}
250
251	static __inline__ __m128i __DEFAULT_FN_ATTRS128
252	_mm_maskz_dpbusd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
253	{
254	return (__m128i)__builtin_ia32_selectd_128(__U,
255	(__v4si)_mm_dpbusd_epi32(__S, __A, __B),
256	(__v4si)_mm_setzero_si128());
257	}
258
259	static __inline__ __m128i __DEFAULT_FN_ATTRS128
260	_mm_mask_dpbusds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
261	{
262	return (__m128i)__builtin_ia32_selectd_128(__U,
263	(__v4si)_mm_dpbusds_epi32(__S, __A, __B),
264	(__v4si)__S);
265	}
266
267	static __inline__ __m128i __DEFAULT_FN_ATTRS128
268	_mm_maskz_dpbusds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
269	{
270	return (__m128i)__builtin_ia32_selectd_128(__U,
271	(__v4si)_mm_dpbusds_epi32(__S, __A, __B),
272	(__v4si)_mm_setzero_si128());
273	}
274
275	static __inline__ __m128i __DEFAULT_FN_ATTRS128
276	_mm_mask_dpwssd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
277	{
278	return (__m128i)__builtin_ia32_selectd_128(__U,
279	(__v4si)_mm_dpwssd_epi32(__S, __A, __B),
280	(__v4si)__S);
281	}
282
283	static __inline__ __m128i __DEFAULT_FN_ATTRS128
284	_mm_maskz_dpwssd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
285	{
286	return (__m128i)__builtin_ia32_selectd_128(__U,
287	(__v4si)_mm_dpwssd_epi32(__S, __A, __B),
288	(__v4si)_mm_setzero_si128());
289	}
290
291	static __inline__ __m128i __DEFAULT_FN_ATTRS128
292	_mm_mask_dpwssds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
293	{
294	return (__m128i)__builtin_ia32_selectd_128(__U,
295	(__v4si)_mm_dpwssds_epi32(__S, __A, __B),
296	(__v4si)__S);
297	}
298
299	static __inline__ __m128i __DEFAULT_FN_ATTRS128
300	_mm_maskz_dpwssds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
301	{
302	return (__m128i)__builtin_ia32_selectd_128(__U,
303	(__v4si)_mm_dpwssds_epi32(__S, __A, __B),
304	(__v4si)_mm_setzero_si128());
305	}
306
307	#undef __DEFAULT_FN_ATTRS128
308	#undef __DEFAULT_FN_ATTRS256
309
310	#endif
311

Warning: This file is not a C or C++ file. It does not have highlighting.

source code of clang/lib/Headers/avx512vlvnniintrin.h