avxvnniint8intrin.h source code [clang/lib/Headers/avxvnniint8intrin.h]

Warning: This file is not a C or C++ file. It does not have highlighting.

1	/*===-------- avxvnniint8intrin.h - AVXVNNIINT8 intrinsics -----------===
2	*
3	* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	* See https://llvm.org/LICENSE.txt for license information.
5	* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	*
7	*===-----------------------------------------------------------------------===
8	*/
9	#ifndef __IMMINTRIN_H
10	#error \
11	"Never use <avxvnniint8intrin.h> directly; include <immintrin.h> instead."
12	#endif
13
14	#ifndef __AVXVNNIINT8INTRIN_H
15	#define __AVXVNNIINT8INTRIN_H
16
17	/* Define the default attributes for the functions in this file. */
18	#define __DEFAULT_FN_ATTRS256 \
19	__attribute__((__always_inline__, __nodebug__, __target__("avxvnniint8"), \
20	__min_vector_width__(256)))
21	#define __DEFAULT_FN_ATTRS128 \
22	__attribute__((__always_inline__, __nodebug__, __target__("avxvnniint8"), \
23	__min_vector_width__(128)))
24
25	/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
26	/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate
27	/// signed 16-bit results. Sum these 4 results with the corresponding
28	/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
29	///
30	/// \headerfile <x86intrin.h>
31	///
32	/// \code
33	/// _mm_dpbssd_epi32(__m128i __W, __m128i __A, __m128i __B);
34	/// \endcode
35	///
36	/// This intrinsic corresponds to the \c VPDPBSSD instruction.
37	///
38	/// \param __A
39	/// A 128-bit vector of [16 x char].
40	/// \param __B
41	/// A 128-bit vector of [16 x char].
42	/// \returns
43	/// A 128-bit vector of [4 x int].
44	///
45	/// \code{.operation}
46	/// FOR j := 0 to 3
47	/// tmp1.word := SignExtend16(__A.byte[4j]) SignExtend16(__B.byte[4*j])
48	/// tmp2.word := SignExtend16(__A.byte[4j+1]) SignExtend16(__B.byte[4*j+1])
49	/// tmp3.word := SignExtend16(__A.byte[4j+2]) SignExtend16(__B.byte[4*j+2])
50	/// tmp4.word := SignExtend16(__A.byte[4j+3]) SignExtend16(__B.byte[4*j+3])
51	/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
52	/// ENDFOR
53	/// dst[MAX:128] := 0
54	/// \endcode
55	static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbssd_epi32(__m128i __W,
56	__m128i __A,
57	__m128i __B) {
58	return (__m128i)__builtin_ia32_vpdpbssd128((__v4si)__W, (__v4si)__A,
59	(__v4si)__B);
60	}
61
62	/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
63	/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate
64	/// signed 16-bit results. Sum these 4 results with the corresponding
65	/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
66	///
67	/// \headerfile <x86intrin.h>
68	///
69	/// \code
70	/// _mm256_dpbssd_epi32(__m256i __W, __m256i __A, __m256i __B);
71	/// \endcode
72	///
73	/// This intrinsic corresponds to the \c VPDPBSSD instruction.
74	///
75	/// \param __A
76	/// A 256-bit vector of [32 x char].
77	/// \param __B
78	/// A 256-bit vector of [32 x char].
79	/// \returns
80	/// A 256-bit vector of [8 x int].
81	///
82	/// \code{.operation}
83	/// FOR j := 0 to 7
84	/// tmp1.word := SignExtend16(__A.byte[4j]) SignExtend16(__B.byte[4*j])
85	/// tmp2.word := SignExtend16(__A.byte[4j+1]) SignExtend16(__B.byte[4*j+1])
86	/// tmp3.word := SignExtend16(__A.byte[4j+2]) SignExtend16(__B.byte[4*j+2])
87	/// tmp4.word := SignExtend16(__A.byte[4j+3]) SignExtend16(__B.byte[4*j+3])
88	/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
89	/// ENDFOR
90	/// dst[MAX:256] := 0
91	/// \endcode
92	static __inline__ __m256i __DEFAULT_FN_ATTRS256
93	_mm256_dpbssd_epi32(__m256i __W, __m256i __A, __m256i __B) {
94	return (__m256i)__builtin_ia32_vpdpbssd256((__v8si)__W, (__v8si)__A,
95	(__v8si)__B);
96	}
97
98	/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
99	/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate
100	/// signed 16-bit results. Sum these 4 results with the corresponding
101	/// 32-bit integer in \a __W with signed saturation, and store the packed
102	/// 32-bit results in \a dst.
103	///
104	/// \headerfile <x86intrin.h>
105	///
106	/// \code
107	/// _mm_dpbssds_epi32( __m128i __W, __m128i __A, __m128i __B);
108	/// \endcode
109	///
110	/// This intrinsic corresponds to the \c VPDPBSSD instruction.
111	///
112	/// \param __A
113	/// A 128-bit vector of [16 x char].
114	/// \param __B
115	/// A 128-bit vector of [16 x char].
116	/// \returns
117	/// A 128-bit vector of [4 x int].
118	///
119	/// \code{.operation}
120	/// FOR j := 0 to 3
121	/// tmp1.word := SignExtend16(__A.byte[4j]) SignExtend16(__B.byte[4*j])
122	/// tmp2.word := SignExtend16(__A.byte[4j+1]) SignExtend16(__B.byte[4*j+1])
123	/// tmp3.word := SignExtend16(__A.byte[4j+2]) SignExtend16(__B.byte[4*j+2])
124	/// tmp4.word := SignExtend16(__A.byte[4j+3]) SignExtend16(__B.byte[4*j+3])
125	/// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
126	/// ENDFOR
127	/// dst[MAX:128] := 0
128	/// \endcode
129	static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbssds_epi32(__m128i __W,
130	__m128i __A,
131	__m128i __B) {
132	return (__m128i)__builtin_ia32_vpdpbssds128((__v4si)__W, (__v4si)__A,
133	(__v4si)__B);
134	}
135
136	/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
137	/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate
138	/// signed 16-bit results. Sum these 4 results with the corresponding
139	/// 32-bit integer in \a __W with signed saturation, and store the packed
140	/// 32-bit results in \a dst.
141	///
142	/// \headerfile <x86intrin.h>
143	///
144	/// \code
145	/// _mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B);
146	/// \endcode
147	///
148	/// This intrinsic corresponds to the \c VPDPBSSD instruction.
149	///
150	/// \param __A
151	/// A 256-bit vector of [32 x char].
152	/// \param __B
153	/// A 256-bit vector of [32 x char].
154	/// \returns
155	/// A 256-bit vector of [8 x int].
156	///
157	/// \code{.operation}
158	/// FOR j := 0 to 7
159	/// tmp1.word := SignExtend16(__A.byte[4j]) SignExtend16(__B.byte[4*j])
160	/// tmp2.word := SignExtend16(__A.byte[4j+1]) SignExtend16(__B.byte[4*j+1])
161	/// tmp3.word := SignExtend16(__A.byte[4j+2]) SignExtend16(__B.byte[4*j+2])
162	/// tmp4.word := SignExtend16(__A.byte[4j+3]) SignExtend16(__B.byte[4*j+3])
163	/// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
164	/// ENDFOR
165	/// dst[MAX:256] := 0
166	/// \endcode
167	static __inline__ __m256i __DEFAULT_FN_ATTRS256
168	_mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B) {
169	return (__m256i)__builtin_ia32_vpdpbssds256((__v8si)__W, (__v8si)__A,
170	(__v8si)__B);
171	}
172
173	/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
174	/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
175	/// signed 16-bit results. Sum these 4 results with the corresponding
176	/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
177	///
178	/// \headerfile <x86intrin.h>
179	///
180	/// \code
181	/// _mm_dpbsud_epi32(__m128i __W, __m128i __A, __m128i __B);
182	/// \endcode
183	///
184	/// This intrinsic corresponds to the \c VPDPBSSD instruction.
185	///
186	/// \param __A
187	/// A 128-bit vector of [16 x char].
188	/// \param __B
189	/// A 128-bit vector of [16 x unsigned char].
190	/// \returns
191	/// A 128-bit vector of [4 x int].
192	///
193	/// \code{.operation}
194	/// FOR j := 0 to 3
195	/// tmp1.word := Signed(SignExtend16(__A.byte[4j]) ZeroExtend16(__B.byte[4*j]))
196	/// tmp2.word := Signed(SignExtend16(__A.byte[4j+1]) ZeroExtend16(__B.byte[4*j+1]))
197	/// tmp3.word := Signed(SignExtend16(__A.byte[4j+2]) ZeroExtend16(__B.byte[4*j+2]))
198	/// tmp4.word := Signed(SignExtend16(__A.byte[4j+3]) ZeroExtend16(__B.byte[4*j+3]))
199	/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
200	/// ENDFOR
201	/// dst[MAX:128] := 0
202	/// \endcode
203	static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbsud_epi32(__m128i __W,
204	__m128i __A,
205	__m128i __B) {
206	return (__m128i)__builtin_ia32_vpdpbsud128((__v4si)__W, (__v4si)__A,
207	(__v4si)__B);
208	}
209
210	/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
211	/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
212	/// signed 16-bit results. Sum these 4 results with the corresponding
213	/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
214	///
215	/// \headerfile <x86intrin.h>
216	///
217	/// \code
218	/// _mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B);
219	/// \endcode
220	///
221	/// This intrinsic corresponds to the \c VPDPBSSD instruction.
222	///
223	/// \param __A
224	/// A 256-bit vector of [32 x char].
225	/// \param __B
226	/// A 256-bit vector of [32 x unsigned char].
227	/// \returns
228	/// A 256-bit vector of [8 x int].
229	///
230	/// \code{.operation}
231	/// FOR j := 0 to 7
232	/// tmp1.word := Signed(SignExtend16(__A.byte[4j]) ZeroExtend16(__B.byte[4*j]))
233	/// tmp2.word := Signed(SignExtend16(__A.byte[4j+1]) ZeroExtend16(__B.byte[4*j+1]))
234	/// tmp3.word := Signed(SignExtend16(__A.byte[4j+2]) ZeroExtend16(__B.byte[4*j+2]))
235	/// tmp4.word := Signed(SignExtend16(__A.byte[4j+3]) ZeroExtend16(__B.byte[4*j+3]))
236	/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
237	/// ENDFOR
238	/// dst[MAX:256] := 0
239	/// \endcode
240	static __inline__ __m256i __DEFAULT_FN_ATTRS256
241	_mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B) {
242	return (__m256i)__builtin_ia32_vpdpbsud256((__v8si)__W, (__v8si)__A,
243	(__v8si)__B);
244	}
245
246	/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
247	/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
248	/// signed 16-bit results. Sum these 4 results with the corresponding
249	/// 32-bit integer in \a __W with signed saturation, and store the packed
250	/// 32-bit results in \a dst.
251	///
252	/// \headerfile <x86intrin.h>
253	///
254	/// \code
255	/// _mm_dpbsuds_epi32( __m128i __W, __m128i __A, __m128i __B);
256	/// \endcode
257	///
258	/// This intrinsic corresponds to the \c VPDPBSSD instruction.
259	///
260	/// \param __A
261	/// A 128-bit vector of [16 x char].
262	/// \param __B
263	/// A 128-bit vector of [16 x unsigned char].
264	/// \returns
265	/// A 128-bit vector of [4 x int].
266	///
267	/// \code{.operation}
268	/// FOR j := 0 to 3
269	/// tmp1.word := Signed(SignExtend16(__A.byte[4j]) ZeroExtend16(__B.byte[4*j]))
270	/// tmp2.word := Signed(SignExtend16(__A.byte[4j+1]) ZeroExtend16(__B.byte[4*j+1]))
271	/// tmp3.word := Signed(SignExtend16(__A.byte[4j+2]) ZeroExtend16(__B.byte[4*j+2]))
272	/// tmp4.word := Signed(SignExtend16(__A.byte[4j+3]) ZeroExtend16(__B.byte[4*j+3]))
273	/// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
274	/// ENDFOR
275	/// dst[MAX:128] := 0
276	/// \endcode
277	static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbsuds_epi32(__m128i __W,
278	__m128i __A,
279	__m128i __B) {
280	return (__m128i)__builtin_ia32_vpdpbsuds128((__v4si)__W, (__v4si)__A,
281	(__v4si)__B);
282	}
283
284	/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
285	/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
286	/// signed 16-bit results. Sum these 4 results with the corresponding
287	/// 32-bit integer in \a __W with signed saturation, and store the packed
288	/// 32-bit results in \a dst.
289	///
290	/// \headerfile <x86intrin.h>
291	///
292	/// \code
293	/// _mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B);
294	/// \endcode
295	///
296	/// This intrinsic corresponds to the \c VPDPBSSD instruction.
297	///
298	/// \param __A
299	/// A 256-bit vector of [32 x char].
300	/// \param __B
301	/// A 256-bit vector of [32 x unsigned char].
302	/// \returns
303	/// A 256-bit vector of [8 x int].
304	///
305	/// \code{.operation}
306	/// FOR j := 0 to 7
307	/// tmp1.word := Signed(SignExtend16(__A.byte[4j]) ZeroExtend16(__B.byte[4*j]))
308	/// tmp2.word := Signed(SignExtend16(__A.byte[4j+1]) ZeroExtend16(__B.byte[4*j+1]))
309	/// tmp3.word := Signed(SignExtend16(__A.byte[4j+2]) ZeroExtend16(__B.byte[4*j+2]))
310	/// tmp4.word := Signed(SignExtend16(__A.byte[4j+3]) ZeroExtend16(__B.byte[4*j+3]))
311	/// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
312	/// ENDFOR
313	/// dst[MAX:256] := 0
314	/// \endcode
315	static __inline__ __m256i __DEFAULT_FN_ATTRS256
316	_mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
317	return (__m256i)__builtin_ia32_vpdpbsuds256((__v8si)__W, (__v8si)__A,
318	(__v8si)__B);
319	}
320
321	/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
322	/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
323	/// signed 16-bit results. Sum these 4 results with the corresponding
324	/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
325	///
326	/// \headerfile <x86intrin.h>
327	///
328	/// \code
329	/// _mm_dpbuud_epi32(__m128i __W, __m128i __A, __m128i __B);
330	/// \endcode
331	///
332	/// This intrinsic corresponds to the \c VPDPBSSD instruction.
333	///
334	/// \param __A
335	/// A 128-bit vector of [16 x unsigned char].
336	/// \param __B
337	/// A 128-bit vector of [16 x unsigned char].
338	/// \returns
339	/// A 128-bit vector of [4 x int].
340	///
341	/// \code{.operation}
342	/// FOR j := 0 to 3
343	/// tmp1.word := ZeroExtend16(__A.byte[4j]) ZeroExtend16(__B.byte[4*j])
344	/// tmp2.word := ZeroExtend16(__A.byte[4j+1]) ZeroExtend16(__B.byte[4*j+1])
345	/// tmp3.word := ZeroExtend16(__A.byte[4j+2]) ZeroExtend16(__B.byte[4*j+2])
346	/// tmp4.word := ZeroExtend16(__A.byte[4j+3]) ZeroExtend16(__B.byte[4*j+3])
347	/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
348	/// ENDFOR
349	/// dst[MAX:128] := 0
350	/// \endcode
351	static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbuud_epi32(__m128i __W,
352	__m128i __A,
353	__m128i __B) {
354	return (__m128i)__builtin_ia32_vpdpbuud128((__v4si)__W, (__v4si)__A,
355	(__v4si)__B);
356	}
357
358	/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
359	/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
360	/// signed 16-bit results. Sum these 4 results with the corresponding
361	/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
362	///
363	/// \headerfile <x86intrin.h>
364	///
365	/// \code
366	/// _mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B);
367	/// \endcode
368	///
369	/// This intrinsic corresponds to the \c VPDPBSSD instruction.
370	///
371	/// \param __A
372	/// A 256-bit vector of [32 x unsigned char].
373	/// \param __B
374	/// A 256-bit vector of [32 x unsigned char].
375	/// \returns
376	/// A 256-bit vector of [8 x int].
377	///
378	/// \code{.operation}
379	/// FOR j := 0 to 7
380	/// tmp1.word := ZeroExtend16(__A.byte[4j]) ZeroExtend16(__B.byte[4*j])
381	/// tmp2.word := ZeroExtend16(__A.byte[4j+1]) ZeroExtend16(__B.byte[4*j+1])
382	/// tmp3.word := ZeroExtend16(__A.byte[4j+2]) ZeroExtend16(__B.byte[4*j+2])
383	/// tmp4.word := ZeroExtend16(__A.byte[4j+3]) ZeroExtend16(__B.byte[4*j+3])
384	/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
385	/// ENDFOR
386	/// dst[MAX:256] := 0
387	/// \endcode
388	static __inline__ __m256i __DEFAULT_FN_ATTRS256
389	_mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B) {
390	return (__m256i)__builtin_ia32_vpdpbuud256((__v8si)__W, (__v8si)__A,
391	(__v8si)__B);
392	}
393
394	/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
395	/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
396	/// signed 16-bit results. Sum these 4 results with the corresponding
397	/// 32-bit integer in \a __W with signed saturation, and store the packed
398	/// 32-bit results in \a dst.
399	///
400	/// \headerfile <x86intrin.h>
401	///
402	/// \code
403	/// _mm_dpbuuds_epi32( __m128i __W, __m128i __A, __m128i __B);
404	/// \endcode
405	///
406	/// This intrinsic corresponds to the \c VPDPBUUDS instruction.
407	///
408	/// \param __A
409	/// A 128-bit vector of [16 x unsigned char].
410	/// \param __B
411	/// A 128-bit vector of [16 x unsigned char].
412	/// \returns
413	/// A 128-bit vector of [4 x int].
414	///
415	/// \code{.operation}
416	/// FOR j := 0 to 3
417	/// tmp1.word := ZeroExtend16(__A.byte[4j]) ZeroExtend16(__B.byte[4*j])
418	/// tmp2.word := ZeroExtend16(__A.byte[4j+1]) ZeroExtend16(__B.byte[4*j+1])
419	/// tmp3.word := ZeroExtend16(__A.byte[4j+2]) ZeroExtend16(__B.byte[4*j+2])
420	/// tmp4.word := ZeroExtend16(__A.byte[4j+3]) ZeroExtend16(__B.byte[4*j+3])
421	/// dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
422	/// ENDFOR
423	/// dst[MAX:128] := 0
424	/// \endcode
425	static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbuuds_epi32(__m128i __W,
426	__m128i __A,
427	__m128i __B) {
428	return (__m128i)__builtin_ia32_vpdpbuuds128((__v4si)__W, (__v4si)__A,
429	(__v4si)__B);
430	}
431
432	/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
433	/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
434	/// signed 16-bit results. Sum these 4 results with the corresponding
435	/// 32-bit integer in \a __W with signed saturation, and store the packed
436	/// 32-bit results in \a dst.
437	///
438	/// \headerfile <x86intrin.h>
439	///
440	/// \code
441	/// _mm256_dpbuuds_epi32(__m256i __W, __m256i __A, __m256i __B);
442	/// \endcode
443	///
444	/// This intrinsic corresponds to the \c VPDPBUUDS instruction.
445	///
446	/// \param __A
447	/// A 256-bit vector of [32 x unsigned char].
448	/// \param __B
449	/// A 256-bit vector of [32 x unsigned char].
450	/// \returns
451	/// A 256-bit vector of [8 x int].
452	///
453	/// \code{.operation}
454	/// FOR j := 0 to 7
455	/// tmp1.word := ZeroExtend16(__A.byte[4j]) ZeroExtend16(__B.byte[4*j])
456	/// tmp2.word := ZeroExtend16(__A.byte[4j+1]) ZeroExtend16(__B.byte[4*j+1])
457	/// tmp3.word := ZeroExtend16(__A.byte[4j+2]) ZeroExtend16(__B.byte[4*j+2])
458	/// tmp4.word := ZeroExtend16(__A.byte[4j+3]) ZeroExtend16(__B.byte[4*j+3])
459	/// dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
460	/// ENDFOR
461	/// dst[MAX:256] := 0
462	/// \endcode
463	static __inline__ __m256i __DEFAULT_FN_ATTRS256
464	_mm256_dpbuuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
465	return (__m256i)__builtin_ia32_vpdpbuuds256((__v8si)__W, (__v8si)__A,
466	(__v8si)__B);
467	}
468	#undef __DEFAULT_FN_ATTRS128
469	#undef __DEFAULT_FN_ATTRS256
470
471	#endif // __AVXVNNIINT8INTRIN_H
472

Warning: This file is not a C or C++ file. It does not have highlighting.

source code of clang/lib/Headers/avxvnniint8intrin.h