avxvnniint16intrin.h source code [clang/lib/Headers/avxvnniint16intrin.h]

Warning: This file is not a C or C++ file. It does not have highlighting.

1	/*===----------- avxvnniint16intrin.h - AVXVNNIINT16 intrinsics-------------===
2	*
3	* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	* See https://llvm.org/LICENSE.txt for license information.
5	* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	*
7	*===-----------------------------------------------------------------------===
8	*/
9
10	#ifndef __IMMINTRIN_H
11	#error \
12	"Never use <avxvnniint16intrin.h> directly; include <immintrin.h> instead."
13	#endif // __IMMINTRIN_H
14
15	#ifndef __AVXVNNIINT16INTRIN_H
16	#define __AVXVNNIINT16INTRIN_H
17
18	/* Define the default attributes for the functions in this file. */
19	#define __DEFAULT_FN_ATTRS128 \
20	__attribute__((__always_inline__, __nodebug__, __target__("avxvnniint16"), \
21	__min_vector_width__(128)))
22	#define __DEFAULT_FN_ATTRS256 \
23	__attribute__((__always_inline__, __nodebug__, __target__("avxvnniint16"), \
24	__min_vector_width__(256)))
25
26	/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
27	/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
28	/// signed 16-bit results. Sum these 2 results with the corresponding
29	/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
30	///
31	/// \headerfile <immintrin.h>
32	///
33	/// \code
34	/// __m128i _mm_dpwsud_epi32(__m128i __W, __m128i __A, __m128i __B)
35	/// \endcode
36	///
37	/// This intrinsic corresponds to the \c VPDPWSUD instruction.
38	///
39	/// \param __W
40	/// A 128-bit vector of [4 x int].
41	/// \param __A
42	/// A 128-bit vector of [8 x short].
43	/// \param __B
44	/// A 128-bit vector of [8 x unsigned short].
45	/// \returns
46	/// A 128-bit vector of [4 x int].
47	///
48	/// \code{.operation}
49	/// FOR j := 0 to 3
50	/// tmp1.dword := SignExtend32(__A.word[2j]) ZeroExtend32(__B.word[2*j])
51	/// tmp2.dword := SignExtend32(__A.word[2j+1]) ZeroExtend32(__B.word[2*j+1])
52	/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2
53	/// ENDFOR
54	/// dst[MAX:128] := 0
55	/// \endcode
56	static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwsud_epi32(__m128i __W,
57	__m128i __A,
58	__m128i __B) {
59	return (__m128i)__builtin_ia32_vpdpwsud128((__v4si)__W, (__v4si)__A,
60	(__v4si)__B);
61	}
62
63	/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
64	/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
65	/// signed 16-bit results. Sum these 2 results with the corresponding
66	/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
67	///
68	/// \headerfile <immintrin.h>
69	///
70	/// \code
71	/// __m256i _mm256_dpwsud_epi32(__m256i __W, __m256i __A, __m256i __B)
72	/// \endcode
73	///
74	/// This intrinsic corresponds to the \c VPDPWSUD instruction.
75	///
76	/// \param __W
77	/// A 256-bit vector of [8 x int].
78	/// \param __A
79	/// A 256-bit vector of [16 x short].
80	/// \param __B
81	/// A 256-bit vector of [16 x unsigned short].
82	/// \returns
83	/// A 256-bit vector of [8 x int].
84	///
85	/// \code{.operation}
86	/// FOR j := 0 to 7
87	/// tmp1.dword := SignExtend32(__A.word[2j]) ZeroExtend32(__B.word[2*j])
88	/// tmp2.dword := SignExtend32(__A.word[2j+1]) ZeroExtend32(__B.word[2*j+1])
89	/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2
90	/// ENDFOR
91	/// dst[MAX:256] := 0
92	/// \endcode
93	static __inline__ __m256i __DEFAULT_FN_ATTRS256
94	_mm256_dpwsud_epi32(__m256i __W, __m256i __A, __m256i __B) {
95	return (__m256i)__builtin_ia32_vpdpwsud256((__v8si)__W, (__v8si)__A,
96	(__v8si)__B);
97	}
98
99	/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
100	/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
101	/// signed 16-bit results. Sum these 2 results with the corresponding
102	/// 32-bit integer in \a __W with signed saturation, and store the packed
103	/// 32-bit results in \a dst.
104	///
105	/// \headerfile <immintrin.h>
106	///
107	/// \code
108	/// __m128i _mm_dpwsuds_epi32(__m128i __W, __m128i __A, __m128i __B)
109	/// \endcode
110	///
111	/// This intrinsic corresponds to the \c VPDPWSUDS instruction.
112	///
113	/// \param __W
114	/// A 128-bit vector of [4 x int].
115	/// \param __A
116	/// A 128-bit vector of [8 x short].
117	/// \param __B
118	/// A 128-bit vector of [8 x unsigned short].
119	/// \returns
120	/// A 128-bit vector of [4 x int].
121	///
122	/// \code{.operation}
123	/// FOR j := 0 to 3
124	/// tmp1.dword := SignExtend32(__A.word[2j]) ZeroExtend32(__B.word[2*j])
125	/// tmp2.dword := SignExtend32(__A.word[2j+1]) ZeroExtend32(__B.word[2*j+1])
126	/// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
127	/// ENDFOR
128	/// dst[MAX:128] := 0
129	/// \endcode
130	static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwsuds_epi32(__m128i __W,
131	__m128i __A,
132	__m128i __B) {
133	return (__m128i)__builtin_ia32_vpdpwsuds128((__v4si)__W, (__v4si)__A,
134	(__v4si)__B);
135	}
136
137	/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
138	/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
139	/// signed 16-bit results. Sum these 2 results with the corresponding
140	/// 32-bit integer in \a __W with signed saturation, and store the packed
141	/// 32-bit results in \a dst.
142	///
143	/// \headerfile <immintrin.h>
144	///
145	/// \code
146	/// __m256i _mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B)
147	/// \endcode
148	///
149	/// This intrinsic corresponds to the \c VPDPWSUDS instruction.
150	///
151	/// \param __W
152	/// A 256-bit vector of [8 x int].
153	/// \param __A
154	/// A 256-bit vector of [16 x short].
155	/// \param __B
156	/// A 256-bit vector of [16 x unsigned short].
157	/// \returns
158	/// A 256-bit vector of [8 x int].
159	///
160	/// \code{.operation}
161	/// FOR j := 0 to 7
162	/// tmp1.dword := SignExtend32(__A.word[2j]) ZeroExtend32(__B.word[2*j])
163	/// tmp2.dword := SignExtend32(__A.word[2j+1]) ZeroExtend32(__B.word[2*j+1])
164	/// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
165	/// ENDFOR
166	/// dst[MAX:256] := 0
167	/// \endcode
168	static __inline__ __m256i __DEFAULT_FN_ATTRS256
169	_mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
170	return (__m256i)__builtin_ia32_vpdpwsuds256((__v8si)__W, (__v8si)__A,
171	(__v8si)__B);
172	}
173
174	/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
175	/// corresponding signed 16-bit integers in \a __B, producing 2 intermediate
176	/// signed 16-bit results. Sum these 2 results with the corresponding
177	/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
178	///
179	/// \headerfile <immintrin.h>
180	///
181	/// \code
182	/// __m128i _mm_dpbusd_epi32(__m128i __W, __m128i __A, __m128i __B)
183	/// \endcode
184	///
185	/// This intrinsic corresponds to the \c VPDPWUSD instruction.
186	///
187	/// \param __W
188	/// A 128-bit vector of [4 x int].
189	/// \param __A
190	/// A 128-bit vector of [8 x unsigned short].
191	/// \param __B
192	/// A 128-bit vector of [8 x short].
193	/// \returns
194	/// A 128-bit vector of [4 x int].
195	///
196	/// \code{.operation}
197	/// FOR j := 0 to 3
198	/// tmp1.dword := ZeroExtend32(__A.word[2j]) SignExtend32(__B.word[2*j])
199	/// tmp2.dword := ZeroExtend32(__A.word[2j+1]) SignExtend32(__B.word[2*j+1])
200	/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2
201	/// ENDFOR
202	/// dst[MAX:128] := 0
203	/// \endcode
204	static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwusd_epi32(__m128i __W,
205	__m128i __A,
206	__m128i __B) {
207	return (__m128i)__builtin_ia32_vpdpwusd128((__v4si)__W, (__v4si)__A,
208	(__v4si)__B);
209	}
210
211	/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
212	/// corresponding signed 16-bit integers in \a __B, producing 2 intermediate
213	/// signed 16-bit results. Sum these 2 results with the corresponding
214	/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
215	///
216	/// \headerfile <immintrin.h>
217	///
218	/// \code
219	/// __m256i _mm256_dpwusd_epi32(__m256i __W, __m256i __A, __m256i __B)
220	/// \endcode
221	///
222	/// This intrinsic corresponds to the \c VPDPWUSD instruction.
223	///
224	/// \param __W
225	/// A 256-bit vector of [8 x int].
226	/// \param __A
227	/// A 256-bit vector of [16 x unsigned short].
228	/// \param __B
229	/// A 256-bit vector of [16 x short].
230	/// \returns
231	/// A 256-bit vector of [8 x int].
232	///
233	/// \code{.operation}
234	/// FOR j := 0 to 7
235	/// tmp1.dword := ZeroExtend32(__A.word[2j]) SignExtend32(__B.word[2*j])
236	/// tmp2.dword := ZeroExtend32(__A.word[2j+1]) SignExtend32(__B.word[2*j+1])
237	/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2
238	/// ENDFOR
239	/// dst[MAX:256] := 0
240	/// \endcode
241	static __inline__ __m256i __DEFAULT_FN_ATTRS256
242	_mm256_dpwusd_epi32(__m256i __W, __m256i __A, __m256i __B) {
243	return (__m256i)__builtin_ia32_vpdpwusd256((__v8si)__W, (__v8si)__A,
244	(__v8si)__B);
245	}
246
247	/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
248	/// corresponding signed 16-bit integers in \a __B, producing 2 intermediate
249	/// signed 16-bit results. Sum these 2 results with the corresponding
250	/// 32-bit integer in \a __W with signed saturation, and store the packed
251	/// 32-bit results in \a dst.
252	///
253	/// \headerfile <immintrin.h>
254	///
255	/// \code
256	/// __m128i _mm_dpwusds_epi32(__m128i __W, __m128i __A, __m128i __B)
257	/// \endcode
258	///
259	/// This intrinsic corresponds to the \c VPDPWSUDS instruction.
260	///
261	/// \param __W
262	/// A 128-bit vector of [4 x int].
263	/// \param __A
264	/// A 128-bit vector of [8 x unsigned short].
265	/// \param __B
266	/// A 128-bit vector of [8 x short].
267	/// \returns
268	/// A 128-bit vector of [4 x int].
269	///
270	/// \code{.operation}
271	/// FOR j := 0 to 3
272	/// tmp1.dword := ZeroExtend32(__A.word[2j]) SignExtend32(__B.word[2*j])
273	/// tmp2.dword := ZeroExtend32(__A.word[2j+1]) SignExtend32(__B.word[2*j+1])
274	/// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
275	/// ENDFOR
276	/// dst[MAX:128] := 0
277	/// \endcode
278	static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwusds_epi32(__m128i __W,
279	__m128i __A,
280	__m128i __B) {
281	return (__m128i)__builtin_ia32_vpdpwusds128((__v4si)__W, (__v4si)__A,
282	(__v4si)__B);
283	}
284
285	/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
286	/// corresponding signed 16-bit integers in \a __B, producing 2 intermediate
287	/// signed 16-bit results. Sum these 2 results with the corresponding
288	/// 32-bit integer in \a __W with signed saturation, and store the packed
289	/// 32-bit results in \a dst.
290	///
291	/// \headerfile <immintrin.h>
292	///
293	/// \code
294	/// __m256i _mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B)
295	/// \endcode
296	///
297	/// This intrinsic corresponds to the \c VPDPWSUDS instruction.
298	///
299	/// \param __W
300	/// A 256-bit vector of [8 x int].
301	/// \param __A
302	/// A 256-bit vector of [16 x unsigned short].
303	/// \param __B
304	/// A 256-bit vector of [16 x short].
305	/// \returns
306	/// A 256-bit vector of [8 x int].
307	///
308	/// \code{.operation}
309	/// FOR j := 0 to 7
310	/// tmp1.dword := ZeroExtend32(__A.word[2j]) SignExtend32(__B.word[2*j])
311	/// tmp2.dword := ZeroExtend32(__A.word[2j+1]) SignExtend32(__B.word[2*j+1])
312	/// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
313	/// ENDFOR
314	/// dst[MAX:256] := 0
315	/// \endcode
316	static __inline__ __m256i __DEFAULT_FN_ATTRS256
317	_mm256_dpwusds_epi32(__m256i __W, __m256i __A, __m256i __B) {
318	return (__m256i)__builtin_ia32_vpdpwusds256((__v8si)__W, (__v8si)__A,
319	(__v8si)__B);
320	}
321
322	/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
323	/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
324	/// signed 16-bit results. Sum these 2 results with the corresponding
325	/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
326	///
327	/// \headerfile <immintrin.h>
328	///
329	/// \code
330	/// __m128i _mm_dpwuud_epi32(__m128i __W, __m128i __A, __m128i __B)
331	/// \endcode
332	///
333	/// This intrinsic corresponds to the \c VPDPWUUD instruction.
334	///
335	/// \param __W
336	/// A 128-bit vector of [4 x unsigned int].
337	/// \param __A
338	/// A 128-bit vector of [8 x unsigned short].
339	/// \param __B
340	/// A 128-bit vector of [8 x unsigned short].
341	/// \returns
342	/// A 128-bit vector of [4 x unsigned int].
343	///
344	/// \code{.operation}
345	/// FOR j := 0 to 3
346	/// tmp1.dword := ZeroExtend32(__A.word[2j]) ZeroExtend32(__B.word[2*j])
347	/// tmp2.dword := ZeroExtend32(__A.word[2j+1]) ZeroExtend32(__B.word[2*j+1])
348	/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2
349	/// ENDFOR
350	/// dst[MAX:128] := 0
351	/// \endcode
352	static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwuud_epi32(__m128i __W,
353	__m128i __A,
354	__m128i __B) {
355	return (__m128i)__builtin_ia32_vpdpwuud128((__v4si)__W, (__v4si)__A,
356	(__v4si)__B);
357	}
358
359	/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
360	/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
361	/// signed 16-bit results. Sum these 2 results with the corresponding
362	/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
363	///
364	/// \headerfile <immintrin.h>
365	///
366	/// \code
367	/// __m256i _mm256_dpwuud_epi32(__m256i __W, __m256i __A, __m256i __B)
368	/// \endcode
369	///
370	/// This intrinsic corresponds to the \c VPDPWUUD instruction.
371	///
372	/// \param __W
373	/// A 256-bit vector of [8 x unsigned int].
374	/// \param __A
375	/// A 256-bit vector of [16 x unsigned short].
376	/// \param __B
377	/// A 256-bit vector of [16 x unsigned short].
378	/// \returns
379	/// A 256-bit vector of [8 x unsigned int].
380	///
381	/// \code{.operation}
382	/// FOR j := 0 to 7
383	/// tmp1.dword := ZeroExtend32(__A.word[2j]) ZeroExtend32(__B.word[2*j])
384	/// tmp2.dword := ZeroExtend32(__A.word[2j+1]) ZeroExtend32(__B.word[2*j+1])
385	/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2
386	/// ENDFOR
387	/// dst[MAX:256] := 0
388	/// \endcode
389	static __inline__ __m256i __DEFAULT_FN_ATTRS256
390	_mm256_dpwuud_epi32(__m256i __W, __m256i __A, __m256i __B) {
391	return (__m256i)__builtin_ia32_vpdpwuud256((__v8si)__W, (__v8si)__A,
392	(__v8si)__B);
393	}
394
395	/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
396	/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
397	/// signed 16-bit results. Sum these 2 results with the corresponding
398	/// 32-bit integer in \a __W with signed saturation, and store the packed
399	/// 32-bit results in \a dst.
400	///
401	/// \headerfile <immintrin.h>
402	///
403	/// \code
404	/// __m128i _mm_dpwsuds_epi32(__m128i __W, __m128i __A, __m128i __B)
405	/// \endcode
406	///
407	/// This intrinsic corresponds to the \c VPDPWSUDS instruction.
408	///
409	/// \param __W
410	/// A 128-bit vector of [4 x unsigned int].
411	/// \param __A
412	/// A 128-bit vector of [8 x unsigned short].
413	/// \param __B
414	/// A 128-bit vector of [8 x unsigned short].
415	/// \returns
416	/// A 128-bit vector of [4 x unsigned int].
417	///
418	/// \code{.operation}
419	/// FOR j := 0 to 3
420	/// tmp1.dword := ZeroExtend32(__A.word[2j]) ZeroExtend32(__B.word[2*j])
421	/// tmp2.dword := ZeroExtend32(__A.word[2j+1]) ZeroExtend32(__B.word[2*j+1])
422	/// dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
423	/// ENDFOR
424	/// dst[MAX:128] := 0
425	/// \endcode
426	static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwuuds_epi32(__m128i __W,
427	__m128i __A,
428	__m128i __B) {
429	return (__m128i)__builtin_ia32_vpdpwuuds128((__v4si)__W, (__v4si)__A,
430	(__v4si)__B);
431	}
432
433	/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
434	/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
435	/// signed 16-bit results. Sum these 2 results with the corresponding
436	/// 32-bit integer in \a __W with signed saturation, and store the packed
437	/// 32-bit results in \a dst.
438	///
439	/// \headerfile <immintrin.h>
440	///
441	/// \code
442	/// __m256i _mm256_dpwuuds_epi32(__m256i __W, __m256i __A, __m256i __B)
443	/// \endcode
444	///
445	/// This intrinsic corresponds to the \c VPDPWSUDS instruction.
446	///
447	/// \param __W
448	/// A 256-bit vector of [8 x unsigned int].
449	/// \param __A
450	/// A 256-bit vector of [16 x unsigned short].
451	/// \param __B
452	/// A 256-bit vector of [16 x unsigned short].
453	/// \returns
454	/// A 256-bit vector of [8 x unsigned int].
455	///
456	/// \code{.operation}
457	/// FOR j := 0 to 7
458	/// tmp1.dword := ZeroExtend32(__A.word[2j]) ZeroExtend32(__B.word[2*j])
459	/// tmp2.dword := ZeroExtend32(__A.word[2j+1]) ZeroExtend32(__B.word[2*j+1])
460	/// dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
461	/// ENDFOR
462	/// dst[MAX:256] := 0
463	/// \endcode
464	static __inline__ __m256i __DEFAULT_FN_ATTRS256
465	_mm256_dpwuuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
466	return (__m256i)__builtin_ia32_vpdpwuuds256((__v8si)__W, (__v8si)__A,
467	(__v8si)__B);
468	}
469
470	#undef __DEFAULT_FN_ATTRS128
471	#undef __DEFAULT_FN_ATTRS256
472
473	#endif // __AVXVNNIINT16INTRIN_H
474

Warning: This file is not a C or C++ file. It does not have highlighting.

source code of clang/lib/Headers/avxvnniint16intrin.h