Warning: This file is not a C or C++ file. It does not have highlighting.

1/*===----------- avxvnniint16intrin.h - AVXVNNIINT16 intrinsics-------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10#ifndef __IMMINTRIN_H
11#error \
12 "Never use <avxvnniint16intrin.h> directly; include <immintrin.h> instead."
13#endif // __IMMINTRIN_H
14
15#ifndef __AVXVNNIINT16INTRIN_H
16#define __AVXVNNIINT16INTRIN_H
17
18/* Define the default attributes for the functions in this file. */
19#define __DEFAULT_FN_ATTRS128 \
20 __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint16"), \
21 __min_vector_width__(128)))
22#define __DEFAULT_FN_ATTRS256 \
23 __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint16"), \
24 __min_vector_width__(256)))
25
26/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
27/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
28/// signed 16-bit results. Sum these 2 results with the corresponding
29/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
30///
31/// \headerfile <immintrin.h>
32///
33/// \code
34/// __m128i _mm_dpwsud_epi32(__m128i __W, __m128i __A, __m128i __B)
35/// \endcode
36///
37/// This intrinsic corresponds to the \c VPDPWSUD instruction.
38///
39/// \param __W
40/// A 128-bit vector of [4 x int].
41/// \param __A
42/// A 128-bit vector of [8 x short].
43/// \param __B
44/// A 128-bit vector of [8 x unsigned short].
45/// \returns
46/// A 128-bit vector of [4 x int].
47///
48/// \code{.operation}
49/// FOR j := 0 to 3
50/// tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
51/// tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
52/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2
53/// ENDFOR
54/// dst[MAX:128] := 0
55/// \endcode
56static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwsud_epi32(__m128i __W,
57 __m128i __A,
58 __m128i __B) {
59 return (__m128i)__builtin_ia32_vpdpwsud128((__v4si)__W, (__v4si)__A,
60 (__v4si)__B);
61}
62
63/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
64/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
65/// signed 16-bit results. Sum these 2 results with the corresponding
66/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
67///
68/// \headerfile <immintrin.h>
69///
70/// \code
71/// __m256i _mm256_dpwsud_epi32(__m256i __W, __m256i __A, __m256i __B)
72/// \endcode
73///
74/// This intrinsic corresponds to the \c VPDPWSUD instruction.
75///
76/// \param __W
77/// A 256-bit vector of [8 x int].
78/// \param __A
79/// A 256-bit vector of [16 x short].
80/// \param __B
81/// A 256-bit vector of [16 x unsigned short].
82/// \returns
83/// A 256-bit vector of [8 x int].
84///
85/// \code{.operation}
86/// FOR j := 0 to 7
87/// tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
88/// tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
89/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2
90/// ENDFOR
91/// dst[MAX:256] := 0
92/// \endcode
93static __inline__ __m256i __DEFAULT_FN_ATTRS256
94_mm256_dpwsud_epi32(__m256i __W, __m256i __A, __m256i __B) {
95 return (__m256i)__builtin_ia32_vpdpwsud256((__v8si)__W, (__v8si)__A,
96 (__v8si)__B);
97}
98
99/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
100/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
101/// signed 16-bit results. Sum these 2 results with the corresponding
102/// 32-bit integer in \a __W with signed saturation, and store the packed
103/// 32-bit results in \a dst.
104///
105/// \headerfile <immintrin.h>
106///
107/// \code
108/// __m128i _mm_dpwsuds_epi32(__m128i __W, __m128i __A, __m128i __B)
109/// \endcode
110///
111/// This intrinsic corresponds to the \c VPDPWSUDS instruction.
112///
113/// \param __W
114/// A 128-bit vector of [4 x int].
115/// \param __A
116/// A 128-bit vector of [8 x short].
117/// \param __B
118/// A 128-bit vector of [8 x unsigned short].
119/// \returns
120/// A 128-bit vector of [4 x int].
121///
122/// \code{.operation}
123/// FOR j := 0 to 3
124/// tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
125/// tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
126/// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
127/// ENDFOR
128/// dst[MAX:128] := 0
129/// \endcode
130static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwsuds_epi32(__m128i __W,
131 __m128i __A,
132 __m128i __B) {
133 return (__m128i)__builtin_ia32_vpdpwsuds128((__v4si)__W, (__v4si)__A,
134 (__v4si)__B);
135}
136
137/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
138/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
139/// signed 16-bit results. Sum these 2 results with the corresponding
140/// 32-bit integer in \a __W with signed saturation, and store the packed
141/// 32-bit results in \a dst.
142///
143/// \headerfile <immintrin.h>
144///
145/// \code
146/// __m256i _mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B)
147/// \endcode
148///
149/// This intrinsic corresponds to the \c VPDPWSUDS instruction.
150///
151/// \param __W
152/// A 256-bit vector of [8 x int].
153/// \param __A
154/// A 256-bit vector of [16 x short].
155/// \param __B
156/// A 256-bit vector of [16 x unsigned short].
157/// \returns
158/// A 256-bit vector of [8 x int].
159///
160/// \code{.operation}
161/// FOR j := 0 to 7
162/// tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
163/// tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
164/// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
165/// ENDFOR
166/// dst[MAX:256] := 0
167/// \endcode
168static __inline__ __m256i __DEFAULT_FN_ATTRS256
169_mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
170 return (__m256i)__builtin_ia32_vpdpwsuds256((__v8si)__W, (__v8si)__A,
171 (__v8si)__B);
172}
173
174/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
175/// corresponding signed 16-bit integers in \a __B, producing 2 intermediate
176/// signed 16-bit results. Sum these 2 results with the corresponding
177/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
178///
179/// \headerfile <immintrin.h>
180///
181/// \code
182/// __m128i _mm_dpbusd_epi32(__m128i __W, __m128i __A, __m128i __B)
183/// \endcode
184///
185/// This intrinsic corresponds to the \c VPDPWUSD instruction.
186///
187/// \param __W
188/// A 128-bit vector of [4 x int].
189/// \param __A
190/// A 128-bit vector of [8 x unsigned short].
191/// \param __B
192/// A 128-bit vector of [8 x short].
193/// \returns
194/// A 128-bit vector of [4 x int].
195///
196/// \code{.operation}
197/// FOR j := 0 to 3
198/// tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
199/// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
200/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2
201/// ENDFOR
202/// dst[MAX:128] := 0
203/// \endcode
204static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwusd_epi32(__m128i __W,
205 __m128i __A,
206 __m128i __B) {
207 return (__m128i)__builtin_ia32_vpdpwusd128((__v4si)__W, (__v4si)__A,
208 (__v4si)__B);
209}
210
211/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
212/// corresponding signed 16-bit integers in \a __B, producing 2 intermediate
213/// signed 16-bit results. Sum these 2 results with the corresponding
214/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
215///
216/// \headerfile <immintrin.h>
217///
218/// \code
219/// __m256i _mm256_dpwusd_epi32(__m256i __W, __m256i __A, __m256i __B)
220/// \endcode
221///
222/// This intrinsic corresponds to the \c VPDPWUSD instruction.
223///
224/// \param __W
225/// A 256-bit vector of [8 x int].
226/// \param __A
227/// A 256-bit vector of [16 x unsigned short].
228/// \param __B
229/// A 256-bit vector of [16 x short].
230/// \returns
231/// A 256-bit vector of [8 x int].
232///
233/// \code{.operation}
234/// FOR j := 0 to 7
235/// tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
236/// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
237/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2
238/// ENDFOR
239/// dst[MAX:256] := 0
240/// \endcode
241static __inline__ __m256i __DEFAULT_FN_ATTRS256
242_mm256_dpwusd_epi32(__m256i __W, __m256i __A, __m256i __B) {
243 return (__m256i)__builtin_ia32_vpdpwusd256((__v8si)__W, (__v8si)__A,
244 (__v8si)__B);
245}
246
247/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
248/// corresponding signed 16-bit integers in \a __B, producing 2 intermediate
249/// signed 16-bit results. Sum these 2 results with the corresponding
250/// 32-bit integer in \a __W with signed saturation, and store the packed
251/// 32-bit results in \a dst.
252///
253/// \headerfile <immintrin.h>
254///
255/// \code
256/// __m128i _mm_dpwusds_epi32(__m128i __W, __m128i __A, __m128i __B)
257/// \endcode
258///
259/// This intrinsic corresponds to the \c VPDPWSUDS instruction.
260///
261/// \param __W
262/// A 128-bit vector of [4 x int].
263/// \param __A
264/// A 128-bit vector of [8 x unsigned short].
265/// \param __B
266/// A 128-bit vector of [8 x short].
267/// \returns
268/// A 128-bit vector of [4 x int].
269///
270/// \code{.operation}
271/// FOR j := 0 to 3
272/// tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
273/// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
274/// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
275/// ENDFOR
276/// dst[MAX:128] := 0
277/// \endcode
278static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwusds_epi32(__m128i __W,
279 __m128i __A,
280 __m128i __B) {
281 return (__m128i)__builtin_ia32_vpdpwusds128((__v4si)__W, (__v4si)__A,
282 (__v4si)__B);
283}
284
285/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
286/// corresponding signed 16-bit integers in \a __B, producing 2 intermediate
287/// signed 16-bit results. Sum these 2 results with the corresponding
288/// 32-bit integer in \a __W with signed saturation, and store the packed
289/// 32-bit results in \a dst.
290///
291/// \headerfile <immintrin.h>
292///
293/// \code
294/// __m256i _mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B)
295/// \endcode
296///
297/// This intrinsic corresponds to the \c VPDPWSUDS instruction.
298///
299/// \param __W
300/// A 256-bit vector of [8 x int].
301/// \param __A
302/// A 256-bit vector of [16 x unsigned short].
303/// \param __B
304/// A 256-bit vector of [16 x short].
305/// \returns
306/// A 256-bit vector of [8 x int].
307///
308/// \code{.operation}
309/// FOR j := 0 to 7
310/// tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
311/// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
312/// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
313/// ENDFOR
314/// dst[MAX:256] := 0
315/// \endcode
316static __inline__ __m256i __DEFAULT_FN_ATTRS256
317_mm256_dpwusds_epi32(__m256i __W, __m256i __A, __m256i __B) {
318 return (__m256i)__builtin_ia32_vpdpwusds256((__v8si)__W, (__v8si)__A,
319 (__v8si)__B);
320}
321
322/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
323/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
324/// signed 16-bit results. Sum these 2 results with the corresponding
325/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
326///
327/// \headerfile <immintrin.h>
328///
329/// \code
330/// __m128i _mm_dpwuud_epi32(__m128i __W, __m128i __A, __m128i __B)
331/// \endcode
332///
333/// This intrinsic corresponds to the \c VPDPWUUD instruction.
334///
335/// \param __W
336/// A 128-bit vector of [4 x unsigned int].
337/// \param __A
338/// A 128-bit vector of [8 x unsigned short].
339/// \param __B
340/// A 128-bit vector of [8 x unsigned short].
341/// \returns
342/// A 128-bit vector of [4 x unsigned int].
343///
344/// \code{.operation}
345/// FOR j := 0 to 3
346/// tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
347/// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
348/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2
349/// ENDFOR
350/// dst[MAX:128] := 0
351/// \endcode
352static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwuud_epi32(__m128i __W,
353 __m128i __A,
354 __m128i __B) {
355 return (__m128i)__builtin_ia32_vpdpwuud128((__v4si)__W, (__v4si)__A,
356 (__v4si)__B);
357}
358
359/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
360/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
361/// signed 16-bit results. Sum these 2 results with the corresponding
362/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
363///
364/// \headerfile <immintrin.h>
365///
366/// \code
367/// __m256i _mm256_dpwuud_epi32(__m256i __W, __m256i __A, __m256i __B)
368/// \endcode
369///
370/// This intrinsic corresponds to the \c VPDPWUUD instruction.
371///
372/// \param __W
373/// A 256-bit vector of [8 x unsigned int].
374/// \param __A
375/// A 256-bit vector of [16 x unsigned short].
376/// \param __B
377/// A 256-bit vector of [16 x unsigned short].
378/// \returns
379/// A 256-bit vector of [8 x unsigned int].
380///
381/// \code{.operation}
382/// FOR j := 0 to 7
383/// tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
384/// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
385/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2
386/// ENDFOR
387/// dst[MAX:256] := 0
388/// \endcode
389static __inline__ __m256i __DEFAULT_FN_ATTRS256
390_mm256_dpwuud_epi32(__m256i __W, __m256i __A, __m256i __B) {
391 return (__m256i)__builtin_ia32_vpdpwuud256((__v8si)__W, (__v8si)__A,
392 (__v8si)__B);
393}
394
395/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
396/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
397/// signed 16-bit results. Sum these 2 results with the corresponding
398/// 32-bit integer in \a __W with signed saturation, and store the packed
399/// 32-bit results in \a dst.
400///
401/// \headerfile <immintrin.h>
402///
403/// \code
404/// __m128i _mm_dpwsuds_epi32(__m128i __W, __m128i __A, __m128i __B)
405/// \endcode
406///
407/// This intrinsic corresponds to the \c VPDPWSUDS instruction.
408///
409/// \param __W
410/// A 128-bit vector of [4 x unsigned int].
411/// \param __A
412/// A 128-bit vector of [8 x unsigned short].
413/// \param __B
414/// A 128-bit vector of [8 x unsigned short].
415/// \returns
416/// A 128-bit vector of [4 x unsigned int].
417///
418/// \code{.operation}
419/// FOR j := 0 to 3
420/// tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
421/// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
422/// dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
423/// ENDFOR
424/// dst[MAX:128] := 0
425/// \endcode
426static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwuuds_epi32(__m128i __W,
427 __m128i __A,
428 __m128i __B) {
429 return (__m128i)__builtin_ia32_vpdpwuuds128((__v4si)__W, (__v4si)__A,
430 (__v4si)__B);
431}
432
433/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
434/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
435/// signed 16-bit results. Sum these 2 results with the corresponding
436/// 32-bit integer in \a __W with signed saturation, and store the packed
437/// 32-bit results in \a dst.
438///
439/// \headerfile <immintrin.h>
440///
441/// \code
442/// __m256i _mm256_dpwuuds_epi32(__m256i __W, __m256i __A, __m256i __B)
443/// \endcode
444///
445/// This intrinsic corresponds to the \c VPDPWSUDS instruction.
446///
447/// \param __W
448/// A 256-bit vector of [8 x unsigned int].
449/// \param __A
450/// A 256-bit vector of [16 x unsigned short].
451/// \param __B
452/// A 256-bit vector of [16 x unsigned short].
453/// \returns
454/// A 256-bit vector of [8 x unsigned int].
455///
456/// \code{.operation}
457/// FOR j := 0 to 7
458/// tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
459/// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
460/// dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
461/// ENDFOR
462/// dst[MAX:256] := 0
463/// \endcode
464static __inline__ __m256i __DEFAULT_FN_ATTRS256
465_mm256_dpwuuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
466 return (__m256i)__builtin_ia32_vpdpwuuds256((__v8si)__W, (__v8si)__A,
467 (__v8si)__B);
468}
469
470#undef __DEFAULT_FN_ATTRS128
471#undef __DEFAULT_FN_ATTRS256
472
473#endif // __AVXVNNIINT16INTRIN_H
474

Warning: This file is not a C or C++ file. It does not have highlighting.

source code of clang/lib/Headers/avxvnniint16intrin.h