Warning: This file is not a C or C++ file. It does not have highlighting.
1 | /*===----------- avxvnniint16intrin.h - AVXVNNIINT16 intrinsics-------------=== |
---|---|
2 | * |
3 | * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | * See https://llvm.org/LICENSE.txt for license information. |
5 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | * |
7 | *===-----------------------------------------------------------------------=== |
8 | */ |
9 | |
10 | #ifndef __IMMINTRIN_H |
11 | #error \ |
12 | "Never use <avxvnniint16intrin.h> directly; include <immintrin.h> instead." |
13 | #endif // __IMMINTRIN_H |
14 | |
15 | #ifndef __AVXVNNIINT16INTRIN_H |
16 | #define __AVXVNNIINT16INTRIN_H |
17 | |
18 | /* Define the default attributes for the functions in this file. */ |
19 | #define __DEFAULT_FN_ATTRS128 \ |
20 | __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint16"), \ |
21 | __min_vector_width__(128))) |
22 | #define __DEFAULT_FN_ATTRS256 \ |
23 | __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint16"), \ |
24 | __min_vector_width__(256))) |
25 | |
26 | /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with |
27 | /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate |
28 | /// signed 16-bit results. Sum these 2 results with the corresponding |
29 | /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst. |
30 | /// |
31 | /// \headerfile <immintrin.h> |
32 | /// |
33 | /// \code |
34 | /// __m128i _mm_dpwsud_epi32(__m128i __W, __m128i __A, __m128i __B) |
35 | /// \endcode |
36 | /// |
37 | /// This intrinsic corresponds to the \c VPDPWSUD instruction. |
38 | /// |
39 | /// \param __W |
40 | /// A 128-bit vector of [4 x int]. |
41 | /// \param __A |
42 | /// A 128-bit vector of [8 x short]. |
43 | /// \param __B |
44 | /// A 128-bit vector of [8 x unsigned short]. |
45 | /// \returns |
46 | /// A 128-bit vector of [4 x int]. |
47 | /// |
48 | /// \code{.operation} |
49 | /// FOR j := 0 to 3 |
50 | /// tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j]) |
51 | /// tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1]) |
52 | /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 |
53 | /// ENDFOR |
54 | /// dst[MAX:128] := 0 |
55 | /// \endcode |
56 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwsud_epi32(__m128i __W, |
57 | __m128i __A, |
58 | __m128i __B) { |
59 | return (__m128i)__builtin_ia32_vpdpwsud128((__v4si)__W, (__v4si)__A, |
60 | (__v4si)__B); |
61 | } |
62 | |
63 | /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with |
64 | /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate |
65 | /// signed 16-bit results. Sum these 2 results with the corresponding |
66 | /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst. |
67 | /// |
68 | /// \headerfile <immintrin.h> |
69 | /// |
70 | /// \code |
71 | /// __m256i _mm256_dpwsud_epi32(__m256i __W, __m256i __A, __m256i __B) |
72 | /// \endcode |
73 | /// |
74 | /// This intrinsic corresponds to the \c VPDPWSUD instruction. |
75 | /// |
76 | /// \param __W |
77 | /// A 256-bit vector of [8 x int]. |
78 | /// \param __A |
79 | /// A 256-bit vector of [16 x short]. |
80 | /// \param __B |
81 | /// A 256-bit vector of [16 x unsigned short]. |
82 | /// \returns |
83 | /// A 256-bit vector of [8 x int]. |
84 | /// |
85 | /// \code{.operation} |
86 | /// FOR j := 0 to 7 |
87 | /// tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j]) |
88 | /// tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1]) |
89 | /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 |
90 | /// ENDFOR |
91 | /// dst[MAX:256] := 0 |
92 | /// \endcode |
93 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
94 | _mm256_dpwsud_epi32(__m256i __W, __m256i __A, __m256i __B) { |
95 | return (__m256i)__builtin_ia32_vpdpwsud256((__v8si)__W, (__v8si)__A, |
96 | (__v8si)__B); |
97 | } |
98 | |
99 | /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with |
100 | /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate |
101 | /// signed 16-bit results. Sum these 2 results with the corresponding |
102 | /// 32-bit integer in \a __W with signed saturation, and store the packed |
103 | /// 32-bit results in \a dst. |
104 | /// |
105 | /// \headerfile <immintrin.h> |
106 | /// |
107 | /// \code |
108 | /// __m128i _mm_dpwsuds_epi32(__m128i __W, __m128i __A, __m128i __B) |
109 | /// \endcode |
110 | /// |
111 | /// This intrinsic corresponds to the \c VPDPWSUDS instruction. |
112 | /// |
113 | /// \param __W |
114 | /// A 128-bit vector of [4 x int]. |
115 | /// \param __A |
116 | /// A 128-bit vector of [8 x short]. |
117 | /// \param __B |
118 | /// A 128-bit vector of [8 x unsigned short]. |
119 | /// \returns |
120 | /// A 128-bit vector of [4 x int]. |
121 | /// |
122 | /// \code{.operation} |
123 | /// FOR j := 0 to 3 |
124 | /// tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j]) |
125 | /// tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1]) |
126 | /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2) |
127 | /// ENDFOR |
128 | /// dst[MAX:128] := 0 |
129 | /// \endcode |
130 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwsuds_epi32(__m128i __W, |
131 | __m128i __A, |
132 | __m128i __B) { |
133 | return (__m128i)__builtin_ia32_vpdpwsuds128((__v4si)__W, (__v4si)__A, |
134 | (__v4si)__B); |
135 | } |
136 | |
137 | /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with |
138 | /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate |
139 | /// signed 16-bit results. Sum these 2 results with the corresponding |
140 | /// 32-bit integer in \a __W with signed saturation, and store the packed |
141 | /// 32-bit results in \a dst. |
142 | /// |
143 | /// \headerfile <immintrin.h> |
144 | /// |
145 | /// \code |
146 | /// __m256i _mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B) |
147 | /// \endcode |
148 | /// |
149 | /// This intrinsic corresponds to the \c VPDPWSUDS instruction. |
150 | /// |
151 | /// \param __W |
152 | /// A 256-bit vector of [8 x int]. |
153 | /// \param __A |
154 | /// A 256-bit vector of [16 x short]. |
155 | /// \param __B |
156 | /// A 256-bit vector of [16 x unsigned short]. |
157 | /// \returns |
158 | /// A 256-bit vector of [8 x int]. |
159 | /// |
160 | /// \code{.operation} |
161 | /// FOR j := 0 to 7 |
162 | /// tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j]) |
163 | /// tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1]) |
164 | /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2) |
165 | /// ENDFOR |
166 | /// dst[MAX:256] := 0 |
167 | /// \endcode |
168 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
169 | _mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B) { |
170 | return (__m256i)__builtin_ia32_vpdpwsuds256((__v8si)__W, (__v8si)__A, |
171 | (__v8si)__B); |
172 | } |
173 | |
174 | /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with |
175 | /// corresponding signed 16-bit integers in \a __B, producing 2 intermediate |
176 | /// signed 16-bit results. Sum these 2 results with the corresponding |
177 | /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst. |
178 | /// |
179 | /// \headerfile <immintrin.h> |
180 | /// |
181 | /// \code |
182 | /// __m128i _mm_dpbusd_epi32(__m128i __W, __m128i __A, __m128i __B) |
183 | /// \endcode |
184 | /// |
185 | /// This intrinsic corresponds to the \c VPDPWUSD instruction. |
186 | /// |
187 | /// \param __W |
188 | /// A 128-bit vector of [4 x int]. |
189 | /// \param __A |
190 | /// A 128-bit vector of [8 x unsigned short]. |
191 | /// \param __B |
192 | /// A 128-bit vector of [8 x short]. |
193 | /// \returns |
194 | /// A 128-bit vector of [4 x int]. |
195 | /// |
196 | /// \code{.operation} |
197 | /// FOR j := 0 to 3 |
198 | /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j]) |
199 | /// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1]) |
200 | /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 |
201 | /// ENDFOR |
202 | /// dst[MAX:128] := 0 |
203 | /// \endcode |
204 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwusd_epi32(__m128i __W, |
205 | __m128i __A, |
206 | __m128i __B) { |
207 | return (__m128i)__builtin_ia32_vpdpwusd128((__v4si)__W, (__v4si)__A, |
208 | (__v4si)__B); |
209 | } |
210 | |
211 | /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with |
212 | /// corresponding signed 16-bit integers in \a __B, producing 2 intermediate |
213 | /// signed 16-bit results. Sum these 2 results with the corresponding |
214 | /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst. |
215 | /// |
216 | /// \headerfile <immintrin.h> |
217 | /// |
218 | /// \code |
219 | /// __m256i _mm256_dpwusd_epi32(__m256i __W, __m256i __A, __m256i __B) |
220 | /// \endcode |
221 | /// |
222 | /// This intrinsic corresponds to the \c VPDPWUSD instruction. |
223 | /// |
224 | /// \param __W |
225 | /// A 256-bit vector of [8 x int]. |
226 | /// \param __A |
227 | /// A 256-bit vector of [16 x unsigned short]. |
228 | /// \param __B |
229 | /// A 256-bit vector of [16 x short]. |
230 | /// \returns |
231 | /// A 256-bit vector of [8 x int]. |
232 | /// |
233 | /// \code{.operation} |
234 | /// FOR j := 0 to 7 |
235 | /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j]) |
236 | /// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1]) |
237 | /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 |
238 | /// ENDFOR |
239 | /// dst[MAX:256] := 0 |
240 | /// \endcode |
241 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
242 | _mm256_dpwusd_epi32(__m256i __W, __m256i __A, __m256i __B) { |
243 | return (__m256i)__builtin_ia32_vpdpwusd256((__v8si)__W, (__v8si)__A, |
244 | (__v8si)__B); |
245 | } |
246 | |
247 | /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with |
248 | /// corresponding signed 16-bit integers in \a __B, producing 2 intermediate |
249 | /// signed 16-bit results. Sum these 2 results with the corresponding |
250 | /// 32-bit integer in \a __W with signed saturation, and store the packed |
251 | /// 32-bit results in \a dst. |
252 | /// |
253 | /// \headerfile <immintrin.h> |
254 | /// |
255 | /// \code |
256 | /// __m128i _mm_dpwusds_epi32(__m128i __W, __m128i __A, __m128i __B) |
257 | /// \endcode |
258 | /// |
259 | /// This intrinsic corresponds to the \c VPDPWSUDS instruction. |
260 | /// |
261 | /// \param __W |
262 | /// A 128-bit vector of [4 x int]. |
263 | /// \param __A |
264 | /// A 128-bit vector of [8 x unsigned short]. |
265 | /// \param __B |
266 | /// A 128-bit vector of [8 x short]. |
267 | /// \returns |
268 | /// A 128-bit vector of [4 x int]. |
269 | /// |
270 | /// \code{.operation} |
271 | /// FOR j := 0 to 3 |
272 | /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j]) |
273 | /// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1]) |
274 | /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2) |
275 | /// ENDFOR |
276 | /// dst[MAX:128] := 0 |
277 | /// \endcode |
278 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwusds_epi32(__m128i __W, |
279 | __m128i __A, |
280 | __m128i __B) { |
281 | return (__m128i)__builtin_ia32_vpdpwusds128((__v4si)__W, (__v4si)__A, |
282 | (__v4si)__B); |
283 | } |
284 | |
285 | /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with |
286 | /// corresponding signed 16-bit integers in \a __B, producing 2 intermediate |
287 | /// signed 16-bit results. Sum these 2 results with the corresponding |
288 | /// 32-bit integer in \a __W with signed saturation, and store the packed |
289 | /// 32-bit results in \a dst. |
290 | /// |
291 | /// \headerfile <immintrin.h> |
292 | /// |
293 | /// \code |
294 | /// __m256i _mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B) |
295 | /// \endcode |
296 | /// |
297 | /// This intrinsic corresponds to the \c VPDPWSUDS instruction. |
298 | /// |
299 | /// \param __W |
300 | /// A 256-bit vector of [8 x int]. |
301 | /// \param __A |
302 | /// A 256-bit vector of [16 x unsigned short]. |
303 | /// \param __B |
304 | /// A 256-bit vector of [16 x short]. |
305 | /// \returns |
306 | /// A 256-bit vector of [8 x int]. |
307 | /// |
308 | /// \code{.operation} |
309 | /// FOR j := 0 to 7 |
310 | /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j]) |
311 | /// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1]) |
312 | /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2) |
313 | /// ENDFOR |
314 | /// dst[MAX:256] := 0 |
315 | /// \endcode |
316 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
317 | _mm256_dpwusds_epi32(__m256i __W, __m256i __A, __m256i __B) { |
318 | return (__m256i)__builtin_ia32_vpdpwusds256((__v8si)__W, (__v8si)__A, |
319 | (__v8si)__B); |
320 | } |
321 | |
322 | /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with |
323 | /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate |
324 | /// signed 16-bit results. Sum these 2 results with the corresponding |
325 | /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst. |
326 | /// |
327 | /// \headerfile <immintrin.h> |
328 | /// |
329 | /// \code |
330 | /// __m128i _mm_dpwuud_epi32(__m128i __W, __m128i __A, __m128i __B) |
331 | /// \endcode |
332 | /// |
333 | /// This intrinsic corresponds to the \c VPDPWUUD instruction. |
334 | /// |
335 | /// \param __W |
336 | /// A 128-bit vector of [4 x unsigned int]. |
337 | /// \param __A |
338 | /// A 128-bit vector of [8 x unsigned short]. |
339 | /// \param __B |
340 | /// A 128-bit vector of [8 x unsigned short]. |
341 | /// \returns |
342 | /// A 128-bit vector of [4 x unsigned int]. |
343 | /// |
344 | /// \code{.operation} |
345 | /// FOR j := 0 to 3 |
346 | /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j]) |
347 | /// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1]) |
348 | /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 |
349 | /// ENDFOR |
350 | /// dst[MAX:128] := 0 |
351 | /// \endcode |
352 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwuud_epi32(__m128i __W, |
353 | __m128i __A, |
354 | __m128i __B) { |
355 | return (__m128i)__builtin_ia32_vpdpwuud128((__v4si)__W, (__v4si)__A, |
356 | (__v4si)__B); |
357 | } |
358 | |
359 | /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with |
360 | /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate |
361 | /// signed 16-bit results. Sum these 2 results with the corresponding |
362 | /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst. |
363 | /// |
364 | /// \headerfile <immintrin.h> |
365 | /// |
366 | /// \code |
367 | /// __m256i _mm256_dpwuud_epi32(__m256i __W, __m256i __A, __m256i __B) |
368 | /// \endcode |
369 | /// |
370 | /// This intrinsic corresponds to the \c VPDPWUUD instruction. |
371 | /// |
372 | /// \param __W |
373 | /// A 256-bit vector of [8 x unsigned int]. |
374 | /// \param __A |
375 | /// A 256-bit vector of [16 x unsigned short]. |
376 | /// \param __B |
377 | /// A 256-bit vector of [16 x unsigned short]. |
378 | /// \returns |
379 | /// A 256-bit vector of [8 x unsigned int]. |
380 | /// |
381 | /// \code{.operation} |
382 | /// FOR j := 0 to 7 |
383 | /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j]) |
384 | /// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1]) |
385 | /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 |
386 | /// ENDFOR |
387 | /// dst[MAX:256] := 0 |
388 | /// \endcode |
389 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
390 | _mm256_dpwuud_epi32(__m256i __W, __m256i __A, __m256i __B) { |
391 | return (__m256i)__builtin_ia32_vpdpwuud256((__v8si)__W, (__v8si)__A, |
392 | (__v8si)__B); |
393 | } |
394 | |
395 | /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with |
396 | /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate |
397 | /// signed 16-bit results. Sum these 2 results with the corresponding |
398 | /// 32-bit integer in \a __W with signed saturation, and store the packed |
399 | /// 32-bit results in \a dst. |
400 | /// |
401 | /// \headerfile <immintrin.h> |
402 | /// |
403 | /// \code |
404 | /// __m128i _mm_dpwsuds_epi32(__m128i __W, __m128i __A, __m128i __B) |
405 | /// \endcode |
406 | /// |
407 | /// This intrinsic corresponds to the \c VPDPWSUDS instruction. |
408 | /// |
409 | /// \param __W |
410 | /// A 128-bit vector of [4 x unsigned int]. |
411 | /// \param __A |
412 | /// A 128-bit vector of [8 x unsigned short]. |
413 | /// \param __B |
414 | /// A 128-bit vector of [8 x unsigned short]. |
415 | /// \returns |
416 | /// A 128-bit vector of [4 x unsigned int]. |
417 | /// |
418 | /// \code{.operation} |
419 | /// FOR j := 0 to 3 |
420 | /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j]) |
421 | /// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1]) |
422 | /// dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2) |
423 | /// ENDFOR |
424 | /// dst[MAX:128] := 0 |
425 | /// \endcode |
426 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwuuds_epi32(__m128i __W, |
427 | __m128i __A, |
428 | __m128i __B) { |
429 | return (__m128i)__builtin_ia32_vpdpwuuds128((__v4si)__W, (__v4si)__A, |
430 | (__v4si)__B); |
431 | } |
432 | |
433 | /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with |
434 | /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate |
435 | /// signed 16-bit results. Sum these 2 results with the corresponding |
436 | /// 32-bit integer in \a __W with signed saturation, and store the packed |
437 | /// 32-bit results in \a dst. |
438 | /// |
439 | /// \headerfile <immintrin.h> |
440 | /// |
441 | /// \code |
442 | /// __m256i _mm256_dpwuuds_epi32(__m256i __W, __m256i __A, __m256i __B) |
443 | /// \endcode |
444 | /// |
445 | /// This intrinsic corresponds to the \c VPDPWSUDS instruction. |
446 | /// |
447 | /// \param __W |
448 | /// A 256-bit vector of [8 x unsigned int]. |
449 | /// \param __A |
450 | /// A 256-bit vector of [16 x unsigned short]. |
451 | /// \param __B |
452 | /// A 256-bit vector of [16 x unsigned short]. |
453 | /// \returns |
454 | /// A 256-bit vector of [8 x unsigned int]. |
455 | /// |
456 | /// \code{.operation} |
457 | /// FOR j := 0 to 7 |
458 | /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j]) |
459 | /// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1]) |
460 | /// dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2) |
461 | /// ENDFOR |
462 | /// dst[MAX:256] := 0 |
463 | /// \endcode |
464 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
465 | _mm256_dpwuuds_epi32(__m256i __W, __m256i __A, __m256i __B) { |
466 | return (__m256i)__builtin_ia32_vpdpwuuds256((__v8si)__W, (__v8si)__A, |
467 | (__v8si)__B); |
468 | } |
469 | |
470 | #undef __DEFAULT_FN_ATTRS128 |
471 | #undef __DEFAULT_FN_ATTRS256 |
472 | |
473 | #endif // __AVXVNNIINT16INTRIN_H |
474 |
Warning: This file is not a C or C++ file. It does not have highlighting.