Warning: This file is not a C or C++ file. It does not have highlighting.
1 | /*===-------- avxvnniint8intrin.h - AVXVNNIINT8 intrinsics -----------=== |
---|---|
2 | * |
3 | * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | * See https://llvm.org/LICENSE.txt for license information. |
5 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | * |
7 | *===-----------------------------------------------------------------------=== |
8 | */ |
9 | #ifndef __IMMINTRIN_H |
10 | #error \ |
11 | "Never use <avxvnniint8intrin.h> directly; include <immintrin.h> instead." |
12 | #endif |
13 | |
14 | #ifndef __AVXVNNIINT8INTRIN_H |
15 | #define __AVXVNNIINT8INTRIN_H |
16 | |
17 | /* Define the default attributes for the functions in this file. */ |
18 | #define __DEFAULT_FN_ATTRS256 \ |
19 | __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint8"), \ |
20 | __min_vector_width__(256))) |
21 | #define __DEFAULT_FN_ATTRS128 \ |
22 | __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint8"), \ |
23 | __min_vector_width__(128))) |
24 | |
25 | /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with |
26 | /// corresponding signed 8-bit integers in \a __B, producing 4 intermediate |
27 | /// signed 16-bit results. Sum these 4 results with the corresponding |
28 | /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst. |
29 | /// |
30 | /// \headerfile <x86intrin.h> |
31 | /// |
32 | /// \code |
33 | /// _mm_dpbssd_epi32(__m128i __W, __m128i __A, __m128i __B); |
34 | /// \endcode |
35 | /// |
36 | /// This intrinsic corresponds to the \c VPDPBSSD instruction. |
37 | /// |
38 | /// \param __A |
39 | /// A 128-bit vector of [16 x char]. |
40 | /// \param __B |
41 | /// A 128-bit vector of [16 x char]. |
42 | /// \returns |
43 | /// A 128-bit vector of [4 x int]. |
44 | /// |
45 | /// \code{.operation} |
46 | /// FOR j := 0 to 3 |
47 | /// tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]) |
48 | /// tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]) |
49 | /// tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]) |
50 | /// tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]) |
51 | /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 |
52 | /// ENDFOR |
53 | /// dst[MAX:128] := 0 |
54 | /// \endcode |
55 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbssd_epi32(__m128i __W, |
56 | __m128i __A, |
57 | __m128i __B) { |
58 | return (__m128i)__builtin_ia32_vpdpbssd128((__v4si)__W, (__v4si)__A, |
59 | (__v4si)__B); |
60 | } |
61 | |
62 | /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with |
63 | /// corresponding signed 8-bit integers in \a __B, producing 4 intermediate |
64 | /// signed 16-bit results. Sum these 4 results with the corresponding |
65 | /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst. |
66 | /// |
67 | /// \headerfile <x86intrin.h> |
68 | /// |
69 | /// \code |
70 | /// _mm256_dpbssd_epi32(__m256i __W, __m256i __A, __m256i __B); |
71 | /// \endcode |
72 | /// |
73 | /// This intrinsic corresponds to the \c VPDPBSSD instruction. |
74 | /// |
75 | /// \param __A |
76 | /// A 256-bit vector of [32 x char]. |
77 | /// \param __B |
78 | /// A 256-bit vector of [32 x char]. |
79 | /// \returns |
80 | /// A 256-bit vector of [8 x int]. |
81 | /// |
82 | /// \code{.operation} |
83 | /// FOR j := 0 to 7 |
84 | /// tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]) |
85 | /// tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]) |
86 | /// tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]) |
87 | /// tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]) |
88 | /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 |
89 | /// ENDFOR |
90 | /// dst[MAX:256] := 0 |
91 | /// \endcode |
92 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
93 | _mm256_dpbssd_epi32(__m256i __W, __m256i __A, __m256i __B) { |
94 | return (__m256i)__builtin_ia32_vpdpbssd256((__v8si)__W, (__v8si)__A, |
95 | (__v8si)__B); |
96 | } |
97 | |
98 | /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with |
99 | /// corresponding signed 8-bit integers in \a __B, producing 4 intermediate |
100 | /// signed 16-bit results. Sum these 4 results with the corresponding |
101 | /// 32-bit integer in \a __W with signed saturation, and store the packed |
102 | /// 32-bit results in \a dst. |
103 | /// |
104 | /// \headerfile <x86intrin.h> |
105 | /// |
106 | /// \code |
107 | /// _mm_dpbssds_epi32( __m128i __W, __m128i __A, __m128i __B); |
108 | /// \endcode |
109 | /// |
110 | /// This intrinsic corresponds to the \c VPDPBSSD instruction. |
111 | /// |
112 | /// \param __A |
113 | /// A 128-bit vector of [16 x char]. |
114 | /// \param __B |
115 | /// A 128-bit vector of [16 x char]. |
116 | /// \returns |
117 | /// A 128-bit vector of [4 x int]. |
118 | /// |
119 | /// \code{.operation} |
120 | /// FOR j := 0 to 3 |
121 | /// tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]) |
122 | /// tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]) |
123 | /// tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]) |
124 | /// tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]) |
125 | /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) |
126 | /// ENDFOR |
127 | /// dst[MAX:128] := 0 |
128 | /// \endcode |
129 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbssds_epi32(__m128i __W, |
130 | __m128i __A, |
131 | __m128i __B) { |
132 | return (__m128i)__builtin_ia32_vpdpbssds128((__v4si)__W, (__v4si)__A, |
133 | (__v4si)__B); |
134 | } |
135 | |
136 | /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with |
137 | /// corresponding signed 8-bit integers in \a __B, producing 4 intermediate |
138 | /// signed 16-bit results. Sum these 4 results with the corresponding |
139 | /// 32-bit integer in \a __W with signed saturation, and store the packed |
140 | /// 32-bit results in \a dst. |
141 | /// |
142 | /// \headerfile <x86intrin.h> |
143 | /// |
144 | /// \code |
145 | /// _mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B); |
146 | /// \endcode |
147 | /// |
148 | /// This intrinsic corresponds to the \c VPDPBSSD instruction. |
149 | /// |
150 | /// \param __A |
151 | /// A 256-bit vector of [32 x char]. |
152 | /// \param __B |
153 | /// A 256-bit vector of [32 x char]. |
154 | /// \returns |
155 | /// A 256-bit vector of [8 x int]. |
156 | /// |
157 | /// \code{.operation} |
158 | /// FOR j := 0 to 7 |
159 | /// tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]) |
160 | /// tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]) |
161 | /// tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]) |
162 | /// tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]) |
163 | /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) |
164 | /// ENDFOR |
165 | /// dst[MAX:256] := 0 |
166 | /// \endcode |
167 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
168 | _mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B) { |
169 | return (__m256i)__builtin_ia32_vpdpbssds256((__v8si)__W, (__v8si)__A, |
170 | (__v8si)__B); |
171 | } |
172 | |
173 | /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with |
174 | /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate |
175 | /// signed 16-bit results. Sum these 4 results with the corresponding |
176 | /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst. |
177 | /// |
178 | /// \headerfile <x86intrin.h> |
179 | /// |
180 | /// \code |
181 | /// _mm_dpbsud_epi32(__m128i __W, __m128i __A, __m128i __B); |
182 | /// \endcode |
183 | /// |
184 | /// This intrinsic corresponds to the \c VPDPBSSD instruction. |
185 | /// |
186 | /// \param __A |
187 | /// A 128-bit vector of [16 x char]. |
188 | /// \param __B |
189 | /// A 128-bit vector of [16 x unsigned char]. |
190 | /// \returns |
191 | /// A 128-bit vector of [4 x int]. |
192 | /// |
193 | /// \code{.operation} |
194 | /// FOR j := 0 to 3 |
195 | /// tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])) |
196 | /// tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])) |
197 | /// tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])) |
198 | /// tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])) |
199 | /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 |
200 | /// ENDFOR |
201 | /// dst[MAX:128] := 0 |
202 | /// \endcode |
203 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbsud_epi32(__m128i __W, |
204 | __m128i __A, |
205 | __m128i __B) { |
206 | return (__m128i)__builtin_ia32_vpdpbsud128((__v4si)__W, (__v4si)__A, |
207 | (__v4si)__B); |
208 | } |
209 | |
210 | /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with |
211 | /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate |
212 | /// signed 16-bit results. Sum these 4 results with the corresponding |
213 | /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst. |
214 | /// |
215 | /// \headerfile <x86intrin.h> |
216 | /// |
217 | /// \code |
218 | /// _mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B); |
219 | /// \endcode |
220 | /// |
221 | /// This intrinsic corresponds to the \c VPDPBSSD instruction. |
222 | /// |
223 | /// \param __A |
224 | /// A 256-bit vector of [32 x char]. |
225 | /// \param __B |
226 | /// A 256-bit vector of [32 x unsigned char]. |
227 | /// \returns |
228 | /// A 256-bit vector of [8 x int]. |
229 | /// |
230 | /// \code{.operation} |
231 | /// FOR j := 0 to 7 |
232 | /// tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])) |
233 | /// tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])) |
234 | /// tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])) |
235 | /// tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])) |
236 | /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 |
237 | /// ENDFOR |
238 | /// dst[MAX:256] := 0 |
239 | /// \endcode |
240 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
241 | _mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B) { |
242 | return (__m256i)__builtin_ia32_vpdpbsud256((__v8si)__W, (__v8si)__A, |
243 | (__v8si)__B); |
244 | } |
245 | |
246 | /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with |
247 | /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate |
248 | /// signed 16-bit results. Sum these 4 results with the corresponding |
249 | /// 32-bit integer in \a __W with signed saturation, and store the packed |
250 | /// 32-bit results in \a dst. |
251 | /// |
252 | /// \headerfile <x86intrin.h> |
253 | /// |
254 | /// \code |
255 | /// _mm_dpbsuds_epi32( __m128i __W, __m128i __A, __m128i __B); |
256 | /// \endcode |
257 | /// |
258 | /// This intrinsic corresponds to the \c VPDPBSSD instruction. |
259 | /// |
260 | /// \param __A |
261 | /// A 128-bit vector of [16 x char]. |
262 | /// \param __B |
263 | /// A 128-bit vector of [16 x unsigned char]. |
264 | /// \returns |
265 | /// A 128-bit vector of [4 x int]. |
266 | /// |
267 | /// \code{.operation} |
268 | /// FOR j := 0 to 3 |
269 | /// tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])) |
270 | /// tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])) |
271 | /// tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])) |
272 | /// tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])) |
273 | /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) |
274 | /// ENDFOR |
275 | /// dst[MAX:128] := 0 |
276 | /// \endcode |
277 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbsuds_epi32(__m128i __W, |
278 | __m128i __A, |
279 | __m128i __B) { |
280 | return (__m128i)__builtin_ia32_vpdpbsuds128((__v4si)__W, (__v4si)__A, |
281 | (__v4si)__B); |
282 | } |
283 | |
284 | /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with |
285 | /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate |
286 | /// signed 16-bit results. Sum these 4 results with the corresponding |
287 | /// 32-bit integer in \a __W with signed saturation, and store the packed |
288 | /// 32-bit results in \a dst. |
289 | /// |
290 | /// \headerfile <x86intrin.h> |
291 | /// |
292 | /// \code |
293 | /// _mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B); |
294 | /// \endcode |
295 | /// |
296 | /// This intrinsic corresponds to the \c VPDPBSSD instruction. |
297 | /// |
298 | /// \param __A |
299 | /// A 256-bit vector of [32 x char]. |
300 | /// \param __B |
301 | /// A 256-bit vector of [32 x unsigned char]. |
302 | /// \returns |
303 | /// A 256-bit vector of [8 x int]. |
304 | /// |
305 | /// \code{.operation} |
306 | /// FOR j := 0 to 7 |
307 | /// tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])) |
308 | /// tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])) |
309 | /// tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])) |
310 | /// tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])) |
311 | /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) |
312 | /// ENDFOR |
313 | /// dst[MAX:256] := 0 |
314 | /// \endcode |
315 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
316 | _mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B) { |
317 | return (__m256i)__builtin_ia32_vpdpbsuds256((__v8si)__W, (__v8si)__A, |
318 | (__v8si)__B); |
319 | } |
320 | |
321 | /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with |
322 | /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate |
323 | /// signed 16-bit results. Sum these 4 results with the corresponding |
324 | /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst. |
325 | /// |
326 | /// \headerfile <x86intrin.h> |
327 | /// |
328 | /// \code |
329 | /// _mm_dpbuud_epi32(__m128i __W, __m128i __A, __m128i __B); |
330 | /// \endcode |
331 | /// |
332 | /// This intrinsic corresponds to the \c VPDPBSSD instruction. |
333 | /// |
334 | /// \param __A |
335 | /// A 128-bit vector of [16 x unsigned char]. |
336 | /// \param __B |
337 | /// A 128-bit vector of [16 x unsigned char]. |
338 | /// \returns |
339 | /// A 128-bit vector of [4 x int]. |
340 | /// |
341 | /// \code{.operation} |
342 | /// FOR j := 0 to 3 |
343 | /// tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]) |
344 | /// tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]) |
345 | /// tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]) |
346 | /// tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]) |
347 | /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 |
348 | /// ENDFOR |
349 | /// dst[MAX:128] := 0 |
350 | /// \endcode |
351 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbuud_epi32(__m128i __W, |
352 | __m128i __A, |
353 | __m128i __B) { |
354 | return (__m128i)__builtin_ia32_vpdpbuud128((__v4si)__W, (__v4si)__A, |
355 | (__v4si)__B); |
356 | } |
357 | |
358 | /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with |
359 | /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate |
360 | /// signed 16-bit results. Sum these 4 results with the corresponding |
361 | /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst. |
362 | /// |
363 | /// \headerfile <x86intrin.h> |
364 | /// |
365 | /// \code |
366 | /// _mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B); |
367 | /// \endcode |
368 | /// |
369 | /// This intrinsic corresponds to the \c VPDPBSSD instruction. |
370 | /// |
371 | /// \param __A |
372 | /// A 256-bit vector of [32 x unsigned char]. |
373 | /// \param __B |
374 | /// A 256-bit vector of [32 x unsigned char]. |
375 | /// \returns |
376 | /// A 256-bit vector of [8 x int]. |
377 | /// |
378 | /// \code{.operation} |
379 | /// FOR j := 0 to 7 |
380 | /// tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]) |
381 | /// tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]) |
382 | /// tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]) |
383 | /// tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]) |
384 | /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 |
385 | /// ENDFOR |
386 | /// dst[MAX:256] := 0 |
387 | /// \endcode |
388 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
389 | _mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B) { |
390 | return (__m256i)__builtin_ia32_vpdpbuud256((__v8si)__W, (__v8si)__A, |
391 | (__v8si)__B); |
392 | } |
393 | |
394 | /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with |
395 | /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate |
396 | /// signed 16-bit results. Sum these 4 results with the corresponding |
397 | /// 32-bit integer in \a __W with signed saturation, and store the packed |
398 | /// 32-bit results in \a dst. |
399 | /// |
400 | /// \headerfile <x86intrin.h> |
401 | /// |
402 | /// \code |
403 | /// _mm_dpbuuds_epi32( __m128i __W, __m128i __A, __m128i __B); |
404 | /// \endcode |
405 | /// |
406 | /// This intrinsic corresponds to the \c VPDPBUUDS instruction. |
407 | /// |
408 | /// \param __A |
409 | /// A 128-bit vector of [16 x unsigned char]. |
410 | /// \param __B |
411 | /// A 128-bit vector of [16 x unsigned char]. |
412 | /// \returns |
413 | /// A 128-bit vector of [4 x int]. |
414 | /// |
415 | /// \code{.operation} |
416 | /// FOR j := 0 to 3 |
417 | /// tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]) |
418 | /// tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]) |
419 | /// tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]) |
420 | /// tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]) |
421 | /// dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) |
422 | /// ENDFOR |
423 | /// dst[MAX:128] := 0 |
424 | /// \endcode |
425 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbuuds_epi32(__m128i __W, |
426 | __m128i __A, |
427 | __m128i __B) { |
428 | return (__m128i)__builtin_ia32_vpdpbuuds128((__v4si)__W, (__v4si)__A, |
429 | (__v4si)__B); |
430 | } |
431 | |
432 | /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with |
433 | /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate |
434 | /// signed 16-bit results. Sum these 4 results with the corresponding |
435 | /// 32-bit integer in \a __W with signed saturation, and store the packed |
436 | /// 32-bit results in \a dst. |
437 | /// |
438 | /// \headerfile <x86intrin.h> |
439 | /// |
440 | /// \code |
441 | /// _mm256_dpbuuds_epi32(__m256i __W, __m256i __A, __m256i __B); |
442 | /// \endcode |
443 | /// |
444 | /// This intrinsic corresponds to the \c VPDPBUUDS instruction. |
445 | /// |
446 | /// \param __A |
447 | /// A 256-bit vector of [32 x unsigned char]. |
448 | /// \param __B |
449 | /// A 256-bit vector of [32 x unsigned char]. |
450 | /// \returns |
451 | /// A 256-bit vector of [8 x int]. |
452 | /// |
453 | /// \code{.operation} |
454 | /// FOR j := 0 to 7 |
455 | /// tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]) |
456 | /// tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]) |
457 | /// tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]) |
458 | /// tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]) |
459 | /// dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) |
460 | /// ENDFOR |
461 | /// dst[MAX:256] := 0 |
462 | /// \endcode |
463 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
464 | _mm256_dpbuuds_epi32(__m256i __W, __m256i __A, __m256i __B) { |
465 | return (__m256i)__builtin_ia32_vpdpbuuds256((__v8si)__W, (__v8si)__A, |
466 | (__v8si)__B); |
467 | } |
468 | #undef __DEFAULT_FN_ATTRS128 |
469 | #undef __DEFAULT_FN_ATTRS256 |
470 | |
471 | #endif // __AVXVNNIINT8INTRIN_H |
472 |
Warning: This file is not a C or C++ file. It does not have highlighting.