Warning: This file is not a C or C++ file. It does not have highlighting.
1 | /*===------------- avx512vlvnniintrin.h - VNNI intrinsics ------------------=== |
---|---|
2 | * |
3 | * |
4 | * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
5 | * See https://llvm.org/LICENSE.txt for license information. |
6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
7 | * |
8 | *===-----------------------------------------------------------------------=== |
9 | */ |
10 | #ifndef __IMMINTRIN_H |
11 | #error "Never use <avx512vlvnniintrin.h> directly; include <immintrin.h> instead." |
12 | #endif |
13 | |
14 | #ifndef __AVX512VLVNNIINTRIN_H |
15 | #define __AVX512VLVNNIINTRIN_H |
16 | |
17 | /* Define the default attributes for the functions in this file. */ |
18 | #define __DEFAULT_FN_ATTRS128 \ |
19 | __attribute__((__always_inline__, __nodebug__, \ |
20 | __target__("avx512vl,avx512vnni,no-evex512"), \ |
21 | __min_vector_width__(128))) |
22 | #define __DEFAULT_FN_ATTRS256 \ |
23 | __attribute__((__always_inline__, __nodebug__, \ |
24 | __target__("avx512vl,avx512vnni,no-evex512"), \ |
25 | __min_vector_width__(256))) |
26 | |
27 | /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with |
28 | /// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed |
29 | /// 16-bit results. Sum these 4 results with the corresponding 32-bit integer |
30 | /// in \a S, and store the packed 32-bit results in DST. |
31 | /// |
32 | /// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions. |
33 | /// |
34 | /// \code{.operation} |
35 | /// FOR j := 0 to 7 |
36 | /// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j])) |
37 | /// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1])) |
38 | /// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2])) |
39 | /// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3])) |
40 | /// DST.dword[j] := S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 |
41 | /// ENDFOR |
42 | /// DST[MAX:256] := 0 |
43 | /// \endcode |
44 | #define _mm256_dpbusd_epi32(S, A, B) \ |
45 | ((__m256i)__builtin_ia32_vpdpbusd256((__v8si)(S), (__v8si)(A), (__v8si)(B))) |
46 | |
47 | /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with |
48 | /// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed |
49 | /// 16-bit results. Sum these 4 results with the corresponding 32-bit integer |
50 | /// in \a S using signed saturation, and store the packed 32-bit results in DST. |
51 | /// |
52 | /// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions. |
53 | /// |
54 | /// \code{.operation} |
55 | /// FOR j := 0 to 7 |
56 | /// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j])) |
57 | /// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1])) |
58 | /// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2])) |
59 | /// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3])) |
60 | /// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) |
61 | /// ENDFOR |
62 | /// DST[MAX:256] := 0 |
63 | /// \endcode |
64 | #define _mm256_dpbusds_epi32(S, A, B) \ |
65 | ((__m256i)__builtin_ia32_vpdpbusds256((__v8si)(S), (__v8si)(A), (__v8si)(B))) |
66 | |
67 | /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with |
68 | /// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit |
69 | /// results. Sum these 2 results with the corresponding 32-bit integer in \a S, |
70 | /// and store the packed 32-bit results in DST. |
71 | /// |
72 | /// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions. |
73 | /// |
74 | /// \code{.operation} |
75 | /// FOR j := 0 to 7 |
76 | /// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j]) |
77 | /// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1]) |
78 | /// DST.dword[j] := S.dword[j] + tmp1 + tmp2 |
79 | /// ENDFOR |
80 | /// DST[MAX:256] := 0 |
81 | /// \endcode |
82 | #define _mm256_dpwssd_epi32(S, A, B) \ |
83 | ((__m256i)__builtin_ia32_vpdpwssd256((__v8si)(S), (__v8si)(A), (__v8si)(B))) |
84 | |
85 | /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with |
86 | /// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit |
87 | /// results. Sum these 2 results with the corresponding 32-bit integer in \a S |
88 | /// using signed saturation, and store the packed 32-bit results in DST. |
89 | /// |
90 | /// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions. |
91 | /// |
92 | /// \code{.operation} |
93 | /// FOR j := 0 to 7 |
94 | /// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j]) |
95 | /// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1]) |
96 | /// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2) |
97 | /// ENDFOR |
98 | /// DST[MAX:256] := 0 |
99 | /// \endcode |
100 | #define _mm256_dpwssds_epi32(S, A, B) \ |
101 | ((__m256i)__builtin_ia32_vpdpwssds256((__v8si)(S), (__v8si)(A), (__v8si)(B))) |
102 | |
103 | /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with |
104 | /// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed |
105 | /// 16-bit results. Sum these 4 results with the corresponding 32-bit integer |
106 | /// in \a S, and store the packed 32-bit results in DST. |
107 | /// |
108 | /// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions. |
109 | /// |
110 | /// \code{.operation} |
111 | /// FOR j := 0 to 3 |
112 | /// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j])) |
113 | /// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1])) |
114 | /// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2])) |
115 | /// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3])) |
116 | /// DST.dword[j] := S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 |
117 | /// ENDFOR |
118 | /// DST[MAX:128] := 0 |
119 | /// \endcode |
120 | #define _mm_dpbusd_epi32(S, A, B) \ |
121 | ((__m128i)__builtin_ia32_vpdpbusd128((__v4si)(S), (__v4si)(A), (__v4si)(B))) |
122 | |
123 | /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with |
124 | /// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed |
125 | /// 16-bit results. Sum these 4 results with the corresponding 32-bit integer |
126 | /// in \a S using signed saturation, and store the packed 32-bit results in DST. |
127 | /// |
128 | /// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions. |
129 | /// |
130 | /// \code{.operation} |
131 | /// FOR j := 0 to 3 |
132 | /// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j])) |
133 | /// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1])) |
134 | /// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2])) |
135 | /// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3])) |
136 | /// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) |
137 | /// ENDFOR |
138 | /// DST[MAX:128] := 0 |
139 | /// \endcode |
140 | #define _mm_dpbusds_epi32(S, A, B) \ |
141 | ((__m128i)__builtin_ia32_vpdpbusds128((__v4si)(S), (__v4si)(A), (__v4si)(B))) |
142 | |
143 | /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with |
144 | /// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit |
145 | /// results. Sum these 2 results with the corresponding 32-bit integer in \a S, |
146 | /// and store the packed 32-bit results in DST. |
147 | /// |
148 | /// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions. |
149 | /// |
150 | /// \code{.operation} |
151 | /// FOR j := 0 to 3 |
152 | /// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j]) |
153 | /// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1]) |
154 | /// DST.dword[j] := S.dword[j] + tmp1 + tmp2 |
155 | /// ENDFOR |
156 | /// DST[MAX:128] := 0 |
157 | /// \endcode |
158 | #define _mm_dpwssd_epi32(S, A, B) \ |
159 | ((__m128i)__builtin_ia32_vpdpwssd128((__v4si)(S), (__v4si)(A), (__v4si)(B))) |
160 | |
161 | /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with |
162 | /// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit |
163 | /// results. Sum these 2 results with the corresponding 32-bit integer in \a S |
164 | /// using signed saturation, and store the packed 32-bit results in DST. |
165 | /// |
166 | /// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions. |
167 | /// |
168 | /// \code{.operation} |
169 | /// FOR j := 0 to 3 |
170 | /// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j]) |
171 | /// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1]) |
172 | /// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2) |
173 | /// ENDFOR |
174 | /// DST[MAX:128] := 0 |
175 | /// \endcode |
176 | #define _mm_dpwssds_epi32(S, A, B) \ |
177 | ((__m128i)__builtin_ia32_vpdpwssds128((__v4si)(S), (__v4si)(A), (__v4si)(B))) |
178 | |
179 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
180 | _mm256_mask_dpbusd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) |
181 | { |
182 | return (__m256i)__builtin_ia32_selectd_256(__U, |
183 | (__v8si)_mm256_dpbusd_epi32(__S, __A, __B), |
184 | (__v8si)__S); |
185 | } |
186 | |
187 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
188 | _mm256_maskz_dpbusd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) |
189 | { |
190 | return (__m256i)__builtin_ia32_selectd_256(__U, |
191 | (__v8si)_mm256_dpbusd_epi32(__S, __A, __B), |
192 | (__v8si)_mm256_setzero_si256()); |
193 | } |
194 | |
195 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
196 | _mm256_mask_dpbusds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) |
197 | { |
198 | return (__m256i)__builtin_ia32_selectd_256(__U, |
199 | (__v8si)_mm256_dpbusds_epi32(__S, __A, __B), |
200 | (__v8si)__S); |
201 | } |
202 | |
203 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
204 | _mm256_maskz_dpbusds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) |
205 | { |
206 | return (__m256i)__builtin_ia32_selectd_256(__U, |
207 | (__v8si)_mm256_dpbusds_epi32(__S, __A, __B), |
208 | (__v8si)_mm256_setzero_si256()); |
209 | } |
210 | |
211 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
212 | _mm256_mask_dpwssd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) |
213 | { |
214 | return (__m256i)__builtin_ia32_selectd_256(__U, |
215 | (__v8si)_mm256_dpwssd_epi32(__S, __A, __B), |
216 | (__v8si)__S); |
217 | } |
218 | |
219 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
220 | _mm256_maskz_dpwssd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) |
221 | { |
222 | return (__m256i)__builtin_ia32_selectd_256(__U, |
223 | (__v8si)_mm256_dpwssd_epi32(__S, __A, __B), |
224 | (__v8si)_mm256_setzero_si256()); |
225 | } |
226 | |
227 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
228 | _mm256_mask_dpwssds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) |
229 | { |
230 | return (__m256i)__builtin_ia32_selectd_256(__U, |
231 | (__v8si)_mm256_dpwssds_epi32(__S, __A, __B), |
232 | (__v8si)__S); |
233 | } |
234 | |
235 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
236 | _mm256_maskz_dpwssds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) |
237 | { |
238 | return (__m256i)__builtin_ia32_selectd_256(__U, |
239 | (__v8si)_mm256_dpwssds_epi32(__S, __A, __B), |
240 | (__v8si)_mm256_setzero_si256()); |
241 | } |
242 | |
243 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
244 | _mm_mask_dpbusd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) |
245 | { |
246 | return (__m128i)__builtin_ia32_selectd_128(__U, |
247 | (__v4si)_mm_dpbusd_epi32(__S, __A, __B), |
248 | (__v4si)__S); |
249 | } |
250 | |
251 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
252 | _mm_maskz_dpbusd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) |
253 | { |
254 | return (__m128i)__builtin_ia32_selectd_128(__U, |
255 | (__v4si)_mm_dpbusd_epi32(__S, __A, __B), |
256 | (__v4si)_mm_setzero_si128()); |
257 | } |
258 | |
259 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
260 | _mm_mask_dpbusds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) |
261 | { |
262 | return (__m128i)__builtin_ia32_selectd_128(__U, |
263 | (__v4si)_mm_dpbusds_epi32(__S, __A, __B), |
264 | (__v4si)__S); |
265 | } |
266 | |
267 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
268 | _mm_maskz_dpbusds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) |
269 | { |
270 | return (__m128i)__builtin_ia32_selectd_128(__U, |
271 | (__v4si)_mm_dpbusds_epi32(__S, __A, __B), |
272 | (__v4si)_mm_setzero_si128()); |
273 | } |
274 | |
275 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
276 | _mm_mask_dpwssd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) |
277 | { |
278 | return (__m128i)__builtin_ia32_selectd_128(__U, |
279 | (__v4si)_mm_dpwssd_epi32(__S, __A, __B), |
280 | (__v4si)__S); |
281 | } |
282 | |
283 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
284 | _mm_maskz_dpwssd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) |
285 | { |
286 | return (__m128i)__builtin_ia32_selectd_128(__U, |
287 | (__v4si)_mm_dpwssd_epi32(__S, __A, __B), |
288 | (__v4si)_mm_setzero_si128()); |
289 | } |
290 | |
291 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
292 | _mm_mask_dpwssds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) |
293 | { |
294 | return (__m128i)__builtin_ia32_selectd_128(__U, |
295 | (__v4si)_mm_dpwssds_epi32(__S, __A, __B), |
296 | (__v4si)__S); |
297 | } |
298 | |
299 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
300 | _mm_maskz_dpwssds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) |
301 | { |
302 | return (__m128i)__builtin_ia32_selectd_128(__U, |
303 | (__v4si)_mm_dpwssds_epi32(__S, __A, __B), |
304 | (__v4si)_mm_setzero_si128()); |
305 | } |
306 | |
307 | #undef __DEFAULT_FN_ATTRS128 |
308 | #undef __DEFAULT_FN_ATTRS256 |
309 | |
310 | #endif |
311 |
Warning: This file is not a C or C++ file. It does not have highlighting.