Warning: This file is not a C or C++ file. It does not have highlighting.
1 | /*===-------------- avxneconvertintrin.h - AVXNECONVERT --------------------=== |
---|---|
2 | * |
3 | * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | * See https://llvm.org/LICENSE.txt for license information. |
5 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | * |
7 | *===-----------------------------------------------------------------------=== |
8 | */ |
9 | |
10 | #ifndef __IMMINTRIN_H |
11 | #error \ |
12 | "Never use <avxneconvertintrin.h> directly; include <immintrin.h> instead." |
13 | #endif // __IMMINTRIN_H |
14 | |
15 | #ifdef __SSE2__ |
16 | |
17 | #ifndef __AVXNECONVERTINTRIN_H |
18 | #define __AVXNECONVERTINTRIN_H |
19 | |
20 | /* Define the default attributes for the functions in this file. */ |
21 | #define __DEFAULT_FN_ATTRS128 \ |
22 | __attribute__((__always_inline__, __nodebug__, __target__("avxneconvert"), \ |
23 | __min_vector_width__(128))) |
24 | #define __DEFAULT_FN_ATTRS256 \ |
25 | __attribute__((__always_inline__, __nodebug__, __target__("avxneconvert"), \ |
26 | __min_vector_width__(256))) |
27 | |
28 | /// Convert scalar BF16 (16-bit) floating-point element |
29 | /// stored at memory locations starting at location \a __A to a |
30 | /// single-precision (32-bit) floating-point, broadcast it to packed |
31 | /// single-precision (32-bit) floating-point elements, and store the results in |
32 | /// \a dst. |
33 | /// |
34 | /// \headerfile <x86intrin.h> |
35 | /// |
36 | /// \code |
37 | /// _mm_bcstnebf16_ps(const void *__A); |
38 | /// \endcode |
39 | /// |
40 | /// This intrinsic corresponds to the \c VBCSTNEBF162PS instruction. |
41 | /// |
42 | /// \param __A |
43 | /// A pointer to a 16-bit memory location. The address of the memory |
44 | /// location does not have to be aligned. |
45 | /// \returns |
46 | /// A 128-bit vector of [4 x float]. |
47 | /// |
48 | /// \code{.operation} |
49 | /// b := Convert_BF16_To_FP32(MEM[__A+15:__A]) |
50 | /// FOR j := 0 to 3 |
51 | /// m := j*32 |
52 | /// dst[m+31:m] := b |
53 | /// ENDFOR |
54 | /// dst[MAX:128] := 0 |
55 | /// \endcode |
56 | static __inline__ __m128 __DEFAULT_FN_ATTRS128 |
57 | _mm_bcstnebf16_ps(const void *__A) { |
58 | return (__m128)__builtin_ia32_vbcstnebf162ps128((const __bf16 *)__A); |
59 | } |
60 | |
61 | /// Convert scalar BF16 (16-bit) floating-point element |
62 | /// stored at memory locations starting at location \a __A to a |
63 | /// single-precision (32-bit) floating-point, broadcast it to packed |
64 | /// single-precision (32-bit) floating-point elements, and store the results in |
65 | /// \a dst. |
66 | /// |
67 | /// \headerfile <x86intrin.h> |
68 | /// |
69 | /// \code |
70 | /// _mm256_bcstnebf16_ps(const void *__A); |
71 | /// \endcode |
72 | /// |
73 | /// This intrinsic corresponds to the \c VBCSTNEBF162PS instruction. |
74 | /// |
75 | /// \param __A |
76 | /// A pointer to a 16-bit memory location. The address of the memory |
77 | /// location does not have to be aligned. |
78 | /// \returns |
79 | /// A 256-bit vector of [8 x float]. |
80 | /// |
81 | /// \code{.operation} |
82 | /// b := Convert_BF16_To_FP32(MEM[__A+15:__A]) |
83 | /// FOR j := 0 to 7 |
84 | /// m := j*32 |
85 | /// dst[m+31:m] := b |
86 | /// ENDFOR |
87 | /// dst[MAX:256] := 0 |
88 | /// \endcode |
89 | static __inline__ __m256 __DEFAULT_FN_ATTRS256 |
90 | _mm256_bcstnebf16_ps(const void *__A) { |
91 | return (__m256)__builtin_ia32_vbcstnebf162ps256((const __bf16 *)__A); |
92 | } |
93 | |
94 | /// Convert scalar half-precision (16-bit) floating-point element |
95 | /// stored at memory locations starting at location \a __A to a |
96 | /// single-precision (32-bit) floating-point, broadcast it to packed |
97 | /// single-precision (32-bit) floating-point elements, and store the results in |
98 | /// \a dst. |
99 | /// |
100 | /// \headerfile <x86intrin.h> |
101 | /// |
102 | /// \code |
103 | /// _mm_bcstnesh_ps(const void *__A); |
104 | /// \endcode |
105 | /// |
106 | /// This intrinsic corresponds to the \c VBCSTNESH2PS instruction. |
107 | /// |
108 | /// \param __A |
109 | /// A pointer to a 16-bit memory location. The address of the memory |
110 | /// location does not have to be aligned. |
111 | /// \returns |
112 | /// A 128-bit vector of [4 x float]. |
113 | /// |
114 | /// \code{.operation} |
115 | /// b := Convert_FP16_To_FP32(MEM[__A+15:__A]) |
116 | /// FOR j := 0 to 3 |
117 | /// m := j*32 |
118 | /// dst[m+31:m] := b |
119 | /// ENDFOR |
120 | /// dst[MAX:128] := 0 |
121 | /// \endcode |
122 | static __inline__ __m128 __DEFAULT_FN_ATTRS128 |
123 | _mm_bcstnesh_ps(const void *__A) { |
124 | return (__m128)__builtin_ia32_vbcstnesh2ps128((const _Float16 *)__A); |
125 | } |
126 | |
127 | /// Convert scalar half-precision (16-bit) floating-point element |
128 | /// stored at memory locations starting at location \a __A to a |
129 | /// single-precision (32-bit) floating-point, broadcast it to packed |
130 | /// single-precision (32-bit) floating-point elements, and store the results in |
131 | /// \a dst. |
132 | /// |
133 | /// \headerfile <x86intrin.h> |
134 | /// |
135 | /// \code |
136 | /// _mm256_bcstnesh_ps(const void *__A); |
137 | /// \endcode |
138 | /// |
139 | /// This intrinsic corresponds to the \c VBCSTNESH2PS instruction. |
140 | /// |
141 | /// \param __A |
142 | /// A pointer to a 16-bit memory location. The address of the memory |
143 | /// location does not have to be aligned. |
144 | /// \returns |
145 | /// A 256-bit vector of [8 x float]. |
146 | /// |
147 | /// \code{.operation} |
148 | /// b := Convert_FP16_To_FP32(MEM[__A+15:__A]) |
149 | /// FOR j := 0 to 7 |
150 | /// m := j*32 |
151 | /// dst[m+31:m] := b |
152 | /// ENDFOR |
153 | /// dst[MAX:256] := 0 |
154 | /// \endcode |
155 | static __inline__ __m256 __DEFAULT_FN_ATTRS256 |
156 | _mm256_bcstnesh_ps(const void *__A) { |
157 | return (__m256)__builtin_ia32_vbcstnesh2ps256((const _Float16 *)__A); |
158 | } |
159 | |
160 | /// Convert packed BF16 (16-bit) floating-point even-indexed elements |
161 | /// stored at memory locations starting at location \a __A to packed |
162 | /// single-precision (32-bit) floating-point elements, and store the results in |
163 | /// \a dst. |
164 | /// |
165 | /// \headerfile <x86intrin.h> |
166 | /// |
167 | /// \code |
168 | /// _mm_cvtneebf16_ps(const __m128bh *__A); |
169 | /// \endcode |
170 | /// |
171 | /// This intrinsic corresponds to the \c VCVTNEEBF162PS instruction. |
172 | /// |
173 | /// \param __A |
174 | /// A pointer to a 128-bit memory location containing 8 consecutive |
175 | /// BF16 (16-bit) floating-point values. |
176 | /// \returns |
177 | /// A 128-bit vector of [4 x float]. |
178 | /// |
179 | /// \code{.operation} |
180 | /// FOR j := 0 to 3 |
181 | /// k := j*2 |
182 | /// i := k*16 |
183 | /// m := j*32 |
184 | /// dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+i+15:__A+i]) |
185 | /// ENDFOR |
186 | /// dst[MAX:128] := 0 |
187 | /// \endcode |
188 | static __inline__ __m128 __DEFAULT_FN_ATTRS128 |
189 | _mm_cvtneebf16_ps(const __m128bh *__A) { |
190 | return (__m128)__builtin_ia32_vcvtneebf162ps128((const __v8bf *)__A); |
191 | } |
192 | |
193 | /// Convert packed BF16 (16-bit) floating-point even-indexed elements |
194 | /// stored at memory locations starting at location \a __A to packed |
195 | /// single-precision (32-bit) floating-point elements, and store the results in |
196 | /// \a dst. |
197 | /// |
198 | /// \headerfile <x86intrin.h> |
199 | /// |
200 | /// \code |
201 | /// _mm256_cvtneebf16_ps(const __m256bh *__A); |
202 | /// \endcode |
203 | /// |
204 | /// This intrinsic corresponds to the \c VCVTNEEBF162PS instruction. |
205 | /// |
206 | /// \param __A |
207 | /// A pointer to a 256-bit memory location containing 16 consecutive |
208 | /// BF16 (16-bit) floating-point values. |
209 | /// \returns |
210 | /// A 256-bit vector of [8 x float]. |
211 | /// |
212 | /// \code{.operation} |
213 | /// FOR j := 0 to 7 |
214 | /// k := j*2 |
215 | /// i := k*16 |
216 | /// m := j*32 |
217 | /// dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+i+15:__A+i]) |
218 | /// ENDFOR |
219 | /// dst[MAX:256] := 0 |
220 | /// \endcode |
221 | static __inline__ __m256 __DEFAULT_FN_ATTRS256 |
222 | _mm256_cvtneebf16_ps(const __m256bh *__A) { |
223 | return (__m256)__builtin_ia32_vcvtneebf162ps256((const __v16bf *)__A); |
224 | } |
225 | |
226 | /// Convert packed half-precision (16-bit) floating-point even-indexed elements |
227 | /// stored at memory locations starting at location \a __A to packed |
228 | /// single-precision (32-bit) floating-point elements, and store the results in |
229 | /// \a dst. |
230 | /// |
231 | /// \headerfile <x86intrin.h> |
232 | /// |
233 | /// \code |
234 | /// _mm_cvtneeph_ps(const __m128h *__A); |
235 | /// \endcode |
236 | /// |
237 | /// This intrinsic corresponds to the \c VCVTNEEPH2PS instruction. |
238 | /// |
239 | /// \param __A |
240 | /// A pointer to a 128-bit memory location containing 8 consecutive |
241 | /// half-precision (16-bit) floating-point values. |
242 | /// \returns |
243 | /// A 128-bit vector of [4 x float]. |
244 | /// |
245 | /// \code{.operation} |
246 | /// FOR j := 0 to 3 |
247 | /// k := j*2 |
248 | /// i := k*16 |
249 | /// m := j*32 |
250 | /// dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+i+15:__A+i]) |
251 | /// ENDFOR |
252 | /// dst[MAX:128] := 0 |
253 | /// \endcode |
254 | static __inline__ __m128 __DEFAULT_FN_ATTRS128 |
255 | _mm_cvtneeph_ps(const __m128h *__A) { |
256 | return (__m128)__builtin_ia32_vcvtneeph2ps128((const __v8hf *)__A); |
257 | } |
258 | |
259 | /// Convert packed half-precision (16-bit) floating-point even-indexed elements |
260 | /// stored at memory locations starting at location \a __A to packed |
261 | /// single-precision (32-bit) floating-point elements, and store the results in |
262 | /// \a dst. |
263 | /// |
264 | /// \headerfile <x86intrin.h> |
265 | /// |
266 | /// \code |
267 | /// _mm256_cvtneeph_ps(const __m256h *__A); |
268 | /// \endcode |
269 | /// |
270 | /// This intrinsic corresponds to the \c VCVTNEEPH2PS instruction. |
271 | /// |
272 | /// \param __A |
273 | /// A pointer to a 256-bit memory location containing 16 consecutive |
274 | /// half-precision (16-bit) floating-point values. |
275 | /// \returns |
276 | /// A 256-bit vector of [8 x float]. |
277 | /// |
278 | /// \code{.operation} |
279 | /// FOR j := 0 to 7 |
280 | /// k := j*2 |
281 | /// i := k*16 |
282 | /// m := j*32 |
283 | /// dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+i+15:__A+i]) |
284 | /// ENDFOR |
285 | /// dst[MAX:256] := 0 |
286 | /// \endcode |
287 | static __inline__ __m256 __DEFAULT_FN_ATTRS256 |
288 | _mm256_cvtneeph_ps(const __m256h *__A) { |
289 | return (__m256)__builtin_ia32_vcvtneeph2ps256((const __v16hf *)__A); |
290 | } |
291 | |
292 | /// Convert packed BF16 (16-bit) floating-point odd-indexed elements |
293 | /// stored at memory locations starting at location \a __A to packed |
294 | /// single-precision (32-bit) floating-point elements, and store the results in |
295 | /// \a dst. |
296 | /// |
297 | /// \headerfile <x86intrin.h> |
298 | /// |
299 | /// \code |
300 | /// _mm_cvtneobf16_ps(const __m128bh *__A); |
301 | /// \endcode |
302 | /// |
303 | /// This intrinsic corresponds to the \c VCVTNEOBF162PS instruction. |
304 | /// |
305 | /// \param __A |
306 | /// A pointer to a 128-bit memory location containing 8 consecutive |
307 | /// BF16 (16-bit) floating-point values. |
308 | /// \returns |
309 | /// A 128-bit vector of [4 x float]. |
310 | /// |
311 | /// \code{.operation} |
312 | /// FOR j := 0 to 3 |
313 | /// k := j*2+1 |
314 | /// i := k*16 |
315 | /// m := j*32 |
316 | /// dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+i+15:__A+i]) |
317 | /// ENDFOR |
318 | /// dst[MAX:128] := 0 |
319 | /// \endcode |
320 | static __inline__ __m128 __DEFAULT_FN_ATTRS128 |
321 | _mm_cvtneobf16_ps(const __m128bh *__A) { |
322 | return (__m128)__builtin_ia32_vcvtneobf162ps128((const __v8bf *)__A); |
323 | } |
324 | |
325 | /// Convert packed BF16 (16-bit) floating-point odd-indexed elements |
326 | /// stored at memory locations starting at location \a __A to packed |
327 | /// single-precision (32-bit) floating-point elements, and store the results in |
328 | /// \a dst. |
329 | /// |
330 | /// \headerfile <x86intrin.h> |
331 | /// |
332 | /// \code |
333 | /// _mm256_cvtneobf16_ps(const __m256bh *__A); |
334 | /// \endcode |
335 | /// |
336 | /// This intrinsic corresponds to the \c VCVTNEOBF162PS instruction. |
337 | /// |
338 | /// \param __A |
339 | /// A pointer to a 256-bit memory location containing 16 consecutive |
340 | /// BF16 (16-bit) floating-point values. |
341 | /// \returns |
342 | /// A 256-bit vector of [8 x float]. |
343 | /// |
344 | /// \code{.operation} |
345 | /// FOR j := 0 to 7 |
346 | /// k := j*2+1 |
347 | /// i := k*16 |
348 | /// m := j*32 |
349 | /// dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+i+15:__A+i]) |
350 | /// ENDFOR |
351 | /// dst[MAX:256] := 0 |
352 | /// \endcode |
353 | static __inline__ __m256 __DEFAULT_FN_ATTRS256 |
354 | _mm256_cvtneobf16_ps(const __m256bh *__A) { |
355 | return (__m256)__builtin_ia32_vcvtneobf162ps256((const __v16bf *)__A); |
356 | } |
357 | |
358 | /// Convert packed half-precision (16-bit) floating-point odd-indexed elements |
359 | /// stored at memory locations starting at location \a __A to packed |
360 | /// single-precision (32-bit) floating-point elements, and store the results in |
361 | /// \a dst. |
362 | /// |
363 | /// \headerfile <x86intrin.h> |
364 | /// |
365 | /// \code |
366 | /// _mm_cvtneoph_ps(const __m128h *__A); |
367 | /// \endcode |
368 | /// |
369 | /// This intrinsic corresponds to the \c VCVTNEOPH2PS instruction. |
370 | /// |
371 | /// \param __A |
372 | /// A pointer to a 128-bit memory location containing 8 consecutive |
373 | /// half-precision (16-bit) floating-point values. |
374 | /// \returns |
375 | /// A 128-bit vector of [4 x float]. |
376 | /// |
377 | /// \code{.operation} |
378 | /// FOR j := 0 to 3 |
379 | /// k := j*2+1 |
380 | /// i := k*16 |
381 | /// m := j*32 |
382 | /// dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+i+15:__A+i]) |
383 | /// ENDFOR |
384 | /// dst[MAX:128] := 0 |
385 | /// \endcode |
386 | static __inline__ __m128 __DEFAULT_FN_ATTRS128 |
387 | _mm_cvtneoph_ps(const __m128h *__A) { |
388 | return (__m128)__builtin_ia32_vcvtneoph2ps128((const __v8hf *)__A); |
389 | } |
390 | |
391 | /// Convert packed half-precision (16-bit) floating-point odd-indexed elements |
392 | /// stored at memory locations starting at location \a __A to packed |
393 | /// single-precision (32-bit) floating-point elements, and store the results in |
394 | /// \a dst. |
395 | /// |
396 | /// \headerfile <x86intrin.h> |
397 | /// |
398 | /// \code |
399 | /// _mm256_cvtneoph_ps(const __m256h *__A); |
400 | /// \endcode |
401 | /// |
402 | /// This intrinsic corresponds to the \c VCVTNEOPH2PS instruction. |
403 | /// |
404 | /// \param __A |
405 | /// A pointer to a 256-bit memory location containing 16 consecutive |
406 | /// half-precision (16-bit) floating-point values. |
407 | /// \returns |
408 | /// A 256-bit vector of [8 x float]. |
409 | /// |
410 | /// \code{.operation} |
411 | /// FOR j := 0 to 7 |
412 | /// k := j*2+1 |
413 | /// i := k*16 |
414 | /// m := j*32 |
415 | /// dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+i+15:__A+i]) |
416 | /// ENDFOR |
417 | /// dst[MAX:256] := 0 |
418 | /// \endcode |
419 | static __inline__ __m256 __DEFAULT_FN_ATTRS256 |
420 | _mm256_cvtneoph_ps(const __m256h *__A) { |
421 | return (__m256)__builtin_ia32_vcvtneoph2ps256((const __v16hf *)__A); |
422 | } |
423 | |
424 | /// Convert packed single-precision (32-bit) floating-point elements in \a __A |
425 | /// to packed BF16 (16-bit) floating-point elements, and store the results in \a |
426 | /// dst. |
427 | /// |
428 | /// \headerfile <x86intrin.h> |
429 | /// |
430 | /// \code |
431 | /// _mm_cvtneps_avx_pbh(__m128 __A); |
432 | /// \endcode |
433 | /// |
434 | /// This intrinsic corresponds to the \c VCVTNEPS2BF16 instruction. |
435 | /// |
436 | /// \param __A |
437 | /// A 128-bit vector of [4 x float]. |
438 | /// \returns |
439 | /// A 128-bit vector of [8 x bfloat]. |
440 | /// |
441 | /// \code{.operation} |
442 | /// FOR j := 0 to 3 |
443 | /// dst.word[j] := Convert_FP32_To_BF16(__A.fp32[j]) |
444 | /// ENDFOR |
445 | /// dst[MAX:128] := 0 |
446 | /// \endcode |
447 | static __inline__ __m128bh __DEFAULT_FN_ATTRS128 |
448 | _mm_cvtneps_avx_pbh(__m128 __A) { |
449 | return (__m128bh)__builtin_ia32_vcvtneps2bf16128((__v4sf)__A); |
450 | } |
451 | |
452 | /// Convert packed single-precision (32-bit) floating-point elements in \a __A |
453 | /// to packed BF16 (16-bit) floating-point elements, and store the results in \a |
454 | /// dst. |
455 | /// |
456 | /// \headerfile <x86intrin.h> |
457 | /// |
458 | /// \code |
459 | /// _mm256_cvtneps_avx_pbh(__m256 __A); |
460 | /// \endcode |
461 | /// |
462 | /// This intrinsic corresponds to the \c VCVTNEPS2BF16 instruction. |
463 | /// |
464 | /// \param __A |
465 | /// A 256-bit vector of [8 x float]. |
466 | /// \returns |
467 | /// A 128-bit vector of [8 x bfloat]. |
468 | /// |
469 | /// \code{.operation} |
470 | /// FOR j := 0 to 7 |
471 | /// dst.word[j] := Convert_FP32_To_BF16(a.fp32[j]) |
472 | /// ENDFOR |
473 | /// dst[MAX:128] := 0 |
474 | /// \endcode |
475 | static __inline__ __m128bh __DEFAULT_FN_ATTRS256 |
476 | _mm256_cvtneps_avx_pbh(__m256 __A) { |
477 | return (__m128bh)__builtin_ia32_vcvtneps2bf16256((__v8sf)__A); |
478 | } |
479 | |
480 | #undef __DEFAULT_FN_ATTRS128 |
481 | #undef __DEFAULT_FN_ATTRS256 |
482 | |
483 | #endif // __AVXNECONVERTINTRIN_H |
484 | #endif // __SSE2__ |
485 |
Warning: This file is not a C or C++ file. It does not have highlighting.