avx512bf16intrin.h source code [clang/lib/Headers/avx512bf16intrin.h]

Warning: This file is not a C or C++ file. It does not have highlighting.

1	/*===------------ avx512bf16intrin.h - AVX512_BF16 intrinsics --------------===
2	*
3	* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	* See https://llvm.org/LICENSE.txt for license information.
5	* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	*
7	*===-----------------------------------------------------------------------===
8	*/
9	#ifndef __IMMINTRIN_H
10	#error "Never use <avx512bf16intrin.h> directly; include <immintrin.h> instead."
11	#endif
12
13	#ifdef __SSE2__
14
15	#ifndef __AVX512BF16INTRIN_H
16	#define __AVX512BF16INTRIN_H
17
18	typedef __bf16 __v32bf __attribute__((__vector_size__(64), __aligned__(64)));
19	typedef __bf16 __m512bh __attribute__((__vector_size__(64), __aligned__(64)));
20	typedef __bf16 __bfloat16 __attribute__((deprecated("use __bf16 instead")));
21
22	#define __DEFAULT_FN_ATTRS512 \
23	__attribute__((__always_inline__, __nodebug__, __target__("avx512bf16,evex512"), \
24	__min_vector_width__(512)))
25	#define __DEFAULT_FN_ATTRS \
26	__attribute__((__always_inline__, __nodebug__, \
27	__target__("avx512bf16,no-evex512")))
28
29	/// Convert One BF16 Data to One Single Float Data.
30	///
31	/// \headerfile <x86intrin.h>
32	///
33	/// This intrinsic does not correspond to a specific instruction.
34	///
35	/// \param __A
36	/// A bfloat data.
37	/// \returns A float data whose sign field and exponent field keep unchanged,
38	/// and fraction field is extended to 23 bits.
39	static __inline__ float __DEFAULT_FN_ATTRS _mm_cvtsbh_ss(__bf16 __A) {
40	return __builtin_ia32_cvtsbf162ss_32(__A);
41	}
42
43	/// Convert Two Packed Single Data to One Packed BF16 Data.
44	///
45	/// \headerfile <x86intrin.h>
46	///
47	/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
48	///
49	/// \param __A
50	/// A 512-bit vector of [16 x float].
51	/// \param __B
52	/// A 512-bit vector of [16 x float].
53	/// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from
54	/// conversion of __B, and higher 256 bits come from conversion of __A.
55	static __inline__ __m512bh __DEFAULT_FN_ATTRS512
56	_mm512_cvtne2ps_pbh(__m512 __A, __m512 __B) {
57	return (__m512bh)__builtin_ia32_cvtne2ps2bf16_512((__v16sf) __A,
58	(__v16sf) __B);
59	}
60
61	/// Convert Two Packed Single Data to One Packed BF16 Data.
62	///
63	/// \headerfile <x86intrin.h>
64	///
65	/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
66	///
67	/// \param __A
68	/// A 512-bit vector of [16 x float].
69	/// \param __B
70	/// A 512-bit vector of [16 x float].
71	/// \param __W
72	/// A 512-bit vector of [32 x bfloat].
73	/// \param __U
74	/// A 32-bit mask value specifying what is chosen for each element.
75	/// A 1 means conversion of __A or __B. A 0 means element from __W.
76	/// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from
77	/// conversion of __B, and higher 256 bits come from conversion of __A.
78	static __inline__ __m512bh __DEFAULT_FN_ATTRS512
79	_mm512_mask_cvtne2ps_pbh(__m512bh __W, __mmask32 __U, __m512 __A, __m512 __B) {
80	return (__m512bh)__builtin_ia32_selectpbf_512((__mmask32)__U,
81	(__v32bf)_mm512_cvtne2ps_pbh(__A, __B),
82	(__v32bf)__W);
83	}
84
85	/// Convert Two Packed Single Data to One Packed BF16 Data.
86	///
87	/// \headerfile <x86intrin.h>
88	///
89	/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
90	///
91	/// \param __A
92	/// A 512-bit vector of [16 x float].
93	/// \param __B
94	/// A 512-bit vector of [16 x float].
95	/// \param __U
96	/// A 32-bit mask value specifying what is chosen for each element.
97	/// A 1 means conversion of __A or __B. A 0 means element is zero.
98	/// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from
99	/// conversion of __B, and higher 256 bits come from conversion of __A.
100	static __inline__ __m512bh __DEFAULT_FN_ATTRS512
101	_mm512_maskz_cvtne2ps_pbh(__mmask32 __U, __m512 __A, __m512 __B) {
102	return (__m512bh)__builtin_ia32_selectpbf_512((__mmask32)__U,
103	(__v32bf)_mm512_cvtne2ps_pbh(__A, __B),
104	(__v32bf)_mm512_setzero_si512());
105	}
106
107	/// Convert Packed Single Data to Packed BF16 Data.
108	///
109	/// \headerfile <x86intrin.h>
110	///
111	/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
112	///
113	/// \param __A
114	/// A 512-bit vector of [16 x float].
115	/// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A.
116	static __inline__ __m256bh __DEFAULT_FN_ATTRS512
117	_mm512_cvtneps_pbh(__m512 __A) {
118	return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
119	(__v16bf)_mm256_undefined_si256(),
120	(__mmask16)-1);
121	}
122
123	/// Convert Packed Single Data to Packed BF16 Data.
124	///
125	/// \headerfile <x86intrin.h>
126	///
127	/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
128	///
129	/// \param __A
130	/// A 512-bit vector of [16 x float].
131	/// \param __W
132	/// A 256-bit vector of [16 x bfloat].
133	/// \param __U
134	/// A 16-bit mask value specifying what is chosen for each element.
135	/// A 1 means conversion of __A. A 0 means element from __W.
136	/// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A.
137	static __inline__ __m256bh __DEFAULT_FN_ATTRS512
138	_mm512_mask_cvtneps_pbh(__m256bh __W, __mmask16 __U, __m512 __A) {
139	return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
140	(__v16bf)__W,
141	(__mmask16)__U);
142	}
143
144	/// Convert Packed Single Data to Packed BF16 Data.
145	///
146	/// \headerfile <x86intrin.h>
147	///
148	/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
149	///
150	/// \param __A
151	/// A 512-bit vector of [16 x float].
152	/// \param __U
153	/// A 16-bit mask value specifying what is chosen for each element.
154	/// A 1 means conversion of __A. A 0 means element is zero.
155	/// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A.
156	static __inline__ __m256bh __DEFAULT_FN_ATTRS512
157	_mm512_maskz_cvtneps_pbh(__mmask16 __U, __m512 __A) {
158	return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
159	(__v16bf)_mm256_setzero_si256(),
160	(__mmask16)__U);
161	}
162
163	/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
164	///
165	/// \headerfile <x86intrin.h>
166	///
167	/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
168	///
169	/// \param __A
170	/// A 512-bit vector of [32 x bfloat].
171	/// \param __B
172	/// A 512-bit vector of [32 x bfloat].
173	/// \param __D
174	/// A 512-bit vector of [16 x float].
175	/// \returns A 512-bit vector of [16 x float] comes from Dot Product of
176	/// __A, __B and __D
177	static __inline__ __m512 __DEFAULT_FN_ATTRS512
178	_mm512_dpbf16_ps(__m512 __D, __m512bh __A, __m512bh __B) {
179	return (__m512)__builtin_ia32_dpbf16ps_512((__v16sf) __D,
180	(__v32bf) __A,
181	(__v32bf) __B);
182	}
183
184	/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
185	///
186	/// \headerfile <x86intrin.h>
187	///
188	/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
189	///
190	/// \param __A
191	/// A 512-bit vector of [32 x bfloat].
192	/// \param __B
193	/// A 512-bit vector of [32 x bfloat].
194	/// \param __D
195	/// A 512-bit vector of [16 x float].
196	/// \param __U
197	/// A 16-bit mask value specifying what is chosen for each element.
198	/// A 1 means __A and __B's dot product accumulated with __D. A 0 means __D.
199	/// \returns A 512-bit vector of [16 x float] comes from Dot Product of
200	/// __A, __B and __D
201	static __inline__ __m512 __DEFAULT_FN_ATTRS512
202	_mm512_mask_dpbf16_ps(__m512 __D, __mmask16 __U, __m512bh __A, __m512bh __B) {
203	return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
204	(__v16sf)_mm512_dpbf16_ps(__D, __A, __B),
205	(__v16sf)__D);
206	}
207
208	/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
209	///
210	/// \headerfile <x86intrin.h>
211	///
212	/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
213	///
214	/// \param __A
215	/// A 512-bit vector of [32 x bfloat].
216	/// \param __B
217	/// A 512-bit vector of [32 x bfloat].
218	/// \param __D
219	/// A 512-bit vector of [16 x float].
220	/// \param __U
221	/// A 16-bit mask value specifying what is chosen for each element.
222	/// A 1 means __A and __B's dot product accumulated with __D. A 0 means 0.
223	/// \returns A 512-bit vector of [16 x float] comes from Dot Product of
224	/// __A, __B and __D
225	static __inline__ __m512 __DEFAULT_FN_ATTRS512
226	_mm512_maskz_dpbf16_ps(__mmask16 __U, __m512 __D, __m512bh __A, __m512bh __B) {
227	return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
228	(__v16sf)_mm512_dpbf16_ps(__D, __A, __B),
229	(__v16sf)_mm512_setzero_si512());
230	}
231
232	/// Convert Packed BF16 Data to Packed float Data.
233	///
234	/// \headerfile <x86intrin.h>
235	///
236	/// \param __A
237	/// A 256-bit vector of [16 x bfloat].
238	/// \returns A 512-bit vector of [16 x float] come from conversion of __A
239	static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtpbh_ps(__m256bh __A) {
240	return _mm512_castsi512_ps((__m512i)_mm512_slli_epi32(
241	(__m512i)_mm512_cvtepi16_epi32((__m256i)__A), 16));
242	}
243
244	/// Convert Packed BF16 Data to Packed float Data using zeroing mask.
245	///
246	/// \headerfile <x86intrin.h>
247	///
248	/// \param __U
249	/// A 16-bit mask. Elements are zeroed out when the corresponding mask
250	/// bit is not set.
251	/// \param __A
252	/// A 256-bit vector of [16 x bfloat].
253	/// \returns A 512-bit vector of [16 x float] come from conversion of __A
254	static __inline__ __m512 __DEFAULT_FN_ATTRS512
255	_mm512_maskz_cvtpbh_ps(__mmask16 __U, __m256bh __A) {
256	return _mm512_castsi512_ps((__m512i)_mm512_slli_epi32(
257	(__m512i)_mm512_maskz_cvtepi16_epi32((__mmask16)__U, (__m256i)__A), 16));
258	}
259
260	/// Convert Packed BF16 Data to Packed float Data using merging mask.
261	///
262	/// \headerfile <x86intrin.h>
263	///
264	/// \param __S
265	/// A 512-bit vector of [16 x float]. Elements are copied from __S when
266	/// the corresponding mask bit is not set.
267	/// \param __U
268	/// A 16-bit mask.
269	/// \param __A
270	/// A 256-bit vector of [16 x bfloat].
271	/// \returns A 512-bit vector of [16 x float] come from conversion of __A
272	static __inline__ __m512 __DEFAULT_FN_ATTRS512
273	_mm512_mask_cvtpbh_ps(__m512 __S, __mmask16 __U, __m256bh __A) {
274	return _mm512_castsi512_ps((__m512i)_mm512_mask_slli_epi32(
275	(__m512i)__S, (__mmask16)__U,
276	(__m512i)_mm512_cvtepi16_epi32((__m256i)__A), 16));
277	}
278
279	#undef __DEFAULT_FN_ATTRS
280	#undef __DEFAULT_FN_ATTRS512
281
282	#endif
283	#endif
284

Warning: This file is not a C or C++ file. It does not have highlighting.

source code of clang/lib/Headers/avx512bf16intrin.h