svml_s_exp2f16_core_avx512.S source code [glibc/sysdeps/x86_64/fpu/multiarch/svml_s_exp2f16_core_avx512.S]

1	/ Function exp2f vectorized with AVX-512.*
2	Copyright (C) 2021-2024 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	https://www.gnu.org/licenses/. /*
18
19	/*
20	* ALGORITHM DESCRIPTION:
21	*
22	* Single precision mantissa represented as: 1.b1b2b3 ... b23
23	* Constant for single precision: S = 2^19 x 1.5
24	*
25	* 2^X = 2^Xo x 2^{X-Xo}
26	* 2^X = 2^K x 2^fo x 2^{X-Xo}
27	* 2^X = 2^K x 2^fo x 2^r
28	*
29	* 2^K --> Manual scaling
30	* 2^fo --> Table lookup
31	* r --> 1 + poly (r = X - Xo)
32	*
33	* Xo = K + fo
34	* Xo = K + 0.x1x2x3x4
35	*
36	* r = X - Xo
37	* = Vreduce(X, imm)
38	* = X - VRndScale(X, imm), where Xo = VRndScale(X, imm)
39	*
40	* Rnd(S + X) = S + Xo, where S is selected as S = 2^19 x 1.5
41	* S + X = S + floor(X) + 0.x1x2x3x4
42	* Rnd(S + X) = Rnd(2^19 x 1.5 + X)
43	* (Note: 2^exp x 1.b1b2b3 ... b23, 2^{exp-23} = 2^-4 for exp=19)
44	*
45	* exp2(x) = 2^K x 2^fo x (1 + poly(r)), where 2^r = 1 + poly(r)
46	*
47	* Scale back:
48	* dest = src1 x 2^floor(src2)
49	*
50	*
51	*/
52
53	/ Offsets for data table __svml_sexp2_data_internal_avx512*
54	*/
55	#define Frac_PowerS0 0
56	#define poly_coeff1 64
57	#define poly_coeff2 128
58	#define poly_coeff3 192
59	#define add_const 256
60	#define AbsMask 320
61	#define Threshold 384
62
63	#include <sysdep.h>
64
65	.section .text.evex512, "ax", @progbits
66	ENTRY(_ZGVeN16v_exp2f_skx)
67	pushq %rbp
68	cfi_def_cfa_offset(`16`)
69	movq %rsp, %rbp
70	cfi_def_cfa(`6`, `16`)
71	cfi_offset(`6`, -`16`)
72	andq $-`64`, %rsp
73	subq $`192`, %rsp
74	vmovups add_const+__svml_sexp2_data_internal_avx512(%rip), %zmm3
75
76	/*
77	* Reduced argument
78	* where VREDUCE is available
79	*/
80	vreduceps $`65`, {sae}, %zmm0, %zmm6
81	vmovups poly_coeff3+__svml_sexp2_data_internal_avx512(%rip), %zmm5
82	vmovups poly_coeff2+__svml_sexp2_data_internal_avx512(%rip), %zmm10
83	vmovups Threshold+__svml_sexp2_data_internal_avx512(%rip), %zmm2
84
85	/*
86	*
87	* HA
88	* Variables and constants
89	* Load constants and vector(s)
90	*/
91	vmovups poly_coeff1+__svml_sexp2_data_internal_avx512(%rip), %zmm7
92
93	/*
94	* Integer form of K+0.b1b2b3b4 in lower bits - call K_plus_f0
95	* Mantisssa of normalized single precision FP: 1.b1b2...b23
96	*/
97	vaddps {rd-sae}, %zmm3, %zmm0, %zmm4
98	vandps AbsMask+__svml_sexp2_data_internal_avx512(%rip), %zmm0, %zmm1
99
100	/ c3r + c2 /*
101	vfmadd231ps {rn-sae}, %zmm6, %zmm5, %zmm10
102	vcmpps $`30`, {sae}, %zmm2, %zmm1, %k0
103
104	/ c3r^2 + c2r + c1 /
105	vfmadd213ps {rn-sae}, %zmm7, %zmm6, %zmm10
106
107	/ Table value: 2^(0.b1b2b3b4) /
108	vpermps __svml_sexp2_data_internal_avx512(%rip), %zmm4, %zmm9
109	kmovw %k0, %edx
110
111	/ Tr /*
112	vmulps {rn-sae}, %zmm6, %zmm9, %zmm8
113
114	/ T + (Tr(c3r^2 + c2r + c1) /
115	vfmadd213ps {rn-sae}, %zmm9, %zmm8, %zmm10
116
117	/ Scaling placed at the end to avoid accuracy loss when Trscale underflows /
118	vscalefps {rn-sae}, %zmm0, %zmm10, %zmm1
119	testl %edx, %edx
120
121	/ Go to special inputs processing branch /
122	jne L(SPECIAL_VALUES_BRANCH)
123	# LOE rbx r12 r13 r14 r15 edx zmm0 zmm1
124
125	/ Restore registers*
126	* and exit the function
127	*/
128
129	L(EXIT):
130	vmovaps %zmm1, %zmm0
131	movq %rbp, %rsp
132	popq %rbp
133	cfi_def_cfa(`7`, `8`)
134	cfi_restore(`6`)
135	ret
136	cfi_def_cfa(`6`, `16`)
137	cfi_offset(`6`, -`16`)
138
139	/ Branch to process*
140	* special inputs
141	*/
142
143	L(SPECIAL_VALUES_BRANCH):
144	vmovups %zmm0, `64`(%rsp)
145	vmovups %zmm1, `128`(%rsp)
146	# LOE rbx r12 r13 r14 r15 edx zmm1
147
148	xorl %eax, %eax
149	# LOE rbx r12 r13 r14 r15 eax edx
150
151	vzeroupper
152	movq %r12, `16`(%rsp)
153	/ DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) /
154	.cfi_escape `0x10`, `0x0c`, `0x0e`, `0x38`, `0x1c`, `0x0d`, `0xc0`, `0xff`, `0xff`, `0xff`, `0x1a`, `0x0d`, `0x50`, `0xff`, `0xff`, `0xff`, `0x22`
155	movl %eax, %r12d
156	movq %r13, `8`(%rsp)
157	/ DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) /
158	.cfi_escape `0x10`, `0x0d`, `0x0e`, `0x38`, `0x1c`, `0x0d`, `0xc0`, `0xff`, `0xff`, `0xff`, `0x1a`, `0x0d`, `0x48`, `0xff`, `0xff`, `0xff`, `0x22`
159	movl %edx, %r13d
160	movq %r14, (%rsp)
161	/ DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) /
162	.cfi_escape `0x10`, `0x0e`, `0x0e`, `0x38`, `0x1c`, `0x0d`, `0xc0`, `0xff`, `0xff`, `0xff`, `0x1a`, `0x0d`, `0x40`, `0xff`, `0xff`, `0xff`, `0x22`
163	# LOE rbx r15 r12d r13d
164
165	/ Range mask*
166	* bits check
167	*/
168
169	L(RANGEMASK_CHECK):
170	btl %r12d, %r13d
171
172	/ Call scalar math function /
173	jc L(SCALAR_MATH_CALL)
174	# LOE rbx r15 r12d r13d
175
176	/ Special inputs*
177	* processing loop
178	*/
179
180	L(SPECIAL_VALUES_LOOP):
181	incl %r12d
182	cmpl $`16`, %r12d
183
184	/ Check bits in range mask /
185	jl L(RANGEMASK_CHECK)
186	# LOE rbx r15 r12d r13d
187
188	movq `16`(%rsp), %r12
189	cfi_restore(`12`)
190	movq `8`(%rsp), %r13
191	cfi_restore(`13`)
192	movq (%rsp), %r14
193	cfi_restore(`14`)
194	vmovups `128`(%rsp), %zmm1
195
196	/ Go to exit /
197	jmp L(EXIT)
198	/ DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) /
199	.cfi_escape `0x10`, `0x0c`, `0x0e`, `0x38`, `0x1c`, `0x0d`, `0xc0`, `0xff`, `0xff`, `0xff`, `0x1a`, `0x0d`, `0x50`, `0xff`, `0xff`, `0xff`, `0x22`
200	/ DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) /
201	.cfi_escape `0x10`, `0x0d`, `0x0e`, `0x38`, `0x1c`, `0x0d`, `0xc0`, `0xff`, `0xff`, `0xff`, `0x1a`, `0x0d`, `0x48`, `0xff`, `0xff`, `0xff`, `0x22`
202	/ DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) /
203	.cfi_escape `0x10`, `0x0e`, `0x0e`, `0x38`, `0x1c`, `0x0d`, `0xc0`, `0xff`, `0xff`, `0xff`, `0x1a`, `0x0d`, `0x40`, `0xff`, `0xff`, `0xff`, `0x22`
204	# LOE rbx r12 r13 r14 r15 zmm1
205
206	/ Scalar math function call*
207	* to process special input
208	*/
209
210	L(SCALAR_MATH_CALL):
211	movl %r12d, %r14d
212	vmovss `64`(%rsp, %r14, `4`), %xmm0
213	call exp2f@PLT
214	# LOE rbx r14 r15 r12d r13d xmm0
215
216	vmovss %xmm0, `128`(%rsp, %r14, `4`)
217
218	/ Process special inputs in loop /
219	jmp L(SPECIAL_VALUES_LOOP)
220	# LOE rbx r15 r12d r13d
221	END(_ZGVeN16v_exp2f_skx)
222
223	.section .rodata, "a"
224	.align `64`
225
226	#ifdef __svml_sexp2_data_internal_avx512_typedef
227	typedef unsigned int VUINT32;
228	typedef struct {
229	__declspec(align(`64`)) VUINT32 Frac_PowerS0[`16`][`1`];
230	__declspec(align(`64`)) VUINT32 poly_coeff1[`16`][`1`];
231	__declspec(align(`64`)) VUINT32 poly_coeff2[`16`][`1`];
232	__declspec(align(`64`)) VUINT32 poly_coeff3[`16`][`1`];
233	__declspec(align(`64`)) VUINT32 add_const[`16`][`1`];
234	__declspec(align(`64`)) VUINT32 AbsMask[`16`][`1`];
235	__declspec(align(`64`)) VUINT32 Threshold[`16`][`1`];
236	} __svml_sexp2_data_internal_avx512;
237	#endif
238	__svml_sexp2_data_internal_avx512:
239	/ Frac_PowerS0 /
240	.long `0x3F800000`
241	.long `0x3F85AAC3`
242	.long `0x3F8B95C2`
243	.long `0x3F91C3D3`
244	.long `0x3F9837F0`
245	.long `0x3F9EF532`
246	.long `0x3FA5FED7`
247	.long `0x3FAD583F`
248	.long `0x3FB504F3`
249	.long `0x3FBD08A4`
250	.long `0x3FC5672A`
251	.long `0x3FCE248C`
252	.long `0x3FD744FD`
253	.long `0x3FE0CCDF`
254	.long `0x3FEAC0C7`
255	.long `0x3FF5257D`
256	.align `64`
257	.long `0x3F317222`, `0x3F317222`, `0x3F317222`, `0x3F317222`, `0x3F317222`, `0x3F317222`, `0x3F317222`, `0x3F317222`, `0x3F317222`, `0x3F317222`, `0x3F317222`, `0x3F317222`, `0x3F317222`, `0x3F317222`, `0x3F317222`, `0x3F317222` / == poly_coeff1 == /
258	.align `64`
259	.long `0x3E75F16B`, `0x3E75F16B`, `0x3E75F16B`, `0x3E75F16B`, `0x3E75F16B`, `0x3E75F16B`, `0x3E75F16B`, `0x3E75F16B`, `0x3E75F16B`, `0x3E75F16B`, `0x3E75F16B`, `0x3E75F16B`, `0x3E75F16B`, `0x3E75F16B`, `0x3E75F16B`, `0x3E75F16B` / == poly_coeff2 == /
260	.align `64`
261	.long `0x3D6854CA`, `0x3D6854CA`, `0x3D6854CA`, `0x3D6854CA`, `0x3D6854CA`, `0x3D6854CA`, `0x3D6854CA`, `0x3D6854CA`, `0x3D6854CA`, `0x3D6854CA`, `0x3D6854CA`, `0x3D6854CA`, `0x3D6854CA`, `0x3D6854CA`, `0x3D6854CA`, `0x3D6854CA` / == poly_coeff3 == /
262	.align `64`
263	.long `0x49400000`, `0x49400000`, `0x49400000`, `0x49400000`, `0x49400000`, `0x49400000`, `0x49400000`, `0x49400000`, `0x49400000`, `0x49400000`, `0x49400000`, `0x49400000`, `0x49400000`, `0x49400000`, `0x49400000`, `0x49400000` / add_const /
264	.align `64`
265	.long `0x7fffffff`, `0x7fffffff`, `0x7fffffff`, `0x7fffffff`, `0x7fffffff`, `0x7fffffff`, `0x7fffffff`, `0x7fffffff`, `0x7fffffff`, `0x7fffffff`, `0x7fffffff`, `0x7fffffff`, `0x7fffffff`, `0x7fffffff`, `0x7fffffff`, `0x7fffffff` / AbsMask /
266	.align `64`
267	.long `0x42fc0000`, `0x42fc0000`, `0x42fc0000`, `0x42fc0000`, `0x42fc0000`, `0x42fc0000`, `0x42fc0000`, `0x42fc0000`, `0x42fc0000`, `0x42fc0000`, `0x42fc0000`, `0x42fc0000`, `0x42fc0000`, `0x42fc0000`, `0x42fc0000`, `0x42fc0000` / Threshold=126.0 /
268	.align `64`
269	.type __svml_sexp2_data_internal_avx512, @object
270	.size __svml_sexp2_data_internal_avx512, .-__svml_sexp2_data_internal_avx512
271

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_s_exp2f16_core_avx512.S