svml_s_exp2f8_core_avx2.S source code [glibc/sysdeps/x86_64/fpu/multiarch/svml_s_exp2f8_core_avx2.S]

1	/ Function exp2f vectorized with AVX2.*
2	Copyright (C) 2021-2024 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	https://www.gnu.org/licenses/. /*
18
19	/*
20	* ALGORITHM DESCRIPTION:
21	*
22	* exp2(x) = 2^n * T[j] * (1 + P(y))
23	* where
24	* x = m*(1/K) + y, y in [-1/K..1/K]
25	* m = n*K + j, m, n,j - signed integer, j in [-K/2..K/2]
26	*
27	* values of 2^j/K are tabulated
28	*
29	* P(y) is a minimax polynomial approximation of exp2(x)-1
30	* on small interval [-1/K..1/K]
31	*
32	* Special cases:
33	*
34	* exp2(NaN) = NaN
35	* exp2(+INF) = +INF
36	* exp2(-INF) = 0
37	* exp2(x) = 1 for subnormals
38	* For IEEE float
39	* if x >= 128.0 then exp2f(x) overflow
40	* if x < -151.0 then exp2f(x) underflow
41	*
42	*/
43
44	/ Offsets for data table __svml_sexp2_data_internal*
45	*/
46	#define _sShifter 0
47	#define _sPC0 32
48	#define _sPC1 64
49	#define _sPC2 96
50	#define _sPC3 128
51	#define _sPC4 160
52	#define _sPC5 192
53	#define _sPC6 224
54	#define _iAbsMask 256
55	#define _iDomainRange 288
56
57	#include <sysdep.h>
58
59	.section .text.avx2, "ax", @progbits
60	ENTRY(_ZGVdN8v_exp2f_avx2)
61	pushq %rbp
62	cfi_def_cfa_offset(`16`)
63	movq %rsp, %rbp
64	cfi_def_cfa(`6`, `16`)
65	cfi_offset(`6`, -`16`)
66	andq $-`32`, %rsp
67	subq $`96`, %rsp
68	vmovups __svml_sexp2_data_internal(%rip), %ymm1
69
70	/ Check for overflow\underflow /
71	vmovups _sPC6+__svml_sexp2_data_internal(%rip), %ymm7
72
73	/ Implementation /
74	vaddps %ymm1, %ymm0, %ymm6
75	vsubps %ymm1, %ymm6, %ymm4
76
77	/ 2^N /
78	vpslld $`23`, %ymm6, %ymm8
79
80	/ R /
81	vsubps %ymm4, %ymm0, %ymm5
82
83	/ Polynomial /
84	vfmadd213ps _sPC5+__svml_sexp2_data_internal(%rip), %ymm5, %ymm7
85	vfmadd213ps _sPC4+__svml_sexp2_data_internal(%rip), %ymm5, %ymm7
86	vfmadd213ps _sPC3+__svml_sexp2_data_internal(%rip), %ymm5, %ymm7
87	vfmadd213ps _sPC2+__svml_sexp2_data_internal(%rip), %ymm5, %ymm7
88	vfmadd213ps _sPC1+__svml_sexp2_data_internal(%rip), %ymm5, %ymm7
89	vfmadd213ps _sPC0+__svml_sexp2_data_internal(%rip), %ymm5, %ymm7
90
91	/ Check for overflow\underflow /
92	vandps _iAbsMask+__svml_sexp2_data_internal(%rip), %ymm0, %ymm2
93	vpcmpgtd _iDomainRange+__svml_sexp2_data_internal(%rip), %ymm2, %ymm3
94	vmovmskps %ymm3, %edx
95
96	/ Reconstruction /
97	vpaddd %ymm8, %ymm7, %ymm1
98	testl %edx, %edx
99
100	/ Go to special inputs processing branch /
101	jne L(SPECIAL_VALUES_BRANCH)
102	# LOE rbx r12 r13 r14 r15 edx ymm0 ymm1
103
104	/ Restore registers*
105	* and exit the function
106	*/
107
108	L(EXIT):
109	vmovaps %ymm1, %ymm0
110	movq %rbp, %rsp
111	popq %rbp
112	cfi_def_cfa(`7`, `8`)
113	cfi_restore(`6`)
114	ret
115	cfi_def_cfa(`6`, `16`)
116	cfi_offset(`6`, -`16`)
117
118	/ Branch to process*
119	* special inputs
120	*/
121
122	L(SPECIAL_VALUES_BRANCH):
123	vmovups %ymm0, `32`(%rsp)
124	vmovups %ymm1, `64`(%rsp)
125	# LOE rbx r12 r13 r14 r15 edx ymm1
126
127	xorl %eax, %eax
128	# LOE rbx r12 r13 r14 r15 eax edx
129
130	vzeroupper
131	movq %r12, `16`(%rsp)
132	/ DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) /
133	.cfi_escape `0x10`, `0x0c`, `0x0e`, `0x38`, `0x1c`, `0x0d`, `0xe0`, `0xff`, `0xff`, `0xff`, `0x1a`, `0x0d`, `0xb0`, `0xff`, `0xff`, `0xff`, `0x22`
134	movl %eax, %r12d
135	movq %r13, `8`(%rsp)
136	/ DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) /
137	.cfi_escape `0x10`, `0x0d`, `0x0e`, `0x38`, `0x1c`, `0x0d`, `0xe0`, `0xff`, `0xff`, `0xff`, `0x1a`, `0x0d`, `0xa8`, `0xff`, `0xff`, `0xff`, `0x22`
138	movl %edx, %r13d
139	movq %r14, (%rsp)
140	/ DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) /
141	.cfi_escape `0x10`, `0x0e`, `0x0e`, `0x38`, `0x1c`, `0x0d`, `0xe0`, `0xff`, `0xff`, `0xff`, `0x1a`, `0x0d`, `0xa0`, `0xff`, `0xff`, `0xff`, `0x22`
142	# LOE rbx r15 r12d r13d
143
144	/ Range mask*
145	* bits check
146	*/
147
148	L(RANGEMASK_CHECK):
149	btl %r12d, %r13d
150
151	/ Call scalar math function /
152	jc L(SCALAR_MATH_CALL)
153	# LOE rbx r15 r12d r13d
154
155	/ Special inputs*
156	* processing loop
157	*/
158
159	L(SPECIAL_VALUES_LOOP):
160	incl %r12d
161	cmpl $`8`, %r12d
162
163	/ Check bits in range mask /
164	jl L(RANGEMASK_CHECK)
165	# LOE rbx r15 r12d r13d
166
167	movq `16`(%rsp), %r12
168	cfi_restore(`12`)
169	movq `8`(%rsp), %r13
170	cfi_restore(`13`)
171	movq (%rsp), %r14
172	cfi_restore(`14`)
173	vmovups `64`(%rsp), %ymm1
174
175	/ Go to exit /
176	jmp L(EXIT)
177	/ DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) /
178	.cfi_escape `0x10`, `0x0c`, `0x0e`, `0x38`, `0x1c`, `0x0d`, `0xe0`, `0xff`, `0xff`, `0xff`, `0x1a`, `0x0d`, `0xb0`, `0xff`, `0xff`, `0xff`, `0x22`
179	/ DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) /
180	.cfi_escape `0x10`, `0x0d`, `0x0e`, `0x38`, `0x1c`, `0x0d`, `0xe0`, `0xff`, `0xff`, `0xff`, `0x1a`, `0x0d`, `0xa8`, `0xff`, `0xff`, `0xff`, `0x22`
181	/ DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) /
182	.cfi_escape `0x10`, `0x0e`, `0x0e`, `0x38`, `0x1c`, `0x0d`, `0xe0`, `0xff`, `0xff`, `0xff`, `0x1a`, `0x0d`, `0xa0`, `0xff`, `0xff`, `0xff`, `0x22`
183	# LOE rbx r12 r13 r14 r15 ymm1
184
185	/ Scalar math function call*
186	* to process special input
187	*/
188
189	L(SCALAR_MATH_CALL):
190	movl %r12d, %r14d
191	vmovss `32`(%rsp, %r14, `4`), %xmm0
192	call exp2f@PLT
193	# LOE rbx r14 r15 r12d r13d xmm0
194
195	vmovss %xmm0, `64`(%rsp, %r14, `4`)
196
197	/ Process special inputs in loop /
198	jmp L(SPECIAL_VALUES_LOOP)
199	# LOE rbx r15 r12d r13d
200	END(_ZGVdN8v_exp2f_avx2)
201
202	.section .rodata, "a"
203	.align `32`
204
205	#ifdef __svml_sexp2_data_internal_typedef
206	typedef unsigned int VUINT32;
207	typedef struct {
208	__declspec(align(`32`)) VUINT32 _sShifter[`8`][`1`];
209	__declspec(align(`32`)) VUINT32 _sPC0[`8`][`1`];
210	__declspec(align(`32`)) VUINT32 _sPC1[`8`][`1`];
211	__declspec(align(`32`)) VUINT32 _sPC2[`8`][`1`];
212	__declspec(align(`32`)) VUINT32 _sPC3[`8`][`1`];
213	__declspec(align(`32`)) VUINT32 _sPC4[`8`][`1`];
214	__declspec(align(`32`)) VUINT32 _sPC5[`8`][`1`];
215	__declspec(align(`32`)) VUINT32 _sPC6[`8`][`1`];
216	__declspec(align(`32`)) VUINT32 _iAbsMask[`8`][`1`];
217	__declspec(align(`32`)) VUINT32 _iDomainRange[`8`][`1`];
218	} __svml_sexp2_data_internal;
219	#endif
220	__svml_sexp2_data_internal:
221	.long `0x4b400000`, `0x4b400000`, `0x4b400000`, `0x4b400000`, `0x4b400000`, `0x4b400000`, `0x4b400000`, `0x4b400000` / _sShifter /
222	.align `32`
223	.long `0x3F800000`, `0x3F800000`, `0x3F800000`, `0x3F800000`, `0x3F800000`, `0x3F800000`, `0x3F800000`, `0x3F800000` / _sPC0 /
224	.align `32`
225	.long `0x3f317218`, `0x3f317218`, `0x3f317218`, `0x3f317218`, `0x3f317218`, `0x3f317218`, `0x3f317218`, `0x3f317218` / _sPC1 /
226	.align `32`
227	.long `0x3e75fdef`, `0x3e75fdef`, `0x3e75fdef`, `0x3e75fdef`, `0x3e75fdef`, `0x3e75fdef`, `0x3e75fdef`, `0x3e75fdef` / _sPC2 /
228	.align `32`
229	.long `0x3d6357cf`, `0x3d6357cf`, `0x3d6357cf`, `0x3d6357cf`, `0x3d6357cf`, `0x3d6357cf`, `0x3d6357cf`, `0x3d6357cf` / _sPC3 /
230	.align `32`
231	.long `0x3c1d962c`, `0x3c1d962c`, `0x3c1d962c`, `0x3c1d962c`, `0x3c1d962c`, `0x3c1d962c`, `0x3c1d962c`, `0x3c1d962c` / _sPC4 /
232	.align `32`
233	.long `0x3aaf7a51`, `0x3aaf7a51`, `0x3aaf7a51`, `0x3aaf7a51`, `0x3aaf7a51`, `0x3aaf7a51`, `0x3aaf7a51`, `0x3aaf7a51` / _sPC5 /
234	.align `32`
235	.long `0x39213c8c`, `0x39213c8c`, `0x39213c8c`, `0x39213c8c`, `0x39213c8c`, `0x39213c8c`, `0x39213c8c`, `0x39213c8c` / _sPC6 /
236	//common
237	.align `32`
238	.long `0x7fffffff`, `0x7fffffff`, `0x7fffffff`, `0x7fffffff`, `0x7fffffff`, `0x7fffffff`, `0x7fffffff`, `0x7fffffff` / _iAbsMask /
239	.align `32`
240	.long `0x42fc0000`, `0x42fc0000`, `0x42fc0000`, `0x42fc0000`, `0x42fc0000`, `0x42fc0000`, `0x42fc0000`, `0x42fc0000` / _iDomainRange=126.0 /
241	.align `32`
242	.type __svml_sexp2_data_internal, @object
243	.size __svml_sexp2_data_internal, .-__svml_sexp2_data_internal
244

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_s_exp2f8_core_avx2.S