svml_s_exp2f4_core_sse4.S source code [glibc/sysdeps/x86_64/fpu/multiarch/svml_s_exp2f4_core_sse4.S]

1	/ Function exp2f vectorized with SSE4.*
2	Copyright (C) 2021-2024 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	https://www.gnu.org/licenses/. /*
18
19	/*
20	* ALGORITHM DESCRIPTION:
21	*
22	* exp2(x) = 2^n * T[j] * (1 + P(y))
23	* where
24	* x = m*(1/K) + y, y in [-1/K..1/K]
25	* m = n*K + j, m, n,j - signed integer, j in [-K/2..K/2]
26	*
27	* values of 2^j/K are tabulated
28	*
29	* P(y) is a minimax polynomial approximation of exp2(x)-1
30	* on small interval [-1/K..1/K]
31	*
32	* Special cases:
33	*
34	* exp2(NaN) = NaN
35	* exp2(+INF) = +INF
36	* exp2(-INF) = 0
37	* exp2(x) = 1 for subnormals
38	* For IEEE float
39	* if x >= 128.0 then exp2f(x) overflow
40	* if x < -151.0 then exp2f(x) underflow
41	*
42	*/
43
44	/ Offsets for data table __svml_sexp2_data_internal*
45	*/
46	#define _sShifter 0
47	#define _sPC0 16
48	#define _sPC1 32
49	#define _sPC2 48
50	#define _sPC3 64
51	#define _sPC4 80
52	#define _sPC5 96
53	#define _sPC6 112
54	#define _iAbsMask 128
55	#define _iDomainRange 144
56
57	#include <sysdep.h>
58
59	.section .text.sse4, "ax", @progbits
60	ENTRY(_ZGVbN4v_exp2f_sse4)
61	subq $`72`, %rsp
62	cfi_def_cfa_offset(`80`)
63
64	/ Check for overflow\underflow /
65	movups __svml_sexp2_data_internal(%rip), %xmm1
66
67	/ Implementation /
68	movaps %xmm1, %xmm5
69
70	/ Polynomial /
71	movups _sPC6+__svml_sexp2_data_internal(%rip), %xmm4
72	addps %xmm0, %xmm5
73	movaps %xmm5, %xmm3
74
75	/ 2^N /
76	pslld $`23`, %xmm5
77
78	/ Check for overflow\underflow /
79	movdqu _iAbsMask+__svml_sexp2_data_internal(%rip), %xmm2
80	subps %xmm1, %xmm3
81
82	/ R /
83	movaps %xmm0, %xmm1
84	pand %xmm0, %xmm2
85	pcmpgtd _iDomainRange+__svml_sexp2_data_internal(%rip), %xmm2
86	subps %xmm3, %xmm1
87	movmskps %xmm2, %edx
88	mulps %xmm1, %xmm4
89	addps _sPC5+__svml_sexp2_data_internal(%rip), %xmm4
90	mulps %xmm1, %xmm4
91	addps _sPC4+__svml_sexp2_data_internal(%rip), %xmm4
92	mulps %xmm1, %xmm4
93	addps _sPC3+__svml_sexp2_data_internal(%rip), %xmm4
94	mulps %xmm1, %xmm4
95	addps _sPC2+__svml_sexp2_data_internal(%rip), %xmm4
96	mulps %xmm1, %xmm4
97	addps _sPC1+__svml_sexp2_data_internal(%rip), %xmm4
98	mulps %xmm4, %xmm1
99	addps _sPC0+__svml_sexp2_data_internal(%rip), %xmm1
100
101	/ Reconstruction /
102	paddd %xmm5, %xmm1
103	testl %edx, %edx
104
105	/ Go to special inputs processing branch /
106	jne L(SPECIAL_VALUES_BRANCH)
107	# LOE rbx rbp r12 r13 r14 r15 edx xmm0 xmm1
108
109	/ Restore registers*
110	* and exit the function
111	*/
112
113	L(EXIT):
114	movaps %xmm1, %xmm0
115	addq $`72`, %rsp
116	cfi_def_cfa_offset(`8`)
117	ret
118	cfi_def_cfa_offset(`80`)
119
120	/ Branch to process*
121	* special inputs
122	*/
123
124	L(SPECIAL_VALUES_BRANCH):
125	movups %xmm0, `32`(%rsp)
126	movups %xmm1, `48`(%rsp)
127	# LOE rbx rbp r12 r13 r14 r15 edx
128
129	xorl %eax, %eax
130	movq %r12, `16`(%rsp)
131	cfi_offset(`12`, -`64`)
132	movl %eax, %r12d
133	movq %r13, `8`(%rsp)
134	cfi_offset(`13`, -`72`)
135	movl %edx, %r13d
136	movq %r14, (%rsp)
137	cfi_offset(`14`, -`80`)
138	# LOE rbx rbp r15 r12d r13d
139
140	/ Range mask*
141	* bits check
142	*/
143
144	L(RANGEMASK_CHECK):
145	btl %r12d, %r13d
146
147	/ Call scalar math function /
148	jc L(SCALAR_MATH_CALL)
149	# LOE rbx rbp r15 r12d r13d
150
151	/ Special inputs*
152	* processing loop
153	*/
154
155	L(SPECIAL_VALUES_LOOP):
156	incl %r12d
157	cmpl $`4`, %r12d
158
159	/ Check bits in range mask /
160	jl L(RANGEMASK_CHECK)
161	# LOE rbx rbp r15 r12d r13d
162
163	movq `16`(%rsp), %r12
164	cfi_restore(`12`)
165	movq `8`(%rsp), %r13
166	cfi_restore(`13`)
167	movq (%rsp), %r14
168	cfi_restore(`14`)
169	movups `48`(%rsp), %xmm1
170
171	/ Go to exit /
172	jmp L(EXIT)
173	cfi_offset(`12`, -`64`)
174	cfi_offset(`13`, -`72`)
175	cfi_offset(`14`, -`80`)
176	# LOE rbx rbp r12 r13 r14 r15 xmm1
177
178	/ Scalar math function call*
179	* to process special input
180	*/
181
182	L(SCALAR_MATH_CALL):
183	movl %r12d, %r14d
184	movss `32`(%rsp, %r14, `4`), %xmm0
185	call exp2f@PLT
186	# LOE rbx rbp r14 r15 r12d r13d xmm0
187
188	movss %xmm0, `48`(%rsp, %r14, `4`)
189
190	/ Process special inputs in loop /
191	jmp L(SPECIAL_VALUES_LOOP)
192	# LOE rbx rbp r15 r12d r13d
193	END(_ZGVbN4v_exp2f_sse4)
194
195	.section .rodata, "a"
196	.align `16`
197
198	#ifdef __svml_sexp2_data_internal_typedef
199	typedef unsigned int VUINT32;
200	typedef struct {
201	__declspec(align(`16`)) VUINT32 _sShifter[`4`][`1`];
202	__declspec(align(`16`)) VUINT32 _sPC0[`4`][`1`];
203	__declspec(align(`16`)) VUINT32 _sPC1[`4`][`1`];
204	__declspec(align(`16`)) VUINT32 _sPC2[`4`][`1`];
205	__declspec(align(`16`)) VUINT32 _sPC3[`4`][`1`];
206	__declspec(align(`16`)) VUINT32 _sPC4[`4`][`1`];
207	__declspec(align(`16`)) VUINT32 _sPC5[`4`][`1`];
208	__declspec(align(`16`)) VUINT32 _sPC6[`4`][`1`];
209	__declspec(align(`16`)) VUINT32 _iAbsMask[`4`][`1`];
210	__declspec(align(`16`)) VUINT32 _iDomainRange[`4`][`1`];
211	} __svml_sexp2_data_internal;
212	#endif
213	__svml_sexp2_data_internal:
214	.long `0x4b400000`, `0x4b400000`, `0x4b400000`, `0x4b400000` / _sShifter /
215	.align `16`
216	.long `0x3F800000`, `0x3F800000`, `0x3F800000`, `0x3F800000` / _sPC0 /
217	.align `16`
218	.long `0x3f317218`, `0x3f317218`, `0x3f317218`, `0x3f317218` / _sPC1 /
219	.align `16`
220	.long `0x3e75fdef`, `0x3e75fdef`, `0x3e75fdef`, `0x3e75fdef` / _sPC2 /
221	.align `16`
222	.long `0x3d6357cf`, `0x3d6357cf`, `0x3d6357cf`, `0x3d6357cf` / _sPC3 /
223	.align `16`
224	.long `0x3c1d962c`, `0x3c1d962c`, `0x3c1d962c`, `0x3c1d962c` / _sPC4 /
225	.align `16`
226	.long `0x3aaf7a51`, `0x3aaf7a51`, `0x3aaf7a51`, `0x3aaf7a51` / _sPC5 /
227	.align `16`
228	.long `0x39213c8c`, `0x39213c8c`, `0x39213c8c`, `0x39213c8c` / _sPC6 /
229	//common
230	.align `16`
231	.long `0x7fffffff`, `0x7fffffff`, `0x7fffffff`, `0x7fffffff` / _iAbsMask /
232	.align `16`
233	.long `0x42fc0000`, `0x42fc0000`, `0x42fc0000`, `0x42fc0000` / _iDomainRange=126.0 /
234	.align `16`
235	.type __svml_sexp2_data_internal, @object
236	.size __svml_sexp2_data_internal, .-__svml_sexp2_data_internal
237

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_s_exp2f4_core_sse4.S