svml_s_sinhf4_core_sse4.S source code [glibc/sysdeps/x86_64/fpu/multiarch/svml_s_sinhf4_core_sse4.S]

1	/ Function sinhf vectorized with SSE4.*
2	Copyright (C) 2021-2024 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	https://www.gnu.org/licenses/. /*
18
19	/*
20	* ALGORITHM DESCRIPTION:
21	*
22	* Compute sinh(x) as (exp(x)-exp(-x))/2,
23	* where exp is calculated as
24	* exp(Mln2 + ln2(j/2^k) + r) = 2^M * 2^(j/2^k) * exp(r)
25	*
26	* Special cases:
27	*
28	* sinh(NaN) = quiet NaN, and raise invalid exception
29	* sinh(INF) = that INF
30	* sinh(x) = x for subnormals
31	* sinh(x) overflows for big x and returns MAXLOG+log(2)
32	*
33	*/
34
35	/ Offsets for data table __svml_ssinh_data_internal*
36	*/
37	#define _sInvLn2 0
38	#define _sLn2hi 16
39	#define _sLn2lo 32
40	#define _sSign 48
41	#define _sShifter 64
42	#define _iDomainRange 80
43	#define _sPC1 96
44	#define _sPC2 112
45	#define _sPC3 128
46	#define _sPC4 144
47	#define _sPC5 160
48	#define _sPC6 176
49	#define _iHalf 192
50
51	#include <sysdep.h>
52
53	.section .text.sse4, "ax", @progbits
54	ENTRY(_ZGVbN4v_sinhf_sse4)
55	subq $`72`, %rsp
56	cfi_def_cfa_offset(`80`)
57
58	/*
59	* Implementation
60	* Abs argument
61	*/
62	movups _sSign+__svml_ssinh_data_internal(%rip), %xmm14
63	andps %xmm0, %xmm14
64	movaps %xmm14, %xmm10
65
66	/*
67	* Load argument
68	* dM = x/log(2) + RShifter
69	*/
70	movups _sInvLn2+__svml_ssinh_data_internal(%rip), %xmm7
71	pxor %xmm0, %xmm10
72	mulps %xmm10, %xmm7
73
74	/*
75	* Check for overflow\underflow
76	* MORE faster than GE?
77	*/
78	movaps %xmm10, %xmm1
79	movups _sShifter+__svml_ssinh_data_internal(%rip), %xmm2
80
81	/ sR = sX - sNLog2_hi /*
82	movups _sLn2hi+__svml_ssinh_data_internal(%rip), %xmm3
83	addps %xmm2, %xmm7
84
85	/*
86	* R
87	* sN = sM - RShifter
88	*/
89	movaps %xmm7, %xmm4
90
91	/*
92	* G1, G2 2^N, 2^(-N)
93	* iM now is an EXP(2^N)
94	*/
95	pslld $`23`, %xmm7
96
97	/ sR = (sX - sNLog2_hi) - sNLog2_lo /
98	movups _sLn2lo+__svml_ssinh_data_internal(%rip), %xmm5
99	subps %xmm2, %xmm4
100	mulps %xmm4, %xmm3
101	mulps %xmm4, %xmm5
102	subps %xmm3, %xmm10
103
104	/*
105	* sinh(r) = r((a1=1)+r^2(a3+r^2(a5+{v1 r^2a7})))) = r + r(r^2(a3+r^2(a5+r^2a7))) ....
106	* sSinh_r = (a3+r^2*a5)
107	*/
108	movups _sPC5+__svml_ssinh_data_internal(%rip), %xmm8
109	subps %xmm5, %xmm10
110
111	/ sR2 = sR^2 /
112	movaps %xmm10, %xmm12
113	mulps %xmm10, %xmm12
114
115	/*
116	* sinh(X) = sG2 + sG1sinh(dR) + sG2sR2(a2+sR2(a4+a6*sR2)
117	* sOut = (a4 +a6*sR2)
118	*/
119	movups _sPC6+__svml_ssinh_data_internal(%rip), %xmm9
120	mulps %xmm12, %xmm8
121	mulps %xmm12, %xmm9
122	addps _sPC3+__svml_ssinh_data_internal(%rip), %xmm8
123	addps _sPC4+__svml_ssinh_data_internal(%rip), %xmm9
124
125	/ sSinh_r = r^2(a3+r^2a5) /
126	mulps %xmm12, %xmm8
127
128	/ sOut = a2+sR2(a4+a6sR2) /
129	mulps %xmm12, %xmm9
130
131	/ sSinh_r = r + r(r^2(a3+r^2a5)) /*
132	mulps %xmm10, %xmm8
133	addps _sPC2+__svml_ssinh_data_internal(%rip), %xmm9
134	addps %xmm8, %xmm10
135
136	/ sOut = sR2(a2+sR2(a4+a6sR2) /*
137	mulps %xmm9, %xmm12
138	movdqu _iHalf+__svml_ssinh_data_internal(%rip), %xmm6
139	movdqa %xmm6, %xmm13
140	psubd %xmm7, %xmm6
141	paddd %xmm7, %xmm13
142
143	/ sG1 = 2^(N-1)+2^(-N-1) /
144	movdqa %xmm13, %xmm11
145
146	/ sG2 = 2^(N-1)-2^(-N-1) /
147	subps %xmm6, %xmm13
148	addps %xmm6, %xmm11
149
150	/ sOut = sG2sR2(a2+sR2(a4+a6sR2) /
151	mulps %xmm13, %xmm12
152
153	/ sOut = sG1sinh(dR)+sG2sR2(a2+sR2(a4+a6sR2) /*
154	mulps %xmm10, %xmm11
155	pcmpgtd _iDomainRange+__svml_ssinh_data_internal(%rip), %xmm1
156	addps %xmm11, %xmm12
157	movmskps %xmm1, %edx
158
159	/ sOut = sG2 + sG1sinh(dR) + sG2sR2(a2+sR2(a4+a6sR2) /*
160	addps %xmm12, %xmm13
161
162	/ Ret H /
163	orps %xmm13, %xmm14
164	testl %edx, %edx
165
166	/ Go to special inputs processing branch /
167	jne L(SPECIAL_VALUES_BRANCH)
168	# LOE rbx rbp r12 r13 r14 r15 edx xmm0 xmm14
169
170	/ Restore registers*
171	* and exit the function
172	*/
173
174	L(EXIT):
175	movaps %xmm14, %xmm0
176	addq $`72`, %rsp
177	cfi_def_cfa_offset(`8`)
178	ret
179	cfi_def_cfa_offset(`80`)
180
181	/ Branch to process*
182	* special inputs
183	*/
184
185	L(SPECIAL_VALUES_BRANCH):
186	movups %xmm0, `32`(%rsp)
187	movups %xmm14, `48`(%rsp)
188	# LOE rbx rbp r12 r13 r14 r15 edx
189
190	xorl %eax, %eax
191	movq %r12, `16`(%rsp)
192	cfi_offset(`12`, -`64`)
193	movl %eax, %r12d
194	movq %r13, `8`(%rsp)
195	cfi_offset(`13`, -`72`)
196	movl %edx, %r13d
197	movq %r14, (%rsp)
198	cfi_offset(`14`, -`80`)
199	# LOE rbx rbp r15 r12d r13d
200
201	/ Range mask*
202	* bits check
203	*/
204
205	L(RANGEMASK_CHECK):
206	btl %r12d, %r13d
207
208	/ Call scalar math function /
209	jc L(SCALAR_MATH_CALL)
210	# LOE rbx rbp r15 r12d r13d
211
212	/ Special inputs*
213	* processing loop
214	*/
215
216	L(SPECIAL_VALUES_LOOP):
217	incl %r12d
218	cmpl $`4`, %r12d
219
220	/ Check bits in range mask /
221	jl L(RANGEMASK_CHECK)
222	# LOE rbx rbp r15 r12d r13d
223
224	movq `16`(%rsp), %r12
225	cfi_restore(`12`)
226	movq `8`(%rsp), %r13
227	cfi_restore(`13`)
228	movq (%rsp), %r14
229	cfi_restore(`14`)
230	movups `48`(%rsp), %xmm14
231
232	/ Go to exit /
233	jmp L(EXIT)
234	cfi_offset(`12`, -`64`)
235	cfi_offset(`13`, -`72`)
236	cfi_offset(`14`, -`80`)
237	# LOE rbx rbp r12 r13 r14 r15 xmm14
238
239	/ Scalar math function call*
240	* to process special input
241	*/
242
243	L(SCALAR_MATH_CALL):
244	movl %r12d, %r14d
245	movss `32`(%rsp, %r14, `4`), %xmm0
246	call sinhf@PLT
247	# LOE rbx rbp r14 r15 r12d r13d xmm0
248
249	movss %xmm0, `48`(%rsp, %r14, `4`)
250
251	/ Process special inputs in loop /
252	jmp L(SPECIAL_VALUES_LOOP)
253	# LOE rbx rbp r15 r12d r13d
254	END(_ZGVbN4v_sinhf_sse4)
255
256	.section .rodata, "a"
257	.align `16`
258
259	#ifdef __svml_ssinh_data_internal_typedef
260	typedef unsigned int VUINT32;
261	typedef struct {
262	__declspec(align(`16`)) VUINT32 _sInvLn2[`4`][`1`];
263	__declspec(align(`16`)) VUINT32 _sLn2hi[`4`][`1`];
264	__declspec(align(`16`)) VUINT32 _sLn2lo[`4`][`1`];
265	__declspec(align(`16`)) VUINT32 _sSign[`4`][`1`];
266	__declspec(align(`16`)) VUINT32 _sShifter[`4`][`1`];
267	__declspec(align(`16`)) VUINT32 _iDomainRange[`4`][`1`];
268	__declspec(align(`16`)) VUINT32 _sPC1[`4`][`1`];
269	__declspec(align(`16`)) VUINT32 _sPC2[`4`][`1`];
270	__declspec(align(`16`)) VUINT32 _sPC3[`4`][`1`];
271	__declspec(align(`16`)) VUINT32 _sPC4[`4`][`1`];
272	__declspec(align(`16`)) VUINT32 _sPC5[`4`][`1`];
273	__declspec(align(`16`)) VUINT32 _sPC6[`4`][`1`];
274	__declspec(align(`16`)) VUINT32 _iHalf[`4`][`1`];
275	} __svml_ssinh_data_internal;
276	#endif
277	__svml_ssinh_data_internal:
278	.long `0x3FB8AA3B`, `0x3FB8AA3B`, `0x3FB8AA3B`, `0x3FB8AA3B` / _sInvLn2 / // k=0
279	.align `16`
280	.long `0x3F317000`, `0x3F317000`, `0x3F317000`, `0x3F317000` / _sLn2hi /
281	.align `16`
282	.long `0x3805FDF4`, `0x3805FDF4`, `0x3805FDF4`, `0x3805FDF4` / _sLn2lo /
283	.align `16`
284	.long `0x80000000`, `0x80000000`, `0x80000000`, `0x80000000` / _sSign /
285	.align `16`
286	.long `0x4b400000`, `0x4b400000`, `0x4b400000`, `0x4b400000` / _sShifter /
287	.align `16`
288	.long `0x42AEAC4E`, `0x42AEAC4E`, `0x42AEAC4E`, `0x42AEAC4E` / _iDomainRange /
289	.align `16`
290	.long `0x3F800000`, `0x3F800000`, `0x3F800000`, `0x3F800000` / _sPC1=1 /
291	.align `16`
292	.long `0x3f000000`, `0x3f000000`, `0x3f000000`, `0x3f000000` / _sPC2 /
293	.align `16`
294	.long `0x3e2aaa57`, `0x3e2aaa57`, `0x3e2aaa57`, `0x3e2aaa57` / _sPC3 /
295	.align `16`
296	.long `0x3d2aaa72`, `0x3d2aaa72`, `0x3d2aaa72`, `0x3d2aaa72` / _sPC4 /
297	.align `16`
298	.long `0x3c091461`, `0x3c091461`, `0x3c091461`, `0x3c091461` / _sPC5 /
299	.align `16`
300	.long `0x3ab6a8a3`, `0x3ab6a8a3`, `0x3ab6a8a3`, `0x3ab6a8a3` / _sPC6 /
301	// Integer constants
302	.align `16`
303	.long `0x3f000000`, `0x3f000000`, `0x3f000000`, `0x3f000000` / _iHalf /
304	.align `16`
305	.type __svml_ssinh_data_internal, @object
306	.size __svml_ssinh_data_internal, .-__svml_ssinh_data_internal
307

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_s_sinhf4_core_sse4.S