svml_s_sinhf8_core_avx2.S source code [glibc/sysdeps/x86_64/fpu/multiarch/svml_s_sinhf8_core_avx2.S]

1	/ Function sinhf vectorized with AVX2.*
2	Copyright (C) 2021-2024 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	https://www.gnu.org/licenses/. /*
18
19	/*
20	* ALGORITHM DESCRIPTION:
21	*
22	* Compute sinh(x) as (exp(x)-exp(-x))/2,
23	* where exp is calculated as
24	* exp(Mln2 + ln2(j/2^k) + r) = 2^M * 2^(j/2^k) * exp(r)
25	*
26	* Special cases:
27	*
28	* sinh(NaN) = quiet NaN, and raise invalid exception
29	* sinh(INF) = that INF
30	* sinh(x) = x for subnormals
31	* sinh(x) overflows for big x and returns MAXLOG+log(2)
32	*
33	*/
34
35	/ Offsets for data table __svml_ssinh_data_internal*
36	*/
37	#define _sInvLn2 0
38	#define _sLn2hi 32
39	#define _sLn2lo 64
40	#define _sSign 96
41	#define _sShifter 128
42	#define _iDomainRange 160
43	#define _sPC1 192
44	#define _sPC2 224
45	#define _sPC3 256
46	#define _sPC4 288
47	#define _sPC5 320
48	#define _sPC6 352
49	#define _iHalf 384
50
51	#include <sysdep.h>
52
53	.section .text.avx2, "ax", @progbits
54	ENTRY(_ZGVdN8v_sinhf_avx2)
55	pushq %rbp
56	cfi_def_cfa_offset(`16`)
57	movq %rsp, %rbp
58	cfi_def_cfa(`6`, `16`)
59	cfi_offset(`6`, -`16`)
60	andq $-`32`, %rsp
61	subq $`96`, %rsp
62	vmovups _sInvLn2+__svml_ssinh_data_internal(%rip), %ymm7
63	vmovups _sShifter+__svml_ssinh_data_internal(%rip), %ymm4
64	vmovups _sLn2hi+__svml_ssinh_data_internal(%rip), %ymm5
65
66	/*
67	* sinh(X) = sG2 + sG1sinh(dR) + sG2sR2(a2+sR2(a4+a6*sR2)
68	* sOut = (a4 +a6*sR2)
69	*/
70	vmovups _sPC6+__svml_ssinh_data_internal(%rip), %ymm14
71
72	/*
73	* sinh(r) = r((a1=1)+r^2(a3+r^2(a5+{v1 r^2a7})))) = r + r(r^2(a3+r^2(a5+r^2a7))) ....
74	* sSinh_r = (a3+r^2*a5)
75	*/
76	vmovups _sPC5+__svml_ssinh_data_internal(%rip), %ymm12
77	vmovups _iHalf+__svml_ssinh_data_internal(%rip), %ymm8
78	vmovaps %ymm0, %ymm2
79
80	/*
81	* Implementation
82	* Abs argument
83	*/
84	vandps _sSign+__svml_ssinh_data_internal(%rip), %ymm2, %ymm1
85	vxorps %ymm2, %ymm1, %ymm0
86
87	/*
88	* Load argument
89	* dM = x/log(2) + RShifter
90	*/
91	vfmadd213ps %ymm4, %ymm0, %ymm7
92
93	/*
94	* R
95	* sN = sM - RShifter
96	*/
97	vsubps %ymm4, %ymm7, %ymm6
98
99	/*
100	* G1, G2 2^N, 2^(-N)
101	* iM now is an EXP(2^N)
102	*/
103	vpslld $`23`, %ymm7, %ymm9
104
105	/*
106	* Check for overflow\underflow
107	* MORE faster than GE?
108	*/
109	vpcmpgtd _iDomainRange+__svml_ssinh_data_internal(%rip), %ymm0, %ymm3
110
111	/ sR = sX - sNLog2_hi /*
112	vfnmadd231ps %ymm5, %ymm6, %ymm0
113	vpaddd %ymm9, %ymm8, %ymm10
114	vpsubd %ymm9, %ymm8, %ymm11
115
116	/ sR = (sX - sNLog2_hi) - sNLog2_lo /
117	vfnmadd231ps _sLn2lo+__svml_ssinh_data_internal(%rip), %ymm6, %ymm0
118
119	/ sR2 = sR^2 /
120	vmulps %ymm0, %ymm0, %ymm13
121	vfmadd213ps _sPC4+__svml_ssinh_data_internal(%rip), %ymm13, %ymm14
122	vfmadd213ps _sPC3+__svml_ssinh_data_internal(%rip), %ymm13, %ymm12
123
124	/ sOut = a2+sR2(a4+a6sR2) /
125	vfmadd213ps _sPC2+__svml_ssinh_data_internal(%rip), %ymm13, %ymm14
126
127	/ sSinh_r = r^2(a3+r^2a5) /
128	vmulps %ymm12, %ymm13, %ymm12
129
130	/ sOut = sR2(a2+sR2(a4+a6sR2) /*
131	vmulps %ymm14, %ymm13, %ymm15
132
133	/ sSinh_r = r + r(r^2(a3+r^2a5)) /*
134	vfmadd213ps %ymm0, %ymm0, %ymm12
135	vmovmskps %ymm3, %edx
136
137	/ sG1 = 2^(N-1)+2^(-N-1) /
138	vaddps %ymm11, %ymm10, %ymm3
139
140	/ sG2 = 2^(N-1)-2^(-N-1) /
141	vsubps %ymm11, %ymm10, %ymm10
142
143	/ sOut = sG2sR2(a2+sR2(a4+a6sR2) /
144	vmulps %ymm15, %ymm10, %ymm0
145
146	/ sOut = sG1sinh(dR)+sG2sR2(a2+sR2(a4+a6sR2) /*
147	vfmadd213ps %ymm0, %ymm12, %ymm3
148
149	/ sOut = sG2 + sG1sinh(dR) + sG2sR2(a2+sR2(a4+a6sR2) /*
150	vaddps %ymm3, %ymm10, %ymm4
151
152	/ Ret H /
153	vorps %ymm4, %ymm1, %ymm0
154	testl %edx, %edx
155
156	/ Go to special inputs processing branch /
157	jne L(SPECIAL_VALUES_BRANCH)
158	# LOE rbx r12 r13 r14 r15 edx ymm0 ymm2
159
160	/ Restore registers*
161	* and exit the function
162	*/
163
164	L(EXIT):
165	movq %rbp, %rsp
166	popq %rbp
167	cfi_def_cfa(`7`, `8`)
168	cfi_restore(`6`)
169	ret
170	cfi_def_cfa(`6`, `16`)
171	cfi_offset(`6`, -`16`)
172
173	/ Branch to process*
174	* special inputs
175	*/
176
177	L(SPECIAL_VALUES_BRANCH):
178	vmovups %ymm2, `32`(%rsp)
179	vmovups %ymm0, `64`(%rsp)
180	# LOE rbx r12 r13 r14 r15 edx ymm0
181
182	xorl %eax, %eax
183	# LOE rbx r12 r13 r14 r15 eax edx
184
185	vzeroupper
186	movq %r12, `16`(%rsp)
187	/ DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) /
188	.cfi_escape `0x10`, `0x0c`, `0x0e`, `0x38`, `0x1c`, `0x0d`, `0xe0`, `0xff`, `0xff`, `0xff`, `0x1a`, `0x0d`, `0xb0`, `0xff`, `0xff`, `0xff`, `0x22`
189	movl %eax, %r12d
190	movq %r13, `8`(%rsp)
191	/ DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) /
192	.cfi_escape `0x10`, `0x0d`, `0x0e`, `0x38`, `0x1c`, `0x0d`, `0xe0`, `0xff`, `0xff`, `0xff`, `0x1a`, `0x0d`, `0xa8`, `0xff`, `0xff`, `0xff`, `0x22`
193	movl %edx, %r13d
194	movq %r14, (%rsp)
195	/ DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) /
196	.cfi_escape `0x10`, `0x0e`, `0x0e`, `0x38`, `0x1c`, `0x0d`, `0xe0`, `0xff`, `0xff`, `0xff`, `0x1a`, `0x0d`, `0xa0`, `0xff`, `0xff`, `0xff`, `0x22`
197	# LOE rbx r15 r12d r13d
198
199	/ Range mask*
200	* bits check
201	*/
202
203	L(RANGEMASK_CHECK):
204	btl %r12d, %r13d
205
206	/ Call scalar math function /
207	jc L(SCALAR_MATH_CALL)
208	# LOE rbx r15 r12d r13d
209
210	/ Special inputs*
211	* processing loop
212	*/
213
214	L(SPECIAL_VALUES_LOOP):
215	incl %r12d
216	cmpl $`8`, %r12d
217
218	/ Check bits in range mask /
219	jl L(RANGEMASK_CHECK)
220	# LOE rbx r15 r12d r13d
221
222	movq `16`(%rsp), %r12
223	cfi_restore(`12`)
224	movq `8`(%rsp), %r13
225	cfi_restore(`13`)
226	movq (%rsp), %r14
227	cfi_restore(`14`)
228	vmovups `64`(%rsp), %ymm0
229
230	/ Go to exit /
231	jmp L(EXIT)
232	/ DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) /
233	.cfi_escape `0x10`, `0x0c`, `0x0e`, `0x38`, `0x1c`, `0x0d`, `0xe0`, `0xff`, `0xff`, `0xff`, `0x1a`, `0x0d`, `0xb0`, `0xff`, `0xff`, `0xff`, `0x22`
234	/ DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) /
235	.cfi_escape `0x10`, `0x0d`, `0x0e`, `0x38`, `0x1c`, `0x0d`, `0xe0`, `0xff`, `0xff`, `0xff`, `0x1a`, `0x0d`, `0xa8`, `0xff`, `0xff`, `0xff`, `0x22`
236	/ DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) /
237	.cfi_escape `0x10`, `0x0e`, `0x0e`, `0x38`, `0x1c`, `0x0d`, `0xe0`, `0xff`, `0xff`, `0xff`, `0x1a`, `0x0d`, `0xa0`, `0xff`, `0xff`, `0xff`, `0x22`
238	# LOE rbx r12 r13 r14 r15 ymm0
239
240	/ Scalar math function call*
241	* to process special input
242	*/
243
244	L(SCALAR_MATH_CALL):
245	movl %r12d, %r14d
246	vmovss `32`(%rsp, %r14, `4`), %xmm0
247	call sinhf@PLT
248	# LOE rbx r14 r15 r12d r13d xmm0
249
250	vmovss %xmm0, `64`(%rsp, %r14, `4`)
251
252	/ Process special inputs in loop /
253	jmp L(SPECIAL_VALUES_LOOP)
254	# LOE rbx r15 r12d r13d
255	END(_ZGVdN8v_sinhf_avx2)
256
257	.section .rodata, "a"
258	.align `32`
259
260	#ifdef __svml_ssinh_data_internal_typedef
261	typedef unsigned int VUINT32;
262	typedef struct {
263	__declspec(align(`32`)) VUINT32 _sInvLn2[`8`][`1`];
264	__declspec(align(`32`)) VUINT32 _sLn2hi[`8`][`1`];
265	__declspec(align(`32`)) VUINT32 _sLn2lo[`8`][`1`];
266	__declspec(align(`32`)) VUINT32 _sSign[`8`][`1`];
267	__declspec(align(`32`)) VUINT32 _sShifter[`8`][`1`];
268	__declspec(align(`32`)) VUINT32 _iDomainRange[`8`][`1`];
269	__declspec(align(`32`)) VUINT32 _sPC1[`8`][`1`];
270	__declspec(align(`32`)) VUINT32 _sPC2[`8`][`1`];
271	__declspec(align(`32`)) VUINT32 _sPC3[`8`][`1`];
272	__declspec(align(`32`)) VUINT32 _sPC4[`8`][`1`];
273	__declspec(align(`32`)) VUINT32 _sPC5[`8`][`1`];
274	__declspec(align(`32`)) VUINT32 _sPC6[`8`][`1`];
275	__declspec(align(`32`)) VUINT32 _iHalf[`8`][`1`];
276	} __svml_ssinh_data_internal;
277	#endif
278	__svml_ssinh_data_internal:
279	.long `0x3FB8AA3B`, `0x3FB8AA3B`, `0x3FB8AA3B`, `0x3FB8AA3B`, `0x3FB8AA3B`, `0x3FB8AA3B`, `0x3FB8AA3B`, `0x3FB8AA3B` / _sInvLn2 / // k=0
280	.align `32`
281	.long `0x3F317000`, `0x3F317000`, `0x3F317000`, `0x3F317000`, `0x3F317000`, `0x3F317000`, `0x3F317000`, `0x3F317000` / _sLn2hi /
282	.align `32`
283	.long `0x3805FDF4`, `0x3805FDF4`, `0x3805FDF4`, `0x3805FDF4`, `0x3805FDF4`, `0x3805FDF4`, `0x3805FDF4`, `0x3805FDF4` / _sLn2lo /
284	.align `32`
285	.long `0x80000000`, `0x80000000`, `0x80000000`, `0x80000000`, `0x80000000`, `0x80000000`, `0x80000000`, `0x80000000` / _sSign /
286	.align `32`
287	.long `0x4b400000`, `0x4b400000`, `0x4b400000`, `0x4b400000`, `0x4b400000`, `0x4b400000`, `0x4b400000`, `0x4b400000` / _sShifter /
288	.align `32`
289	.long `0x42AEAC4E`, `0x42AEAC4E`, `0x42AEAC4E`, `0x42AEAC4E`, `0x42AEAC4E`, `0x42AEAC4E`, `0x42AEAC4E`, `0x42AEAC4E` / _iDomainRange /
290	.align `32`
291	.long `0x3F800000`, `0x3F800000`, `0x3F800000`, `0x3F800000`, `0x3F800000`, `0x3F800000`, `0x3F800000`, `0x3F800000` / _sPC1=1 /
292	.align `32`
293	.long `0x3f000000`, `0x3f000000`, `0x3f000000`, `0x3f000000`, `0x3f000000`, `0x3f000000`, `0x3f000000`, `0x3f000000` / _sPC2 /
294	.align `32`
295	.long `0x3e2aaa57`, `0x3e2aaa57`, `0x3e2aaa57`, `0x3e2aaa57`, `0x3e2aaa57`, `0x3e2aaa57`, `0x3e2aaa57`, `0x3e2aaa57` / _sPC3 /
296	.align `32`
297	.long `0x3d2aaa72`, `0x3d2aaa72`, `0x3d2aaa72`, `0x3d2aaa72`, `0x3d2aaa72`, `0x3d2aaa72`, `0x3d2aaa72`, `0x3d2aaa72` / _sPC4 /
298	.align `32`
299	.long `0x3c091461`, `0x3c091461`, `0x3c091461`, `0x3c091461`, `0x3c091461`, `0x3c091461`, `0x3c091461`, `0x3c091461` / _sPC5 /
300	.align `32`
301	.long `0x3ab6a8a3`, `0x3ab6a8a3`, `0x3ab6a8a3`, `0x3ab6a8a3`, `0x3ab6a8a3`, `0x3ab6a8a3`, `0x3ab6a8a3`, `0x3ab6a8a3` / _sPC6 /
302	// Integer constants
303	.align `32`
304	.long `0x3f000000`, `0x3f000000`, `0x3f000000`, `0x3f000000`, `0x3f000000`, `0x3f000000`, `0x3f000000`, `0x3f000000` / _iHalf /
305	.align `32`
306	.type __svml_ssinh_data_internal, @object
307	.size __svml_ssinh_data_internal, .-__svml_ssinh_data_internal
308

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_s_sinhf8_core_avx2.S