svml_s_atanhf4_core_sse4.S source code [glibc/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf4_core_sse4.S]

1	/ Function atanhf vectorized with SSE4.*
2	Copyright (C) 2021-2024 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	https://www.gnu.org/licenses/. /*
18
19	/*
20	* ALGORITHM DESCRIPTION:
21	*
22	* Compute atanh(x) as 0.5 * log((1 + x)/(1 - x))
23	*
24	* Special cases:
25	*
26	* atanh(0) = 0
27	* atanh(+1) = +INF
28	* atanh(-1) = -INF
29	* atanh(x) = NaN if \|x\| > 1, or if x is a NaN or INF
30	*
31	*/
32
33	/ Offsets for data table __svml_satanh_data_internal_avx512. Ordered*
34	by use in the function. On cold-starts this might help the
35	prefetcher. Possibly a better idea is to interleave start/end so
36	that the prefetcher is less likely to detect a stream and pull
37	irrelivant lines into cache. /*
38	#define sOne 0
39	#define SgnMask 16
40	#define sTopMask12 32
41	#define iBrkValue 48
42	#define iOffExpoMask 64
43	#define sPoly 80
44	#define sLn2 208
45	#define TinyRange 224
46
47	#include <sysdep.h>
48	#define ATANHF_DATA(x) ((x)+__svml_satanh_data_internal)
49
50	.section .text.sse4, "ax", @progbits
51	ENTRY(_ZGVbN4v_atanhf_sse4)
52	movaps %xmm0, %xmm5
53
54	/ Load constants including One = 1 /
55	movups ATANHF_DATA(sOne)(%rip), %xmm4
56	movaps %xmm5, %xmm3
57
58	/ Strip off the sign, so treat X as positive until right at the end /
59	movups ATANHF_DATA(SgnMask)(%rip), %xmm1
60	movaps %xmm4, %xmm2
61	andps %xmm1, %xmm0
62	movaps %xmm4, %xmm10
63	movups ATANHF_DATA(sTopMask12)(%rip), %xmm11
64	movaps %xmm4, %xmm14
65	movaps %xmm11, %xmm9
66
67
68	/*
69	* Compute V = 2 * X trivially, and UHi + U_lo = 1 - X in two pieces,
70	* the upper part UHi being <= 12 bits long. Then we have
71	* atanh(X) = 1/2 * log((1 + X) / (1 - X)) = 1/2 * log1p(V / (UHi + ULo)).
72	*/
73	movaps %xmm0, %xmm6
74	mulps %xmm5, %xmm3
75	subps %xmm0, %xmm2
76	addps %xmm0, %xmm6
77	subps %xmm2, %xmm10
78	addps %xmm5, %xmm3
79	subps %xmm0, %xmm10
80	andps %xmm2, %xmm9
81
82
83	/*
84	* Check whether \|X\| < 1, in which case we use the main function.
85	* Otherwise set the rangemask so that the callout will get used.
86	* Note that this will also use the callout for NaNs since not(NaN < 1).
87	*/
88	rcpps %xmm9, %xmm7
89	subps %xmm9, %xmm2
90	andps %xmm11, %xmm7
91
92
93	/*
94	* Split V as well into upper 12 bits and lower part, so that we can get
95	* a preliminary quotient estimate without rounding error.
96	*/
97	andps %xmm6, %xmm11
98	mulps %xmm7, %xmm9
99	addps %xmm2, %xmm10
100	subps %xmm11, %xmm6
101
102	/ Hence get initial quotient estimate QHi + QLo = R * VHi + R * VLo /
103	mulps %xmm7, %xmm11
104	mulps %xmm7, %xmm10
105	subps %xmm9, %xmm14
106	mulps %xmm6, %xmm7
107	subps %xmm10, %xmm14
108
109	/ Compute D = E + E^2 /
110	movaps %xmm14, %xmm13
111	movaps %xmm4, %xmm8
112	mulps %xmm14, %xmm13
113
114	/ reduction: compute r,n /
115	movdqu ATANHF_DATA(iBrkValue)(%rip), %xmm9
116	addps %xmm13, %xmm14
117
118	/*
119	* Compute R * (VHi + VLo) * (1 + E + E^2)
120	* = R * (VHi + VLo) * (1 + D)
121	* = QHi + (QHi * D + QLo + QLo * D)
122	*/
123	movaps %xmm14, %xmm2
124	mulps %xmm7, %xmm14
125	mulps %xmm11, %xmm2
126	addps %xmm14, %xmm7
127	movdqu ATANHF_DATA(iOffExpoMask)(%rip), %xmm12
128	movaps %xmm4, %xmm14
129
130	/ Record the sign for eventual reincorporation. /
131	addps %xmm7, %xmm2
132
133
134	/*
135	* Now finally accumulate the high and low parts of the
136	* argument to log1p, H + L, with a final compensated summation.
137	*/
138	movaps %xmm2, %xmm6
139	andnps %xmm5, %xmm1
140	movaps %xmm4, %xmm7
141	/ Or the sign bit in with the tiny result to handle atanh(-0) correctly /
142	addps %xmm11, %xmm6
143	maxps %xmm6, %xmm7
144	minps %xmm6, %xmm8
145	subps %xmm6, %xmm11
146	movaps %xmm7, %xmm10
147	addps %xmm8, %xmm10
148	addps %xmm11, %xmm2
149	subps %xmm10, %xmm7
150	psubd %xmm9, %xmm10
151	addps %xmm8, %xmm7
152	pand %xmm10, %xmm12
153	psrad $`23`, %xmm10
154	cvtdq2ps %xmm10, %xmm13
155	addps %xmm7, %xmm2
156
157	/ final reconstruction /
158	pslld $`23`, %xmm10
159	paddd %xmm9, %xmm12
160	psubd %xmm10, %xmm14
161
162	/ polynomial evaluation /
163	subps %xmm4, %xmm12
164	mulps %xmm14, %xmm2
165	movups ATANHF_DATA(sPoly+`0`)(%rip), %xmm7
166	addps %xmm12, %xmm2
167	mulps %xmm2, %xmm7
168
169
170	/ Finally, halve the result and reincorporate the sign /
171	addps ATANHF_DATA(sPoly+`16`)(%rip), %xmm7
172	mulps %xmm2, %xmm7
173	addps ATANHF_DATA(sPoly+`32`)(%rip), %xmm7
174	mulps %xmm2, %xmm7
175	addps ATANHF_DATA(sPoly+`48`)(%rip), %xmm7
176	mulps %xmm2, %xmm7
177	addps ATANHF_DATA(sPoly+`64`)(%rip), %xmm7
178	mulps %xmm2, %xmm7
179	addps ATANHF_DATA(sPoly+`80`)(%rip), %xmm7
180	mulps %xmm2, %xmm7
181	addps ATANHF_DATA(sPoly+`96`)(%rip), %xmm7
182	mulps %xmm2, %xmm7
183	movaps ATANHF_DATA(sPoly+`112`)(%rip), %xmm6
184	addps %xmm6, %xmm7
185	mulps %xmm2, %xmm7
186	mulps %xmm2, %xmm7
187	mulps ATANHF_DATA(sLn2)(%rip), %xmm13
188	/ We can build `sHalf` with `sPoly & sOne`. /
189	andps %xmm4, %xmm6
190	orps %xmm1, %xmm3
191	xorps %xmm6, %xmm1
192
193	addps %xmm2, %xmm7
194	addps %xmm13, %xmm7
195	mulps %xmm7, %xmm1
196
197	/ Finish check of NaNs. /
198	cmpleps %xmm0, %xmm4
199	movmskps %xmm4, %edx
200	cmpltps ATANHF_DATA(TinyRange)(%rip), %xmm0
201
202	andps %xmm0, %xmm3
203	andnps %xmm1, %xmm0
204	orps %xmm3, %xmm0
205
206	testl %edx, %edx
207	/ Go to special inputs processing branch. /
208	jne L(SPECIAL_VALUES_BRANCH)
209	# LOE rbx rbp r12 r13 r14 r15 xmm0
210	/ No registers to restore on fast path. /
211	ret
212
213
214	/ Cold case. edx has 1s where there was a special value that*
215	needs to be handled by a atanhf call. Optimize for code size
216	more so than speed here. /*
217	L(SPECIAL_VALUES_BRANCH):
218	# LOE rbx rdx rbp r12 r13 r14 r15 xmm0 xmm5
219	/ Stack coming in 16-byte aligned. Set 8-byte misaligned so on*
220	call entry will be 16-byte aligned. /*
221	subq $`56`, %rsp
222	cfi_def_cfa_offset(`64`)
223	movups %xmm0, `24`(%rsp)
224	movups %xmm5, `40`(%rsp)
225
226	/ Use rbx/rbp for callee save registers as they get short*
227	encoding for many instructions (as compared with r12/r13). /*
228	movq %rbx, (%rsp)
229	cfi_offset(rbx, -`64`)
230	movq %rbp, `8`(%rsp)
231	cfi_offset(rbp, -`56`)
232	/ edx has 1s where there was a special value that needs to be handled*
233	by a tanhf call. /*
234	movl %edx, %ebx
235	L(SPECIAL_VALUES_LOOP):
236	# LOE rbx rbp r12 r13 r14 r15
237	/ use rbp as index for special value that is saved across calls to*
238	tanhf. We technically don't need a callee save register here as offset
239	to rsp is always [0, 12] so we can restore rsp by realigning to 64.
240	Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
241	in the loop. /*
242	xorl %ebp, %ebp
243	bsfl %ebx, %ebp
244
245	/ Scalar math function call to process special input. /
246	movss `40`(%rsp, %rbp, `4`), %xmm0
247	call atanhf@PLT
248	/ No good way to avoid the store-forwarding fault this will cause on*
249	return. `lfence` avoids the SF fault but at greater cost as it
250	serialized stack/callee save restoration. /*
251	movss %xmm0, `24`(%rsp, %rbp, `4`)
252
253	leal -`1`(%rbx), %eax
254	andl %eax, %ebx
255	jnz L(SPECIAL_VALUES_LOOP)
256	# LOE r12 r13 r14 r15
257	/ All results have been written to 24(%rsp). /
258	movups `24`(%rsp), %xmm0
259	movq (%rsp), %rbx
260	cfi_restore(rbx)
261	movq `8`(%rsp), %rbp
262	cfi_restore(rbp)
263	addq $`56`, %rsp
264	cfi_def_cfa_offset(`8`)
265	ret
266	END(_ZGVbN4v_atanhf_sse4)
267
268	.section .rodata, "a"
269	.align `16`
270
271	#ifdef __svml_satanh_data_internal_typedef
272	typedef unsigned int VUINT32;
273	typedef struct{
274	__declspec(align(`16`)) VUINT32 sOne[`4`][`1`];
275	__declspec(align(`16`)) VUINT32 SgnMask[`4`][`1`];
276	__declspec(align(`16`)) VUINT32 sTopMask12[`4`][`1`];
277	__declspec(align(`16`)) VUINT32 iBrkValue[`4`][`1`];
278	__declspec(align(`16`)) VUINT32 iOffExpoMask[`4`][`1`];
279	__declspec(align(`16`)) VUINT32 sPoly[`8`][`4`][`1`];
280	__declspec(align(`16`)) VUINT32 sLn2[`4`][`1`];
281	__declspec(align(`16`)) VUINT32 TinyRange[`4`][`1`];
282	} __svml_satanh_data_internal;
283	#endif
284
285	__svml_satanh_data_internal:
286	/ sOne = SP 1.0 /
287	.align `16`
288	.long `0x3f800000`, `0x3f800000`, `0x3f800000`, `0x3f800000`
289	/ SgnMask /
290	.long `0x7fffffff`, `0x7fffffff`, `0x7fffffff`, `0x7fffffff`
291	/ sTopMask12 /
292	.align `16`
293	.long `0xFFFFF000`, `0xFFFFF000`, `0xFFFFF000`, `0xFFFFF000`
294	/ iBrkValue = SP 2/3 /
295	.align `16`
296	.long `0x3f2aaaab`, `0x3f2aaaab`, `0x3f2aaaab`, `0x3f2aaaab`
297	/ iOffExpoMask = SP significand mask ==/
298	.align `16`
299	.long `0x007fffff`, `0x007fffff`, `0x007fffff`, `0x007fffff`
300
301	/ sPoly[] = SP polynomial /
302	.align `16`
303	.long `0x3e0d84ed`, `0x3e0d84ed`, `0x3e0d84ed`, `0x3e0d84ed` / 1.3820238411426544189453125e-01 P7 /
304	.long `0xbe1ad9e3`, `0xbe1ad9e3`, `0xbe1ad9e3`, `0xbe1ad9e3` / -1.5122179687023162841796875e-01 P6 /
305	.long `0x3e0fcb12`, `0x3e0fcb12`, `0x3e0fcb12`, `0x3e0fcb12` / 1.4042308926582336425781250e-01 P5 /
306	.long `0xbe28ad37`, `0xbe28ad37`, `0xbe28ad37`, `0xbe28ad37` / -1.6472326219081878662109375e-01 P4 /
307	.long `0x3e4ce190`, `0x3e4ce190`, `0x3e4ce190`, `0x3e4ce190` / 2.0007920265197753906250000e-01 P3 /
308	.long `0xbe80058e`, `0xbe80058e`, `0xbe80058e`, `0xbe80058e` / -2.5004237890243530273437500e-01 P2 /
309	.long `0x3eaaaa94`, `0x3eaaaa94`, `0x3eaaaa94`, `0x3eaaaa94` / 3.3333265781402587890625000e-01 P1 /
310	.long `0xbf000000`, `0xbf000000`, `0xbf000000`, `0xbf000000` / -5.0000000000000000000000000e-01 P0 /
311
312	/ sLn2 = SP ln(2) /
313	.align `16`
314	.long `0x3f317218`, `0x3f317218`, `0x3f317218`, `0x3f317218`
315	/ TinyRange /
316	.align `16`
317	.long `0x0C000000`, `0x0C000000`, `0x0C000000`, `0x0C000000`
318	.align `16`
319	.type __svml_satanh_data_internal, @object
320	.size __svml_satanh_data_internal, .-__svml_satanh_data_internal
321

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf4_core_sse4.S