svml_s_atanhf8_core_avx2.S source code [glibc/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf8_core_avx2.S]

1	/ Function atanhf vectorized with AVX2.*
2	Copyright (C) 2021-2024 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	https://www.gnu.org/licenses/. /*
18
19	/*
20	* ALGORITHM DESCRIPTION:
21	*
22	* Compute atanh(x) as 0.5 * log((1 + x)/(1 - x))
23	*
24	* Special cases:
25	*
26	* atanh(0) = 0
27	* atanh(+1) = +INF
28	* atanh(-1) = -INF
29	* atanh(x) = NaN if \|x\| > 1, or if x is a NaN or INF
30	*
31	*/
32
33	/ Offsets for data table __svml_satanh_data_internal_avx512. Ordered*
34	by use in the function. On cold-starts this might hhelp the
35	prefetcher. Possibly a better idea is to interleave start/end so
36	that the prefetcher is less likely to detect a stream and pull
37	irrelivant lines into cache. /*
38	#define SgnMask 0
39	#define sOne 32
40	#define sTopMask12 64
41	#define TinyRange 96
42	#define iBrkValue 128
43	#define iOffExpoMask 160
44	#define sPoly 192
45	#define sLn2 448
46	#define sHalf 480
47
48	#include <sysdep.h>
49	#define ATANHF_DATA(x) ((x)+__svml_satanh_data_internal)
50
51	.section .text.avx2, "ax", @progbits
52	ENTRY(_ZGVdN8v_atanhf_avx2)
53	/ Strip off the sign, so treat X as positive until right at the end /
54	vmovaps ATANHF_DATA(SgnMask)(%rip), %ymm2
55	vandps %ymm2, %ymm0, %ymm3
56	/ Load constants including One = 1 /
57	vmovups ATANHF_DATA(sOne)(%rip), %ymm5
58	vsubps %ymm3, %ymm5, %ymm1
59	vmovups ATANHF_DATA(sTopMask12)(%rip), %ymm4
60
61	vrcpps %ymm1, %ymm7
62	vsubps %ymm1, %ymm5, %ymm9
63	vandps %ymm4, %ymm7, %ymm6
64	vsubps %ymm3, %ymm9, %ymm7
65
66	/ No need to split sU when FMA is available /
67	vfnmadd213ps %ymm5, %ymm6, %ymm1
68	vmovaps %ymm0, %ymm8
69	vfmadd213ps %ymm0, %ymm0, %ymm0
70	vfnmadd231ps %ymm6, %ymm7, %ymm1
71
72	/*
73	* Check whether \|X\| < 1, in which case we use the main function.
74	* Otherwise set the rangemask so that the callout will get used.
75	* Note that this will also use the callout for NaNs since not(NaN < 1).
76	*/
77	vcmpnlt_uqps %ymm5, %ymm3, %ymm14
78	vcmplt_oqps ATANHF_DATA(TinyRange)(%rip), %ymm3, %ymm15
79
80	/*
81	* Compute V = 2 * X trivially, and UHi + U_lo = 1 - X in two pieces,
82	* the upper part UHi being <= 12 bits long. Then we have
83	* atanh(X) = 1/2 * log((1 + X) / (1 - X)) = 1/2 * log1p(V / (UHi + ULo)).
84	*/
85	vaddps %ymm3, %ymm3, %ymm3
86
87	/*
88	* Split V as well into upper 12 bits and lower part, so that we can get
89	* a preliminary quotient estimate without rounding error.
90	*/
91	vandps %ymm4, %ymm3, %ymm4
92	vsubps %ymm4, %ymm3, %ymm7
93
94	/ Hence get initial quotient estimate QHi + QLo = R * VHi + R * VLo /
95	vmulps %ymm4, %ymm6, %ymm4
96
97	/ Compute D = E + E^2 /
98	vfmadd213ps %ymm1, %ymm1, %ymm1
99
100	/ Record the sign for eventual reincorporation. /
101	vandnps %ymm8, %ymm2, %ymm3
102
103	/ Or the sign bit in with the tiny result to handle atanh(-0) correctly /
104	vorps %ymm3, %ymm0, %ymm13
105	vmulps %ymm7, %ymm6, %ymm2
106
107	/*
108	* Compute R * (VHi + VLo) * (1 + E + E^2)
109	* = R * (VHi + VLo) * (1 + D)
110	* = QHi + (QHi * D + QLo + QLo * D)
111	*/
112
113	/*
114	* If less precision is acceptable the `vmulps %ymm1, %ymm4, %ymm9;
115	* vaddps %ymm1, %ymm9, %ymm1` can be replaced with
116	* `vfmadd231ps %ymm1, %ymm4, %ymm4`.
117	*/
118	vmulps %ymm1, %ymm4, %ymm6
119	vfmadd213ps %ymm2, %ymm2, %ymm1
120	vaddps %ymm1, %ymm6, %ymm1
121
122	/*
123	* Now finally accumulate the high and low parts of the
124	* argument to log1p, H + L, with a final compensated summation.
125	*/
126	vaddps %ymm1, %ymm4, %ymm2
127
128	/ reduction: compute r, n /
129	vmovups ATANHF_DATA(iBrkValue)(%rip), %ymm9
130
131	/*
132	* Now we feed into the log1p code, using H in place of _VARG1 and
133	* later incorporating L into the reduced argument.
134	* compute 1+x as high, low parts
135	*/
136	vmaxps %ymm2, %ymm5, %ymm0
137	vminps %ymm2, %ymm5, %ymm6
138
139	/ This is needed for rounding (see `vaddps %ymm1, %ymm4, %ymm2`). /
140	vsubps %ymm2, %ymm4, %ymm2
141	vaddps %ymm6, %ymm0, %ymm4
142	vpsubd %ymm9, %ymm4, %ymm7
143	vsubps %ymm4, %ymm0, %ymm4
144	vaddps %ymm2, %ymm1, %ymm2
145	vmovaps ATANHF_DATA(iOffExpoMask)(%rip), %ymm1
146
147	vandps %ymm1, %ymm7, %ymm0
148	vaddps %ymm4, %ymm6, %ymm4
149	vandnps %ymm7, %ymm1, %ymm6
150	vmovups ATANHF_DATA(sPoly+`0`)(%rip), %ymm1
151	vpaddd %ymm9, %ymm0, %ymm0
152	vaddps %ymm4, %ymm2, %ymm4
153	vpsubd %ymm6, %ymm5, %ymm6
154
155	/ polynomial evaluation /
156	vsubps %ymm5, %ymm0, %ymm2
157	vfmadd231ps %ymm4, %ymm6, %ymm2
158	vfmadd213ps ATANHF_DATA(sPoly+`32`)(%rip), %ymm2, %ymm1
159	vfmadd213ps ATANHF_DATA(sPoly+`64`)(%rip), %ymm2, %ymm1
160	vfmadd213ps ATANHF_DATA(sPoly+`96`)(%rip), %ymm2, %ymm1
161	vfmadd213ps ATANHF_DATA(sPoly+`128`)(%rip), %ymm2, %ymm1
162	vfmadd213ps ATANHF_DATA(sPoly+`160`)(%rip), %ymm2, %ymm1
163	vfmadd213ps ATANHF_DATA(sPoly+`192`)(%rip), %ymm2, %ymm1
164	vfmadd213ps ATANHF_DATA(sPoly+`224`)(%rip), %ymm2, %ymm1
165
166	vmulps %ymm1, %ymm2, %ymm1
167	vfmadd213ps %ymm2, %ymm2, %ymm1
168
169	/ final reconstruction /
170	vpsrad $`23`, %ymm7, %ymm6
171	vcvtdq2ps %ymm6, %ymm2
172	vfmadd132ps ATANHF_DATA(sLn2)(%rip), %ymm1, %ymm2
173
174	/ Finally, halve the result and reincorporate the sign /
175	vxorps ATANHF_DATA(sHalf)(%rip), %ymm3, %ymm3
176	vmulps %ymm2, %ymm3, %ymm2
177	vmovmskps %ymm14, %edx
178	testl %edx, %edx
179
180	vblendvps %ymm15, %ymm13, %ymm2, %ymm0
181	/ Go to special inputs processing branch /
182	jne L(SPECIAL_VALUES_BRANCH)
183	# LOE rbx rdx r12 r13 r14 r15 ymm0
184	/ No registers to restore on fast path. /
185	ret
186
187
188	/ Cold case. edx has 1s where there was a special value that*
189	needs to be handled by a atanhf call. Optimize for code size
190	more so than speed here. /*
191	L(SPECIAL_VALUES_BRANCH):
192	# LOE rbx rdx r12 r13 r14 r15 ymm0 ymm8
193	/ Use r13 to save/restore the stack. This allows us to use rbp as*
194	callee save register saving code size. /*
195	pushq %r13
196	cfi_adjust_cfa_offset(`8`)
197	cfi_offset(r13, -`16`)
198	/ Need to callee save registers to preserve state across tanhf calls.*
199	*/
200	pushq %rbx
201	cfi_adjust_cfa_offset(`8`)
202	cfi_offset(rbx, -`24`)
203	pushq %rbp
204	cfi_adjust_cfa_offset(`8`)
205	cfi_offset(rbp, -`32`)
206	movq %rsp, %r13
207	cfi_def_cfa_register(r13)
208
209	/ Align stack and make room for 2x ymm vectors. /
210	andq $-`32`, %rsp
211	addq $-`64`, %rsp
212
213	/ Save all already computed inputs. /
214	vmovups %ymm0, (%rsp)
215	/ Save original input (ymm8 unchanged up to this point). /
216	vmovups %ymm8, `32`(%rsp)
217
218	vzeroupper
219
220	/ edx has 1s where there was a special value that needs to be handled*
221	by a atanhf call. /*
222	movl %edx, %ebx
223	L(SPECIAL_VALUES_LOOP):
224	# LOE rbx rbp r12 r13 r14 r15
225	/ use rbp as index for special value that is saved across calls to*
226	atanhf. We technically don't need a callee save register here as offset
227	to rsp is always [0, 28] so we can restore rsp by realigning to 64.
228	Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
229	in the loop. Realigning also costs more code size. /*
230	xorl %ebp, %ebp
231	tzcntl %ebx, %ebp
232
233	/ Scalar math function call to process special input. /
234	vmovss `32`(%rsp, %rbp, `4`), %xmm0
235	call atanhf@PLT
236
237	/ No good way to avoid the store-forwarding fault this will cause on*
238	return. `lfence` avoids the SF fault but at greater cost as it
239	serialized stack/callee save restoration. /*
240	vmovss %xmm0, (%rsp, %rbp, `4`)
241
242	blsrl %ebx, %ebx
243	jnz L(SPECIAL_VALUES_LOOP)
244	# LOE r12 r13 r14 r15
245
246
247	/ All results have been written to (%rsp). /
248	vmovups (%rsp), %ymm0
249	/ Restore rsp. /
250	movq %r13, %rsp
251	cfi_def_cfa_register(rsp)
252	/ Restore callee save registers. /
253	popq %rbp
254	cfi_adjust_cfa_offset(-`8`)
255	cfi_restore(rbp)
256	popq %rbx
257	cfi_adjust_cfa_offset(-`8`)
258	cfi_restore(rbp)
259	popq %r13
260	cfi_adjust_cfa_offset(-`8`)
261	cfi_restore(r13)
262	ret
263	END(_ZGVdN8v_atanhf_avx2)
264
265	.section .rodata, "a"
266	.align `32`
267	#ifdef __svml_satanh_data_internal_typedef
268	typedef unsigned int VUINT32;
269	typedef struct{
270	__declspec(align(`32`)) VUINT32 SgnMask[`8`][`1`];
271	__declspec(align(`32`)) VUINT32 sOne[`8`][`1`];
272	__declspec(align(`32`)) VUINT32 sTopMask12[`8`][`1`];
273	__declspec(align(`32`)) VUINT32 TinyRange[`8`][`1`];
274	__declspec(align(`32`)) VUINT32 iBrkValue[`8`][`1`];
275	__declspec(align(`32`)) VUINT32 iOffExpoMask[`8`][`1`];
276	__declspec(align(`32`)) VUINT32 sPoly[`8`][`8`][`1`];
277	__declspec(align(`32`)) VUINT32 sLn2[`8`][`1`];
278	__declspec(align(`32`)) VUINT32 sHalf[`8`][`1`];
279	} __svml_satanh_data_internal;
280	#endif
281	__svml_satanh_data_internal:
282	/ SgnMask /
283	.long `0x7fffffff`, `0x7fffffff`, `0x7fffffff`, `0x7fffffff`
284	.long `0x7fffffff`, `0x7fffffff`, `0x7fffffff`, `0x7fffffff`
285	/ sOne = SP 1.0 /
286	.align `32`
287	.long `0x3f800000`, `0x3f800000`, `0x3f800000`, `0x3f800000`
288	.long `0x3f800000`, `0x3f800000`, `0x3f800000`, `0x3f800000`
289	/ sTopMask12 /
290	.align `32`
291	.long `0xFFFFF000`, `0xFFFFF000`, `0xFFFFF000`, `0xFFFFF000`
292	.long `0xFFFFF000`, `0xFFFFF000`, `0xFFFFF000`, `0xFFFFF000`
293	/ TinyRange /
294	.align `32`
295	.long `0x0C000000`, `0x0C000000`, `0x0C000000`, `0x0C000000`
296	.long `0x0C000000`, `0x0C000000`, `0x0C000000`, `0x0C000000`
297	/ iBrkValue = SP 2/3 /
298	.align `32`
299	.long `0x3f2aaaab`, `0x3f2aaaab`, `0x3f2aaaab`, `0x3f2aaaab`
300	.long `0x3f2aaaab`, `0x3f2aaaab`, `0x3f2aaaab`, `0x3f2aaaab`
301	/ iOffExpoMask = SP significand mask /
302	.align `32`
303	.long `0x007fffff`, `0x007fffff`, `0x007fffff`, `0x007fffff`
304	.long `0x007fffff`, `0x007fffff`, `0x007fffff`, `0x007fffff`
305	/ sPoly[] = SP polynomial /
306	.align `32`
307	.long `0x3e0d84ed`, `0x3e0d84ed`, `0x3e0d84ed`, `0x3e0d84ed`
308	.long `0x3e0d84ed`, `0x3e0d84ed`, `0x3e0d84ed`, `0x3e0d84ed` / 1.3820238411426544189453125e-01 P7 /
309	.long `0xbe1ad9e3`, `0xbe1ad9e3`, `0xbe1ad9e3`, `0xbe1ad9e3`
310	.long `0xbe1ad9e3`, `0xbe1ad9e3`, `0xbe1ad9e3`, `0xbe1ad9e3` / -1.5122179687023162841796875e-01 P6 /
311	.long `0x3e0fcb12`, `0x3e0fcb12`, `0x3e0fcb12`, `0x3e0fcb12`
312	.long `0x3e0fcb12`, `0x3e0fcb12`, `0x3e0fcb12`, `0x3e0fcb12` / 1.4042308926582336425781250e-01 P5 /
313	.long `0xbe28ad37`, `0xbe28ad37`, `0xbe28ad37`, `0xbe28ad37`
314	.long `0xbe28ad37`, `0xbe28ad37`, `0xbe28ad37`, `0xbe28ad37` / -1.6472326219081878662109375e-01 P4 /
315	.long `0x3e4ce190`, `0x3e4ce190`, `0x3e4ce190`, `0x3e4ce190`
316	.long `0x3e4ce190`, `0x3e4ce190`, `0x3e4ce190`, `0x3e4ce190` / 2.0007920265197753906250000e-01 P3 /
317	.long `0xbe80058e`, `0xbe80058e`, `0xbe80058e`, `0xbe80058e`
318	.long `0xbe80058e`, `0xbe80058e`, `0xbe80058e`, `0xbe80058e` / -2.5004237890243530273437500e-01 P2 /
319	.long `0x3eaaaa94`, `0x3eaaaa94`, `0x3eaaaa94`, `0x3eaaaa94`
320	.long `0x3eaaaa94`, `0x3eaaaa94`, `0x3eaaaa94`, `0x3eaaaa94` / 3.3333265781402587890625000e-01 P1 /
321	.long `0xbf000000`, `0xbf000000`, `0xbf000000`, `0xbf000000`
322	.long `0xbf000000`, `0xbf000000`, `0xbf000000`, `0xbf000000` / -5.0000000000000000000000000e-01 P0 /
323	/ sLn2 = SP ln(2) /
324	.align `32`
325	.long `0x3f317218`, `0x3f317218`, `0x3f317218`, `0x3f317218`
326	.long `0x3f317218`, `0x3f317218`, `0x3f317218`, `0x3f317218`
327	/ sHalf /
328	.align `32`
329	.long `0x3F000000`, `0x3F000000`, `0x3F000000`, `0x3F000000`
330	.long `0x3F000000`, `0x3F000000`, `0x3F000000`, `0x3F000000`
331	.align `32`
332	.type __svml_satanh_data_internal, @object
333	.size __svml_satanh_data_internal, .-__svml_satanh_data_internal
334

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf8_core_avx2.S