1/* Function atanhf vectorized with SSE4.
2 Copyright (C) 2021-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
18
19/*
20 * ALGORITHM DESCRIPTION:
21 *
22 * Compute atanh(x) as 0.5 * log((1 + x)/(1 - x))
23 *
24 * Special cases:
25 *
26 * atanh(0) = 0
27 * atanh(+1) = +INF
28 * atanh(-1) = -INF
29 * atanh(x) = NaN if |x| > 1, or if x is a NaN or INF
30 *
31 */
32
33/* Offsets for data table __svml_satanh_data_internal_avx512. Ordered
34 by use in the function. On cold-starts this might help the
35 prefetcher. Possibly a better idea is to interleave start/end so
36 that the prefetcher is less likely to detect a stream and pull
37 irrelivant lines into cache. */
38#define sOne 0
39#define SgnMask 16
40#define sTopMask12 32
41#define iBrkValue 48
42#define iOffExpoMask 64
43#define sPoly 80
44#define sLn2 208
45#define TinyRange 224
46
47#include <sysdep.h>
48#define ATANHF_DATA(x) ((x)+__svml_satanh_data_internal)
49
50 .section .text.sse4, "ax", @progbits
51ENTRY(_ZGVbN4v_atanhf_sse4)
52 movaps %xmm0, %xmm5
53
54 /* Load constants including One = 1 */
55 movups ATANHF_DATA(sOne)(%rip), %xmm4
56 movaps %xmm5, %xmm3
57
58 /* Strip off the sign, so treat X as positive until right at the end */
59 movups ATANHF_DATA(SgnMask)(%rip), %xmm1
60 movaps %xmm4, %xmm2
61 andps %xmm1, %xmm0
62 movaps %xmm4, %xmm10
63 movups ATANHF_DATA(sTopMask12)(%rip), %xmm11
64 movaps %xmm4, %xmm14
65 movaps %xmm11, %xmm9
66
67
68 /*
69 * Compute V = 2 * X trivially, and UHi + U_lo = 1 - X in two pieces,
70 * the upper part UHi being <= 12 bits long. Then we have
71 * atanh(X) = 1/2 * log((1 + X) / (1 - X)) = 1/2 * log1p(V / (UHi + ULo)).
72 */
73 movaps %xmm0, %xmm6
74 mulps %xmm5, %xmm3
75 subps %xmm0, %xmm2
76 addps %xmm0, %xmm6
77 subps %xmm2, %xmm10
78 addps %xmm5, %xmm3
79 subps %xmm0, %xmm10
80 andps %xmm2, %xmm9
81
82
83 /*
84 * Check whether |X| < 1, in which case we use the main function.
85 * Otherwise set the rangemask so that the callout will get used.
86 * Note that this will also use the callout for NaNs since not(NaN < 1).
87 */
88 rcpps %xmm9, %xmm7
89 subps %xmm9, %xmm2
90 andps %xmm11, %xmm7
91
92
93 /*
94 * Split V as well into upper 12 bits and lower part, so that we can get
95 * a preliminary quotient estimate without rounding error.
96 */
97 andps %xmm6, %xmm11
98 mulps %xmm7, %xmm9
99 addps %xmm2, %xmm10
100 subps %xmm11, %xmm6
101
102 /* Hence get initial quotient estimate QHi + QLo = R * VHi + R * VLo */
103 mulps %xmm7, %xmm11
104 mulps %xmm7, %xmm10
105 subps %xmm9, %xmm14
106 mulps %xmm6, %xmm7
107 subps %xmm10, %xmm14
108
109 /* Compute D = E + E^2 */
110 movaps %xmm14, %xmm13
111 movaps %xmm4, %xmm8
112 mulps %xmm14, %xmm13
113
114 /* reduction: compute r,n */
115 movdqu ATANHF_DATA(iBrkValue)(%rip), %xmm9
116 addps %xmm13, %xmm14
117
118 /*
119 * Compute R * (VHi + VLo) * (1 + E + E^2)
120 * = R * (VHi + VLo) * (1 + D)
121 * = QHi + (QHi * D + QLo + QLo * D)
122 */
123 movaps %xmm14, %xmm2
124 mulps %xmm7, %xmm14
125 mulps %xmm11, %xmm2
126 addps %xmm14, %xmm7
127 movdqu ATANHF_DATA(iOffExpoMask)(%rip), %xmm12
128 movaps %xmm4, %xmm14
129
130 /* Record the sign for eventual reincorporation. */
131 addps %xmm7, %xmm2
132
133
134 /*
135 * Now finally accumulate the high and low parts of the
136 * argument to log1p, H + L, with a final compensated summation.
137 */
138 movaps %xmm2, %xmm6
139 andnps %xmm5, %xmm1
140 movaps %xmm4, %xmm7
141 /* Or the sign bit in with the tiny result to handle atanh(-0) correctly */
142 addps %xmm11, %xmm6
143 maxps %xmm6, %xmm7
144 minps %xmm6, %xmm8
145 subps %xmm6, %xmm11
146 movaps %xmm7, %xmm10
147 addps %xmm8, %xmm10
148 addps %xmm11, %xmm2
149 subps %xmm10, %xmm7
150 psubd %xmm9, %xmm10
151 addps %xmm8, %xmm7
152 pand %xmm10, %xmm12
153 psrad $23, %xmm10
154 cvtdq2ps %xmm10, %xmm13
155 addps %xmm7, %xmm2
156
157 /* final reconstruction */
158 pslld $23, %xmm10
159 paddd %xmm9, %xmm12
160 psubd %xmm10, %xmm14
161
162 /* polynomial evaluation */
163 subps %xmm4, %xmm12
164 mulps %xmm14, %xmm2
165 movups ATANHF_DATA(sPoly+0)(%rip), %xmm7
166 addps %xmm12, %xmm2
167 mulps %xmm2, %xmm7
168
169
170 /* Finally, halve the result and reincorporate the sign */
171 addps ATANHF_DATA(sPoly+16)(%rip), %xmm7
172 mulps %xmm2, %xmm7
173 addps ATANHF_DATA(sPoly+32)(%rip), %xmm7
174 mulps %xmm2, %xmm7
175 addps ATANHF_DATA(sPoly+48)(%rip), %xmm7
176 mulps %xmm2, %xmm7
177 addps ATANHF_DATA(sPoly+64)(%rip), %xmm7
178 mulps %xmm2, %xmm7
179 addps ATANHF_DATA(sPoly+80)(%rip), %xmm7
180 mulps %xmm2, %xmm7
181 addps ATANHF_DATA(sPoly+96)(%rip), %xmm7
182 mulps %xmm2, %xmm7
183 movaps ATANHF_DATA(sPoly+112)(%rip), %xmm6
184 addps %xmm6, %xmm7
185 mulps %xmm2, %xmm7
186 mulps %xmm2, %xmm7
187 mulps ATANHF_DATA(sLn2)(%rip), %xmm13
188 /* We can build `sHalf` with `sPoly & sOne`. */
189 andps %xmm4, %xmm6
190 orps %xmm1, %xmm3
191 xorps %xmm6, %xmm1
192
193 addps %xmm2, %xmm7
194 addps %xmm13, %xmm7
195 mulps %xmm7, %xmm1
196
197 /* Finish check of NaNs. */
198 cmpleps %xmm0, %xmm4
199 movmskps %xmm4, %edx
200 cmpltps ATANHF_DATA(TinyRange)(%rip), %xmm0
201
202 andps %xmm0, %xmm3
203 andnps %xmm1, %xmm0
204 orps %xmm3, %xmm0
205
206 testl %edx, %edx
207 /* Go to special inputs processing branch. */
208 jne L(SPECIAL_VALUES_BRANCH)
209 # LOE rbx rbp r12 r13 r14 r15 xmm0
210 /* No registers to restore on fast path. */
211 ret
212
213
214 /* Cold case. edx has 1s where there was a special value that
215 needs to be handled by a atanhf call. Optimize for code size
216 more so than speed here. */
217L(SPECIAL_VALUES_BRANCH):
218 # LOE rbx rdx rbp r12 r13 r14 r15 xmm0 xmm5
219 /* Stack coming in 16-byte aligned. Set 8-byte misaligned so on
220 call entry will be 16-byte aligned. */
221 subq $56, %rsp
222 cfi_def_cfa_offset(64)
223 movups %xmm0, 24(%rsp)
224 movups %xmm5, 40(%rsp)
225
226 /* Use rbx/rbp for callee save registers as they get short
227 encoding for many instructions (as compared with r12/r13). */
228 movq %rbx, (%rsp)
229 cfi_offset(rbx, -64)
230 movq %rbp, 8(%rsp)
231 cfi_offset(rbp, -56)
232 /* edx has 1s where there was a special value that needs to be handled
233 by a tanhf call. */
234 movl %edx, %ebx
235L(SPECIAL_VALUES_LOOP):
236 # LOE rbx rbp r12 r13 r14 r15
237 /* use rbp as index for special value that is saved across calls to
238 tanhf. We technically don't need a callee save register here as offset
239 to rsp is always [0, 12] so we can restore rsp by realigning to 64.
240 Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
241 in the loop. */
242 xorl %ebp, %ebp
243 bsfl %ebx, %ebp
244
245 /* Scalar math function call to process special input. */
246 movss 40(%rsp, %rbp, 4), %xmm0
247 call atanhf@PLT
248 /* No good way to avoid the store-forwarding fault this will cause on
249 return. `lfence` avoids the SF fault but at greater cost as it
250 serialized stack/callee save restoration. */
251 movss %xmm0, 24(%rsp, %rbp, 4)
252
253 leal -1(%rbx), %eax
254 andl %eax, %ebx
255 jnz L(SPECIAL_VALUES_LOOP)
256 # LOE r12 r13 r14 r15
257 /* All results have been written to 24(%rsp). */
258 movups 24(%rsp), %xmm0
259 movq (%rsp), %rbx
260 cfi_restore(rbx)
261 movq 8(%rsp), %rbp
262 cfi_restore(rbp)
263 addq $56, %rsp
264 cfi_def_cfa_offset(8)
265 ret
266END(_ZGVbN4v_atanhf_sse4)
267
268 .section .rodata, "a"
269 .align 16
270
271#ifdef __svml_satanh_data_internal_typedef
272typedef unsigned int VUINT32;
273typedef struct{
274 __declspec(align(16)) VUINT32 sOne[4][1];
275 __declspec(align(16)) VUINT32 SgnMask[4][1];
276 __declspec(align(16)) VUINT32 sTopMask12[4][1];
277 __declspec(align(16)) VUINT32 iBrkValue[4][1];
278 __declspec(align(16)) VUINT32 iOffExpoMask[4][1];
279 __declspec(align(16)) VUINT32 sPoly[8][4][1];
280 __declspec(align(16)) VUINT32 sLn2[4][1];
281 __declspec(align(16)) VUINT32 TinyRange[4][1];
282} __svml_satanh_data_internal;
283#endif
284
285__svml_satanh_data_internal:
286 /* sOne = SP 1.0 */
287 .align 16
288 .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
289 /* SgnMask */
290 .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
291 /* sTopMask12 */
292 .align 16
293 .long 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000
294 /* iBrkValue = SP 2/3 */
295 .align 16
296 .long 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
297 /* iOffExpoMask = SP significand mask ==*/
298 .align 16
299 .long 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
300
301 /* sPoly[] = SP polynomial */
302 .align 16
303 .long 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed /* 1.3820238411426544189453125e-01 P7 */
304 .long 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 /* -1.5122179687023162841796875e-01 P6 */
305 .long 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 /* 1.4042308926582336425781250e-01 P5 */
306 .long 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 /* -1.6472326219081878662109375e-01 P4 */
307 .long 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 /* 2.0007920265197753906250000e-01 P3 */
308 .long 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e /* -2.5004237890243530273437500e-01 P2 */
309 .long 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 /* 3.3333265781402587890625000e-01 P1 */
310 .long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 /* -5.0000000000000000000000000e-01 P0 */
311
312 /* sLn2 = SP ln(2) */
313 .align 16
314 .long 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
315 /* TinyRange */
316 .align 16
317 .long 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000
318 .align 16
319 .type __svml_satanh_data_internal, @object
320 .size __svml_satanh_data_internal, .-__svml_satanh_data_internal
321

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf4_core_sse4.S