1 | /* Function atanhf vectorized with SSE4. |
2 | Copyright (C) 2021-2024 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | https://www.gnu.org/licenses/. */ |
18 | |
19 | /* |
20 | * ALGORITHM DESCRIPTION: |
21 | * |
22 | * Compute atanh(x) as 0.5 * log((1 + x)/(1 - x)) |
23 | * |
24 | * Special cases: |
25 | * |
26 | * atanh(0) = 0 |
27 | * atanh(+1) = +INF |
28 | * atanh(-1) = -INF |
29 | * atanh(x) = NaN if |x| > 1, or if x is a NaN or INF |
30 | * |
31 | */ |
32 | |
33 | /* Offsets for data table __svml_satanh_data_internal_avx512. Ordered |
34 | by use in the function. On cold-starts this might help the |
35 | prefetcher. Possibly a better idea is to interleave start/end so |
36 | that the prefetcher is less likely to detect a stream and pull |
37 | irrelivant lines into cache. */ |
38 | #define sOne 0 |
39 | #define SgnMask 16 |
40 | #define sTopMask12 32 |
41 | #define iBrkValue 48 |
42 | #define iOffExpoMask 64 |
43 | #define sPoly 80 |
44 | #define sLn2 208 |
45 | #define TinyRange 224 |
46 | |
47 | #include <sysdep.h> |
48 | #define ATANHF_DATA(x) ((x)+__svml_satanh_data_internal) |
49 | |
50 | .section .text.sse4, "ax" , @progbits |
51 | ENTRY(_ZGVbN4v_atanhf_sse4) |
52 | movaps %xmm0, %xmm5 |
53 | |
54 | /* Load constants including One = 1 */ |
55 | movups ATANHF_DATA(sOne)(%rip), %xmm4 |
56 | movaps %xmm5, %xmm3 |
57 | |
58 | /* Strip off the sign, so treat X as positive until right at the end */ |
59 | movups ATANHF_DATA(SgnMask)(%rip), %xmm1 |
60 | movaps %xmm4, %xmm2 |
61 | andps %xmm1, %xmm0 |
62 | movaps %xmm4, %xmm10 |
63 | movups ATANHF_DATA(sTopMask12)(%rip), %xmm11 |
64 | movaps %xmm4, %xmm14 |
65 | movaps %xmm11, %xmm9 |
66 | |
67 | |
68 | /* |
69 | * Compute V = 2 * X trivially, and UHi + U_lo = 1 - X in two pieces, |
70 | * the upper part UHi being <= 12 bits long. Then we have |
71 | * atanh(X) = 1/2 * log((1 + X) / (1 - X)) = 1/2 * log1p(V / (UHi + ULo)). |
72 | */ |
73 | movaps %xmm0, %xmm6 |
74 | mulps %xmm5, %xmm3 |
75 | subps %xmm0, %xmm2 |
76 | addps %xmm0, %xmm6 |
77 | subps %xmm2, %xmm10 |
78 | addps %xmm5, %xmm3 |
79 | subps %xmm0, %xmm10 |
80 | andps %xmm2, %xmm9 |
81 | |
82 | |
83 | /* |
84 | * Check whether |X| < 1, in which case we use the main function. |
85 | * Otherwise set the rangemask so that the callout will get used. |
86 | * Note that this will also use the callout for NaNs since not(NaN < 1). |
87 | */ |
88 | rcpps %xmm9, %xmm7 |
89 | subps %xmm9, %xmm2 |
90 | andps %xmm11, %xmm7 |
91 | |
92 | |
93 | /* |
94 | * Split V as well into upper 12 bits and lower part, so that we can get |
95 | * a preliminary quotient estimate without rounding error. |
96 | */ |
97 | andps %xmm6, %xmm11 |
98 | mulps %xmm7, %xmm9 |
99 | addps %xmm2, %xmm10 |
100 | subps %xmm11, %xmm6 |
101 | |
102 | /* Hence get initial quotient estimate QHi + QLo = R * VHi + R * VLo */ |
103 | mulps %xmm7, %xmm11 |
104 | mulps %xmm7, %xmm10 |
105 | subps %xmm9, %xmm14 |
106 | mulps %xmm6, %xmm7 |
107 | subps %xmm10, %xmm14 |
108 | |
109 | /* Compute D = E + E^2 */ |
110 | movaps %xmm14, %xmm13 |
111 | movaps %xmm4, %xmm8 |
112 | mulps %xmm14, %xmm13 |
113 | |
114 | /* reduction: compute r,n */ |
115 | movdqu ATANHF_DATA(iBrkValue)(%rip), %xmm9 |
116 | addps %xmm13, %xmm14 |
117 | |
118 | /* |
119 | * Compute R * (VHi + VLo) * (1 + E + E^2) |
120 | * = R * (VHi + VLo) * (1 + D) |
121 | * = QHi + (QHi * D + QLo + QLo * D) |
122 | */ |
123 | movaps %xmm14, %xmm2 |
124 | mulps %xmm7, %xmm14 |
125 | mulps %xmm11, %xmm2 |
126 | addps %xmm14, %xmm7 |
127 | movdqu ATANHF_DATA(iOffExpoMask)(%rip), %xmm12 |
128 | movaps %xmm4, %xmm14 |
129 | |
130 | /* Record the sign for eventual reincorporation. */ |
131 | addps %xmm7, %xmm2 |
132 | |
133 | |
134 | /* |
135 | * Now finally accumulate the high and low parts of the |
136 | * argument to log1p, H + L, with a final compensated summation. |
137 | */ |
138 | movaps %xmm2, %xmm6 |
139 | andnps %xmm5, %xmm1 |
140 | movaps %xmm4, %xmm7 |
141 | /* Or the sign bit in with the tiny result to handle atanh(-0) correctly */ |
142 | addps %xmm11, %xmm6 |
143 | maxps %xmm6, %xmm7 |
144 | minps %xmm6, %xmm8 |
145 | subps %xmm6, %xmm11 |
146 | movaps %xmm7, %xmm10 |
147 | addps %xmm8, %xmm10 |
148 | addps %xmm11, %xmm2 |
149 | subps %xmm10, %xmm7 |
150 | psubd %xmm9, %xmm10 |
151 | addps %xmm8, %xmm7 |
152 | pand %xmm10, %xmm12 |
153 | psrad $23, %xmm10 |
154 | cvtdq2ps %xmm10, %xmm13 |
155 | addps %xmm7, %xmm2 |
156 | |
157 | /* final reconstruction */ |
158 | pslld $23, %xmm10 |
159 | paddd %xmm9, %xmm12 |
160 | psubd %xmm10, %xmm14 |
161 | |
162 | /* polynomial evaluation */ |
163 | subps %xmm4, %xmm12 |
164 | mulps %xmm14, %xmm2 |
165 | movups ATANHF_DATA(sPoly+0)(%rip), %xmm7 |
166 | addps %xmm12, %xmm2 |
167 | mulps %xmm2, %xmm7 |
168 | |
169 | |
170 | /* Finally, halve the result and reincorporate the sign */ |
171 | addps ATANHF_DATA(sPoly+16)(%rip), %xmm7 |
172 | mulps %xmm2, %xmm7 |
173 | addps ATANHF_DATA(sPoly+32)(%rip), %xmm7 |
174 | mulps %xmm2, %xmm7 |
175 | addps ATANHF_DATA(sPoly+48)(%rip), %xmm7 |
176 | mulps %xmm2, %xmm7 |
177 | addps ATANHF_DATA(sPoly+64)(%rip), %xmm7 |
178 | mulps %xmm2, %xmm7 |
179 | addps ATANHF_DATA(sPoly+80)(%rip), %xmm7 |
180 | mulps %xmm2, %xmm7 |
181 | addps ATANHF_DATA(sPoly+96)(%rip), %xmm7 |
182 | mulps %xmm2, %xmm7 |
183 | movaps ATANHF_DATA(sPoly+112)(%rip), %xmm6 |
184 | addps %xmm6, %xmm7 |
185 | mulps %xmm2, %xmm7 |
186 | mulps %xmm2, %xmm7 |
187 | mulps ATANHF_DATA(sLn2)(%rip), %xmm13 |
188 | /* We can build `sHalf` with `sPoly & sOne`. */ |
189 | andps %xmm4, %xmm6 |
190 | orps %xmm1, %xmm3 |
191 | xorps %xmm6, %xmm1 |
192 | |
193 | addps %xmm2, %xmm7 |
194 | addps %xmm13, %xmm7 |
195 | mulps %xmm7, %xmm1 |
196 | |
197 | /* Finish check of NaNs. */ |
198 | cmpleps %xmm0, %xmm4 |
199 | movmskps %xmm4, %edx |
200 | cmpltps ATANHF_DATA(TinyRange)(%rip), %xmm0 |
201 | |
202 | andps %xmm0, %xmm3 |
203 | andnps %xmm1, %xmm0 |
204 | orps %xmm3, %xmm0 |
205 | |
206 | testl %edx, %edx |
207 | /* Go to special inputs processing branch. */ |
208 | jne L(SPECIAL_VALUES_BRANCH) |
209 | # LOE rbx rbp r12 r13 r14 r15 xmm0 |
210 | /* No registers to restore on fast path. */ |
211 | ret |
212 | |
213 | |
214 | /* Cold case. edx has 1s where there was a special value that |
215 | needs to be handled by a atanhf call. Optimize for code size |
216 | more so than speed here. */ |
217 | L(SPECIAL_VALUES_BRANCH): |
218 | # LOE rbx rdx rbp r12 r13 r14 r15 xmm0 xmm5 |
219 | /* Stack coming in 16-byte aligned. Set 8-byte misaligned so on |
220 | call entry will be 16-byte aligned. */ |
221 | subq $56, %rsp |
222 | cfi_def_cfa_offset(64) |
223 | movups %xmm0, 24(%rsp) |
224 | movups %xmm5, 40(%rsp) |
225 | |
226 | /* Use rbx/rbp for callee save registers as they get short |
227 | encoding for many instructions (as compared with r12/r13). */ |
228 | movq %rbx, (%rsp) |
229 | cfi_offset(rbx, -64) |
230 | movq %rbp, 8(%rsp) |
231 | cfi_offset(rbp, -56) |
232 | /* edx has 1s where there was a special value that needs to be handled |
233 | by a tanhf call. */ |
234 | movl %edx, %ebx |
235 | L(SPECIAL_VALUES_LOOP): |
236 | # LOE rbx rbp r12 r13 r14 r15 |
237 | /* use rbp as index for special value that is saved across calls to |
238 | tanhf. We technically don't need a callee save register here as offset |
239 | to rsp is always [0, 12] so we can restore rsp by realigning to 64. |
240 | Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions |
241 | in the loop. */ |
242 | xorl %ebp, %ebp |
243 | bsfl %ebx, %ebp |
244 | |
245 | /* Scalar math function call to process special input. */ |
246 | movss 40(%rsp, %rbp, 4), %xmm0 |
247 | call atanhf@PLT |
248 | /* No good way to avoid the store-forwarding fault this will cause on |
249 | return. `lfence` avoids the SF fault but at greater cost as it |
250 | serialized stack/callee save restoration. */ |
251 | movss %xmm0, 24(%rsp, %rbp, 4) |
252 | |
253 | leal -1(%rbx), %eax |
254 | andl %eax, %ebx |
255 | jnz L(SPECIAL_VALUES_LOOP) |
256 | # LOE r12 r13 r14 r15 |
257 | /* All results have been written to 24(%rsp). */ |
258 | movups 24(%rsp), %xmm0 |
259 | movq (%rsp), %rbx |
260 | cfi_restore(rbx) |
261 | movq 8(%rsp), %rbp |
262 | cfi_restore(rbp) |
263 | addq $56, %rsp |
264 | cfi_def_cfa_offset(8) |
265 | ret |
266 | END(_ZGVbN4v_atanhf_sse4) |
267 | |
268 | .section .rodata, "a" |
269 | .align 16 |
270 | |
271 | #ifdef __svml_satanh_data_internal_typedef |
272 | typedef unsigned int VUINT32; |
273 | typedef struct{ |
274 | __declspec(align(16)) VUINT32 sOne[4][1]; |
275 | __declspec(align(16)) VUINT32 SgnMask[4][1]; |
276 | __declspec(align(16)) VUINT32 sTopMask12[4][1]; |
277 | __declspec(align(16)) VUINT32 iBrkValue[4][1]; |
278 | __declspec(align(16)) VUINT32 iOffExpoMask[4][1]; |
279 | __declspec(align(16)) VUINT32 sPoly[8][4][1]; |
280 | __declspec(align(16)) VUINT32 sLn2[4][1]; |
281 | __declspec(align(16)) VUINT32 TinyRange[4][1]; |
282 | } __svml_satanh_data_internal; |
283 | #endif |
284 | |
285 | __svml_satanh_data_internal: |
286 | /* sOne = SP 1.0 */ |
287 | .align 16 |
288 | .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 |
289 | /* SgnMask */ |
290 | .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff |
291 | /* sTopMask12 */ |
292 | .align 16 |
293 | .long 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000 |
294 | /* iBrkValue = SP 2/3 */ |
295 | .align 16 |
296 | .long 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab |
297 | /* iOffExpoMask = SP significand mask ==*/ |
298 | .align 16 |
299 | .long 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff |
300 | |
301 | /* sPoly[] = SP polynomial */ |
302 | .align 16 |
303 | .long 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed /* 1.3820238411426544189453125e-01 P7 */ |
304 | .long 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 /* -1.5122179687023162841796875e-01 P6 */ |
305 | .long 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 /* 1.4042308926582336425781250e-01 P5 */ |
306 | .long 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 /* -1.6472326219081878662109375e-01 P4 */ |
307 | .long 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 /* 2.0007920265197753906250000e-01 P3 */ |
308 | .long 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e /* -2.5004237890243530273437500e-01 P2 */ |
309 | .long 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 /* 3.3333265781402587890625000e-01 P1 */ |
310 | .long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 /* -5.0000000000000000000000000e-01 P0 */ |
311 | |
312 | /* sLn2 = SP ln(2) */ |
313 | .align 16 |
314 | .long 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218 |
315 | /* TinyRange */ |
316 | .align 16 |
317 | .long 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000 |
318 | .align 16 |
319 | .type __svml_satanh_data_internal, @object |
320 | .size __svml_satanh_data_internal, .-__svml_satanh_data_internal |
321 | |