1 | /* Function atanhf vectorized with AVX2. |
2 | Copyright (C) 2021-2024 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | https://www.gnu.org/licenses/. */ |
18 | |
19 | /* |
20 | * ALGORITHM DESCRIPTION: |
21 | * |
22 | * Compute atanh(x) as 0.5 * log((1 + x)/(1 - x)) |
23 | * |
24 | * Special cases: |
25 | * |
26 | * atanh(0) = 0 |
27 | * atanh(+1) = +INF |
28 | * atanh(-1) = -INF |
29 | * atanh(x) = NaN if |x| > 1, or if x is a NaN or INF |
30 | * |
31 | */ |
32 | |
33 | /* Offsets for data table __svml_satanh_data_internal_avx512. Ordered |
34 | by use in the function. On cold-starts this might hhelp the |
35 | prefetcher. Possibly a better idea is to interleave start/end so |
36 | that the prefetcher is less likely to detect a stream and pull |
37 | irrelivant lines into cache. */ |
38 | #define SgnMask 0 |
39 | #define sOne 32 |
40 | #define sTopMask12 64 |
41 | #define TinyRange 96 |
42 | #define iBrkValue 128 |
43 | #define iOffExpoMask 160 |
44 | #define sPoly 192 |
45 | #define sLn2 448 |
46 | #define sHalf 480 |
47 | |
48 | #include <sysdep.h> |
49 | #define ATANHF_DATA(x) ((x)+__svml_satanh_data_internal) |
50 | |
51 | .section .text.avx2, "ax" , @progbits |
52 | ENTRY(_ZGVdN8v_atanhf_avx2) |
53 | /* Strip off the sign, so treat X as positive until right at the end */ |
54 | vmovaps ATANHF_DATA(SgnMask)(%rip), %ymm2 |
55 | vandps %ymm2, %ymm0, %ymm3 |
56 | /* Load constants including One = 1 */ |
57 | vmovups ATANHF_DATA(sOne)(%rip), %ymm5 |
58 | vsubps %ymm3, %ymm5, %ymm1 |
59 | vmovups ATANHF_DATA(sTopMask12)(%rip), %ymm4 |
60 | |
61 | vrcpps %ymm1, %ymm7 |
62 | vsubps %ymm1, %ymm5, %ymm9 |
63 | vandps %ymm4, %ymm7, %ymm6 |
64 | vsubps %ymm3, %ymm9, %ymm7 |
65 | |
66 | /* No need to split sU when FMA is available */ |
67 | vfnmadd213ps %ymm5, %ymm6, %ymm1 |
68 | vmovaps %ymm0, %ymm8 |
69 | vfmadd213ps %ymm0, %ymm0, %ymm0 |
70 | vfnmadd231ps %ymm6, %ymm7, %ymm1 |
71 | |
72 | /* |
73 | * Check whether |X| < 1, in which case we use the main function. |
74 | * Otherwise set the rangemask so that the callout will get used. |
75 | * Note that this will also use the callout for NaNs since not(NaN < 1). |
76 | */ |
77 | vcmpnlt_uqps %ymm5, %ymm3, %ymm14 |
78 | vcmplt_oqps ATANHF_DATA(TinyRange)(%rip), %ymm3, %ymm15 |
79 | |
80 | /* |
81 | * Compute V = 2 * X trivially, and UHi + U_lo = 1 - X in two pieces, |
82 | * the upper part UHi being <= 12 bits long. Then we have |
83 | * atanh(X) = 1/2 * log((1 + X) / (1 - X)) = 1/2 * log1p(V / (UHi + ULo)). |
84 | */ |
85 | vaddps %ymm3, %ymm3, %ymm3 |
86 | |
87 | /* |
88 | * Split V as well into upper 12 bits and lower part, so that we can get |
89 | * a preliminary quotient estimate without rounding error. |
90 | */ |
91 | vandps %ymm4, %ymm3, %ymm4 |
92 | vsubps %ymm4, %ymm3, %ymm7 |
93 | |
94 | /* Hence get initial quotient estimate QHi + QLo = R * VHi + R * VLo */ |
95 | vmulps %ymm4, %ymm6, %ymm4 |
96 | |
97 | /* Compute D = E + E^2 */ |
98 | vfmadd213ps %ymm1, %ymm1, %ymm1 |
99 | |
100 | /* Record the sign for eventual reincorporation. */ |
101 | vandnps %ymm8, %ymm2, %ymm3 |
102 | |
103 | /* Or the sign bit in with the tiny result to handle atanh(-0) correctly */ |
104 | vorps %ymm3, %ymm0, %ymm13 |
105 | vmulps %ymm7, %ymm6, %ymm2 |
106 | |
107 | /* |
108 | * Compute R * (VHi + VLo) * (1 + E + E^2) |
109 | * = R * (VHi + VLo) * (1 + D) |
110 | * = QHi + (QHi * D + QLo + QLo * D) |
111 | */ |
112 | |
113 | /* |
114 | * If less precision is acceptable the `vmulps %ymm1, %ymm4, %ymm9; |
115 | * vaddps %ymm1, %ymm9, %ymm1` can be replaced with |
116 | * `vfmadd231ps %ymm1, %ymm4, %ymm4`. |
117 | */ |
118 | vmulps %ymm1, %ymm4, %ymm6 |
119 | vfmadd213ps %ymm2, %ymm2, %ymm1 |
120 | vaddps %ymm1, %ymm6, %ymm1 |
121 | |
122 | /* |
123 | * Now finally accumulate the high and low parts of the |
124 | * argument to log1p, H + L, with a final compensated summation. |
125 | */ |
126 | vaddps %ymm1, %ymm4, %ymm2 |
127 | |
128 | /* reduction: compute r, n */ |
129 | vmovups ATANHF_DATA(iBrkValue)(%rip), %ymm9 |
130 | |
131 | /* |
132 | * Now we feed into the log1p code, using H in place of _VARG1 and |
133 | * later incorporating L into the reduced argument. |
134 | * compute 1+x as high, low parts |
135 | */ |
136 | vmaxps %ymm2, %ymm5, %ymm0 |
137 | vminps %ymm2, %ymm5, %ymm6 |
138 | |
139 | /* This is needed for rounding (see `vaddps %ymm1, %ymm4, %ymm2`). */ |
140 | vsubps %ymm2, %ymm4, %ymm2 |
141 | vaddps %ymm6, %ymm0, %ymm4 |
142 | vpsubd %ymm9, %ymm4, %ymm7 |
143 | vsubps %ymm4, %ymm0, %ymm4 |
144 | vaddps %ymm2, %ymm1, %ymm2 |
145 | vmovaps ATANHF_DATA(iOffExpoMask)(%rip), %ymm1 |
146 | |
147 | vandps %ymm1, %ymm7, %ymm0 |
148 | vaddps %ymm4, %ymm6, %ymm4 |
149 | vandnps %ymm7, %ymm1, %ymm6 |
150 | vmovups ATANHF_DATA(sPoly+0)(%rip), %ymm1 |
151 | vpaddd %ymm9, %ymm0, %ymm0 |
152 | vaddps %ymm4, %ymm2, %ymm4 |
153 | vpsubd %ymm6, %ymm5, %ymm6 |
154 | |
155 | /* polynomial evaluation */ |
156 | vsubps %ymm5, %ymm0, %ymm2 |
157 | vfmadd231ps %ymm4, %ymm6, %ymm2 |
158 | vfmadd213ps ATANHF_DATA(sPoly+32)(%rip), %ymm2, %ymm1 |
159 | vfmadd213ps ATANHF_DATA(sPoly+64)(%rip), %ymm2, %ymm1 |
160 | vfmadd213ps ATANHF_DATA(sPoly+96)(%rip), %ymm2, %ymm1 |
161 | vfmadd213ps ATANHF_DATA(sPoly+128)(%rip), %ymm2, %ymm1 |
162 | vfmadd213ps ATANHF_DATA(sPoly+160)(%rip), %ymm2, %ymm1 |
163 | vfmadd213ps ATANHF_DATA(sPoly+192)(%rip), %ymm2, %ymm1 |
164 | vfmadd213ps ATANHF_DATA(sPoly+224)(%rip), %ymm2, %ymm1 |
165 | |
166 | vmulps %ymm1, %ymm2, %ymm1 |
167 | vfmadd213ps %ymm2, %ymm2, %ymm1 |
168 | |
169 | /* final reconstruction */ |
170 | vpsrad $23, %ymm7, %ymm6 |
171 | vcvtdq2ps %ymm6, %ymm2 |
172 | vfmadd132ps ATANHF_DATA(sLn2)(%rip), %ymm1, %ymm2 |
173 | |
174 | /* Finally, halve the result and reincorporate the sign */ |
175 | vxorps ATANHF_DATA(sHalf)(%rip), %ymm3, %ymm3 |
176 | vmulps %ymm2, %ymm3, %ymm2 |
177 | vmovmskps %ymm14, %edx |
178 | testl %edx, %edx |
179 | |
180 | vblendvps %ymm15, %ymm13, %ymm2, %ymm0 |
181 | /* Go to special inputs processing branch */ |
182 | jne L(SPECIAL_VALUES_BRANCH) |
183 | # LOE rbx rdx r12 r13 r14 r15 ymm0 |
184 | /* No registers to restore on fast path. */ |
185 | ret |
186 | |
187 | |
188 | /* Cold case. edx has 1s where there was a special value that |
189 | needs to be handled by a atanhf call. Optimize for code size |
190 | more so than speed here. */ |
191 | L(SPECIAL_VALUES_BRANCH): |
192 | # LOE rbx rdx r12 r13 r14 r15 ymm0 ymm8 |
193 | /* Use r13 to save/restore the stack. This allows us to use rbp as |
194 | callee save register saving code size. */ |
195 | pushq %r13 |
196 | cfi_adjust_cfa_offset(8) |
197 | cfi_offset(r13, -16) |
198 | /* Need to callee save registers to preserve state across tanhf calls. |
199 | */ |
200 | pushq %rbx |
201 | cfi_adjust_cfa_offset(8) |
202 | cfi_offset(rbx, -24) |
203 | pushq %rbp |
204 | cfi_adjust_cfa_offset(8) |
205 | cfi_offset(rbp, -32) |
206 | movq %rsp, %r13 |
207 | cfi_def_cfa_register(r13) |
208 | |
209 | /* Align stack and make room for 2x ymm vectors. */ |
210 | andq $-32, %rsp |
211 | addq $-64, %rsp |
212 | |
213 | /* Save all already computed inputs. */ |
214 | vmovups %ymm0, (%rsp) |
215 | /* Save original input (ymm8 unchanged up to this point). */ |
216 | vmovups %ymm8, 32(%rsp) |
217 | |
218 | vzeroupper |
219 | |
220 | /* edx has 1s where there was a special value that needs to be handled |
221 | by a atanhf call. */ |
222 | movl %edx, %ebx |
223 | L(SPECIAL_VALUES_LOOP): |
224 | # LOE rbx rbp r12 r13 r14 r15 |
225 | /* use rbp as index for special value that is saved across calls to |
226 | atanhf. We technically don't need a callee save register here as offset |
227 | to rsp is always [0, 28] so we can restore rsp by realigning to 64. |
228 | Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions |
229 | in the loop. Realigning also costs more code size. */ |
230 | xorl %ebp, %ebp |
231 | tzcntl %ebx, %ebp |
232 | |
233 | /* Scalar math function call to process special input. */ |
234 | vmovss 32(%rsp, %rbp, 4), %xmm0 |
235 | call atanhf@PLT |
236 | |
237 | /* No good way to avoid the store-forwarding fault this will cause on |
238 | return. `lfence` avoids the SF fault but at greater cost as it |
239 | serialized stack/callee save restoration. */ |
240 | vmovss %xmm0, (%rsp, %rbp, 4) |
241 | |
242 | blsrl %ebx, %ebx |
243 | jnz L(SPECIAL_VALUES_LOOP) |
244 | # LOE r12 r13 r14 r15 |
245 | |
246 | |
247 | /* All results have been written to (%rsp). */ |
248 | vmovups (%rsp), %ymm0 |
249 | /* Restore rsp. */ |
250 | movq %r13, %rsp |
251 | cfi_def_cfa_register(rsp) |
252 | /* Restore callee save registers. */ |
253 | popq %rbp |
254 | cfi_adjust_cfa_offset(-8) |
255 | cfi_restore(rbp) |
256 | popq %rbx |
257 | cfi_adjust_cfa_offset(-8) |
258 | cfi_restore(rbp) |
259 | popq %r13 |
260 | cfi_adjust_cfa_offset(-8) |
261 | cfi_restore(r13) |
262 | ret |
263 | END(_ZGVdN8v_atanhf_avx2) |
264 | |
265 | .section .rodata, "a" |
266 | .align 32 |
267 | #ifdef __svml_satanh_data_internal_typedef |
268 | typedef unsigned int VUINT32; |
269 | typedef struct{ |
270 | __declspec(align(32)) VUINT32 SgnMask[8][1]; |
271 | __declspec(align(32)) VUINT32 sOne[8][1]; |
272 | __declspec(align(32)) VUINT32 sTopMask12[8][1]; |
273 | __declspec(align(32)) VUINT32 TinyRange[8][1]; |
274 | __declspec(align(32)) VUINT32 iBrkValue[8][1]; |
275 | __declspec(align(32)) VUINT32 iOffExpoMask[8][1]; |
276 | __declspec(align(32)) VUINT32 sPoly[8][8][1]; |
277 | __declspec(align(32)) VUINT32 sLn2[8][1]; |
278 | __declspec(align(32)) VUINT32 sHalf[8][1]; |
279 | } __svml_satanh_data_internal; |
280 | #endif |
281 | __svml_satanh_data_internal: |
282 | /* SgnMask */ |
283 | .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff |
284 | .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff |
285 | /* sOne = SP 1.0 */ |
286 | .align 32 |
287 | .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 |
288 | .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 |
289 | /* sTopMask12 */ |
290 | .align 32 |
291 | .long 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000 |
292 | .long 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000 |
293 | /* TinyRange */ |
294 | .align 32 |
295 | .long 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000 |
296 | .long 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000 |
297 | /* iBrkValue = SP 2/3 */ |
298 | .align 32 |
299 | .long 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab |
300 | .long 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab |
301 | /* iOffExpoMask = SP significand mask */ |
302 | .align 32 |
303 | .long 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff |
304 | .long 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff |
305 | /* sPoly[] = SP polynomial */ |
306 | .align 32 |
307 | .long 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed |
308 | .long 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed /* 1.3820238411426544189453125e-01 P7 */ |
309 | .long 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 |
310 | .long 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 /* -1.5122179687023162841796875e-01 P6 */ |
311 | .long 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 |
312 | .long 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 /* 1.4042308926582336425781250e-01 P5 */ |
313 | .long 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 |
314 | .long 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 /* -1.6472326219081878662109375e-01 P4 */ |
315 | .long 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 |
316 | .long 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 /* 2.0007920265197753906250000e-01 P3 */ |
317 | .long 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e |
318 | .long 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e /* -2.5004237890243530273437500e-01 P2 */ |
319 | .long 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 |
320 | .long 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 /* 3.3333265781402587890625000e-01 P1 */ |
321 | .long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 |
322 | .long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 /* -5.0000000000000000000000000e-01 P0 */ |
323 | /* sLn2 = SP ln(2) */ |
324 | .align 32 |
325 | .long 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218 |
326 | .long 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218 |
327 | /* sHalf */ |
328 | .align 32 |
329 | .long 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000 |
330 | .long 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000 |
331 | .align 32 |
332 | .type __svml_satanh_data_internal, @object |
333 | .size __svml_satanh_data_internal, .-__svml_satanh_data_internal |
334 | |