1/* Function atanh vectorized with AVX-512.
2 Copyright (C) 2021-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
18
19/*
20 * ALGORITHM DESCRIPTION:
21 *
22 * Compute atanh(x) as 0.5 * log((1 + x)/(1 - x))
23 * using small lookup table that map to AVX-512 permute instructions
24 *
25 * Special cases:
26 *
27 * atanh(0) = 0
28 * atanh(+1) = +INF
29 * atanh(-1) = -INF
30 * atanh(x) = NaN if |x| > 1, or if x is a NaN or INF
31 *
32 */
33
34/* Offsets for data table __svml_datanh_data_internal_avx512
35 */
36#define Log_tbl_H 0
37#define Log_tbl_L 128
38#define One 256
39#define AbsMask 320
40#define AddB5 384
41#define RcpBitMask 448
42#define poly_coeff8 512
43#define poly_coeff7 576
44#define poly_coeff6 640
45#define poly_coeff5 704
46#define poly_coeff4 768
47#define poly_coeff3 832
48#define poly_coeff2 896
49#define poly_coeff1 960
50#define poly_coeff0 1024
51#define Half 1088
52#define L2H 1152
53#define L2L 1216
54
55#include <sysdep.h>
56
57 .section .text.evex512, "ax", @progbits
58ENTRY(_ZGVeN8v_atanh_skx)
59 pushq %rbp
60 cfi_def_cfa_offset(16)
61 movq %rsp, %rbp
62 cfi_def_cfa(6, 16)
63 cfi_offset(6, -16)
64 andq $-64, %rsp
65 subq $192, %rsp
66 vmovups One+__svml_datanh_data_internal_avx512(%rip), %zmm15
67
68 /* round reciprocals to 1+4b mantissas */
69 vmovups AddB5+__svml_datanh_data_internal_avx512(%rip), %zmm6
70 vmovups RcpBitMask+__svml_datanh_data_internal_avx512(%rip), %zmm9
71 vmovaps %zmm0, %zmm2
72 vandpd AbsMask+__svml_datanh_data_internal_avx512(%rip), %zmm2, %zmm13
73
74 /* 1+y */
75 vaddpd {rn-sae}, %zmm15, %zmm13, %zmm0
76
77 /* 1-y */
78 vsubpd {rn-sae}, %zmm13, %zmm15, %zmm4
79 vxorpd %zmm13, %zmm2, %zmm1
80
81 /* Yp_high */
82 vsubpd {rn-sae}, %zmm15, %zmm0, %zmm7
83
84 /* -Ym_high */
85 vsubpd {rn-sae}, %zmm15, %zmm4, %zmm12
86
87 /* RcpP ~ 1/Yp */
88 vrcp14pd %zmm0, %zmm3
89
90 /* RcpM ~ 1/Ym */
91 vrcp14pd %zmm4, %zmm5
92
93 /* input outside (-1, 1) ? */
94 vcmppd $21, {sae}, %zmm15, %zmm13, %k0
95 vpaddq %zmm6, %zmm3, %zmm11
96 vpaddq %zmm6, %zmm5, %zmm10
97
98 /* Yp_low */
99 vsubpd {rn-sae}, %zmm7, %zmm13, %zmm8
100 vandpd %zmm9, %zmm11, %zmm14
101 vandpd %zmm9, %zmm10, %zmm3
102
103 /* Ym_low */
104 vaddpd {rn-sae}, %zmm12, %zmm13, %zmm12
105
106 /* Reduced argument: Rp = (RcpP*Yp - 1)+RcpP*Yp_low */
107 vfmsub213pd {rn-sae}, %zmm15, %zmm14, %zmm0
108
109 /* Reduced argument: Rm = (RcpM*Ym - 1)+RcpM*Ym_low */
110 vfmsub231pd {rn-sae}, %zmm3, %zmm4, %zmm15
111
112 /* exponents */
113 vgetexppd {sae}, %zmm14, %zmm5
114 vgetexppd {sae}, %zmm3, %zmm4
115
116 /* Table lookups */
117 vmovups __svml_datanh_data_internal_avx512(%rip), %zmm9
118 vmovups Log_tbl_H+64+__svml_datanh_data_internal_avx512(%rip), %zmm13
119 vmovups Log_tbl_L+__svml_datanh_data_internal_avx512(%rip), %zmm7
120 vfmadd231pd {rn-sae}, %zmm14, %zmm8, %zmm0
121 vfnmadd231pd {rn-sae}, %zmm3, %zmm12, %zmm15
122
123 /* Prepare table index */
124 vpsrlq $48, %zmm14, %zmm11
125 vpsrlq $48, %zmm3, %zmm8
126 vmovups Log_tbl_L+64+__svml_datanh_data_internal_avx512(%rip), %zmm14
127
128 /* polynomials */
129 vmovups poly_coeff8+__svml_datanh_data_internal_avx512(%rip), %zmm3
130
131 /* Km-Kp */
132 vsubpd {rn-sae}, %zmm5, %zmm4, %zmm5
133 vmovups poly_coeff7+__svml_datanh_data_internal_avx512(%rip), %zmm4
134 kmovw %k0, %edx
135 vmovaps %zmm11, %zmm10
136 vmovaps %zmm4, %zmm6
137 vpermi2pd %zmm13, %zmm9, %zmm10
138 vpermi2pd %zmm14, %zmm7, %zmm11
139 vpermt2pd %zmm13, %zmm8, %zmm9
140 vpermt2pd %zmm14, %zmm8, %zmm7
141 vmovups poly_coeff6+__svml_datanh_data_internal_avx512(%rip), %zmm8
142 vfmadd231pd {rn-sae}, %zmm0, %zmm3, %zmm6
143 vfmadd231pd {rn-sae}, %zmm15, %zmm3, %zmm4
144 vmovups poly_coeff3+__svml_datanh_data_internal_avx512(%rip), %zmm13
145 vmovups poly_coeff2+__svml_datanh_data_internal_avx512(%rip), %zmm14
146 vfmadd213pd {rn-sae}, %zmm8, %zmm0, %zmm6
147 vfmadd213pd {rn-sae}, %zmm8, %zmm15, %zmm4
148 vmovups poly_coeff0+__svml_datanh_data_internal_avx512(%rip), %zmm8
149 vsubpd {rn-sae}, %zmm11, %zmm7, %zmm12
150
151 /* table values */
152 vsubpd {rn-sae}, %zmm10, %zmm9, %zmm3
153 vmovups poly_coeff5+__svml_datanh_data_internal_avx512(%rip), %zmm7
154 vmovups poly_coeff4+__svml_datanh_data_internal_avx512(%rip), %zmm9
155
156 /* K*L2H + Th */
157 vmovups L2H+__svml_datanh_data_internal_avx512(%rip), %zmm10
158
159 /* K*L2L + Tl */
160 vmovups L2L+__svml_datanh_data_internal_avx512(%rip), %zmm11
161 vfmadd213pd {rn-sae}, %zmm7, %zmm0, %zmm6
162 vfmadd213pd {rn-sae}, %zmm7, %zmm15, %zmm4
163 vmovups poly_coeff1+__svml_datanh_data_internal_avx512(%rip), %zmm7
164 vfmadd231pd {rn-sae}, %zmm5, %zmm10, %zmm3
165 vfmadd213pd {rn-sae}, %zmm12, %zmm11, %zmm5
166 vfmadd213pd {rn-sae}, %zmm9, %zmm0, %zmm6
167 vfmadd213pd {rn-sae}, %zmm9, %zmm15, %zmm4
168 vfmadd213pd {rn-sae}, %zmm13, %zmm0, %zmm6
169 vfmadd213pd {rn-sae}, %zmm13, %zmm15, %zmm4
170 vfmadd213pd {rn-sae}, %zmm14, %zmm0, %zmm6
171 vfmadd213pd {rn-sae}, %zmm14, %zmm15, %zmm4
172 vfmadd213pd {rn-sae}, %zmm7, %zmm0, %zmm6
173 vfmadd213pd {rn-sae}, %zmm7, %zmm15, %zmm4
174 vfmadd213pd {rn-sae}, %zmm8, %zmm0, %zmm6
175 vfmadd213pd {rn-sae}, %zmm8, %zmm15, %zmm4
176
177 /* (K*L2L + Tl) + Rp*PolyP */
178 vfmadd213pd {rn-sae}, %zmm5, %zmm0, %zmm6
179 vorpd Half+__svml_datanh_data_internal_avx512(%rip), %zmm1, %zmm0
180
181 /* (K*L2L + Tl) + Rp*PolyP -Rm*PolyM */
182 vfnmadd213pd {rn-sae}, %zmm6, %zmm15, %zmm4
183 vaddpd {rn-sae}, %zmm4, %zmm3, %zmm1
184 vmulpd {rn-sae}, %zmm0, %zmm1, %zmm0
185 testl %edx, %edx
186
187 /* Go to special inputs processing branch */
188 jne L(SPECIAL_VALUES_BRANCH)
189 # LOE rbx r12 r13 r14 r15 edx zmm0 zmm2
190
191 /* Restore registers
192 * and exit the function
193 */
194
195L(EXIT):
196 movq %rbp, %rsp
197 popq %rbp
198 cfi_def_cfa(7, 8)
199 cfi_restore(6)
200 ret
201 cfi_def_cfa(6, 16)
202 cfi_offset(6, -16)
203
204 /* Branch to process
205 * special inputs
206 */
207
208L(SPECIAL_VALUES_BRANCH):
209 vmovups %zmm2, 64(%rsp)
210 vmovups %zmm0, 128(%rsp)
211 # LOE rbx r12 r13 r14 r15 edx zmm0
212
213 xorl %eax, %eax
214 # LOE rbx r12 r13 r14 r15 eax edx
215
216 vzeroupper
217 movq %r12, 16(%rsp)
218 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
219 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
220 movl %eax, %r12d
221 movq %r13, 8(%rsp)
222 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
223 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
224 movl %edx, %r13d
225 movq %r14, (%rsp)
226 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
227 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
228 # LOE rbx r15 r12d r13d
229
230 /* Range mask
231 * bits check
232 */
233
234L(RANGEMASK_CHECK):
235 btl %r12d, %r13d
236
237 /* Call scalar math function */
238 jc L(SCALAR_MATH_CALL)
239 # LOE rbx r15 r12d r13d
240
241 /* Special inputs
242 * processing loop
243 */
244
245L(SPECIAL_VALUES_LOOP):
246 incl %r12d
247 cmpl $8, %r12d
248
249 /* Check bits in range mask */
250 jl L(RANGEMASK_CHECK)
251 # LOE rbx r15 r12d r13d
252
253 movq 16(%rsp), %r12
254 cfi_restore(12)
255 movq 8(%rsp), %r13
256 cfi_restore(13)
257 movq (%rsp), %r14
258 cfi_restore(14)
259 vmovups 128(%rsp), %zmm0
260
261 /* Go to exit */
262 jmp L(EXIT)
263 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
264 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
265 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
266 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
267 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
268 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
269 # LOE rbx r12 r13 r14 r15 zmm0
270
271 /* Scalar math function call
272 * to process special input
273 */
274
275L(SCALAR_MATH_CALL):
276 movl %r12d, %r14d
277 vmovsd 64(%rsp, %r14, 8), %xmm0
278 call atanh@PLT
279 # LOE rbx r14 r15 r12d r13d xmm0
280
281 vmovsd %xmm0, 128(%rsp, %r14, 8)
282
283 /* Process special inputs in loop */
284 jmp L(SPECIAL_VALUES_LOOP)
285 # LOE rbx r15 r12d r13d
286END(_ZGVeN8v_atanh_skx)
287
288 .section .rodata, "a"
289 .align 64
290
291#ifdef __svml_datanh_data_internal_avx512_typedef
292typedef unsigned int VUINT32;
293typedef struct {
294 __declspec(align(64)) VUINT32 Log_tbl_H[16][2];
295 __declspec(align(64)) VUINT32 Log_tbl_L[16][2];
296 __declspec(align(64)) VUINT32 One[8][2];
297 __declspec(align(64)) VUINT32 AbsMask[8][2];
298 __declspec(align(64)) VUINT32 AddB5[8][2];
299 __declspec(align(64)) VUINT32 RcpBitMask[8][2];
300 __declspec(align(64)) VUINT32 poly_coeff8[8][2];
301 __declspec(align(64)) VUINT32 poly_coeff7[8][2];
302 __declspec(align(64)) VUINT32 poly_coeff6[8][2];
303 __declspec(align(64)) VUINT32 poly_coeff5[8][2];
304 __declspec(align(64)) VUINT32 poly_coeff4[8][2];
305 __declspec(align(64)) VUINT32 poly_coeff3[8][2];
306 __declspec(align(64)) VUINT32 poly_coeff2[8][2];
307 __declspec(align(64)) VUINT32 poly_coeff1[8][2];
308 __declspec(align(64)) VUINT32 poly_coeff0[8][2];
309 __declspec(align(64)) VUINT32 Half[8][2];
310 __declspec(align(64)) VUINT32 L2H[8][2];
311 __declspec(align(64)) VUINT32 L2L[8][2];
312} __svml_datanh_data_internal_avx512;
313#endif
314__svml_datanh_data_internal_avx512:
315 /* Log_tbl_H */
316 .quad 0x0000000000000000
317 .quad 0x3faf0a30c0100000
318 .quad 0x3fbe27076e2a0000
319 .quad 0x3fc5ff3070a80000
320 .quad 0x3fcc8ff7c79b0000
321 .quad 0x3fd1675cabab8000
322 .quad 0x3fd4618bc21c8000
323 .quad 0x3fd739d7f6bc0000
324 .quad 0x3fd9f323ecbf8000
325 .quad 0x3fdc8ff7c79a8000
326 .quad 0x3fdf128f5faf0000
327 .quad 0x3fe0be72e4254000
328 .quad 0x3fe1e85f5e704000
329 .quad 0x3fe307d7334f0000
330 .quad 0x3fe41d8fe8468000
331 .quad 0x3fe52a2d265bc000
332 /* Log_tbl_L */
333 .align 64
334 .quad 0x0000000000000000
335 .quad 0x3d662a6617cc9717
336 .quad 0x3d6e5cbd3d50fffc
337 .quad 0xbd6b0b0de3077d7e
338 .quad 0xbd697794f689f843
339 .quad 0x3d630701ce63eab9
340 .quad 0xbd609ec17a426426
341 .quad 0xbd67fcb18ed9d603
342 .quad 0x3d584bf2b68d766f
343 .quad 0x3d5a21ac25d81ef3
344 .quad 0x3d3bb2cd720ec44c
345 .quad 0xbd657d49676844cc
346 .quad 0x3d1a07bd8b34be7c
347 .quad 0x3d60be1fb590a1f5
348 .quad 0xbd5aa33736867a17
349 .quad 0x3d46abb9df22bc57
350 /* One */
351 .align 64
352 .quad 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000
353 /* AbsMask */
354 .align 64
355 .quad 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff
356 /* AddB5 */
357 .align 64
358 .quad 0x0000800000000000, 0x0000800000000000, 0x0000800000000000, 0x0000800000000000, 0x0000800000000000, 0x0000800000000000, 0x0000800000000000, 0x0000800000000000
359 /* RcpBitMask */
360 .align 64
361 .quad 0xffff000000000000, 0xffff000000000000, 0xffff000000000000, 0xffff000000000000, 0xffff000000000000, 0xffff000000000000, 0xffff000000000000, 0xffff000000000000
362 /* poly_coeff8 */
363 .align 64
364 .quad 0x3fbc81dd40d38142, 0x3fbc81dd40d38142, 0x3fbc81dd40d38142, 0x3fbc81dd40d38142, 0x3fbc81dd40d38142, 0x3fbc81dd40d38142, 0x3fbc81dd40d38142, 0x3fbc81dd40d38142
365 /* poly_coeff7 */
366 .align 64
367 .quad 0xbfc0073cb82e8b70, 0xbfc0073cb82e8b70, 0xbfc0073cb82e8b70, 0xbfc0073cb82e8b70, 0xbfc0073cb82e8b70, 0xbfc0073cb82e8b70, 0xbfc0073cb82e8b70, 0xbfc0073cb82e8b70
368 /* poly_coeff6 */
369 .align 64
370 .quad 0x3fc2492298ffdae8, 0x3fc2492298ffdae8, 0x3fc2492298ffdae8, 0x3fc2492298ffdae8, 0x3fc2492298ffdae8, 0x3fc2492298ffdae8, 0x3fc2492298ffdae8, 0x3fc2492298ffdae8
371 /* poly_coeff5 */
372 .align 64
373 .quad 0xbfc55553f871e5c5, 0xbfc55553f871e5c5, 0xbfc55553f871e5c5, 0xbfc55553f871e5c5, 0xbfc55553f871e5c5, 0xbfc55553f871e5c5, 0xbfc55553f871e5c5, 0xbfc55553f871e5c5
374 /* poly_coeff4 */
375 .align 64
376 .quad 0x3fc9999999cd394a, 0x3fc9999999cd394a, 0x3fc9999999cd394a, 0x3fc9999999cd394a, 0x3fc9999999cd394a, 0x3fc9999999cd394a, 0x3fc9999999cd394a, 0x3fc9999999cd394a
377 /* poly_coeff3 */
378 .align 64
379 .quad 0xbfd00000000c2a01, 0xbfd00000000c2a01, 0xbfd00000000c2a01, 0xbfd00000000c2a01, 0xbfd00000000c2a01, 0xbfd00000000c2a01, 0xbfd00000000c2a01, 0xbfd00000000c2a01
380 /* poly_coeff2 */
381 .align 64
382 .quad 0x3fd5555555555462, 0x3fd5555555555462, 0x3fd5555555555462, 0x3fd5555555555462, 0x3fd5555555555462, 0x3fd5555555555462, 0x3fd5555555555462, 0x3fd5555555555462
383 /* poly_coeff1 */
384 .align 64
385 .quad 0xbfdfffffffffffc5, 0xbfdfffffffffffc5, 0xbfdfffffffffffc5, 0xbfdfffffffffffc5, 0xbfdfffffffffffc5, 0xbfdfffffffffffc5, 0xbfdfffffffffffc5, 0xbfdfffffffffffc5
386 /* poly_coeff0 */
387 .align 64
388 .quad 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000
389 /* Half */
390 .align 64
391 .quad 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000
392 /* L2H = log(2)_high */
393 .align 64
394 .quad 0x3fe62E42FEFA0000, 0x3fe62E42FEFA0000, 0x3fe62E42FEFA0000, 0x3fe62E42FEFA0000, 0x3fe62E42FEFA0000, 0x3fe62E42FEFA0000, 0x3fe62E42FEFA0000, 0x3fe62E42FEFA0000
395 /* L2L = log(2)_low */
396 .align 64
397 .quad 0x3d7cf79abc9e0000, 0x3d7cf79abc9e0000, 0x3d7cf79abc9e0000, 0x3d7cf79abc9e0000, 0x3d7cf79abc9e0000, 0x3d7cf79abc9e0000, 0x3d7cf79abc9e0000, 0x3d7cf79abc9e0000
398 .align 64
399 .type __svml_datanh_data_internal_avx512, @object
400 .size __svml_datanh_data_internal_avx512, .-__svml_datanh_data_internal_avx512
401

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_d_atanh8_core_avx512.S