1/* Function atanhf vectorized with AVX-512.
2 Copyright (C) 2021-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
18
19/*
20 * ALGORITHM DESCRIPTION:
21 *
22 * Compute atanh(x) as 0.5 * log((1 + x)/(1 - x))
23 * using small lookup table that map to AVX-512 permute instructions
24 *
25 * Special cases:
26 *
27 * atanh(0) = 0
28 * atanh(+1) = +INF
29 * atanh(-1) = -INF
30 * atanh(x) = NaN if |x| > 1, or if x is a NaN or INF
31 *
32 */
33
34/* Offsets for data table __svml_satanh_data_internal_avx512 and
35 __svml_satanh_data_internal_avx512_al64. Ordered by use in the
36 function. On cold-starts this might help the prefetcher. Possibly
37 a better idea is to interleave start/end so that the prefetcher is
38 less likely to detect a stream and pull irrelivant lines into
39 cache. */
40
41/* Offset into __svml_satanh_data_internal_avx512. 4-byte aligned as
42 the memory is broadcast to {1to16}. */
43#define AbsMask 0
44
45/* Offset into __svml_satanh_data_internal_avx512_al64. The full value
46 is used here. */
47#define One 0
48#define AddB5 64
49#define RcpBitMask 128
50#define Log_tbl_L_lo 192
51#define Log_tbl_L_hi 256
52#define Log_tbl_H_lo 320
53#define Log_tbl_H_hi 384
54#define L2H 448
55#define L2L 512
56#define poly_coeff3 576
57#define poly_coeff2 640
58#define poly_coeff1 704
59
60#include <sysdep.h>
61
62#define ATANHF_DATA(x) ((x)+__svml_satanh_data_internal_avx512_al64)
63
64 .section .text.evex512, "ax", @progbits
65ENTRY(_ZGVeN16v_atanhf_skx)
66 vandps AbsMask+__svml_satanh_data_internal_avx512(%rip){1to16}, %zmm0, %zmm6
67 vmovups ATANHF_DATA(One)(%rip), %zmm4
68
69 /* 1+y */
70 vaddps {rn-sae}, %zmm4, %zmm6, %zmm9
71
72 /* 1-y */
73 vsubps {rn-sae}, %zmm6, %zmm4, %zmm8
74
75 /* round reciprocals to 1+5b mantissas */
76 vmovups ATANHF_DATA(AddB5)(%rip), %zmm14
77 vmovups ATANHF_DATA(RcpBitMask)(%rip), %zmm1
78
79 /* RcpP ~ 1/Yp */
80 vrcp14ps %zmm9, %zmm12
81
82 /* RcpM ~ 1/Ym */
83 vrcp14ps %zmm8, %zmm13
84
85 /* Yp_high */
86 vsubps {rn-sae}, %zmm4, %zmm9, %zmm2
87
88 /* -Ym_high */
89 vsubps {rn-sae}, %zmm4, %zmm8, %zmm5
90
91
92 /* input outside (-1, 1) ? */
93 vpaddd %zmm14, %zmm12, %zmm15
94 vpaddd %zmm14, %zmm13, %zmm12
95
96 /* Yp_low */
97 vsubps {rn-sae}, %zmm2, %zmm6, %zmm3
98 vandps %zmm1, %zmm15, %zmm7
99 vandps %zmm1, %zmm12, %zmm12
100
101 /* Ym_low */
102 vaddps {rn-sae}, %zmm5, %zmm6, %zmm5
103
104 /* Reduced argument: Rp = (RcpP*Yp - 1)+RcpP*Yp_low */
105 vfmsub213ps {rn-sae}, %zmm4, %zmm7, %zmm9
106
107 /* Reduced argument: Rm = (RcpM*Ym - 1)+RcpM*Ym_low */
108 vfmsub213ps {rn-sae}, %zmm4, %zmm12, %zmm8
109
110 vmovups ATANHF_DATA(Log_tbl_L_lo)(%rip), %zmm10
111 vmovups ATANHF_DATA(Log_tbl_L_hi)(%rip), %zmm13
112
113 /* exponents */
114 vfmadd231ps {rn-sae}, %zmm7, %zmm3, %zmm9
115 vgetexpps {sae}, %zmm7, %zmm15
116
117
118 /* Table lookups */
119 vfnmadd231ps {rn-sae}, %zmm12, %zmm5, %zmm8
120 vgetexpps {sae}, %zmm12, %zmm14
121
122
123 /* Prepare table index */
124 vpsrld $18, %zmm7, %zmm3
125 vpsrld $18, %zmm12, %zmm2
126 vmovups ATANHF_DATA(Log_tbl_H_lo)(%rip), %zmm11
127 vmovups ATANHF_DATA(Log_tbl_H_hi)(%rip), %zmm7
128 /* Km-Kp */
129
130 vmovaps %zmm3, %zmm5
131 vpermi2ps %zmm13, %zmm10, %zmm3
132 vpermt2ps %zmm13, %zmm2, %zmm10
133 vpermi2ps %zmm7, %zmm11, %zmm5
134 vpermt2ps %zmm7, %zmm2, %zmm11
135 vsubps {rn-sae}, %zmm15, %zmm14, %zmm1
136 vsubps {rn-sae}, %zmm3, %zmm10, %zmm7
137
138 /* K*L2H + Th */
139 vmovups ATANHF_DATA(L2H)(%rip), %zmm2
140
141 /* K*L2L + Tl */
142 vmovups ATANHF_DATA(L2L)(%rip), %zmm3
143
144 /* table values */
145 vsubps {rn-sae}, %zmm5, %zmm11, %zmm5
146 vfmadd231ps {rn-sae}, %zmm1, %zmm2, %zmm5
147 vfmadd213ps {rn-sae}, %zmm7, %zmm3, %zmm1
148 /* polynomials */
149 vmovups ATANHF_DATA(poly_coeff3)(%rip), %zmm7
150 vmovups ATANHF_DATA(poly_coeff2)(%rip), %zmm10
151 vmovaps %zmm10, %zmm14
152 vfmadd231ps {rn-sae}, %zmm9, %zmm7, %zmm10
153 vfmadd231ps {rn-sae}, %zmm8, %zmm7, %zmm14
154 vmovups ATANHF_DATA(poly_coeff1)(%rip), %zmm12
155 vfmadd213ps {rn-sae}, %zmm12, %zmm9, %zmm10
156 vfmadd213ps {rn-sae}, %zmm12, %zmm8, %zmm14
157 vfmadd213ps {rn-sae}, %zmm4, %zmm9, %zmm10
158 vfmadd213ps {rn-sae}, %zmm4, %zmm8, %zmm14
159
160 /* (K*L2L + Tl) + Rp*PolyP */
161 vfmadd213ps {rn-sae}, %zmm1, %zmm9, %zmm10
162
163 /* zmm12 = zmm12 & (zmm4 | zmm0). */
164 vpternlogq $0xe0, %zmm0, %zmm4, %zmm12
165
166 /* (K*L2L + Tl) + Rp*PolyP -Rm*PolyM */
167 vfnmadd213ps {rn-sae}, %zmm5, %zmm8, %zmm14
168 vaddps {rn-sae}, %zmm14, %zmm10, %zmm8
169
170 vcmpps $21, {sae}, %zmm4, %zmm6, %k0
171 kmovw %k0, %edx
172 testl %edx, %edx
173
174 /* Go to special inputs processing branch */
175 jne L(SPECIAL_VALUES_BRANCH)
176 # LOE rbx r12 r13 r14 r15 zmm0 zmm8 zmm12
177 vmulps {rn-sae}, %zmm12, %zmm8, %zmm0
178
179 /* No register to restore on fast path. */
180 ret
181
182 /* Cold case. edx has 1s where there was a special value that
183 needs to be handled by a atanhf call. Optimize for code size
184 more so than speed here. */
185L(SPECIAL_VALUES_BRANCH):
186 # LOE rbx rdx r12 r13 r14 r15 zmm0 zmm8 zmm12
187 /* Use r13 to save/restore the stack. This allows us to use rbp as
188 callee save register saving code size. */
189 pushq %r13
190 cfi_adjust_cfa_offset(8)
191 cfi_offset(r13, -16)
192 /* Need to callee save registers to preserve state across tanhf calls.
193 */
194 pushq %rbx
195 cfi_adjust_cfa_offset(8)
196 cfi_offset(rbx, -24)
197 pushq %rbp
198 cfi_adjust_cfa_offset(8)
199 cfi_offset(rbp, -32)
200 movq %rsp, %r13
201 cfi_def_cfa_register(r13)
202
203 /* Align stack and make room for 2x zmm vectors. */
204 andq $-64, %rsp
205 addq $-128, %rsp
206 vmulps {rn-sae}, %zmm12, %zmm8, %zmm1
207 vmovaps %zmm1, (%rsp)
208 vmovaps %zmm0, 64(%rsp)
209 vzeroupper
210
211 /* edx has 1s where there was a special value that needs to be handled
212 by a atanhf call. */
213 movl %edx, %ebx
214L(SPECIAL_VALUES_LOOP):
215 # LOE rbx rbp r12 r13 r14 r15
216 /* use rbp as index for special value that is saved across calls to
217 atanhf. We technically don't need a callee save register here as offset
218 to rsp is always [0, 56] so we can restore rsp by realigning to 64.
219 Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
220 in the loop. Realigning also costs more code size. */
221 xorl %ebp, %ebp
222 tzcntl %ebx, %ebp
223
224 /* Scalar math function call to process special input. */
225 vmovss 64(%rsp, %rbp, 4), %xmm0
226 call atanhf@PLT
227
228 /* No good way to avoid the store-forwarding fault this will cause on
229 return. `lfence` avoids the SF fault but at greater cost as it
230 serialized stack/callee save restoration. */
231 vmovss %xmm0, (%rsp, %rbp, 4)
232
233 blsrl %ebx, %ebx
234 jnz L(SPECIAL_VALUES_LOOP)
235 # LOE r12 r13 r14 r15
236
237 /* All results have been written to (%rsp). */
238 vmovaps (%rsp), %zmm0
239 /* Restore rsp. */
240 movq %r13, %rsp
241 cfi_def_cfa_register(rsp)
242 /* Restore callee save registers. */
243 popq %rbp
244 cfi_adjust_cfa_offset(-8)
245 cfi_restore(rbp)
246 popq %rbx
247 cfi_adjust_cfa_offset(-8)
248 cfi_restore(rbp)
249 popq %r13
250 cfi_adjust_cfa_offset(-8)
251 cfi_restore(r13)
252 ret
253END(_ZGVeN16v_atanhf_skx)
254
255 .section .rodata, "a"
256 .align 4
257#ifdef __svml_satanh_data_internal_avx512_typedef
258typedef unsigned int VUINT32;
259typedef struct{
260 __declspec(align(4)) VUINT32 AbsMask[1][1];
261 __declspec(align(64)) VUINT32 One[16][1];
262 __declspec(align(64)) VUINT32 AddB5[16][1];
263 __declspec(align(64)) VUINT32 RcpBitMask[16][1];
264 __declspec(align(64)) VUINT32 Log_tbl_L_lo[16][1];
265 __declspec(align(64)) VUINT32 Log_tbl_L_hi[16][1];
266 __declspec(align(64)) VUINT32 Log_tbl_H_lo[16][1];
267 __declspec(align(64)) VUINT32 Log_tbl_H_hi[16][1];
268 __declspec(align(64)) VUINT32 L2H[16][1];
269 __declspec(align(64)) VUINT32 L2L[16][1];
270 __declspec(align(64)) VUINT32 poly_coeff3[16][1];
271 __declspec(align(64)) VUINT32 poly_coeff2[16][1];
272 __declspec(align(64)) VUINT32 poly_coeff1[16][1];
273} __svml_satanh_data_internal_avx512;
274#endif
275__svml_satanh_data_internal_avx512:
276 /* Leave this at front so we can potentially save space due to
277 smaller alignment constraint. */
278 .align 4
279 /* AbsMask */
280 .long 0x7fffffff
281 .align 64
282__svml_satanh_data_internal_avx512_al64:
283 /* One */
284 .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
285 .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
286 .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
287 .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
288 /* AddB5 */
289 .align 64
290 .long 0x00020000, 0x00020000, 0x00020000, 0x00020000
291 .long 0x00020000, 0x00020000, 0x00020000, 0x00020000
292 .long 0x00020000, 0x00020000, 0x00020000, 0x00020000
293 .long 0x00020000, 0x00020000, 0x00020000, 0x00020000
294 /* RcpBitMask */
295 .align 64
296 .long 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
297 .long 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
298 .long 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
299 .long 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
300 /* Log_tbl_L_lo */
301 .align 64
302 .long 0x00000000
303 .long 0x3726c39e
304 .long 0x38a30c01
305 .long 0x37528ae5
306 .long 0x38e0edc5
307 .long 0xb8ab41f8
308 .long 0xb7cf8f58
309 .long 0x3896a73d
310 .long 0xb5838656
311 .long 0x380c36af
312 .long 0xb8235454
313 .long 0x3862bae1
314 .long 0x38c5e10e
315 .long 0x38dedfac
316 .long 0x38ebfb5e
317 .long 0xb8e63c9f
318 /* Log_tbl_L_hi */
319 .align 64
320 .long 0xb85c1340
321 .long 0x38777bcd
322 .long 0xb6038656
323 .long 0x37d40984
324 .long 0xb8b85028
325 .long 0xb8ad5a5a
326 .long 0x3865c84a
327 .long 0x38c3d2f5
328 .long 0x383ebce1
329 .long 0xb8a1ed76
330 .long 0xb7a332c4
331 .long 0xb779654f
332 .long 0xb8602f73
333 .long 0x38f85db0
334 .long 0x37b4996f
335 .long 0xb8bfb3ca
336 /* Log_tbl_H_lo */
337 .align 64
338 .long 0x00000000
339 .long 0x3cfc0000
340 .long 0x3d780000
341 .long 0x3db78000
342 .long 0x3df10000
343 .long 0x3e14c000
344 .long 0x3e300000
345 .long 0x3e4a8000
346 .long 0x3e648000
347 .long 0x3e7dc000
348 .long 0x3e8b4000
349 .long 0x3e974000
350 .long 0x3ea30000
351 .long 0x3eae8000
352 .long 0x3eb9c000
353 .long 0x3ec4e000
354 /* Log_tbl_H_hi */
355 .align 64
356 .long 0x3ecfa000
357 .long 0x3eda2000
358 .long 0x3ee48000
359 .long 0x3eeea000
360 .long 0x3ef8a000
361 .long 0x3f013000
362 .long 0x3f05f000
363 .long 0x3f0aa000
364 .long 0x3f0f4000
365 .long 0x3f13d000
366 .long 0x3f184000
367 .long 0x3f1ca000
368 .long 0x3f20f000
369 .long 0x3f252000
370 .long 0x3f295000
371 .long 0x3f2d7000
372 /* L2H = log(2)_high */
373 .align 64
374 .long 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
375 .long 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
376 .long 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
377 .long 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
378 /* L2L = log(2)_low */
379 .align 64
380 .long 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
381 .long 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
382 .long 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
383 .long 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
384 /* poly_coeff3 */
385 .align 64
386 .long 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
387 .long 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
388 .long 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
389 .long 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
390 /* poly_coeff2 */
391 .align 64
392 .long 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
393 .long 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
394 .long 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
395 .long 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
396 /* poly_coeff1 */
397 .align 64
398 .long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
399 .long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
400 .long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
401 .long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
402 .align 64
403 .type __svml_satanh_data_internal_avx512_al64, @object
404 .size __svml_satanh_data_internal_avx512_al64, .-__svml_satanh_data_internal_avx512_al64
405 .type __svml_satanh_data_internal_avx512, @object
406 .size __svml_satanh_data_internal_avx512, .-__svml_satanh_data_internal_avx512
407

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S