1 | /* Function atanhf vectorized with AVX-512. |
2 | Copyright (C) 2021-2024 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | https://www.gnu.org/licenses/. */ |
18 | |
19 | /* |
20 | * ALGORITHM DESCRIPTION: |
21 | * |
22 | * Compute atanh(x) as 0.5 * log((1 + x)/(1 - x)) |
23 | * using small lookup table that map to AVX-512 permute instructions |
24 | * |
25 | * Special cases: |
26 | * |
27 | * atanh(0) = 0 |
28 | * atanh(+1) = +INF |
29 | * atanh(-1) = -INF |
30 | * atanh(x) = NaN if |x| > 1, or if x is a NaN or INF |
31 | * |
32 | */ |
33 | |
34 | /* Offsets for data table __svml_satanh_data_internal_avx512 and |
35 | __svml_satanh_data_internal_avx512_al64. Ordered by use in the |
36 | function. On cold-starts this might help the prefetcher. Possibly |
37 | a better idea is to interleave start/end so that the prefetcher is |
38 | less likely to detect a stream and pull irrelivant lines into |
39 | cache. */ |
40 | |
41 | /* Offset into __svml_satanh_data_internal_avx512. 4-byte aligned as |
42 | the memory is broadcast to {1to16}. */ |
43 | #define AbsMask 0 |
44 | |
45 | /* Offset into __svml_satanh_data_internal_avx512_al64. The full value |
46 | is used here. */ |
47 | #define One 0 |
48 | #define AddB5 64 |
49 | #define RcpBitMask 128 |
50 | #define Log_tbl_L_lo 192 |
51 | #define Log_tbl_L_hi 256 |
52 | #define Log_tbl_H_lo 320 |
53 | #define Log_tbl_H_hi 384 |
54 | #define L2H 448 |
55 | #define L2L 512 |
56 | #define poly_coeff3 576 |
57 | #define poly_coeff2 640 |
58 | #define poly_coeff1 704 |
59 | |
60 | #include <sysdep.h> |
61 | |
62 | #define ATANHF_DATA(x) ((x)+__svml_satanh_data_internal_avx512_al64) |
63 | |
64 | .section .text.evex512, "ax" , @progbits |
65 | ENTRY(_ZGVeN16v_atanhf_skx) |
66 | vandps AbsMask+__svml_satanh_data_internal_avx512(%rip){1to16}, %zmm0, %zmm6 |
67 | vmovups ATANHF_DATA(One)(%rip), %zmm4 |
68 | |
69 | /* 1+y */ |
70 | vaddps {rn-sae}, %zmm4, %zmm6, %zmm9 |
71 | |
72 | /* 1-y */ |
73 | vsubps {rn-sae}, %zmm6, %zmm4, %zmm8 |
74 | |
75 | /* round reciprocals to 1+5b mantissas */ |
76 | vmovups ATANHF_DATA(AddB5)(%rip), %zmm14 |
77 | vmovups ATANHF_DATA(RcpBitMask)(%rip), %zmm1 |
78 | |
79 | /* RcpP ~ 1/Yp */ |
80 | vrcp14ps %zmm9, %zmm12 |
81 | |
82 | /* RcpM ~ 1/Ym */ |
83 | vrcp14ps %zmm8, %zmm13 |
84 | |
85 | /* Yp_high */ |
86 | vsubps {rn-sae}, %zmm4, %zmm9, %zmm2 |
87 | |
88 | /* -Ym_high */ |
89 | vsubps {rn-sae}, %zmm4, %zmm8, %zmm5 |
90 | |
91 | |
92 | /* input outside (-1, 1) ? */ |
93 | vpaddd %zmm14, %zmm12, %zmm15 |
94 | vpaddd %zmm14, %zmm13, %zmm12 |
95 | |
96 | /* Yp_low */ |
97 | vsubps {rn-sae}, %zmm2, %zmm6, %zmm3 |
98 | vandps %zmm1, %zmm15, %zmm7 |
99 | vandps %zmm1, %zmm12, %zmm12 |
100 | |
101 | /* Ym_low */ |
102 | vaddps {rn-sae}, %zmm5, %zmm6, %zmm5 |
103 | |
104 | /* Reduced argument: Rp = (RcpP*Yp - 1)+RcpP*Yp_low */ |
105 | vfmsub213ps {rn-sae}, %zmm4, %zmm7, %zmm9 |
106 | |
107 | /* Reduced argument: Rm = (RcpM*Ym - 1)+RcpM*Ym_low */ |
108 | vfmsub213ps {rn-sae}, %zmm4, %zmm12, %zmm8 |
109 | |
110 | vmovups ATANHF_DATA(Log_tbl_L_lo)(%rip), %zmm10 |
111 | vmovups ATANHF_DATA(Log_tbl_L_hi)(%rip), %zmm13 |
112 | |
113 | /* exponents */ |
114 | vfmadd231ps {rn-sae}, %zmm7, %zmm3, %zmm9 |
115 | vgetexpps {sae}, %zmm7, %zmm15 |
116 | |
117 | |
118 | /* Table lookups */ |
119 | vfnmadd231ps {rn-sae}, %zmm12, %zmm5, %zmm8 |
120 | vgetexpps {sae}, %zmm12, %zmm14 |
121 | |
122 | |
123 | /* Prepare table index */ |
124 | vpsrld $18, %zmm7, %zmm3 |
125 | vpsrld $18, %zmm12, %zmm2 |
126 | vmovups ATANHF_DATA(Log_tbl_H_lo)(%rip), %zmm11 |
127 | vmovups ATANHF_DATA(Log_tbl_H_hi)(%rip), %zmm7 |
128 | /* Km-Kp */ |
129 | |
130 | vmovaps %zmm3, %zmm5 |
131 | vpermi2ps %zmm13, %zmm10, %zmm3 |
132 | vpermt2ps %zmm13, %zmm2, %zmm10 |
133 | vpermi2ps %zmm7, %zmm11, %zmm5 |
134 | vpermt2ps %zmm7, %zmm2, %zmm11 |
135 | vsubps {rn-sae}, %zmm15, %zmm14, %zmm1 |
136 | vsubps {rn-sae}, %zmm3, %zmm10, %zmm7 |
137 | |
138 | /* K*L2H + Th */ |
139 | vmovups ATANHF_DATA(L2H)(%rip), %zmm2 |
140 | |
141 | /* K*L2L + Tl */ |
142 | vmovups ATANHF_DATA(L2L)(%rip), %zmm3 |
143 | |
144 | /* table values */ |
145 | vsubps {rn-sae}, %zmm5, %zmm11, %zmm5 |
146 | vfmadd231ps {rn-sae}, %zmm1, %zmm2, %zmm5 |
147 | vfmadd213ps {rn-sae}, %zmm7, %zmm3, %zmm1 |
148 | /* polynomials */ |
149 | vmovups ATANHF_DATA(poly_coeff3)(%rip), %zmm7 |
150 | vmovups ATANHF_DATA(poly_coeff2)(%rip), %zmm10 |
151 | vmovaps %zmm10, %zmm14 |
152 | vfmadd231ps {rn-sae}, %zmm9, %zmm7, %zmm10 |
153 | vfmadd231ps {rn-sae}, %zmm8, %zmm7, %zmm14 |
154 | vmovups ATANHF_DATA(poly_coeff1)(%rip), %zmm12 |
155 | vfmadd213ps {rn-sae}, %zmm12, %zmm9, %zmm10 |
156 | vfmadd213ps {rn-sae}, %zmm12, %zmm8, %zmm14 |
157 | vfmadd213ps {rn-sae}, %zmm4, %zmm9, %zmm10 |
158 | vfmadd213ps {rn-sae}, %zmm4, %zmm8, %zmm14 |
159 | |
160 | /* (K*L2L + Tl) + Rp*PolyP */ |
161 | vfmadd213ps {rn-sae}, %zmm1, %zmm9, %zmm10 |
162 | |
163 | /* zmm12 = zmm12 & (zmm4 | zmm0). */ |
164 | vpternlogq $0xe0, %zmm0, %zmm4, %zmm12 |
165 | |
166 | /* (K*L2L + Tl) + Rp*PolyP -Rm*PolyM */ |
167 | vfnmadd213ps {rn-sae}, %zmm5, %zmm8, %zmm14 |
168 | vaddps {rn-sae}, %zmm14, %zmm10, %zmm8 |
169 | |
170 | vcmpps $21, {sae}, %zmm4, %zmm6, %k0 |
171 | kmovw %k0, %edx |
172 | testl %edx, %edx |
173 | |
174 | /* Go to special inputs processing branch */ |
175 | jne L(SPECIAL_VALUES_BRANCH) |
176 | # LOE rbx r12 r13 r14 r15 zmm0 zmm8 zmm12 |
177 | vmulps {rn-sae}, %zmm12, %zmm8, %zmm0 |
178 | |
179 | /* No register to restore on fast path. */ |
180 | ret |
181 | |
182 | /* Cold case. edx has 1s where there was a special value that |
183 | needs to be handled by a atanhf call. Optimize for code size |
184 | more so than speed here. */ |
185 | L(SPECIAL_VALUES_BRANCH): |
186 | # LOE rbx rdx r12 r13 r14 r15 zmm0 zmm8 zmm12 |
187 | /* Use r13 to save/restore the stack. This allows us to use rbp as |
188 | callee save register saving code size. */ |
189 | pushq %r13 |
190 | cfi_adjust_cfa_offset(8) |
191 | cfi_offset(r13, -16) |
192 | /* Need to callee save registers to preserve state across tanhf calls. |
193 | */ |
194 | pushq %rbx |
195 | cfi_adjust_cfa_offset(8) |
196 | cfi_offset(rbx, -24) |
197 | pushq %rbp |
198 | cfi_adjust_cfa_offset(8) |
199 | cfi_offset(rbp, -32) |
200 | movq %rsp, %r13 |
201 | cfi_def_cfa_register(r13) |
202 | |
203 | /* Align stack and make room for 2x zmm vectors. */ |
204 | andq $-64, %rsp |
205 | addq $-128, %rsp |
206 | vmulps {rn-sae}, %zmm12, %zmm8, %zmm1 |
207 | vmovaps %zmm1, (%rsp) |
208 | vmovaps %zmm0, 64(%rsp) |
209 | vzeroupper |
210 | |
211 | /* edx has 1s where there was a special value that needs to be handled |
212 | by a atanhf call. */ |
213 | movl %edx, %ebx |
214 | L(SPECIAL_VALUES_LOOP): |
215 | # LOE rbx rbp r12 r13 r14 r15 |
216 | /* use rbp as index for special value that is saved across calls to |
217 | atanhf. We technically don't need a callee save register here as offset |
218 | to rsp is always [0, 56] so we can restore rsp by realigning to 64. |
219 | Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions |
220 | in the loop. Realigning also costs more code size. */ |
221 | xorl %ebp, %ebp |
222 | tzcntl %ebx, %ebp |
223 | |
224 | /* Scalar math function call to process special input. */ |
225 | vmovss 64(%rsp, %rbp, 4), %xmm0 |
226 | call atanhf@PLT |
227 | |
228 | /* No good way to avoid the store-forwarding fault this will cause on |
229 | return. `lfence` avoids the SF fault but at greater cost as it |
230 | serialized stack/callee save restoration. */ |
231 | vmovss %xmm0, (%rsp, %rbp, 4) |
232 | |
233 | blsrl %ebx, %ebx |
234 | jnz L(SPECIAL_VALUES_LOOP) |
235 | # LOE r12 r13 r14 r15 |
236 | |
237 | /* All results have been written to (%rsp). */ |
238 | vmovaps (%rsp), %zmm0 |
239 | /* Restore rsp. */ |
240 | movq %r13, %rsp |
241 | cfi_def_cfa_register(rsp) |
242 | /* Restore callee save registers. */ |
243 | popq %rbp |
244 | cfi_adjust_cfa_offset(-8) |
245 | cfi_restore(rbp) |
246 | popq %rbx |
247 | cfi_adjust_cfa_offset(-8) |
248 | cfi_restore(rbp) |
249 | popq %r13 |
250 | cfi_adjust_cfa_offset(-8) |
251 | cfi_restore(r13) |
252 | ret |
253 | END(_ZGVeN16v_atanhf_skx) |
254 | |
255 | .section .rodata, "a" |
256 | .align 4 |
257 | #ifdef __svml_satanh_data_internal_avx512_typedef |
258 | typedef unsigned int VUINT32; |
259 | typedef struct{ |
260 | __declspec(align(4)) VUINT32 AbsMask[1][1]; |
261 | __declspec(align(64)) VUINT32 One[16][1]; |
262 | __declspec(align(64)) VUINT32 AddB5[16][1]; |
263 | __declspec(align(64)) VUINT32 RcpBitMask[16][1]; |
264 | __declspec(align(64)) VUINT32 Log_tbl_L_lo[16][1]; |
265 | __declspec(align(64)) VUINT32 Log_tbl_L_hi[16][1]; |
266 | __declspec(align(64)) VUINT32 Log_tbl_H_lo[16][1]; |
267 | __declspec(align(64)) VUINT32 Log_tbl_H_hi[16][1]; |
268 | __declspec(align(64)) VUINT32 L2H[16][1]; |
269 | __declspec(align(64)) VUINT32 L2L[16][1]; |
270 | __declspec(align(64)) VUINT32 poly_coeff3[16][1]; |
271 | __declspec(align(64)) VUINT32 poly_coeff2[16][1]; |
272 | __declspec(align(64)) VUINT32 poly_coeff1[16][1]; |
273 | } __svml_satanh_data_internal_avx512; |
274 | #endif |
275 | __svml_satanh_data_internal_avx512: |
276 | /* Leave this at front so we can potentially save space due to |
277 | smaller alignment constraint. */ |
278 | .align 4 |
279 | /* AbsMask */ |
280 | .long 0x7fffffff |
281 | .align 64 |
282 | __svml_satanh_data_internal_avx512_al64: |
283 | /* One */ |
284 | .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 |
285 | .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 |
286 | .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 |
287 | .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 |
288 | /* AddB5 */ |
289 | .align 64 |
290 | .long 0x00020000, 0x00020000, 0x00020000, 0x00020000 |
291 | .long 0x00020000, 0x00020000, 0x00020000, 0x00020000 |
292 | .long 0x00020000, 0x00020000, 0x00020000, 0x00020000 |
293 | .long 0x00020000, 0x00020000, 0x00020000, 0x00020000 |
294 | /* RcpBitMask */ |
295 | .align 64 |
296 | .long 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000 |
297 | .long 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000 |
298 | .long 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000 |
299 | .long 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000 |
300 | /* Log_tbl_L_lo */ |
301 | .align 64 |
302 | .long 0x00000000 |
303 | .long 0x3726c39e |
304 | .long 0x38a30c01 |
305 | .long 0x37528ae5 |
306 | .long 0x38e0edc5 |
307 | .long 0xb8ab41f8 |
308 | .long 0xb7cf8f58 |
309 | .long 0x3896a73d |
310 | .long 0xb5838656 |
311 | .long 0x380c36af |
312 | .long 0xb8235454 |
313 | .long 0x3862bae1 |
314 | .long 0x38c5e10e |
315 | .long 0x38dedfac |
316 | .long 0x38ebfb5e |
317 | .long 0xb8e63c9f |
318 | /* Log_tbl_L_hi */ |
319 | .align 64 |
320 | .long 0xb85c1340 |
321 | .long 0x38777bcd |
322 | .long 0xb6038656 |
323 | .long 0x37d40984 |
324 | .long 0xb8b85028 |
325 | .long 0xb8ad5a5a |
326 | .long 0x3865c84a |
327 | .long 0x38c3d2f5 |
328 | .long 0x383ebce1 |
329 | .long 0xb8a1ed76 |
330 | .long 0xb7a332c4 |
331 | .long 0xb779654f |
332 | .long 0xb8602f73 |
333 | .long 0x38f85db0 |
334 | .long 0x37b4996f |
335 | .long 0xb8bfb3ca |
336 | /* Log_tbl_H_lo */ |
337 | .align 64 |
338 | .long 0x00000000 |
339 | .long 0x3cfc0000 |
340 | .long 0x3d780000 |
341 | .long 0x3db78000 |
342 | .long 0x3df10000 |
343 | .long 0x3e14c000 |
344 | .long 0x3e300000 |
345 | .long 0x3e4a8000 |
346 | .long 0x3e648000 |
347 | .long 0x3e7dc000 |
348 | .long 0x3e8b4000 |
349 | .long 0x3e974000 |
350 | .long 0x3ea30000 |
351 | .long 0x3eae8000 |
352 | .long 0x3eb9c000 |
353 | .long 0x3ec4e000 |
354 | /* Log_tbl_H_hi */ |
355 | .align 64 |
356 | .long 0x3ecfa000 |
357 | .long 0x3eda2000 |
358 | .long 0x3ee48000 |
359 | .long 0x3eeea000 |
360 | .long 0x3ef8a000 |
361 | .long 0x3f013000 |
362 | .long 0x3f05f000 |
363 | .long 0x3f0aa000 |
364 | .long 0x3f0f4000 |
365 | .long 0x3f13d000 |
366 | .long 0x3f184000 |
367 | .long 0x3f1ca000 |
368 | .long 0x3f20f000 |
369 | .long 0x3f252000 |
370 | .long 0x3f295000 |
371 | .long 0x3f2d7000 |
372 | /* L2H = log(2)_high */ |
373 | .align 64 |
374 | .long 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000 |
375 | .long 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000 |
376 | .long 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000 |
377 | .long 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000 |
378 | /* L2L = log(2)_low */ |
379 | .align 64 |
380 | .long 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4 |
381 | .long 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4 |
382 | .long 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4 |
383 | .long 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4 |
384 | /* poly_coeff3 */ |
385 | .align 64 |
386 | .long 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810 |
387 | .long 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810 |
388 | .long 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810 |
389 | .long 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810 |
390 | /* poly_coeff2 */ |
391 | .align 64 |
392 | .long 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e |
393 | .long 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e |
394 | .long 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e |
395 | .long 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e |
396 | /* poly_coeff1 */ |
397 | .align 64 |
398 | .long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 |
399 | .long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 |
400 | .long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 |
401 | .long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 |
402 | .align 64 |
403 | .type __svml_satanh_data_internal_avx512_al64, @object |
404 | .size __svml_satanh_data_internal_avx512_al64, .-__svml_satanh_data_internal_avx512_al64 |
405 | .type __svml_satanh_data_internal_avx512, @object |
406 | .size __svml_satanh_data_internal_avx512, .-__svml_satanh_data_internal_avx512 |
407 | |