1/* Function acoshf vectorized with AVX-512.
2 Copyright (C) 2021-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
18
19/*
20 * ALGORITHM DESCRIPTION:
21 *
22 * Compute acosh(x) as log(x + sqrt(x*x - 1))
23 * using RSQRT instructions for starting the
24 * square root approximation, and small table lookups for log
25 * that map to AVX-512 permute instructions
26 *
27 * Special cases:
28 *
29 * acosh(NaN) = quiet NaN, and raise invalid exception
30 * acosh(-INF) = NaN
31 * acosh(+INF) = +INF
32 * acosh(x) = NaN if x < 1
33 * acosh(1) = +0
34 *
35 */
36
37/* Offsets for data table __svml_sacosh_data_internal_avx512
38 */
39#define Log_tbl_H 0
40#define Log_tbl_L 128
41#define One 256
42#define SmallThreshold 320
43#define Threshold 384
44#define LargeThreshold 448
45#define ca1 512
46#define c2s 576
47#define c1s 640
48#define AddB5 704
49#define RcpBitMask 768
50#define OneEighth 832
51#define Four 896
52#define poly_coeff3 960
53#define poly_coeff2 1024
54#define poly_coeff1 1088
55#define L2H 1152
56#define L2L 1216
57
58#include <sysdep.h>
59
60 .section .text.evex512, "ax", @progbits
61ENTRY(_ZGVeN16v_acoshf_skx)
62 pushq %rbp
63 cfi_def_cfa_offset(16)
64 movq %rsp, %rbp
65 cfi_def_cfa(6, 16)
66 cfi_offset(6, -16)
67 andq $-64, %rsp
68 subq $192, %rsp
69 vmovups One+__svml_sacosh_data_internal_avx512(%rip), %zmm1
70
71 /*
72 * sqrt(1+x^2) ~ Sh + Sl + Sh*Eh*poly_s
73 * poly_s = c1+c2*Eh
74 */
75 vmovups c2s+__svml_sacosh_data_internal_avx512(%rip), %zmm13
76 vmovups c1s+__svml_sacosh_data_internal_avx512(%rip), %zmm15
77
78 /* polynomial computation for small inputs */
79 vmovups ca1+__svml_sacosh_data_internal_avx512(%rip), %zmm9
80
81 /* very large inputs ? */
82 vmovups Threshold+__svml_sacosh_data_internal_avx512(%rip), %zmm10
83
84 /* out of range inputs? */
85 vmovups LargeThreshold+__svml_sacosh_data_internal_avx512(%rip), %zmm11
86
87 /* not a very small input ? */
88 vmovups SmallThreshold+__svml_sacosh_data_internal_avx512(%rip), %zmm6
89 vmovaps %zmm0, %zmm8
90
91 /* x^2 - 1 */
92 vmovaps %zmm1, %zmm7
93 vfmsub231ps {rn-sae}, %zmm8, %zmm8, %zmm7
94 vcmpps $21, {sae}, %zmm10, %zmm8, %k2
95 vcmpps $22, {sae}, %zmm11, %zmm8, %k0
96 vcmpps $18, {sae}, %zmm1, %zmm8, %k1
97 vrsqrt14ps %zmm7, %zmm12
98 vcmpps $21, {sae}, %zmm6, %zmm7, %k3
99 vmulps {rn-sae}, %zmm9, %zmm7, %zmm4
100
101 /* Sh ~sqrt(-1+x^2) */
102 vmulps {rn-sae}, %zmm12, %zmm7, %zmm5
103
104 /* Sh+x */
105 vaddps {rn-sae}, %zmm8, %zmm5, %zmm9
106
107 /* (Yh*R0)_low */
108 vmovaps %zmm7, %zmm0
109 korw %k0, %k1, %k0
110
111 /* rel. error term: Eh=1-Sh*R0 */
112 vmovaps %zmm1, %zmm14
113 vfmsub213ps {rn-sae}, %zmm5, %zmm12, %zmm0
114 vfnmadd231ps {rn-sae}, %zmm5, %zmm12, %zmm14
115
116 /* rel. error term: Eh=(1-Sh*R0)-Sl*R0 */
117 vfnmadd231ps {rn-sae}, %zmm0, %zmm12, %zmm14
118
119 /* Sh*Eh */
120 vmulps {rn-sae}, %zmm14, %zmm5, %zmm3
121 vfmadd231ps {rn-sae}, %zmm14, %zmm13, %zmm15
122
123 /* Sl + Sh*Eh*poly_s */
124 vfmadd213ps {rn-sae}, %zmm0, %zmm15, %zmm3
125
126 /* Shh */
127 vsubps {rn-sae}, %zmm8, %zmm9, %zmm15
128
129 /* polynomial computation for small inputs */
130 vaddps {rn-sae}, %zmm3, %zmm5, %zmm0
131
132 /* Xin0+Sl+Sh*Eh*poly_s ~ x+sqrt(1+x^2) */
133 vaddps {rn-sae}, %zmm3, %zmm9, %zmm2
134
135 /* Shl */
136 vsubps {rn-sae}, %zmm15, %zmm5, %zmm10
137 vfmadd231ps {rn-sae}, %zmm0, %zmm4, %zmm0
138
139 /* fixup for very large inputs */
140 vmovups OneEighth+__svml_sacosh_data_internal_avx512(%rip), %zmm4
141
142 /* Sl_high */
143 vsubps {rn-sae}, %zmm9, %zmm2, %zmm5
144
145 /* polynomial */
146 vmovups poly_coeff3+__svml_sacosh_data_internal_avx512(%rip), %zmm9
147 vmulps {rn-sae}, %zmm4, %zmm8, %zmm2{%k2}
148
149 /* -K*L2L + Tl */
150 vmovups L2L+__svml_sacosh_data_internal_avx512(%rip), %zmm4
151
152 /* Sl_l */
153 vsubps {rn-sae}, %zmm5, %zmm3, %zmm3
154 vrcp14ps %zmm2, %zmm11
155 vmovups Log_tbl_L+__svml_sacosh_data_internal_avx512(%rip), %zmm5
156
157 /* Xin_low */
158 vaddps {rn-sae}, %zmm10, %zmm3, %zmm13
159
160 /* round reciprocal to 1+4b mantissas */
161 vpaddd AddB5+__svml_sacosh_data_internal_avx512(%rip), %zmm11, %zmm12
162 vmovups poly_coeff1+__svml_sacosh_data_internal_avx512(%rip), %zmm10
163 vandps RcpBitMask+__svml_sacosh_data_internal_avx512(%rip), %zmm12, %zmm14
164
165 /* fixup for very large inputs */
166 vxorps %zmm13, %zmm13, %zmm13{%k2}
167
168 /* reduced argument for log(): (Rcp*Xin-1)+Rcp*Xin_low */
169 vfmsub231ps {rn-sae}, %zmm14, %zmm2, %zmm1
170
171 /* exponents */
172 vgetexpps {sae}, %zmm14, %zmm12
173 vmovups Four+__svml_sacosh_data_internal_avx512(%rip), %zmm2
174
175 /* Prepare table index */
176 vpsrld $18, %zmm14, %zmm3
177 vfmadd231ps {rn-sae}, %zmm14, %zmm13, %zmm1
178 vmovups poly_coeff2+__svml_sacosh_data_internal_avx512(%rip), %zmm13
179
180 /* Table lookups */
181 vmovups __svml_sacosh_data_internal_avx512(%rip), %zmm14
182 vsubps {rn-sae}, %zmm2, %zmm12, %zmm12{%k2}
183 vpermt2ps Log_tbl_L+64+__svml_sacosh_data_internal_avx512(%rip), %zmm3, %zmm5
184 vpermt2ps Log_tbl_H+64+__svml_sacosh_data_internal_avx512(%rip), %zmm3, %zmm14
185
186 /* R^2 */
187 vmulps {rn-sae}, %zmm1, %zmm1, %zmm11
188
189 /* -K*L2H + Th */
190 vmovups L2H+__svml_sacosh_data_internal_avx512(%rip), %zmm2
191 vfmadd231ps {rn-sae}, %zmm1, %zmm9, %zmm13
192 vfnmadd231ps {rn-sae}, %zmm12, %zmm2, %zmm14
193 vfnmadd213ps {rn-sae}, %zmm5, %zmm4, %zmm12
194 vfmadd213ps {rn-sae}, %zmm10, %zmm1, %zmm13
195
196 /* Tl + R^2*Poly */
197 vfmadd213ps {rn-sae}, %zmm12, %zmm11, %zmm13
198
199 /* R+Tl + R^2*Poly */
200 vaddps {rn-sae}, %zmm1, %zmm13, %zmm1
201 vaddps {rn-sae}, %zmm1, %zmm14, %zmm0{%k3}
202
203 /* Go to special inputs processing branch */
204 jne L(SPECIAL_VALUES_BRANCH)
205 # LOE rbx r12 r13 r14 r15 k0 zmm0 zmm8
206
207 /* Restore registers
208 * and exit the function
209 */
210
211L(EXIT):
212 movq %rbp, %rsp
213 popq %rbp
214 cfi_def_cfa(7, 8)
215 cfi_restore(6)
216 ret
217 cfi_def_cfa(6, 16)
218 cfi_offset(6, -16)
219
220 /* Branch to process
221 * special inputs
222 */
223
224L(SPECIAL_VALUES_BRANCH):
225 vmovups %zmm8, 64(%rsp)
226 vmovups %zmm0, 128(%rsp)
227 # LOE rbx r12 r13 r14 r15 k0 zmm0
228
229 xorl %eax, %eax
230 # LOE rbx r12 r13 r14 r15 eax k0
231
232 vzeroupper
233 movq %r12, 16(%rsp)
234 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
235 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
236 movl %eax, %r12d
237 movq %r13, 8(%rsp)
238 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
239 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
240 kmovd %k0, %r13d
241 movq %r14, (%rsp)
242 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
243 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
244 # LOE rbx r15 r12d r13d
245
246 /* Range mask
247 * bits check
248 */
249
250L(RANGEMASK_CHECK):
251 btl %r12d, %r13d
252
253 /* Call scalar math function */
254 jc L(SCALAR_MATH_CALL)
255 # LOE rbx r15 r12d r13d
256
257 /* Special inputs
258 * processing loop
259 */
260
261L(SPECIAL_VALUES_LOOP):
262 incl %r12d
263 cmpl $16, %r12d
264
265 /* Check bits in range mask */
266 jl L(RANGEMASK_CHECK)
267 # LOE rbx r15 r12d r13d
268
269 movq 16(%rsp), %r12
270 cfi_restore(12)
271 movq 8(%rsp), %r13
272 cfi_restore(13)
273 movq (%rsp), %r14
274 cfi_restore(14)
275 vmovups 128(%rsp), %zmm0
276
277 /* Go to exit */
278 jmp L(EXIT)
279 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
280 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
281 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
282 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
283 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
284 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
285 # LOE rbx r12 r13 r14 r15 zmm0
286
287 /* Scalar math function call
288 * to process special input
289 */
290
291L(SCALAR_MATH_CALL):
292 movl %r12d, %r14d
293 vmovss 64(%rsp, %r14, 4), %xmm0
294 call acoshf@PLT
295 # LOE rbx r14 r15 r12d r13d xmm0
296
297 vmovss %xmm0, 128(%rsp, %r14, 4)
298
299 /* Process special inputs in loop */
300 jmp L(SPECIAL_VALUES_LOOP)
301 # LOE rbx r15 r12d r13d
302END(_ZGVeN16v_acoshf_skx)
303
304 .section .rodata, "a"
305 .align 64
306
307#ifdef __svml_sacosh_data_internal_avx512_typedef
308typedef unsigned int VUINT32;
309typedef struct {
310 __declspec(align(64)) VUINT32 Log_tbl_H[32][1];
311 __declspec(align(64)) VUINT32 Log_tbl_L[32][1];
312 __declspec(align(64)) VUINT32 One[16][1];
313 __declspec(align(64)) VUINT32 SmallThreshold[16][1];
314 __declspec(align(64)) VUINT32 Threshold[16][1];
315 __declspec(align(64)) VUINT32 LargeThreshold[16][1];
316 __declspec(align(64)) VUINT32 ca1[16][1];
317 __declspec(align(64)) VUINT32 c2s[16][1];
318 __declspec(align(64)) VUINT32 c1s[16][1];
319 __declspec(align(64)) VUINT32 AddB5[16][1];
320 __declspec(align(64)) VUINT32 RcpBitMask[16][1];
321 __declspec(align(64)) VUINT32 OneEighth[16][1];
322 __declspec(align(64)) VUINT32 Four[16][1];
323 __declspec(align(64)) VUINT32 poly_coeff3[16][1];
324 __declspec(align(64)) VUINT32 poly_coeff2[16][1];
325 __declspec(align(64)) VUINT32 poly_coeff1[16][1];
326 __declspec(align(64)) VUINT32 L2H[16][1];
327 __declspec(align(64)) VUINT32 L2L[16][1];
328} __svml_sacosh_data_internal_avx512;
329#endif
330__svml_sacosh_data_internal_avx512:
331 /* Log_tbl_H */
332 .long 0x00000000
333 .long 0xbcfc0000
334 .long 0xbd788000
335 .long 0xbdb78000
336 .long 0xbdf14000
337 .long 0xbe14a000
338 .long 0xbe300000
339 .long 0xbe4aa000
340 .long 0xbe648000
341 .long 0xbe7dc000
342 .long 0xbe8b4000
343 .long 0xbe974000
344 .long 0xbea31000
345 .long 0xbeae9000
346 .long 0xbeb9d000
347 .long 0xbec4d000
348 .long 0xbecfa000
349 .long 0xbeda2000
350 .long 0xbee48000
351 .long 0xbeeea000
352 .long 0xbef89000
353 .long 0xbf012800
354 .long 0xbf05f000
355 .long 0xbf0aa800
356 .long 0xbf0f4000
357 .long 0xbf13c800
358 .long 0xbf184000
359 .long 0xbf1ca000
360 .long 0xbf20f000
361 .long 0xbf252800
362 .long 0xbf295000
363 .long 0xbf2d6800
364 /* Log_tbl_L */
365 .align 64
366 .long 0x80000000
367 .long 0xb726c39e
368 .long 0x3839e7fe
369 .long 0xb7528ae5
370 .long 0x377891d5
371 .long 0xb8297c10
372 .long 0x37cf8f58
373 .long 0x3852b186
374 .long 0x35838656
375 .long 0xb80c36af
376 .long 0x38235454
377 .long 0xb862bae1
378 .long 0x37e87bc7
379 .long 0x37848150
380 .long 0x37202511
381 .long 0xb74e1b05
382 .long 0x385c1340
383 .long 0xb8777bcd
384 .long 0x36038656
385 .long 0xb7d40984
386 .long 0xb80f5faf
387 .long 0xb8254b4c
388 .long 0xb865c84a
389 .long 0x37f0b42d
390 .long 0xb83ebce1
391 .long 0xb83c2513
392 .long 0x37a332c4
393 .long 0x3779654f
394 .long 0x38602f73
395 .long 0x367449f8
396 .long 0xb7b4996f
397 .long 0xb800986b
398 /* One */
399 .align 64
400 .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
401 /* SmallThreshold */
402 .align 64
403 .long 0x39800000, 0x39800000, 0x39800000, 0x39800000, 0x39800000, 0x39800000, 0x39800000, 0x39800000, 0x39800000, 0x39800000, 0x39800000, 0x39800000, 0x39800000, 0x39800000, 0x39800000, 0x39800000
404 /* Threshold */
405 .align 64
406 .long 0x5f000000, 0x5f000000, 0x5f000000, 0x5f000000, 0x5f000000, 0x5f000000, 0x5f000000, 0x5f000000, 0x5f000000, 0x5f000000, 0x5f000000, 0x5f000000, 0x5f000000, 0x5f000000, 0x5f000000, 0x5f000000
407 /* LargeThreshold */
408 .align 64
409 .long 0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff
410 /* ca1 */
411 .align 64
412 .long 0xbe2AA5DE, 0xbe2AA5DE, 0xbe2AA5DE, 0xbe2AA5DE, 0xbe2AA5DE, 0xbe2AA5DE, 0xbe2AA5DE, 0xbe2AA5DE, 0xbe2AA5DE, 0xbe2AA5DE, 0xbe2AA5DE, 0xbe2AA5DE, 0xbe2AA5DE, 0xbe2AA5DE, 0xbe2AA5DE, 0xbe2AA5DE
413 /* c2s */
414 .align 64
415 .long 0x3ec00000, 0x3ec00000, 0x3ec00000, 0x3ec00000, 0x3ec00000, 0x3ec00000, 0x3ec00000, 0x3ec00000, 0x3ec00000, 0x3ec00000, 0x3ec00000, 0x3ec00000, 0x3ec00000, 0x3ec00000, 0x3ec00000, 0x3ec00000
416 /* c1s */
417 .align 64
418 .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000
419 /* AddB5 */
420 .align 64
421 .long 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000
422 /* RcpBitMask */
423 .align 64
424 .long 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
425 /* OneEighth */
426 .align 64
427 .long 0x3e000000, 0x3e000000, 0x3e000000, 0x3e000000, 0x3e000000, 0x3e000000, 0x3e000000, 0x3e000000, 0x3e000000, 0x3e000000, 0x3e000000, 0x3e000000, 0x3e000000, 0x3e000000, 0x3e000000, 0x3e000000
428 /* Four */
429 .align 64
430 .long 0x40800000, 0x40800000, 0x40800000, 0x40800000, 0x40800000, 0x40800000, 0x40800000, 0x40800000, 0x40800000, 0x40800000, 0x40800000, 0x40800000, 0x40800000, 0x40800000, 0x40800000, 0x40800000
431 /* poly_coeff3 */
432 .align 64
433 .long 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
434 /* poly_coeff2 */
435 .align 64
436 .long 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
437 /* poly_coeff1 */
438 .align 64
439 .long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
440 /* L2H = log(2)_high */
441 .align 64
442 .long 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
443 /* L2L = log(2)_low */
444 .align 64
445 .long 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
446 .align 64
447 .type __svml_sacosh_data_internal_avx512, @object
448 .size __svml_sacosh_data_internal_avx512, .-__svml_sacosh_data_internal_avx512
449

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_s_acoshf16_core_avx512.S