1/* Function acoshf vectorized with AVX2.
2 Copyright (C) 2021-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
18
19/*
20 * ALGORITHM DESCRIPTION:
21 *
22 * Compute acosh(x) as log(x + sqrt(x*x - 1))
23 *
24 * Special cases:
25 *
26 * acosh(NaN) = quiet NaN, and raise invalid exception
27 * acosh(-INF) = NaN
28 * acosh(+INF) = +INF
29 * acosh(x) = NaN if x < 1
30 * acosh(1) = +0
31 *
32 */
33
34/* Offsets for data table __svml_sacosh_data_internal
35 */
36#define sOne 0
37#define sPoly 32
38#define iBrkValue 288
39#define iOffExpoMask 320
40#define sBigThreshold 352
41#define sC2 384
42#define sC3 416
43#define sHalf 448
44#define sLargestFinite 480
45#define sThirtyOne 512
46#define sTopMask8 544
47#define XScale 576
48#define sLn2 608
49
50#include <sysdep.h>
51
52 .section .text.avx2, "ax", @progbits
53ENTRY(_ZGVdN8v_acoshf_avx2)
54 pushq %rbp
55 cfi_def_cfa_offset(16)
56 movq %rsp, %rbp
57 cfi_def_cfa(6, 16)
58 cfi_offset(6, -16)
59 andq $-32, %rsp
60 subq $96, %rsp
61
62 /* Load constants, always including One = 1 */
63 vmovups sOne+__svml_sacosh_data_internal(%rip), %ymm2
64
65 /* Finally, express Y + W = U * V accurately where Y has <= 8 bits */
66 vmovups sTopMask8+__svml_sacosh_data_internal(%rip), %ymm9
67
68 /*
69 * Now 1 / (1 + d)
70 * = 1 / (1 + (sqrt(1 - e) - 1))
71 * = 1 / sqrt(1 - e)
72 * = 1 + 1/2 * e + 3/8 * e^2 + 5/16 * e^3 + 35/128 * e^4 + ...
73 * So compute the first three nonconstant terms of that, so that
74 * we have a relative correction (1 + Corr) to apply to S etc.
75 * C1 = 1/2
76 * C2 = 3/8
77 * C3 = 5/16
78 */
79 vmovups sC3+__svml_sacosh_data_internal(%rip), %ymm14
80 vmovaps %ymm0, %ymm3
81 vmovaps %ymm2, %ymm7
82 vfmsub231ps %ymm3, %ymm3, %ymm7
83
84 /*
85 * Check that 1 < X < +inf; otherwise go to the callout function.
86 * We need the callout for X = 1 to avoid division by zero below.
87 * This test ensures that callout handles NaN and either infinity.
88 */
89 vcmpnle_uqps sLargestFinite+__svml_sacosh_data_internal(%rip), %ymm3, %ymm4
90 vcmpngt_uqps %ymm2, %ymm3, %ymm5
91
92 /*
93 * The following computation can go wrong for very large X, e.g.
94 * the X^2 - 1 = U * V can overflow. But for large X we have
95 * acosh(X) / log(2 X) - 1 =~= 1/(4 * X^2), so for X >= 2^30
96 * we can just later stick X back into the log and tweak up the exponent.
97 * Actually we scale X by 2^-30 and tweak the exponent up by 31,
98 * to stay in the safe range for the later log computation.
99 * Compute a flag now telling us when to do this.
100 */
101 vcmplt_oqps sBigThreshold+__svml_sacosh_data_internal(%rip), %ymm3, %ymm1
102 vandps %ymm9, %ymm7, %ymm10
103
104 /*
105 * Compute R = 1/sqrt(Y + W) * (1 + d)
106 * Force R to <= 8 significant bits.
107 * This means that R * Y and R^2 * Y are exactly representable.
108 */
109 vrsqrtps %ymm10, %ymm8
110 vsubps %ymm10, %ymm7, %ymm11
111 vandps %ymm9, %ymm8, %ymm12
112
113 /*
114 * Compute S = (Y/sqrt(Y + W)) * (1 + d)
115 * and T = (W/sqrt(Y + W)) * (1 + d)
116 * so that S + T = sqrt(Y + W) * (1 + d)
117 * S is exact, and the rounding error in T is OK.
118 */
119 vmulps %ymm12, %ymm10, %ymm15
120 vmulps %ymm11, %ymm12, %ymm0
121
122 /* Now multiplex to the case X = 2^-30 * input, Xl = 0 in the "big" case. */
123 vmulps XScale+__svml_sacosh_data_internal(%rip), %ymm3, %ymm11
124
125 /*
126 * Compute e = -(2 * d + d^2)
127 * The first FMR is exact, and the rounding error in the other is acceptable
128 * since d and e are ~ 2^-8
129 */
130 vmovaps %ymm2, %ymm13
131 vfnmadd231ps %ymm15, %ymm12, %ymm13
132 vfnmadd231ps %ymm0, %ymm12, %ymm13
133 vfmadd213ps sC2+__svml_sacosh_data_internal(%rip), %ymm13, %ymm14
134 vfmadd213ps sHalf+__svml_sacosh_data_internal(%rip), %ymm13, %ymm14
135 vmulps %ymm14, %ymm13, %ymm7
136 vorps %ymm5, %ymm4, %ymm6
137
138 /*
139 * For low-accuracy versions, the computation can be done
140 * just as U + ((S + T) + (S + T) * Corr)
141 */
142 vaddps %ymm0, %ymm15, %ymm5
143
144 /* sU is needed later on */
145 vsubps %ymm2, %ymm3, %ymm4
146 vfmadd213ps %ymm5, %ymm7, %ymm5
147 vmovmskps %ymm6, %edx
148 vaddps %ymm5, %ymm4, %ymm6
149
150 /*
151 * Now resume the main code.
152 * reduction: compute r, n
153 */
154 vmovups iBrkValue+__svml_sacosh_data_internal(%rip), %ymm4
155
156 /*
157 * Now we feed into the log1p code, using H in place of _VARG1 and
158 * also adding L into Xl.
159 * compute 1+x as high, low parts
160 */
161 vmaxps %ymm6, %ymm2, %ymm8
162 vminps %ymm6, %ymm2, %ymm9
163 vaddps %ymm9, %ymm8, %ymm12
164 vblendvps %ymm1, %ymm12, %ymm11, %ymm14
165 vsubps %ymm12, %ymm8, %ymm10
166 vpsubd %ymm4, %ymm14, %ymm15
167 vaddps %ymm10, %ymm9, %ymm13
168 vpand iOffExpoMask+__svml_sacosh_data_internal(%rip), %ymm15, %ymm14
169 vpsrad $23, %ymm15, %ymm15
170 vpaddd %ymm4, %ymm14, %ymm8
171 vpslld $23, %ymm15, %ymm5
172 vmovups sPoly+224+__svml_sacosh_data_internal(%rip), %ymm4
173 vcvtdq2ps %ymm15, %ymm0
174 vpsubd %ymm5, %ymm2, %ymm7
175
176 /* polynomial evaluation */
177 vsubps %ymm2, %ymm8, %ymm2
178
179 /* Add 31 to the exponent in the "large" case to get log(2 * input) */
180 vaddps sThirtyOne+__svml_sacosh_data_internal(%rip), %ymm0, %ymm5
181 vandps %ymm1, %ymm13, %ymm6
182 vmulps %ymm7, %ymm6, %ymm9
183 vblendvps %ymm1, %ymm0, %ymm5, %ymm0
184 vaddps %ymm2, %ymm9, %ymm2
185 vfmadd213ps sPoly+192+__svml_sacosh_data_internal(%rip), %ymm2, %ymm4
186 vfmadd213ps sPoly+160+__svml_sacosh_data_internal(%rip), %ymm2, %ymm4
187 vfmadd213ps sPoly+128+__svml_sacosh_data_internal(%rip), %ymm2, %ymm4
188 vfmadd213ps sPoly+96+__svml_sacosh_data_internal(%rip), %ymm2, %ymm4
189 vfmadd213ps sPoly+64+__svml_sacosh_data_internal(%rip), %ymm2, %ymm4
190 vfmadd213ps sPoly+32+__svml_sacosh_data_internal(%rip), %ymm2, %ymm4
191 vfmadd213ps sPoly+__svml_sacosh_data_internal(%rip), %ymm2, %ymm4
192 vmulps %ymm4, %ymm2, %ymm6
193 vfmadd213ps %ymm2, %ymm2, %ymm6
194
195 /* final reconstruction */
196 vfmadd132ps sLn2+__svml_sacosh_data_internal(%rip), %ymm6, %ymm0
197 testl %edx, %edx
198
199 /* Go to special inputs processing branch */
200 jne L(SPECIAL_VALUES_BRANCH)
201 # LOE rbx r12 r13 r14 r15 edx ymm0 ymm3
202
203 /* Restore registers
204 * and exit the function
205 */
206
207L(EXIT):
208 movq %rbp, %rsp
209 popq %rbp
210 cfi_def_cfa(7, 8)
211 cfi_restore(6)
212 ret
213 cfi_def_cfa(6, 16)
214 cfi_offset(6, -16)
215
216 /* Branch to process
217 * special inputs
218 */
219
220L(SPECIAL_VALUES_BRANCH):
221 vmovups %ymm3, 32(%rsp)
222 vmovups %ymm0, 64(%rsp)
223 # LOE rbx r12 r13 r14 r15 edx ymm0
224
225 xorl %eax, %eax
226 # LOE rbx r12 r13 r14 r15 eax edx
227
228 vzeroupper
229 movq %r12, 16(%rsp)
230 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */
231 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
232 movl %eax, %r12d
233 movq %r13, 8(%rsp)
234 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */
235 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
236 movl %edx, %r13d
237 movq %r14, (%rsp)
238 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */
239 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
240 # LOE rbx r15 r12d r13d
241
242 /* Range mask
243 * bits check
244 */
245
246L(RANGEMASK_CHECK):
247 btl %r12d, %r13d
248
249 /* Call scalar math function */
250 jc L(SCALAR_MATH_CALL)
251 # LOE rbx r15 r12d r13d
252
253 /* Special inputs
254 * processing loop
255 */
256
257L(SPECIAL_VALUES_LOOP):
258 incl %r12d
259 cmpl $8, %r12d
260
261 /* Check bits in range mask */
262 jl L(RANGEMASK_CHECK)
263 # LOE rbx r15 r12d r13d
264
265 movq 16(%rsp), %r12
266 cfi_restore(12)
267 movq 8(%rsp), %r13
268 cfi_restore(13)
269 movq (%rsp), %r14
270 cfi_restore(14)
271 vmovups 64(%rsp), %ymm0
272
273 /* Go to exit */
274 jmp L(EXIT)
275 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */
276 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
277 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */
278 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
279 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */
280 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
281 # LOE rbx r12 r13 r14 r15 ymm0
282
283 /* Scalar math function call
284 * to process special input
285 */
286
287L(SCALAR_MATH_CALL):
288 movl %r12d, %r14d
289 vmovss 32(%rsp, %r14, 4), %xmm0
290 call acoshf@PLT
291 # LOE rbx r14 r15 r12d r13d xmm0
292
293 vmovss %xmm0, 64(%rsp, %r14, 4)
294
295 /* Process special inputs in loop */
296 jmp L(SPECIAL_VALUES_LOOP)
297 # LOE rbx r15 r12d r13d
298END(_ZGVdN8v_acoshf_avx2)
299
300 .section .rodata, "a"
301 .align 32
302
303#ifdef __svml_sacosh_data_internal_typedef
304typedef unsigned int VUINT32;
305typedef struct {
306 __declspec(align(32)) VUINT32 sOne[8][1];
307 __declspec(align(32)) VUINT32 sPoly[8][8][1];
308 __declspec(align(32)) VUINT32 iBrkValue[8][1];
309 __declspec(align(32)) VUINT32 iOffExpoMask[8][1];
310 __declspec(align(32)) VUINT32 sBigThreshold[8][1];
311 __declspec(align(32)) VUINT32 sC2[8][1];
312 __declspec(align(32)) VUINT32 sC3[8][1];
313 __declspec(align(32)) VUINT32 sHalf[8][1];
314 __declspec(align(32)) VUINT32 sLargestFinite[8][1];
315 __declspec(align(32)) VUINT32 sThirtyOne[8][1];
316 __declspec(align(32)) VUINT32 sTopMask8[8][1];
317 __declspec(align(32)) VUINT32 XScale[8][1];
318 __declspec(align(32)) VUINT32 sLn2[8][1];
319} __svml_sacosh_data_internal;
320#endif
321__svml_sacosh_data_internal:
322 /* sOne = SP 1.0 */
323 .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
324 /* sPoly[] = SP polynomial */
325 .align 32
326 .long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 /* -5.0000000000000000000000000e-01 P0 */
327 .long 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 /* 3.3333265781402587890625000e-01 P1 */
328 .long 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e /* -2.5004237890243530273437500e-01 P2 */
329 .long 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 /* 2.0007920265197753906250000e-01 P3 */
330 .long 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 /* -1.6472326219081878662109375e-01 P4 */
331 .long 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 /* 1.4042308926582336425781250e-01 P5 */
332 .long 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 /* -1.5122179687023162841796875e-01 P6 */
333 .long 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed /* 1.3820238411426544189453125e-01 P7 */
334 /* iBrkValue = SP 2/3 */
335 .align 32
336 .long 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
337 /* iOffExpoMask = SP significand mask */
338 .align 32
339 .long 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
340 /* sBigThreshold */
341 .align 32
342 .long 0x4E800000, 0x4E800000, 0x4E800000, 0x4E800000, 0x4E800000, 0x4E800000, 0x4E800000, 0x4E800000
343 /* sC2 */
344 .align 32
345 .long 0x3EC00000, 0x3EC00000, 0x3EC00000, 0x3EC00000, 0x3EC00000, 0x3EC00000, 0x3EC00000, 0x3EC00000
346 /* sC3 */
347 .align 32
348 .long 0x3EA00000, 0x3EA00000, 0x3EA00000, 0x3EA00000, 0x3EA00000, 0x3EA00000, 0x3EA00000, 0x3EA00000
349 /* sHalf */
350 .align 32
351 .long 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000
352 /* sLargestFinite */
353 .align 32
354 .long 0x7F7FFFFF, 0x7F7FFFFF, 0x7F7FFFFF, 0x7F7FFFFF, 0x7F7FFFFF, 0x7F7FFFFF, 0x7F7FFFFF, 0x7F7FFFFF
355 /* sThirtyOne */
356 .align 32
357 .long 0x41F80000, 0x41F80000, 0x41F80000, 0x41F80000, 0x41F80000, 0x41F80000, 0x41F80000, 0x41F80000
358 /* sTopMask8 */
359 .align 32
360 .long 0xFFFF0000, 0xFFFF0000, 0xFFFF0000, 0xFFFF0000, 0xFFFF0000, 0xFFFF0000, 0xFFFF0000, 0xFFFF0000
361 /* XScale */
362 .align 32
363 .long 0x30800000, 0x30800000, 0x30800000, 0x30800000, 0x30800000, 0x30800000, 0x30800000, 0x30800000
364 /* sLn2 = SP ln(2) */
365 .align 32
366 .long 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
367 .align 32
368 .type __svml_sacosh_data_internal, @object
369 .size __svml_sacosh_data_internal, .-__svml_sacosh_data_internal
370

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_s_acoshf8_core_avx2.S