1/* Function acoshf vectorized with SSE4.
2 Copyright (C) 2021-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
18
19/*
20 * ALGORITHM DESCRIPTION:
21 *
22 * Compute acosh(x) as log(x + sqrt(x*x - 1))
23 *
24 * Special cases:
25 *
26 * acosh(NaN) = quiet NaN, and raise invalid exception
27 * acosh(-INF) = NaN
28 * acosh(+INF) = +INF
29 * acosh(x) = NaN if x < 1
30 * acosh(1) = +0
31 *
32 */
33
34/* Offsets for data table __svml_sacosh_data_internal
35 */
36#define sOne 0
37#define sPoly 16
38#define iBrkValue 144
39#define iOffExpoMask 160
40#define sBigThreshold 176
41#define sC2 192
42#define sC3 208
43#define sHalf 224
44#define sLargestFinite 240
45#define sThirtyOne 256
46#define sTopMask8 272
47#define XScale 288
48#define sLn2 304
49
50#include <sysdep.h>
51
52 .section .text.sse4, "ax", @progbits
53ENTRY(_ZGVbN4v_acoshf_sse4)
54 subq $72, %rsp
55 cfi_def_cfa_offset(80)
56
57 /* Compute U = X - 1 and V = X + 1, naively first. */
58 movaps %xmm0, %xmm12
59
60 /* Load constants, always including One = 1 */
61 movups sOne+__svml_sacosh_data_internal(%rip), %xmm2
62
63 /*
64 * Check that 1 < X < +inf; otherwise go to the callout function.
65 * We need the callout for X = 1 to avoid division by zero below.
66 * This test ensures that callout handles NaN and either infinity.
67 */
68 movaps %xmm0, %xmm4
69 movaps %xmm2, %xmm9
70
71 /*
72 * Compute e = -(2 * d + d^2)
73 * The first FMR is exact, and the rounding error in the other is acceptable
74 * since d and e are ~ 2^-8
75 */
76 movaps %xmm2, %xmm10
77
78 /* Finally, express Y + W = U * V accurately where Y has <= 8 bits */
79 movups sTopMask8+__svml_sacosh_data_internal(%rip), %xmm5
80
81 /*
82 * Now we feed into the log1p code, using H in place of _VARG1 and
83 * also adding L into Xl.
84 * compute 1+x as high, low parts
85 */
86 movaps %xmm2, %xmm13
87 movaps %xmm5, %xmm11
88 movaps %xmm2, %xmm3
89
90 /*
91 * Now 1 / (1 + d)
92 * = 1 / (1 + (sqrt(1 - e) - 1))
93 * = 1 / sqrt(1 - e)
94 * = 1 + 1/2 * e + 3/8 * e^2 + 5/16 * e^3 + 35/128 * e^4 + ...
95 * So compute the first three nonconstant terms of that, so that
96 * we have a relative correction (1 + Corr) to apply to S etc.
97 * C1 = 1/2
98 * C2 = 3/8
99 * C3 = 5/16
100 */
101 movups sC3+__svml_sacosh_data_internal(%rip), %xmm8
102
103 /*
104 * The following computation can go wrong for very large X, e.g.
105 * the X^2 - 1 = U * V can overflow. But for large X we have
106 * acosh(X) / log(2 X) - 1 =~= 1/(4 * X^2), so for X >= 2^30
107 * we can just later stick X back into the log and tweak up the exponent.
108 * Actually we scale X by 2^-30 and tweak the exponent up by 31,
109 * to stay in the safe range for the later log computation.
110 * Compute a flag now telling us when to do this.
111 */
112 movaps %xmm0, %xmm1
113 cmpnleps sLargestFinite+__svml_sacosh_data_internal(%rip), %xmm4
114 cmpltps sBigThreshold+__svml_sacosh_data_internal(%rip), %xmm1
115 cmpnltps %xmm0, %xmm3
116 subps %xmm2, %xmm12
117 addps %xmm0, %xmm9
118
119 /* For low-accuracy versions, naivety is harmless */
120 mulps %xmm12, %xmm9
121 orps %xmm3, %xmm4
122 movmskps %xmm4, %edx
123 andps %xmm9, %xmm11
124 movaps %xmm1, %xmm3
125
126 /*
127 * Compute R = 1/sqrt(Y + W) * (1 + d)
128 * Force R to <= 8 significant bits.
129 * This means that R * Y and R^2 * Y are exactly representable.
130 */
131 rsqrtps %xmm11, %xmm7
132 subps %xmm11, %xmm9
133 andps %xmm5, %xmm7
134 movaps %xmm2, %xmm4
135
136 /*
137 * Compute S = (Y/sqrt(Y + W)) * (1 + d)
138 * and T = (W/sqrt(Y + W)) * (1 + d)
139 * so that S + T = sqrt(Y + W) * (1 + d)
140 * S is exact, and the rounding error in T is OK.
141 */
142 mulps %xmm7, %xmm11
143 movaps %xmm7, %xmm6
144 mulps %xmm7, %xmm9
145 mulps %xmm11, %xmm6
146 mulps %xmm9, %xmm7
147
148 /*
149 * For low-accuracy versions, the computation can be done
150 * just as U + ((S + T) + (S + T) * Corr)
151 */
152 addps %xmm9, %xmm11
153 subps %xmm6, %xmm10
154 movaps %xmm2, %xmm9
155 subps %xmm7, %xmm10
156 mulps %xmm10, %xmm8
157
158 /* Now multiplex to the case X = 2^-30 * input, Xl = 0 in the "big" case. */
159 movups XScale+__svml_sacosh_data_internal(%rip), %xmm14
160 mulps %xmm0, %xmm14
161 addps sC2+__svml_sacosh_data_internal(%rip), %xmm8
162 mulps %xmm10, %xmm8
163 andnps %xmm14, %xmm3
164
165 /*
166 * Now resume the main code.
167 * reduction: compute r, n
168 */
169 movdqu iBrkValue+__svml_sacosh_data_internal(%rip), %xmm14
170 movdqu iOffExpoMask+__svml_sacosh_data_internal(%rip), %xmm5
171
172 /* Add 31 to the exponent in the "large" case to get log(2 * input) */
173 movups sThirtyOne+__svml_sacosh_data_internal(%rip), %xmm6
174 addps sHalf+__svml_sacosh_data_internal(%rip), %xmm8
175 mulps %xmm8, %xmm10
176 movaps %xmm1, %xmm8
177 mulps %xmm11, %xmm10
178 addps %xmm10, %xmm11
179 addps %xmm11, %xmm12
180 maxps %xmm12, %xmm13
181 minps %xmm12, %xmm9
182 movaps %xmm13, %xmm15
183 addps %xmm9, %xmm15
184 subps %xmm15, %xmm13
185 andps %xmm1, %xmm15
186 orps %xmm15, %xmm3
187 addps %xmm13, %xmm9
188 psubd %xmm14, %xmm3
189 andps %xmm1, %xmm9
190 pand %xmm3, %xmm5
191 psrad $23, %xmm3
192 cvtdq2ps %xmm3, %xmm7
193 pslld $23, %xmm3
194 paddd %xmm14, %xmm5
195 psubd %xmm3, %xmm4
196
197 /* polynomial evaluation */
198 subps %xmm2, %xmm5
199 mulps %xmm4, %xmm9
200 addps %xmm7, %xmm6
201 movups sPoly+112+__svml_sacosh_data_internal(%rip), %xmm2
202 andnps %xmm6, %xmm8
203 andps %xmm1, %xmm7
204 addps %xmm5, %xmm9
205 mulps %xmm9, %xmm2
206 orps %xmm7, %xmm8
207
208 /* final reconstruction */
209 mulps sLn2+__svml_sacosh_data_internal(%rip), %xmm8
210 addps sPoly+96+__svml_sacosh_data_internal(%rip), %xmm2
211 mulps %xmm9, %xmm2
212 addps sPoly+80+__svml_sacosh_data_internal(%rip), %xmm2
213 mulps %xmm9, %xmm2
214 addps sPoly+64+__svml_sacosh_data_internal(%rip), %xmm2
215 mulps %xmm9, %xmm2
216 addps sPoly+48+__svml_sacosh_data_internal(%rip), %xmm2
217 mulps %xmm9, %xmm2
218 addps sPoly+32+__svml_sacosh_data_internal(%rip), %xmm2
219 mulps %xmm9, %xmm2
220 addps sPoly+16+__svml_sacosh_data_internal(%rip), %xmm2
221 mulps %xmm9, %xmm2
222 addps sPoly+__svml_sacosh_data_internal(%rip), %xmm2
223 mulps %xmm9, %xmm2
224 mulps %xmm9, %xmm2
225 addps %xmm2, %xmm9
226 addps %xmm8, %xmm9
227 testl %edx, %edx
228
229 /* Go to special inputs processing branch */
230 jne L(SPECIAL_VALUES_BRANCH)
231 # LOE rbx rbp r12 r13 r14 r15 edx xmm0 xmm9
232
233 /* Restore registers
234 * and exit the function
235 */
236
237L(EXIT):
238 movaps %xmm9, %xmm0
239 addq $72, %rsp
240 cfi_def_cfa_offset(8)
241 ret
242 cfi_def_cfa_offset(80)
243
244 /* Branch to process
245 * special inputs
246 */
247
248L(SPECIAL_VALUES_BRANCH):
249 movups %xmm0, 32(%rsp)
250 movups %xmm9, 48(%rsp)
251 # LOE rbx rbp r12 r13 r14 r15 edx
252
253 xorl %eax, %eax
254 movq %r12, 16(%rsp)
255 cfi_offset(12, -64)
256 movl %eax, %r12d
257 movq %r13, 8(%rsp)
258 cfi_offset(13, -72)
259 movl %edx, %r13d
260 movq %r14, (%rsp)
261 cfi_offset(14, -80)
262 # LOE rbx rbp r15 r12d r13d
263
264 /* Range mask
265 * bits check
266 */
267
268L(RANGEMASK_CHECK):
269 btl %r12d, %r13d
270
271 /* Call scalar math function */
272 jc L(SCALAR_MATH_CALL)
273 # LOE rbx rbp r15 r12d r13d
274
275 /* Special inputs
276 * processing loop
277 */
278
279L(SPECIAL_VALUES_LOOP):
280 incl %r12d
281 cmpl $4, %r12d
282
283 /* Check bits in range mask */
284 jl L(RANGEMASK_CHECK)
285 # LOE rbx rbp r15 r12d r13d
286
287 movq 16(%rsp), %r12
288 cfi_restore(12)
289 movq 8(%rsp), %r13
290 cfi_restore(13)
291 movq (%rsp), %r14
292 cfi_restore(14)
293 movups 48(%rsp), %xmm9
294
295 /* Go to exit */
296 jmp L(EXIT)
297 cfi_offset(12, -64)
298 cfi_offset(13, -72)
299 cfi_offset(14, -80)
300 # LOE rbx rbp r12 r13 r14 r15 xmm9
301
302 /* Scalar math function call
303 * to process special input
304 */
305
306L(SCALAR_MATH_CALL):
307 movl %r12d, %r14d
308 movss 32(%rsp, %r14, 4), %xmm0
309 call acoshf@PLT
310 # LOE rbx rbp r14 r15 r12d r13d xmm0
311
312 movss %xmm0, 48(%rsp, %r14, 4)
313
314 /* Process special inputs in loop */
315 jmp L(SPECIAL_VALUES_LOOP)
316 # LOE rbx rbp r15 r12d r13d
317END(_ZGVbN4v_acoshf_sse4)
318
319 .section .rodata, "a"
320 .align 16
321
322#ifdef __svml_sacosh_data_internal_typedef
323typedef unsigned int VUINT32;
324typedef struct {
325 __declspec(align(16)) VUINT32 sOne[4][1];
326 __declspec(align(16)) VUINT32 sPoly[8][4][1];
327 __declspec(align(16)) VUINT32 iBrkValue[4][1];
328 __declspec(align(16)) VUINT32 iOffExpoMask[4][1];
329 __declspec(align(16)) VUINT32 sBigThreshold[4][1];
330 __declspec(align(16)) VUINT32 sC2[4][1];
331 __declspec(align(16)) VUINT32 sC3[4][1];
332 __declspec(align(16)) VUINT32 sHalf[4][1];
333 __declspec(align(16)) VUINT32 sLargestFinite[4][1];
334 __declspec(align(16)) VUINT32 sThirtyOne[4][1];
335 __declspec(align(16)) VUINT32 sTopMask8[4][1];
336 __declspec(align(16)) VUINT32 XScale[4][1];
337 __declspec(align(16)) VUINT32 sLn2[4][1];
338} __svml_sacosh_data_internal;
339#endif
340__svml_sacosh_data_internal:
341 /* sOne = SP 1.0 */
342 .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
343 /* sPoly[] = SP polynomial */
344 .align 16
345 .long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 /* -5.0000000000000000000000000e-01 P0 */
346 .long 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 /* 3.3333265781402587890625000e-01 P1 */
347 .long 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e /* -2.5004237890243530273437500e-01 P2 */
348 .long 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 /* 2.0007920265197753906250000e-01 P3 */
349 .long 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 /* -1.6472326219081878662109375e-01 P4 */
350 .long 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 /* 1.4042308926582336425781250e-01 P5 */
351 .long 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 /* -1.5122179687023162841796875e-01 P6 */
352 .long 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed /* 1.3820238411426544189453125e-01 P7 */
353 /* iBrkValue = SP 2/3 */
354 .align 16
355 .long 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
356 /* iOffExpoMask = SP significand mask */
357 .align 16
358 .long 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
359 /* sBigThreshold */
360 .align 16
361 .long 0x4E800000, 0x4E800000, 0x4E800000, 0x4E800000
362 /* sC2 */
363 .align 16
364 .long 0x3EC00000, 0x3EC00000, 0x3EC00000, 0x3EC00000
365 /* sC3 */
366 .align 16
367 .long 0x3EA00000, 0x3EA00000, 0x3EA00000, 0x3EA00000
368 /* sHalf */
369 .align 16
370 .long 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000
371 /* sLargestFinite */
372 .align 16
373 .long 0x7F7FFFFF, 0x7F7FFFFF, 0x7F7FFFFF, 0x7F7FFFFF
374 /* sThirtyOne */
375 .align 16
376 .long 0x41F80000, 0x41F80000, 0x41F80000, 0x41F80000
377 /* sTopMask8 */
378 .align 16
379 .long 0xFFFF0000, 0xFFFF0000, 0xFFFF0000, 0xFFFF0000
380 /* XScale */
381 .align 16
382 .long 0x30800000, 0x30800000, 0x30800000, 0x30800000
383 /* sLn2 = SP ln(2) */
384 .align 16
385 .long 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
386 .align 16
387 .type __svml_sacosh_data_internal, @object
388 .size __svml_sacosh_data_internal, .-__svml_sacosh_data_internal
389

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_s_acoshf4_core_sse4.S