1 | /* Function acosh vectorized with AVX-512. |
2 | Copyright (C) 2021-2024 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | https://www.gnu.org/licenses/. */ |
18 | |
19 | /* |
20 | * ALGORITHM DESCRIPTION: |
21 | * |
22 | * Compute acosh(x) as log(x + sqrt(x*x - 1)) |
23 | * using RSQRT instructions for starting the |
24 | * square root approximation, and small table lookups for log |
25 | * that map to AVX-512 permute instructions |
26 | * |
27 | * Special cases: |
28 | * |
29 | * acosh(NaN) = quiet NaN, and raise invalid exception |
30 | * acosh(-INF) = NaN |
31 | * acosh(+INF) = +INF |
32 | * acosh(x) = NaN if x < 1 |
33 | * acosh(1) = +0 |
34 | * |
35 | */ |
36 | |
37 | /* Offsets for data table __svml_dacosh_data_internal_avx512 |
38 | */ |
39 | #define Log_tbl_H 0 |
40 | #define Log_tbl_L 128 |
41 | #define One 256 |
42 | #define SmallThreshold 320 |
43 | #define Threshold 384 |
44 | #define LargeThreshold 448 |
45 | #define ca2 512 |
46 | #define ca1 576 |
47 | #define c4s 640 |
48 | #define c3s 704 |
49 | #define c2s 768 |
50 | #define c1s 832 |
51 | #define AddB5 896 |
52 | #define RcpBitMask 960 |
53 | #define OneEighth 1024 |
54 | #define Four 1088 |
55 | #define poly_coeff9 1152 |
56 | #define poly_coeff8 1216 |
57 | #define poly_coeff7 1280 |
58 | #define poly_coeff6 1344 |
59 | #define poly_coeff5 1408 |
60 | #define poly_coeff4 1472 |
61 | #define poly_coeff3 1536 |
62 | #define poly_coeff2 1600 |
63 | #define poly_coeff1 1664 |
64 | #define L2H 1728 |
65 | #define L2L 1792 |
66 | |
67 | #include <sysdep.h> |
68 | |
69 | .section .text.evex512, "ax" , @progbits |
70 | ENTRY(_ZGVeN8v_acosh_skx) |
71 | pushq %rbp |
72 | cfi_def_cfa_offset(16) |
73 | movq %rsp, %rbp |
74 | cfi_def_cfa(6, 16) |
75 | cfi_offset(6, -16) |
76 | andq $-64, %rsp |
77 | subq $192, %rsp |
78 | vmovups One+__svml_dacosh_data_internal_avx512(%rip), %zmm5 |
79 | |
80 | /* polynomial computation for small inputs */ |
81 | vmovups ca2+__svml_dacosh_data_internal_avx512(%rip), %zmm13 |
82 | vmovups ca1+__svml_dacosh_data_internal_avx512(%rip), %zmm14 |
83 | |
84 | /* |
85 | * sqrt(1+x^2) ~ Sh + Sl + Sh*Eh*poly_s |
86 | * poly_s = c1+c2*Eh+c3*Eh^2 |
87 | */ |
88 | vmovups c4s+__svml_dacosh_data_internal_avx512(%rip), %zmm1 |
89 | vmovups c2s+__svml_dacosh_data_internal_avx512(%rip), %zmm2 |
90 | vmovups c1s+__svml_dacosh_data_internal_avx512(%rip), %zmm6 |
91 | |
92 | /* very large inputs ? */ |
93 | vmovups Threshold+__svml_dacosh_data_internal_avx512(%rip), %zmm15 |
94 | |
95 | /* out of range inputs? */ |
96 | vmovups LargeThreshold+__svml_dacosh_data_internal_avx512(%rip), %zmm3 |
97 | |
98 | /* not a very small input ? */ |
99 | vmovups SmallThreshold+__svml_dacosh_data_internal_avx512(%rip), %zmm10 |
100 | vmovaps %zmm0, %zmm12 |
101 | |
102 | /* x^2 - 1 */ |
103 | vmovaps %zmm5, %zmm11 |
104 | vfmsub231pd {rn-sae}, %zmm12, %zmm12, %zmm11 |
105 | vcmppd $21, {sae}, %zmm15, %zmm12, %k2 |
106 | vcmppd $22, {sae}, %zmm3, %zmm12, %k0 |
107 | vcmppd $18, {sae}, %zmm5, %zmm12, %k1 |
108 | vrsqrt14pd %zmm11, %zmm4 |
109 | vcmppd $21, {sae}, %zmm10, %zmm11, %k3 |
110 | vfmadd231pd {rn-sae}, %zmm11, %zmm13, %zmm14 |
111 | vmovups c3s+__svml_dacosh_data_internal_avx512(%rip), %zmm13 |
112 | |
113 | /* Sh ~sqrt(-1+x^2) */ |
114 | vmulpd {rn-sae}, %zmm4, %zmm11, %zmm9 |
115 | vmulpd {rn-sae}, %zmm11, %zmm14, %zmm8 |
116 | |
117 | /* Sh+x */ |
118 | vaddpd {rn-sae}, %zmm12, %zmm9, %zmm15 |
119 | |
120 | /* Shh */ |
121 | vsubpd {rn-sae}, %zmm12, %zmm15, %zmm14 |
122 | |
123 | /* (Yh*R0)_low */ |
124 | vmovaps %zmm11, %zmm0 |
125 | korw %k0, %k1, %k0 |
126 | |
127 | /* rel. error term: Eh=1-Sh*R0 */ |
128 | vmovaps %zmm5, %zmm7 |
129 | vfmsub213pd {rn-sae}, %zmm9, %zmm4, %zmm0 |
130 | vfnmadd231pd {rn-sae}, %zmm9, %zmm4, %zmm7 |
131 | |
132 | /* rel. error term: Eh=(1-Sh*R0)-Sl*R0 */ |
133 | vfnmadd231pd {rn-sae}, %zmm0, %zmm4, %zmm7 |
134 | |
135 | /* Shl */ |
136 | vsubpd {rn-sae}, %zmm14, %zmm9, %zmm4 |
137 | vmovups poly_coeff7+__svml_dacosh_data_internal_avx512(%rip), %zmm14 |
138 | vfmadd231pd {rn-sae}, %zmm7, %zmm1, %zmm13 |
139 | vfmadd213pd {rn-sae}, %zmm2, %zmm7, %zmm13 |
140 | vfmadd213pd {rn-sae}, %zmm6, %zmm7, %zmm13 |
141 | |
142 | /* Sh*Eh */ |
143 | vmulpd {rn-sae}, %zmm7, %zmm9, %zmm7 |
144 | |
145 | /* Sl + Sh*Eh*poly_s */ |
146 | vfmadd213pd {rn-sae}, %zmm0, %zmm13, %zmm7 |
147 | |
148 | /* polynomials */ |
149 | vmovups poly_coeff9+__svml_dacosh_data_internal_avx512(%rip), %zmm13 |
150 | |
151 | /* polynomial computation for small inputs */ |
152 | vaddpd {rn-sae}, %zmm7, %zmm9, %zmm0 |
153 | |
154 | /* Xin0+Sl+Sh*Eh*poly_s ~ x+sqrt(1+x^2) */ |
155 | vaddpd {rn-sae}, %zmm7, %zmm15, %zmm6 |
156 | vfmadd231pd {rn-sae}, %zmm0, %zmm8, %zmm0 |
157 | |
158 | /* fixup for very large inputs */ |
159 | vmovups OneEighth+__svml_dacosh_data_internal_avx512(%rip), %zmm8 |
160 | |
161 | /* Sl_high */ |
162 | vsubpd {rn-sae}, %zmm15, %zmm6, %zmm9 |
163 | vmovups poly_coeff6+__svml_dacosh_data_internal_avx512(%rip), %zmm15 |
164 | vmulpd {rn-sae}, %zmm8, %zmm12, %zmm6{%k2} |
165 | |
166 | /* Sl_l */ |
167 | vsubpd {rn-sae}, %zmm9, %zmm7, %zmm3 |
168 | vrcp14pd %zmm6, %zmm1 |
169 | |
170 | /* Xin_low */ |
171 | vaddpd {rn-sae}, %zmm4, %zmm3, %zmm7 |
172 | |
173 | /* Table lookups */ |
174 | vmovups __svml_dacosh_data_internal_avx512(%rip), %zmm3 |
175 | |
176 | /* round reciprocal to 1+4b mantissas */ |
177 | vpaddq AddB5+__svml_dacosh_data_internal_avx512(%rip), %zmm1, %zmm2 |
178 | |
179 | /* fixup for very large inputs */ |
180 | vxorpd %zmm7, %zmm7, %zmm7{%k2} |
181 | vmovups poly_coeff8+__svml_dacosh_data_internal_avx512(%rip), %zmm1 |
182 | vandpd RcpBitMask+__svml_dacosh_data_internal_avx512(%rip), %zmm2, %zmm8 |
183 | vmovups Log_tbl_L+__svml_dacosh_data_internal_avx512(%rip), %zmm2 |
184 | |
185 | /* Prepare table index */ |
186 | vpsrlq $48, %zmm8, %zmm9 |
187 | |
188 | /* reduced argument for log(): (Rcp*Xin-1)+Rcp*Xin_low */ |
189 | vfmsub231pd {rn-sae}, %zmm8, %zmm6, %zmm5 |
190 | |
191 | /* exponents */ |
192 | vgetexppd {sae}, %zmm8, %zmm4 |
193 | vmovups Four+__svml_dacosh_data_internal_avx512(%rip), %zmm6 |
194 | vpermt2pd Log_tbl_H+64+__svml_dacosh_data_internal_avx512(%rip), %zmm9, %zmm3 |
195 | vpermt2pd Log_tbl_L+64+__svml_dacosh_data_internal_avx512(%rip), %zmm9, %zmm2 |
196 | vsubpd {rn-sae}, %zmm6, %zmm4, %zmm4{%k2} |
197 | vfmadd231pd {rn-sae}, %zmm8, %zmm7, %zmm5 |
198 | vmovups poly_coeff5+__svml_dacosh_data_internal_avx512(%rip), %zmm6 |
199 | vmovups poly_coeff4+__svml_dacosh_data_internal_avx512(%rip), %zmm7 |
200 | |
201 | /* -K*L2H + Th */ |
202 | vmovups L2H+__svml_dacosh_data_internal_avx512(%rip), %zmm8 |
203 | |
204 | /* -K*L2L + Tl */ |
205 | vmovups L2L+__svml_dacosh_data_internal_avx512(%rip), %zmm9 |
206 | vfmadd231pd {rn-sae}, %zmm5, %zmm13, %zmm1 |
207 | vmovups poly_coeff2+__svml_dacosh_data_internal_avx512(%rip), %zmm13 |
208 | vfnmadd231pd {rn-sae}, %zmm4, %zmm8, %zmm3 |
209 | vfnmadd213pd {rn-sae}, %zmm2, %zmm9, %zmm4 |
210 | vfmadd213pd {rn-sae}, %zmm14, %zmm5, %zmm1 |
211 | vmovups poly_coeff3+__svml_dacosh_data_internal_avx512(%rip), %zmm2 |
212 | vmovups poly_coeff1+__svml_dacosh_data_internal_avx512(%rip), %zmm14 |
213 | vfmadd213pd {rn-sae}, %zmm15, %zmm5, %zmm1 |
214 | |
215 | /* R^2 */ |
216 | vmulpd {rn-sae}, %zmm5, %zmm5, %zmm15 |
217 | vfmadd213pd {rn-sae}, %zmm6, %zmm5, %zmm1 |
218 | vfmadd213pd {rn-sae}, %zmm7, %zmm5, %zmm1 |
219 | vfmadd213pd {rn-sae}, %zmm2, %zmm5, %zmm1 |
220 | vfmadd213pd {rn-sae}, %zmm13, %zmm5, %zmm1 |
221 | vfmadd213pd {rn-sae}, %zmm14, %zmm5, %zmm1 |
222 | |
223 | /* Tl + R^2*Poly */ |
224 | vfmadd213pd {rn-sae}, %zmm4, %zmm15, %zmm1 |
225 | |
226 | /* R+Tl + R^2*Poly */ |
227 | vaddpd {rn-sae}, %zmm5, %zmm1, %zmm5 |
228 | vaddpd {rn-sae}, %zmm5, %zmm3, %zmm0{%k3} |
229 | |
230 | /* Go to special inputs processing branch */ |
231 | jne L(SPECIAL_VALUES_BRANCH) |
232 | # LOE rbx r12 r13 r14 r15 k0 zmm0 zmm12 |
233 | |
234 | /* Restore registers |
235 | * and exit the function |
236 | */ |
237 | |
238 | L(EXIT): |
239 | movq %rbp, %rsp |
240 | popq %rbp |
241 | cfi_def_cfa(7, 8) |
242 | cfi_restore(6) |
243 | ret |
244 | cfi_def_cfa(6, 16) |
245 | cfi_offset(6, -16) |
246 | |
247 | /* Branch to process |
248 | * special inputs |
249 | */ |
250 | |
251 | L(SPECIAL_VALUES_BRANCH): |
252 | vmovups %zmm12, 64(%rsp) |
253 | vmovups %zmm0, 128(%rsp) |
254 | # LOE rbx r12 r13 r14 r15 k0 zmm0 |
255 | |
256 | xorl %eax, %eax |
257 | # LOE rbx r12 r13 r14 r15 eax k0 |
258 | |
259 | vzeroupper |
260 | movq %r12, 16(%rsp) |
261 | /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */ |
262 | .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22 |
263 | movl %eax, %r12d |
264 | movq %r13, 8(%rsp) |
265 | /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */ |
266 | .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22 |
267 | kmovd %k0, %r13d |
268 | movq %r14, (%rsp) |
269 | /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */ |
270 | .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22 |
271 | # LOE rbx r15 r12d r13d |
272 | |
273 | /* Range mask |
274 | * bits check |
275 | */ |
276 | |
277 | L(RANGEMASK_CHECK): |
278 | btl %r12d, %r13d |
279 | |
280 | /* Call scalar math function */ |
281 | jc L(SCALAR_MATH_CALL) |
282 | # LOE rbx r15 r12d r13d |
283 | |
284 | /* Special inputs |
285 | * processing loop |
286 | */ |
287 | |
288 | L(SPECIAL_VALUES_LOOP): |
289 | incl %r12d |
290 | cmpl $8, %r12d |
291 | |
292 | /* Check bits in range mask */ |
293 | jl L(RANGEMASK_CHECK) |
294 | # LOE rbx r15 r12d r13d |
295 | |
296 | movq 16(%rsp), %r12 |
297 | cfi_restore(12) |
298 | movq 8(%rsp), %r13 |
299 | cfi_restore(13) |
300 | movq (%rsp), %r14 |
301 | cfi_restore(14) |
302 | vmovups 128(%rsp), %zmm0 |
303 | |
304 | /* Go to exit */ |
305 | jmp L(EXIT) |
306 | /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */ |
307 | .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22 |
308 | /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */ |
309 | .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22 |
310 | /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */ |
311 | .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22 |
312 | # LOE rbx r12 r13 r14 r15 zmm0 |
313 | |
314 | /* Scalar math function call |
315 | * to process special input |
316 | */ |
317 | |
318 | L(SCALAR_MATH_CALL): |
319 | movl %r12d, %r14d |
320 | vmovsd 64(%rsp, %r14, 8), %xmm0 |
321 | call acosh@PLT |
322 | # LOE rbx r14 r15 r12d r13d xmm0 |
323 | |
324 | vmovsd %xmm0, 128(%rsp, %r14, 8) |
325 | |
326 | /* Process special inputs in loop */ |
327 | jmp L(SPECIAL_VALUES_LOOP) |
328 | # LOE rbx r15 r12d r13d |
329 | END(_ZGVeN8v_acosh_skx) |
330 | |
331 | .section .rodata, "a" |
332 | .align 64 |
333 | |
334 | #ifdef __svml_dacosh_data_internal_avx512_typedef |
335 | typedef unsigned int VUINT32; |
336 | typedef struct { |
337 | __declspec(align(64)) VUINT32 Log_tbl_H[16][2]; |
338 | __declspec(align(64)) VUINT32 Log_tbl_L[16][2]; |
339 | __declspec(align(64)) VUINT32 One[8][2]; |
340 | __declspec(align(64)) VUINT32 SmallThreshold[8][2]; |
341 | __declspec(align(64)) VUINT32 Threshold[8][2]; |
342 | __declspec(align(64)) VUINT32 LargeThreshold[8][2]; |
343 | __declspec(align(64)) VUINT32 ca2[8][2]; |
344 | __declspec(align(64)) VUINT32 ca1[8][2]; |
345 | __declspec(align(64)) VUINT32 c4s[8][2]; |
346 | __declspec(align(64)) VUINT32 c3s[8][2]; |
347 | __declspec(align(64)) VUINT32 c2s[8][2]; |
348 | __declspec(align(64)) VUINT32 c1s[8][2]; |
349 | __declspec(align(64)) VUINT32 AddB5[8][2]; |
350 | __declspec(align(64)) VUINT32 RcpBitMask[8][2]; |
351 | __declspec(align(64)) VUINT32 OneEighth[8][2]; |
352 | __declspec(align(64)) VUINT32 Four[8][2]; |
353 | __declspec(align(64)) VUINT32 poly_coeff9[8][2]; |
354 | __declspec(align(64)) VUINT32 poly_coeff8[8][2]; |
355 | __declspec(align(64)) VUINT32 poly_coeff7[8][2]; |
356 | __declspec(align(64)) VUINT32 poly_coeff6[8][2]; |
357 | __declspec(align(64)) VUINT32 poly_coeff5[8][2]; |
358 | __declspec(align(64)) VUINT32 poly_coeff4[8][2]; |
359 | __declspec(align(64)) VUINT32 poly_coeff3[8][2]; |
360 | __declspec(align(64)) VUINT32 poly_coeff2[8][2]; |
361 | __declspec(align(64)) VUINT32 poly_coeff1[8][2]; |
362 | __declspec(align(64)) VUINT32 L2H[8][2]; |
363 | __declspec(align(64)) VUINT32 L2L[8][2]; |
364 | } __svml_dacosh_data_internal_avx512; |
365 | #endif |
366 | __svml_dacosh_data_internal_avx512: |
367 | /* Log_tbl_H */ |
368 | .quad 0x0000000000000000 |
369 | .quad 0xbfaf0a30c0120000 |
370 | .quad 0xbfbe27076e2b0000 |
371 | .quad 0xbfc5ff3070a78000 |
372 | .quad 0xbfcc8ff7c79a8000 |
373 | .quad 0xbfd1675cababc000 |
374 | .quad 0xbfd4618bc21c4000 |
375 | .quad 0xbfd739d7f6bbc000 |
376 | .quad 0xbfd9f323ecbf8000 |
377 | .quad 0xbfdc8ff7c79a8000 |
378 | .quad 0xbfdf128f5faf0000 |
379 | .quad 0xbfe0be72e4252000 |
380 | .quad 0xbfe1e85f5e704000 |
381 | .quad 0xbfe307d7334f2000 |
382 | .quad 0xbfe41d8fe8468000 |
383 | .quad 0xbfe52a2d265bc000 |
384 | /* Log_tbl_L */ |
385 | .align 64 |
386 | .quad 0x0000000000000000 |
387 | .quad 0x3d53ab33d066d1d2 |
388 | .quad 0x3d2a342c2af0003c |
389 | .quad 0xbd43d3c873e20a07 |
390 | .quad 0xbd4a21ac25d81ef3 |
391 | .quad 0x3d59f1fc63382a8f |
392 | .quad 0xbd5ec27d0b7b37b3 |
393 | .quad 0xbd50069ce24c53fb |
394 | .quad 0xbd584bf2b68d766f |
395 | .quad 0xbd5a21ac25d81ef3 |
396 | .quad 0xbd3bb2cd720ec44c |
397 | .quad 0xbd55056d312f7668 |
398 | .quad 0xbd1a07bd8b34be7c |
399 | .quad 0x3d5e83c094debc15 |
400 | .quad 0x3d5aa33736867a17 |
401 | .quad 0xbd46abb9df22bc57 |
402 | /* One */ |
403 | .align 64 |
404 | .quad 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000 |
405 | /* SmallThreshold */ |
406 | .align 64 |
407 | .quad 0x3ef0000000000000, 0x3ef0000000000000, 0x3ef0000000000000, 0x3ef0000000000000, 0x3ef0000000000000, 0x3ef0000000000000, 0x3ef0000000000000, 0x3ef0000000000000 |
408 | /* Threshold */ |
409 | .align 64 |
410 | .quad 0x5fe0000000000000, 0x5fe0000000000000, 0x5fe0000000000000, 0x5fe0000000000000, 0x5fe0000000000000, 0x5fe0000000000000, 0x5fe0000000000000, 0x5fe0000000000000 |
411 | /* LargeThreshold */ |
412 | .align 64 |
413 | .quad 0x7fefffffffffffff, 0x7fefffffffffffff, 0x7fefffffffffffff, 0x7fefffffffffffff, 0x7fefffffffffffff, 0x7fefffffffffffff, 0x7fefffffffffffff, 0x7fefffffffffffff |
414 | /* ca2 */ |
415 | .align 64 |
416 | .quad 0x3fb333220eaf02e7, 0x3fb333220eaf02e7, 0x3fb333220eaf02e7, 0x3fb333220eaf02e7, 0x3fb333220eaf02e7, 0x3fb333220eaf02e7, 0x3fb333220eaf02e7, 0x3fb333220eaf02e7 |
417 | /* ca1 */ |
418 | .align 64 |
419 | .quad 0xbfc5555555521e7e, 0xbfc5555555521e7e, 0xbfc5555555521e7e, 0xbfc5555555521e7e, 0xbfc5555555521e7e, 0xbfc5555555521e7e, 0xbfc5555555521e7e, 0xbfc5555555521e7e |
420 | /* c4s */ |
421 | .align 64 |
422 | .quad 0x3fd1800001943612, 0x3fd1800001943612, 0x3fd1800001943612, 0x3fd1800001943612, 0x3fd1800001943612, 0x3fd1800001943612, 0x3fd1800001943612, 0x3fd1800001943612 |
423 | /* c3s */ |
424 | .align 64 |
425 | .quad 0x3fd40000013b0000, 0x3fd40000013b0000, 0x3fd40000013b0000, 0x3fd40000013b0000, 0x3fd40000013b0000, 0x3fd40000013b0000, 0x3fd40000013b0000, 0x3fd40000013b0000 |
426 | /* c2s */ |
427 | .align 64 |
428 | .quad 0x3fd8000000000000, 0x3fd8000000000000, 0x3fd8000000000000, 0x3fd8000000000000, 0x3fd8000000000000, 0x3fd8000000000000, 0x3fd8000000000000, 0x3fd8000000000000 |
429 | /* c1s */ |
430 | .align 64 |
431 | .quad 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000 |
432 | /* AddB5 */ |
433 | .align 64 |
434 | .quad 0x0000800000000000, 0x0000800000000000, 0x0000800000000000, 0x0000800000000000, 0x0000800000000000, 0x0000800000000000, 0x0000800000000000, 0x0000800000000000 |
435 | /* RcpBitMask */ |
436 | .align 64 |
437 | .quad 0xffff000000000000, 0xffff000000000000, 0xffff000000000000, 0xffff000000000000, 0xffff000000000000, 0xffff000000000000, 0xffff000000000000, 0xffff000000000000 |
438 | /* OneEighth */ |
439 | .align 64 |
440 | .quad 0x3fc0000000000000, 0x3fc0000000000000, 0x3fc0000000000000, 0x3fc0000000000000, 0x3fc0000000000000, 0x3fc0000000000000, 0x3fc0000000000000, 0x3fc0000000000000 |
441 | /* Four */ |
442 | .align 64 |
443 | .quad 0x4010000000000000, 0x4010000000000000, 0x4010000000000000, 0x4010000000000000, 0x4010000000000000, 0x4010000000000000, 0x4010000000000000, 0x4010000000000000 |
444 | /* poly_coeff9 */ |
445 | .align 64 |
446 | .quad 0xbfb9a9b040214368, 0xbfb9a9b040214368, 0xbfb9a9b040214368, 0xbfb9a9b040214368, 0xbfb9a9b040214368, 0xbfb9a9b040214368, 0xbfb9a9b040214368, 0xbfb9a9b040214368 |
447 | /* poly_coeff8 */ |
448 | .align 64 |
449 | .quad 0x3fbc80666e249778, 0x3fbc80666e249778, 0x3fbc80666e249778, 0x3fbc80666e249778, 0x3fbc80666e249778, 0x3fbc80666e249778, 0x3fbc80666e249778, 0x3fbc80666e249778 |
450 | /* poly_coeff7 */ |
451 | .align 64 |
452 | .quad 0xbfbffffb8a054bc9, 0xbfbffffb8a054bc9, 0xbfbffffb8a054bc9, 0xbfbffffb8a054bc9, 0xbfbffffb8a054bc9, 0xbfbffffb8a054bc9, 0xbfbffffb8a054bc9, 0xbfbffffb8a054bc9 |
453 | /* poly_coeff6 */ |
454 | .align 64 |
455 | .quad 0x3fc24922f71256f1, 0x3fc24922f71256f1, 0x3fc24922f71256f1, 0x3fc24922f71256f1, 0x3fc24922f71256f1, 0x3fc24922f71256f1, 0x3fc24922f71256f1, 0x3fc24922f71256f1 |
456 | /* poly_coeff5 */ |
457 | .align 64 |
458 | .quad 0xbfc55555559ba736, 0xbfc55555559ba736, 0xbfc55555559ba736, 0xbfc55555559ba736, 0xbfc55555559ba736, 0xbfc55555559ba736, 0xbfc55555559ba736, 0xbfc55555559ba736 |
459 | /* poly_coeff4 */ |
460 | .align 64 |
461 | .quad 0x3fc9999999be77af, 0x3fc9999999be77af, 0x3fc9999999be77af, 0x3fc9999999be77af, 0x3fc9999999be77af, 0x3fc9999999be77af, 0x3fc9999999be77af, 0x3fc9999999be77af |
462 | /* poly_coeff3 */ |
463 | .align 64 |
464 | .quad 0xbfcffffffffffc65, 0xbfcffffffffffc65, 0xbfcffffffffffc65, 0xbfcffffffffffc65, 0xbfcffffffffffc65, 0xbfcffffffffffc65, 0xbfcffffffffffc65, 0xbfcffffffffffc65 |
465 | /* poly_coeff2 */ |
466 | .align 64 |
467 | .quad 0x3fd55555555554c1, 0x3fd55555555554c1, 0x3fd55555555554c1, 0x3fd55555555554c1, 0x3fd55555555554c1, 0x3fd55555555554c1, 0x3fd55555555554c1, 0x3fd55555555554c1 |
468 | /* poly_coeff1 */ |
469 | .align 64 |
470 | .quad 0xbfe0000000000000, 0xbfe0000000000000, 0xbfe0000000000000, 0xbfe0000000000000, 0xbfe0000000000000, 0xbfe0000000000000, 0xbfe0000000000000, 0xbfe0000000000000 |
471 | /* L2H = log(2)_high */ |
472 | .align 64 |
473 | .quad 0x3fe62E42FEFA0000, 0x3fe62E42FEFA0000, 0x3fe62E42FEFA0000, 0x3fe62E42FEFA0000, 0x3fe62E42FEFA0000, 0x3fe62E42FEFA0000, 0x3fe62E42FEFA0000, 0x3fe62E42FEFA0000 |
474 | /* L2L = log(2)_low */ |
475 | .align 64 |
476 | .quad 0x3d7cf79abc9e0000, 0x3d7cf79abc9e0000, 0x3d7cf79abc9e0000, 0x3d7cf79abc9e0000, 0x3d7cf79abc9e0000, 0x3d7cf79abc9e0000, 0x3d7cf79abc9e0000, 0x3d7cf79abc9e0000 |
477 | .align 64 |
478 | .type __svml_dacosh_data_internal_avx512, @object |
479 | .size __svml_dacosh_data_internal_avx512, .-__svml_dacosh_data_internal_avx512 |
480 | |