1 | /* Function asinh vectorized with AVX-512. |
2 | Copyright (C) 2021-2024 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | https://www.gnu.org/licenses/. */ |
18 | |
19 | /* |
20 | * ALGORITHM DESCRIPTION: |
21 | * |
22 | * Compute asinh(x) as log(x + sqrt(x*x + 1)) |
23 | * using RSQRT instructions for starting the |
24 | * square root approximation, and small table lookups for log |
25 | * that map to AVX-512 permute instructions |
26 | * |
27 | * Special cases: |
28 | * |
29 | * asinh(NaN) = quiet NaN, and raise invalid exception |
30 | * asinh(INF) = that INF |
31 | * asinh(0) = that 0 |
32 | * |
33 | */ |
34 | |
35 | /* Offsets for data table __svml_dasinh_data_internal_avx512 |
36 | */ |
37 | #define Log_tbl_H 0 |
38 | #define Log_tbl_L 128 |
39 | #define One 256 |
40 | #define AbsMask 320 |
41 | #define SmallThreshold 384 |
42 | #define Threshold 448 |
43 | #define LargeThreshold 512 |
44 | #define ca2 576 |
45 | #define ca1 640 |
46 | #define c4s 704 |
47 | #define c3s 768 |
48 | #define c2s 832 |
49 | #define c1s 896 |
50 | #define AddB5 960 |
51 | #define RcpBitMask 1024 |
52 | #define OneEighth 1088 |
53 | #define Four 1152 |
54 | #define poly_coeff9 1216 |
55 | #define poly_coeff8 1280 |
56 | #define poly_coeff7 1344 |
57 | #define poly_coeff6 1408 |
58 | #define poly_coeff5 1472 |
59 | #define poly_coeff4 1536 |
60 | #define poly_coeff3 1600 |
61 | #define poly_coeff2 1664 |
62 | #define poly_coeff1 1728 |
63 | #define L2H 1792 |
64 | #define L2L 1856 |
65 | |
66 | #include <sysdep.h> |
67 | |
68 | .section .text.evex512, "ax" , @progbits |
69 | ENTRY(_ZGVeN8v_asinh_skx) |
70 | pushq %rbp |
71 | cfi_def_cfa_offset(16) |
72 | movq %rsp, %rbp |
73 | cfi_def_cfa(6, 16) |
74 | cfi_offset(6, -16) |
75 | andq $-64, %rsp |
76 | subq $192, %rsp |
77 | vmovaps %zmm0, %zmm3 |
78 | |
79 | /* x^2 */ |
80 | vmulpd {rn-sae}, %zmm3, %zmm3, %zmm14 |
81 | vmovups One+__svml_dasinh_data_internal_avx512(%rip), %zmm9 |
82 | |
83 | /* polynomial computation for small inputs */ |
84 | vmovups ca2+__svml_dasinh_data_internal_avx512(%rip), %zmm10 |
85 | vmovups ca1+__svml_dasinh_data_internal_avx512(%rip), %zmm11 |
86 | |
87 | /* not a very small input ? */ |
88 | vmovups SmallThreshold+__svml_dasinh_data_internal_avx512(%rip), %zmm0 |
89 | |
90 | /* A=max(x^2, 1); */ |
91 | vmaxpd {sae}, %zmm14, %zmm9, %zmm4 |
92 | |
93 | /* B=min(x^2, 1); */ |
94 | vminpd {sae}, %zmm14, %zmm9, %zmm5 |
95 | vfmadd231pd {rn-sae}, %zmm14, %zmm10, %zmm11 |
96 | |
97 | /* 1+x^2 */ |
98 | vaddpd {rn-sae}, %zmm9, %zmm14, %zmm8 |
99 | |
100 | /* |input| */ |
101 | vandpd AbsMask+__svml_dasinh_data_internal_avx512(%rip), %zmm3, %zmm1 |
102 | vrsqrt14pd %zmm8, %zmm6 |
103 | vcmppd $21, {sae}, %zmm0, %zmm1, %k2 |
104 | |
105 | /* B_high */ |
106 | vsubpd {rn-sae}, %zmm4, %zmm8, %zmm7 |
107 | |
108 | /* sign bit */ |
109 | vxorpd %zmm3, %zmm1, %zmm2 |
110 | vmulpd {rn-sae}, %zmm14, %zmm11, %zmm4 |
111 | |
112 | /* B_low */ |
113 | vsubpd {rn-sae}, %zmm7, %zmm5, %zmm13 |
114 | vmovups c2s+__svml_dasinh_data_internal_avx512(%rip), %zmm5 |
115 | vmovups c1s+__svml_dasinh_data_internal_avx512(%rip), %zmm7 |
116 | |
117 | /* polynomial computation for small inputs */ |
118 | vfmadd213pd {rn-sae}, %zmm1, %zmm1, %zmm4 |
119 | |
120 | /* (x^2)_low */ |
121 | vmovaps %zmm3, %zmm15 |
122 | vfmsub213pd {rn-sae}, %zmm14, %zmm3, %zmm15 |
123 | |
124 | /* Sh ~sqrt(1+x^2) */ |
125 | vmulpd {rn-sae}, %zmm6, %zmm8, %zmm14 |
126 | |
127 | /* Yl = (x^2)_low + B_low */ |
128 | vaddpd {rn-sae}, %zmm15, %zmm13, %zmm13 |
129 | |
130 | /* very large inputs ? */ |
131 | vmovups Threshold+__svml_dasinh_data_internal_avx512(%rip), %zmm15 |
132 | |
133 | /* (Yh*R0)_low */ |
134 | vfmsub213pd {rn-sae}, %zmm14, %zmm6, %zmm8 |
135 | vcmppd $21, {sae}, %zmm15, %zmm1, %k1 |
136 | |
137 | /* Sl = (Yh*R0)_low+(R0*Yl) */ |
138 | vfmadd213pd {rn-sae}, %zmm8, %zmm6, %zmm13 |
139 | vmovups LargeThreshold+__svml_dasinh_data_internal_avx512(%rip), %zmm8 |
140 | |
141 | /* rel. error term: Eh=1-Sh*R0 */ |
142 | vmovaps %zmm9, %zmm12 |
143 | vfnmadd231pd {rn-sae}, %zmm14, %zmm6, %zmm12 |
144 | vcmppd $22, {sae}, %zmm8, %zmm1, %k0 |
145 | |
146 | /* rel. error term: Eh=(1-Sh*R0)-Sl*R0 */ |
147 | vfnmadd231pd {rn-sae}, %zmm13, %zmm6, %zmm12 |
148 | |
149 | /* |
150 | * sqrt(1+x^2) ~ Sh + Sl + Sh*Eh*poly_s |
151 | * poly_s = c1+c2*Eh+c3*Eh^2 |
152 | */ |
153 | vmovups c4s+__svml_dasinh_data_internal_avx512(%rip), %zmm6 |
154 | vmovups c3s+__svml_dasinh_data_internal_avx512(%rip), %zmm8 |
155 | |
156 | /* Sh*Eh */ |
157 | vmulpd {rn-sae}, %zmm12, %zmm14, %zmm11 |
158 | vfmadd231pd {rn-sae}, %zmm12, %zmm6, %zmm8 |
159 | |
160 | /* Sh+x */ |
161 | vaddpd {rn-sae}, %zmm1, %zmm14, %zmm6 |
162 | kmovw %k0, %edx |
163 | vfmadd213pd {rn-sae}, %zmm5, %zmm12, %zmm8 |
164 | vfmadd213pd {rn-sae}, %zmm7, %zmm12, %zmm8 |
165 | |
166 | /* Xh */ |
167 | vsubpd {rn-sae}, %zmm14, %zmm6, %zmm12 |
168 | |
169 | /* Sl + Sh*Eh*poly_s */ |
170 | vfmadd213pd {rn-sae}, %zmm13, %zmm8, %zmm11 |
171 | |
172 | /* fixup for very large inputs */ |
173 | vmovups OneEighth+__svml_dasinh_data_internal_avx512(%rip), %zmm8 |
174 | |
175 | /* Xl */ |
176 | vsubpd {rn-sae}, %zmm12, %zmm1, %zmm12 |
177 | |
178 | /* Xin0+Sl+Sh*Eh*poly_s ~ x+sqrt(1+x^2) */ |
179 | vaddpd {rn-sae}, %zmm11, %zmm6, %zmm10 |
180 | |
181 | /* Sl_high */ |
182 | vsubpd {rn-sae}, %zmm6, %zmm10, %zmm5 |
183 | vmulpd {rn-sae}, %zmm8, %zmm1, %zmm10{%k1} |
184 | |
185 | /* Table lookups */ |
186 | vmovups __svml_dasinh_data_internal_avx512(%rip), %zmm6 |
187 | |
188 | /* Sl_l */ |
189 | vsubpd {rn-sae}, %zmm5, %zmm11, %zmm7 |
190 | vrcp14pd %zmm10, %zmm13 |
191 | |
192 | /* Xin_low */ |
193 | vaddpd {rn-sae}, %zmm12, %zmm7, %zmm14 |
194 | vmovups Log_tbl_L+__svml_dasinh_data_internal_avx512(%rip), %zmm7 |
195 | vmovups poly_coeff6+__svml_dasinh_data_internal_avx512(%rip), %zmm12 |
196 | |
197 | /* round reciprocal to 1+4b mantissas */ |
198 | vpaddq AddB5+__svml_dasinh_data_internal_avx512(%rip), %zmm13, %zmm11 |
199 | |
200 | /* fixup for very large inputs */ |
201 | vxorpd %zmm14, %zmm14, %zmm14{%k1} |
202 | vmovups poly_coeff5+__svml_dasinh_data_internal_avx512(%rip), %zmm13 |
203 | vandpd RcpBitMask+__svml_dasinh_data_internal_avx512(%rip), %zmm11, %zmm15 |
204 | vmovups poly_coeff7+__svml_dasinh_data_internal_avx512(%rip), %zmm11 |
205 | |
206 | /* Prepare table index */ |
207 | vpsrlq $48, %zmm15, %zmm5 |
208 | |
209 | /* reduced argument for log(): (Rcp*Xin-1)+Rcp*Xin_low */ |
210 | vfmsub231pd {rn-sae}, %zmm15, %zmm10, %zmm9 |
211 | |
212 | /* exponents */ |
213 | vgetexppd {sae}, %zmm15, %zmm8 |
214 | vmovups Four+__svml_dasinh_data_internal_avx512(%rip), %zmm10 |
215 | vpermt2pd Log_tbl_H+64+__svml_dasinh_data_internal_avx512(%rip), %zmm5, %zmm6 |
216 | vpermt2pd Log_tbl_L+64+__svml_dasinh_data_internal_avx512(%rip), %zmm5, %zmm7 |
217 | vsubpd {rn-sae}, %zmm10, %zmm8, %zmm8{%k1} |
218 | vfmadd231pd {rn-sae}, %zmm15, %zmm14, %zmm9 |
219 | |
220 | /* polynomials */ |
221 | vmovups poly_coeff9+__svml_dasinh_data_internal_avx512(%rip), %zmm10 |
222 | vmovups poly_coeff8+__svml_dasinh_data_internal_avx512(%rip), %zmm5 |
223 | vmovups poly_coeff4+__svml_dasinh_data_internal_avx512(%rip), %zmm14 |
224 | |
225 | /* -K*L2H + Th */ |
226 | vmovups L2H+__svml_dasinh_data_internal_avx512(%rip), %zmm15 |
227 | vfmadd231pd {rn-sae}, %zmm9, %zmm10, %zmm5 |
228 | |
229 | /* -K*L2L + Tl */ |
230 | vmovups L2L+__svml_dasinh_data_internal_avx512(%rip), %zmm10 |
231 | vfnmadd231pd {rn-sae}, %zmm8, %zmm15, %zmm6 |
232 | vfmadd213pd {rn-sae}, %zmm11, %zmm9, %zmm5 |
233 | vfnmadd213pd {rn-sae}, %zmm7, %zmm10, %zmm8 |
234 | vmovups poly_coeff3+__svml_dasinh_data_internal_avx512(%rip), %zmm7 |
235 | vmovups poly_coeff1+__svml_dasinh_data_internal_avx512(%rip), %zmm10 |
236 | |
237 | /* R^2 */ |
238 | vmulpd {rn-sae}, %zmm9, %zmm9, %zmm11 |
239 | vfmadd213pd {rn-sae}, %zmm12, %zmm9, %zmm5 |
240 | vfmadd213pd {rn-sae}, %zmm13, %zmm9, %zmm5 |
241 | vfmadd213pd {rn-sae}, %zmm14, %zmm9, %zmm5 |
242 | vfmadd213pd {rn-sae}, %zmm7, %zmm9, %zmm5 |
243 | vmovups poly_coeff2+__svml_dasinh_data_internal_avx512(%rip), %zmm7 |
244 | vfmadd213pd {rn-sae}, %zmm7, %zmm9, %zmm5 |
245 | vfmadd213pd {rn-sae}, %zmm10, %zmm9, %zmm5 |
246 | |
247 | /* Tl + R^2*Poly */ |
248 | vfmadd213pd {rn-sae}, %zmm8, %zmm11, %zmm5 |
249 | |
250 | /* R+Tl + R^2*Poly */ |
251 | vaddpd {rn-sae}, %zmm9, %zmm5, %zmm9 |
252 | vaddpd {rn-sae}, %zmm9, %zmm6, %zmm4{%k2} |
253 | vxorpd %zmm2, %zmm4, %zmm0 |
254 | testl %edx, %edx |
255 | |
256 | /* Go to special inputs processing branch */ |
257 | jne L(SPECIAL_VALUES_BRANCH) |
258 | # LOE rbx r12 r13 r14 r15 edx zmm0 zmm3 |
259 | |
260 | /* Restore registers |
261 | * and exit the function |
262 | */ |
263 | |
264 | L(EXIT): |
265 | movq %rbp, %rsp |
266 | popq %rbp |
267 | cfi_def_cfa(7, 8) |
268 | cfi_restore(6) |
269 | ret |
270 | cfi_def_cfa(6, 16) |
271 | cfi_offset(6, -16) |
272 | |
273 | /* Branch to process |
274 | * special inputs |
275 | */ |
276 | |
277 | L(SPECIAL_VALUES_BRANCH): |
278 | vmovups %zmm3, 64(%rsp) |
279 | vmovups %zmm0, 128(%rsp) |
280 | # LOE rbx r12 r13 r14 r15 edx zmm0 |
281 | |
282 | xorl %eax, %eax |
283 | # LOE rbx r12 r13 r14 r15 eax edx |
284 | |
285 | vzeroupper |
286 | movq %r12, 16(%rsp) |
287 | /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */ |
288 | .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22 |
289 | movl %eax, %r12d |
290 | movq %r13, 8(%rsp) |
291 | /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */ |
292 | .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22 |
293 | movl %edx, %r13d |
294 | movq %r14, (%rsp) |
295 | /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */ |
296 | .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22 |
297 | # LOE rbx r15 r12d r13d |
298 | |
299 | /* Range mask |
300 | * bits check |
301 | */ |
302 | |
303 | L(RANGEMASK_CHECK): |
304 | btl %r12d, %r13d |
305 | |
306 | /* Call scalar math function */ |
307 | jc L(SCALAR_MATH_CALL) |
308 | # LOE rbx r15 r12d r13d |
309 | |
310 | /* Special inputs |
311 | * processing loop |
312 | */ |
313 | |
314 | L(SPECIAL_VALUES_LOOP): |
315 | incl %r12d |
316 | cmpl $8, %r12d |
317 | |
318 | /* Check bits in range mask */ |
319 | jl L(RANGEMASK_CHECK) |
320 | # LOE rbx r15 r12d r13d |
321 | |
322 | movq 16(%rsp), %r12 |
323 | cfi_restore(12) |
324 | movq 8(%rsp), %r13 |
325 | cfi_restore(13) |
326 | movq (%rsp), %r14 |
327 | cfi_restore(14) |
328 | vmovups 128(%rsp), %zmm0 |
329 | |
330 | /* Go to exit */ |
331 | jmp L(EXIT) |
332 | /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */ |
333 | .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22 |
334 | /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */ |
335 | .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22 |
336 | /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */ |
337 | .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22 |
338 | # LOE rbx r12 r13 r14 r15 zmm0 |
339 | |
340 | /* Scalar math function call |
341 | * to process special input |
342 | */ |
343 | |
344 | L(SCALAR_MATH_CALL): |
345 | movl %r12d, %r14d |
346 | vmovsd 64(%rsp, %r14, 8), %xmm0 |
347 | call asinh@PLT |
348 | # LOE rbx r14 r15 r12d r13d xmm0 |
349 | |
350 | vmovsd %xmm0, 128(%rsp, %r14, 8) |
351 | |
352 | /* Process special inputs in loop */ |
353 | jmp L(SPECIAL_VALUES_LOOP) |
354 | # LOE rbx r15 r12d r13d |
355 | END(_ZGVeN8v_asinh_skx) |
356 | |
357 | .section .rodata, "a" |
358 | .align 64 |
359 | |
360 | #ifdef __svml_dasinh_data_internal_avx512_typedef |
361 | typedef unsigned int VUINT32; |
362 | typedef struct { |
363 | __declspec(align(64)) VUINT32 Log_tbl_H[16][2]; |
364 | __declspec(align(64)) VUINT32 Log_tbl_L[16][2]; |
365 | __declspec(align(64)) VUINT32 One[8][2]; |
366 | __declspec(align(64)) VUINT32 AbsMask[8][2]; |
367 | __declspec(align(64)) VUINT32 SmallThreshold[8][2]; |
368 | __declspec(align(64)) VUINT32 Threshold[8][2]; |
369 | __declspec(align(64)) VUINT32 LargeThreshold[8][2]; |
370 | __declspec(align(64)) VUINT32 ca2[8][2]; |
371 | __declspec(align(64)) VUINT32 ca1[8][2]; |
372 | __declspec(align(64)) VUINT32 c4s[8][2]; |
373 | __declspec(align(64)) VUINT32 c3s[8][2]; |
374 | __declspec(align(64)) VUINT32 c2s[8][2]; |
375 | __declspec(align(64)) VUINT32 c1s[8][2]; |
376 | __declspec(align(64)) VUINT32 AddB5[8][2]; |
377 | __declspec(align(64)) VUINT32 RcpBitMask[8][2]; |
378 | __declspec(align(64)) VUINT32 OneEighth[8][2]; |
379 | __declspec(align(64)) VUINT32 Four[8][2]; |
380 | __declspec(align(64)) VUINT32 poly_coeff9[8][2]; |
381 | __declspec(align(64)) VUINT32 poly_coeff8[8][2]; |
382 | __declspec(align(64)) VUINT32 poly_coeff7[8][2]; |
383 | __declspec(align(64)) VUINT32 poly_coeff6[8][2]; |
384 | __declspec(align(64)) VUINT32 poly_coeff5[8][2]; |
385 | __declspec(align(64)) VUINT32 poly_coeff4[8][2]; |
386 | __declspec(align(64)) VUINT32 poly_coeff3[8][2]; |
387 | __declspec(align(64)) VUINT32 poly_coeff2[8][2]; |
388 | __declspec(align(64)) VUINT32 poly_coeff1[8][2]; |
389 | __declspec(align(64)) VUINT32 L2H[8][2]; |
390 | __declspec(align(64)) VUINT32 L2L[8][2]; |
391 | } __svml_dasinh_data_internal_avx512; |
392 | #endif |
393 | __svml_dasinh_data_internal_avx512: |
394 | /* Log_tbl_H */ |
395 | .quad 0x0000000000000000 |
396 | .quad 0xbfaf0a30c0120000 |
397 | .quad 0xbfbe27076e2b0000 |
398 | .quad 0xbfc5ff3070a78000 |
399 | .quad 0xbfcc8ff7c79a8000 |
400 | .quad 0xbfd1675cababc000 |
401 | .quad 0xbfd4618bc21c4000 |
402 | .quad 0xbfd739d7f6bbc000 |
403 | .quad 0xbfd9f323ecbf8000 |
404 | .quad 0xbfdc8ff7c79a8000 |
405 | .quad 0xbfdf128f5faf0000 |
406 | .quad 0xbfe0be72e4252000 |
407 | .quad 0xbfe1e85f5e704000 |
408 | .quad 0xbfe307d7334f2000 |
409 | .quad 0xbfe41d8fe8468000 |
410 | .quad 0xbfe52a2d265bc000 |
411 | /* Log_tbl_L */ |
412 | .align 64 |
413 | .quad 0x0000000000000000 |
414 | .quad 0x3d53ab33d066d1d2 |
415 | .quad 0x3d2a342c2af0003c |
416 | .quad 0xbd43d3c873e20a07 |
417 | .quad 0xbd4a21ac25d81ef3 |
418 | .quad 0x3d59f1fc63382a8f |
419 | .quad 0xbd5ec27d0b7b37b3 |
420 | .quad 0xbd50069ce24c53fb |
421 | .quad 0xbd584bf2b68d766f |
422 | .quad 0xbd5a21ac25d81ef3 |
423 | .quad 0xbd3bb2cd720ec44c |
424 | .quad 0xbd55056d312f7668 |
425 | .quad 0xbd1a07bd8b34be7c |
426 | .quad 0x3d5e83c094debc15 |
427 | .quad 0x3d5aa33736867a17 |
428 | .quad 0xbd46abb9df22bc57 |
429 | /* One */ |
430 | .align 64 |
431 | .quad 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000 |
432 | /* AbsMask */ |
433 | .align 64 |
434 | .quad 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff |
435 | /* SmallThreshold */ |
436 | .align 64 |
437 | .quad 0x3f70000000000000, 0x3f70000000000000, 0x3f70000000000000, 0x3f70000000000000, 0x3f70000000000000, 0x3f70000000000000, 0x3f70000000000000, 0x3f70000000000000 |
438 | /* Threshold */ |
439 | .align 64 |
440 | .quad 0x5fe0000000000000, 0x5fe0000000000000, 0x5fe0000000000000, 0x5fe0000000000000, 0x5fe0000000000000, 0x5fe0000000000000, 0x5fe0000000000000, 0x5fe0000000000000 |
441 | /* LargeThreshold */ |
442 | .align 64 |
443 | .quad 0x7fefffffffffffff, 0x7fefffffffffffff, 0x7fefffffffffffff, 0x7fefffffffffffff, 0x7fefffffffffffff, 0x7fefffffffffffff, 0x7fefffffffffffff, 0x7fefffffffffffff |
444 | /* ca2 */ |
445 | .align 64 |
446 | .quad 0x3fb333220eaf02e7, 0x3fb333220eaf02e7, 0x3fb333220eaf02e7, 0x3fb333220eaf02e7, 0x3fb333220eaf02e7, 0x3fb333220eaf02e7, 0x3fb333220eaf02e7, 0x3fb333220eaf02e7 |
447 | /* ca1 */ |
448 | .align 64 |
449 | .quad 0xbfc5555555521e7e, 0xbfc5555555521e7e, 0xbfc5555555521e7e, 0xbfc5555555521e7e, 0xbfc5555555521e7e, 0xbfc5555555521e7e, 0xbfc5555555521e7e, 0xbfc5555555521e7e |
450 | /* c4s */ |
451 | .align 64 |
452 | .quad 0x3fd1800001943612, 0x3fd1800001943612, 0x3fd1800001943612, 0x3fd1800001943612, 0x3fd1800001943612, 0x3fd1800001943612, 0x3fd1800001943612, 0x3fd1800001943612 |
453 | /* c3s */ |
454 | .align 64 |
455 | .quad 0x3fd40000013b0000, 0x3fd40000013b0000, 0x3fd40000013b0000, 0x3fd40000013b0000, 0x3fd40000013b0000, 0x3fd40000013b0000, 0x3fd40000013b0000, 0x3fd40000013b0000 |
456 | /* c2s */ |
457 | .align 64 |
458 | .quad 0x3fd8000000000000, 0x3fd8000000000000, 0x3fd8000000000000, 0x3fd8000000000000, 0x3fd8000000000000, 0x3fd8000000000000, 0x3fd8000000000000, 0x3fd8000000000000 |
459 | /* c1s */ |
460 | .align 64 |
461 | .quad 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000 |
462 | /* AddB5 */ |
463 | .align 64 |
464 | .quad 0x0000800000000000, 0x0000800000000000, 0x0000800000000000, 0x0000800000000000, 0x0000800000000000, 0x0000800000000000, 0x0000800000000000, 0x0000800000000000 |
465 | /* RcpBitMask */ |
466 | .align 64 |
467 | .quad 0xffff000000000000, 0xffff000000000000, 0xffff000000000000, 0xffff000000000000, 0xffff000000000000, 0xffff000000000000, 0xffff000000000000, 0xffff000000000000 |
468 | /* OneEighth */ |
469 | .align 64 |
470 | .quad 0x3fc0000000000000, 0x3fc0000000000000, 0x3fc0000000000000, 0x3fc0000000000000, 0x3fc0000000000000, 0x3fc0000000000000, 0x3fc0000000000000, 0x3fc0000000000000 |
471 | /* Four */ |
472 | .align 64 |
473 | .quad 0x4010000000000000, 0x4010000000000000, 0x4010000000000000, 0x4010000000000000, 0x4010000000000000, 0x4010000000000000, 0x4010000000000000, 0x4010000000000000 |
474 | /* poly_coeff9 */ |
475 | .align 64 |
476 | .quad 0xbfb9a9b040214368, 0xbfb9a9b040214368, 0xbfb9a9b040214368, 0xbfb9a9b040214368, 0xbfb9a9b040214368, 0xbfb9a9b040214368, 0xbfb9a9b040214368, 0xbfb9a9b040214368 |
477 | /* poly_coeff8 */ |
478 | .align 64 |
479 | .quad 0x3fbc80666e249778, 0x3fbc80666e249778, 0x3fbc80666e249778, 0x3fbc80666e249778, 0x3fbc80666e249778, 0x3fbc80666e249778, 0x3fbc80666e249778, 0x3fbc80666e249778 |
480 | /* poly_coeff7 */ |
481 | .align 64 |
482 | .quad 0xbfbffffb8a054bc9, 0xbfbffffb8a054bc9, 0xbfbffffb8a054bc9, 0xbfbffffb8a054bc9, 0xbfbffffb8a054bc9, 0xbfbffffb8a054bc9, 0xbfbffffb8a054bc9, 0xbfbffffb8a054bc9 |
483 | /* poly_coeff6 */ |
484 | .align 64 |
485 | .quad 0x3fc24922f71256f1, 0x3fc24922f71256f1, 0x3fc24922f71256f1, 0x3fc24922f71256f1, 0x3fc24922f71256f1, 0x3fc24922f71256f1, 0x3fc24922f71256f1, 0x3fc24922f71256f1 |
486 | /* poly_coeff5 */ |
487 | .align 64 |
488 | .quad 0xbfc55555559ba736, 0xbfc55555559ba736, 0xbfc55555559ba736, 0xbfc55555559ba736, 0xbfc55555559ba736, 0xbfc55555559ba736, 0xbfc55555559ba736, 0xbfc55555559ba736 |
489 | /* poly_coeff4 */ |
490 | .align 64 |
491 | .quad 0x3fc9999999be77af, 0x3fc9999999be77af, 0x3fc9999999be77af, 0x3fc9999999be77af, 0x3fc9999999be77af, 0x3fc9999999be77af, 0x3fc9999999be77af, 0x3fc9999999be77af |
492 | /* poly_coeff3 */ |
493 | .align 64 |
494 | .quad 0xbfcffffffffffc65, 0xbfcffffffffffc65, 0xbfcffffffffffc65, 0xbfcffffffffffc65, 0xbfcffffffffffc65, 0xbfcffffffffffc65, 0xbfcffffffffffc65, 0xbfcffffffffffc65 |
495 | /* poly_coeff2 */ |
496 | .align 64 |
497 | .quad 0x3fd55555555554c1, 0x3fd55555555554c1, 0x3fd55555555554c1, 0x3fd55555555554c1, 0x3fd55555555554c1, 0x3fd55555555554c1, 0x3fd55555555554c1, 0x3fd55555555554c1 |
498 | /* poly_coeff1 */ |
499 | .align 64 |
500 | .quad 0xbfe0000000000000, 0xbfe0000000000000, 0xbfe0000000000000, 0xbfe0000000000000, 0xbfe0000000000000, 0xbfe0000000000000, 0xbfe0000000000000, 0xbfe0000000000000 |
501 | /* L2H = log(2)_high */ |
502 | .align 64 |
503 | .quad 0x3fe62E42FEFA0000, 0x3fe62E42FEFA0000, 0x3fe62E42FEFA0000, 0x3fe62E42FEFA0000, 0x3fe62E42FEFA0000, 0x3fe62E42FEFA0000, 0x3fe62E42FEFA0000, 0x3fe62E42FEFA0000 |
504 | /* L2L = log(2)_low */ |
505 | .align 64 |
506 | .quad 0x3d7cf79abc9e0000, 0x3d7cf79abc9e0000, 0x3d7cf79abc9e0000, 0x3d7cf79abc9e0000, 0x3d7cf79abc9e0000, 0x3d7cf79abc9e0000, 0x3d7cf79abc9e0000, 0x3d7cf79abc9e0000 |
507 | .align 64 |
508 | .type __svml_dasinh_data_internal_avx512, @object |
509 | .size __svml_dasinh_data_internal_avx512, .-__svml_dasinh_data_internal_avx512 |
510 | |