1/* Function asin vectorized with AVX-512.
2 Copyright (C) 2021-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
18
19/*
20 * ALGORITHM DESCRIPTION:
21 *
22 * SelMask = (|x| >= 0.5) ? 1 : 0;
23 * R = SelMask ? sqrt(0.5 - 0.5*|x|) : |x|
24 * asin(x) = (SelMask ? (Pi/2 - 2*Poly(R)) : Poly(R))*(-1)^sign(x)
25 *
26 */
27
28/* Offsets for data table __svml_dasin_data_internal
29 */
30#define AbsMask 0
31#define OneHalf 64
32#define SmallNorm 128
33#define One 192
34#define Two 256
35#define sqrt_coeff_1 320
36#define sqrt_coeff_2 384
37#define sqrt_coeff_3 448
38#define sqrt_coeff_4 512
39#define poly_coeff_1 576
40#define poly_coeff_2 640
41#define poly_coeff_3 704
42#define poly_coeff_4 768
43#define poly_coeff_5 832
44#define poly_coeff_6 896
45#define poly_coeff_7 960
46#define poly_coeff_8 1024
47#define poly_coeff_9 1088
48#define poly_coeff_10 1152
49#define poly_coeff_11 1216
50#define poly_coeff_12 1280
51#define Pi2H 1344
52
53#include <sysdep.h>
54
55 .section .text.evex512, "ax", @progbits
56ENTRY(_ZGVeN8v_asin_skx)
57 pushq %rbp
58 cfi_def_cfa_offset(16)
59 movq %rsp, %rbp
60 cfi_def_cfa(6, 16)
61 cfi_offset(6, -16)
62 andq $-64, %rsp
63 subq $192, %rsp
64 vmovups OneHalf+__svml_dasin_data_internal(%rip), %zmm8
65
66 /* S ~ -2*sqrt(Y) */
67 vmovups SmallNorm+__svml_dasin_data_internal(%rip), %zmm10
68 vmovups Two+__svml_dasin_data_internal(%rip), %zmm14
69 vmovups sqrt_coeff_1+__svml_dasin_data_internal(%rip), %zmm15
70 vmovups sqrt_coeff_2+__svml_dasin_data_internal(%rip), %zmm2
71 vmovups sqrt_coeff_3+__svml_dasin_data_internal(%rip), %zmm1
72 vmovups One+__svml_dasin_data_internal(%rip), %zmm9
73 vmovaps %zmm0, %zmm6
74
75 /* x = |arg| */
76 vandpd __svml_dasin_data_internal(%rip), %zmm6, %zmm4
77
78 /* Y = 0.5 - 0.5*x */
79 vmovaps %zmm8, %zmm11
80 vfnmadd231pd {rn-sae}, %zmm4, %zmm8, %zmm11
81
82 /* x^2 */
83 vmulpd {rn-sae}, %zmm4, %zmm4, %zmm7
84 vrsqrt14pd %zmm11, %zmm12
85 vcmppd $17, {sae}, %zmm10, %zmm11, %k1
86 vcmppd $21, {sae}, %zmm8, %zmm4, %k2
87 vcmppd $17, {sae}, %zmm4, %zmm9, %k0
88 vmovups poly_coeff_5+__svml_dasin_data_internal(%rip), %zmm10
89
90 /* polynomial */
91 vmovups poly_coeff_1+__svml_dasin_data_internal(%rip), %zmm8
92 vmovups poly_coeff_3+__svml_dasin_data_internal(%rip), %zmm9
93 vminpd {sae}, %zmm11, %zmm7, %zmm3
94 vxorpd %zmm12, %zmm12, %zmm12{%k1}
95 vaddpd {rn-sae}, %zmm11, %zmm11, %zmm0
96 vxorpd %zmm6, %zmm4, %zmm5
97 vmulpd {rn-sae}, %zmm12, %zmm12, %zmm13
98 vmulpd {rn-sae}, %zmm12, %zmm0, %zmm7
99 vmovups poly_coeff_7+__svml_dasin_data_internal(%rip), %zmm11
100 vmovups poly_coeff_4+__svml_dasin_data_internal(%rip), %zmm12
101 vfmsub213pd {rn-sae}, %zmm14, %zmm13, %zmm0
102 vmovups sqrt_coeff_4+__svml_dasin_data_internal(%rip), %zmm13
103 vfmadd231pd {rn-sae}, %zmm3, %zmm9, %zmm12
104 vmovups poly_coeff_11+__svml_dasin_data_internal(%rip), %zmm9
105 vfmadd231pd {rn-sae}, %zmm0, %zmm15, %zmm2
106 vmovups poly_coeff_9+__svml_dasin_data_internal(%rip), %zmm15
107 vmulpd {rn-sae}, %zmm0, %zmm7, %zmm14
108 vfmadd213pd {rn-sae}, %zmm1, %zmm0, %zmm2
109 vmovups poly_coeff_2+__svml_dasin_data_internal(%rip), %zmm1
110 kmovw %k0, %edx
111 vfmadd213pd {rn-sae}, %zmm13, %zmm0, %zmm2
112 vfmadd231pd {rn-sae}, %zmm3, %zmm8, %zmm1
113 vmovups poly_coeff_10+__svml_dasin_data_internal(%rip), %zmm8
114 vmulpd {rn-sae}, %zmm3, %zmm3, %zmm0
115 vfmsub213pd {rn-sae}, %zmm7, %zmm14, %zmm2
116 vmovups poly_coeff_6+__svml_dasin_data_internal(%rip), %zmm7
117 vfmadd231pd {rn-sae}, %zmm3, %zmm15, %zmm8
118 vfmadd213pd {rn-sae}, %zmm12, %zmm0, %zmm1
119 vblendmpd %zmm2, %zmm4, %zmm2{%k2}
120 vfmadd231pd {rn-sae}, %zmm3, %zmm10, %zmm7
121 vmovups poly_coeff_8+__svml_dasin_data_internal(%rip), %zmm10
122 vmovups Pi2H+__svml_dasin_data_internal(%rip), %zmm4
123 vfmadd231pd {rn-sae}, %zmm3, %zmm11, %zmm10
124 vmovups poly_coeff_12+__svml_dasin_data_internal(%rip), %zmm11
125 vfmadd213pd {rn-sae}, %zmm10, %zmm0, %zmm7
126 vfmadd231pd {rn-sae}, %zmm3, %zmm9, %zmm11
127 vmulpd {rn-sae}, %zmm0, %zmm0, %zmm10
128 vfmadd213pd {rn-sae}, %zmm7, %zmm10, %zmm1
129 vfmadd213pd {rn-sae}, %zmm8, %zmm0, %zmm1
130 vfmadd213pd {rn-sae}, %zmm11, %zmm0, %zmm1
131 vmulpd {rn-sae}, %zmm3, %zmm1, %zmm3
132 vfmadd213pd {rn-sae}, %zmm2, %zmm2, %zmm3
133 vaddpd {rn-sae}, %zmm4, %zmm3, %zmm3{%k2}
134 vxorpd %zmm5, %zmm3, %zmm0
135 testl %edx, %edx
136
137 /* Go to special inputs processing branch */
138 jne L(SPECIAL_VALUES_BRANCH)
139 # LOE rbx r12 r13 r14 r15 edx zmm0 zmm6
140
141 /* Restore registers
142 * and exit the function
143 */
144
145L(EXIT):
146 movq %rbp, %rsp
147 popq %rbp
148 cfi_def_cfa(7, 8)
149 cfi_restore(6)
150 ret
151 cfi_def_cfa(6, 16)
152 cfi_offset(6, -16)
153
154 /* Branch to process
155 * special inputs
156 */
157
158L(SPECIAL_VALUES_BRANCH):
159 vmovups %zmm6, 64(%rsp)
160 vmovups %zmm0, 128(%rsp)
161 # LOE rbx r12 r13 r14 r15 edx zmm0
162
163 xorl %eax, %eax
164 # LOE rbx r12 r13 r14 r15 eax edx
165
166 vzeroupper
167 movq %r12, 16(%rsp)
168 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
169 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
170 movl %eax, %r12d
171 movq %r13, 8(%rsp)
172 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
173 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
174 movl %edx, %r13d
175 movq %r14, (%rsp)
176 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
177 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
178 # LOE rbx r15 r12d r13d
179
180 /* Range mask
181 * bits check
182 */
183
184L(RANGEMASK_CHECK):
185 btl %r12d, %r13d
186
187 /* Call scalar math function */
188 jc L(SCALAR_MATH_CALL)
189 # LOE rbx r15 r12d r13d
190
191 /* Special inputs
192 * processing loop
193 */
194
195L(SPECIAL_VALUES_LOOP):
196 incl %r12d
197 cmpl $8, %r12d
198
199 /* Check bits in range mask */
200 jl L(RANGEMASK_CHECK)
201 # LOE rbx r15 r12d r13d
202
203 movq 16(%rsp), %r12
204 cfi_restore(12)
205 movq 8(%rsp), %r13
206 cfi_restore(13)
207 movq (%rsp), %r14
208 cfi_restore(14)
209 vmovups 128(%rsp), %zmm0
210
211 /* Go to exit */
212 jmp L(EXIT)
213 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
214 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
215 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
216 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
217 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
218 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
219 # LOE rbx r12 r13 r14 r15 zmm0
220
221 /* Scalar math function call
222 * to process special input
223 */
224
225L(SCALAR_MATH_CALL):
226 movl %r12d, %r14d
227 vmovsd 64(%rsp, %r14, 8), %xmm0
228 call asin@PLT
229 # LOE rbx r14 r15 r12d r13d xmm0
230
231 vmovsd %xmm0, 128(%rsp, %r14, 8)
232
233 /* Process special inputs in loop */
234 jmp L(SPECIAL_VALUES_LOOP)
235 # LOE rbx r15 r12d r13d
236END(_ZGVeN8v_asin_skx)
237
238 .section .rodata, "a"
239 .align 64
240
241#ifdef __svml_dasin_data_internal_typedef
242typedef unsigned int VUINT32;
243typedef struct {
244 __declspec(align(64)) VUINT32 AbsMask[8][2];
245 __declspec(align(64)) VUINT32 OneHalf[8][2];
246 __declspec(align(64)) VUINT32 SmallNorm[8][2];
247 __declspec(align(64)) VUINT32 One[8][2];
248 __declspec(align(64)) VUINT32 Two[8][2];
249 __declspec(align(64)) VUINT32 sqrt_coeff[4][8][2];
250 __declspec(align(64)) VUINT32 poly_coeff[12][8][2];
251 __declspec(align(64)) VUINT32 Pi2H[8][2];
252} __svml_dasin_data_internal;
253#endif
254__svml_dasin_data_internal:
255 /* AbsMask */
256 .quad 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff
257 /* OneHalf */
258 .align 64
259 .quad 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000
260 /* SmallNorm */
261 .align 64
262 .quad 0x3000000000000000, 0x3000000000000000, 0x3000000000000000, 0x3000000000000000, 0x3000000000000000, 0x3000000000000000, 0x3000000000000000, 0x3000000000000000
263 /* One */
264 .align 64
265 .quad 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000
266 /* Two */
267 .align 64
268 .quad 0x4000000000000000, 0x4000000000000000, 0x4000000000000000, 0x4000000000000000, 0x4000000000000000, 0x4000000000000000, 0x4000000000000000, 0x4000000000000000
269 /* sqrt_coeff[4] */
270 .align 64
271 .quad 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3 /* sqrt_coeff4 */
272 .quad 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D /* sqrt_coeff3 */
273 .quad 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97 /* sqrt_coeff2 */
274 .quad 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D /* sqrt_coeff1 */
275 /* poly_coeff[12] */
276 .align 64
277 .quad 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909 /* poly_coeff12 */
278 .quad 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED /* poly_coeff11 */
279 .quad 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE /* poly_coeff10 */
280 .quad 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5 /* poly_coeff9 */
281 .quad 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6 /* poly_coeff8 */
282 .quad 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57 /* poly_coeff7 */
283 .quad 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E /* poly_coeff6 */
284 .quad 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd /* poly_coeff5 */
285 .quad 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE /* poly_coeff4 */
286 .quad 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8 /* poly_coeff3 */
287 .quad 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE /* poly_coeff2 */
288 .quad 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C /* poly_coeff1 */
289 /* Pi2H */
290 .align 64
291 .quad 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18
292 .align 64
293 .type __svml_dasin_data_internal, @object
294 .size __svml_dasin_data_internal, .-__svml_dasin_data_internal
295

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_d_asin8_core_avx512.S