1/* Function asinf vectorized with AVX-512.
2 Copyright (C) 2021-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
18
19/*
20 * ALGORITHM DESCRIPTION:
21 *
22 * SelMask = (|x| >= 0.5) ? 1 : 0;
23 * R = SelMask ? sqrt(0.5 - 0.5*|x|) : |x|
24 * asin(x) = (SelMask ? (Pi/2 - 2*Poly(R)) : Poly(R))*(-1)^sign(x)
25 *
26 *
27 */
28
29/* Offsets for data table __svml_sasin_data_internal
30 */
31#define AbsMask 0
32#define OneHalf 64
33#define SmallNorm 128
34#define One 192
35#define Two 256
36#define sqrt_coeff_1 320
37#define sqrt_coeff_2 384
38#define poly_coeff_1 448
39#define poly_coeff_2 512
40#define poly_coeff_3 576
41#define poly_coeff_4 640
42#define poly_coeff_5 704
43#define Pi2H 768
44
45#include <sysdep.h>
46
47 .section .text.evex512, "ax", @progbits
48ENTRY(_ZGVeN16v_asinf_skx)
49 pushq %rbp
50 cfi_def_cfa_offset(16)
51 movq %rsp, %rbp
52 cfi_def_cfa(6, 16)
53 cfi_offset(6, -16)
54 andq $-64, %rsp
55 subq $192, %rsp
56 vmovups __svml_sasin_data_internal(%rip), %zmm4
57 vmovups OneHalf+__svml_sasin_data_internal(%rip), %zmm6
58
59 /* SQ ~ -2*sqrt(Y) */
60 vmovups SmallNorm+__svml_sasin_data_internal(%rip), %zmm8
61 vmovups Two+__svml_sasin_data_internal(%rip), %zmm12
62 vmovups sqrt_coeff_1+__svml_sasin_data_internal(%rip), %zmm13
63 vmovups One+__svml_sasin_data_internal(%rip), %zmm7
64 vmovaps %zmm0, %zmm3
65
66 /* x = |arg| */
67 vandps %zmm3, %zmm4, %zmm2
68 vandnps %zmm3, %zmm4, %zmm1
69
70 /* x^2 */
71 vmulps {rn-sae}, %zmm2, %zmm2, %zmm5
72 vcmpps $17, {sae}, %zmm2, %zmm7, %k0
73 vcmpps $21, {sae}, %zmm6, %zmm2, %k2
74 vmovups poly_coeff_2+__svml_sasin_data_internal(%rip), %zmm7
75 kmovw %k0, %edx
76
77 /* Y = 0.5 - 0.5*x */
78 vmovaps %zmm6, %zmm9
79 vfnmadd231ps {rn-sae}, %zmm2, %zmm6, %zmm9
80 vmovups poly_coeff_5+__svml_sasin_data_internal(%rip), %zmm6
81 vrsqrt14ps %zmm9, %zmm10
82 vcmpps $17, {sae}, %zmm8, %zmm9, %k1
83 vminps {sae}, %zmm9, %zmm5, %zmm0
84 vmovups sqrt_coeff_2+__svml_sasin_data_internal(%rip), %zmm8
85 vmovups poly_coeff_4+__svml_sasin_data_internal(%rip), %zmm5
86 vxorps %zmm10, %zmm10, %zmm10{%k1}
87 vaddps {rn-sae}, %zmm9, %zmm9, %zmm14
88 vmulps {rn-sae}, %zmm10, %zmm10, %zmm11
89 vmulps {rn-sae}, %zmm10, %zmm14, %zmm4
90 vfmsub213ps {rn-sae}, %zmm12, %zmm11, %zmm14
91 vmulps {rn-sae}, %zmm14, %zmm4, %zmm15
92 vfmadd231ps {rn-sae}, %zmm14, %zmm13, %zmm8
93 vmovups poly_coeff_3+__svml_sasin_data_internal(%rip), %zmm14
94
95 /* polynomial */
96 vmovups poly_coeff_1+__svml_sasin_data_internal(%rip), %zmm13
97 vfmsub213ps {rn-sae}, %zmm4, %zmm15, %zmm8
98 vfmadd231ps {rn-sae}, %zmm0, %zmm14, %zmm5
99 vfmadd231ps {rn-sae}, %zmm0, %zmm13, %zmm7
100 vmulps {rn-sae}, %zmm0, %zmm0, %zmm15
101 vblendmps %zmm8, %zmm2, %zmm2{%k2}
102 vfmadd213ps {rn-sae}, %zmm5, %zmm15, %zmm7
103 vfmadd213ps {rn-sae}, %zmm6, %zmm0, %zmm7
104 vmulps {rn-sae}, %zmm0, %zmm7, %zmm9
105 vmovups Pi2H+__svml_sasin_data_internal(%rip), %zmm0
106 vfmadd213ps {rn-sae}, %zmm2, %zmm2, %zmm9
107 vaddps {rn-sae}, %zmm0, %zmm9, %zmm9{%k2}
108 vxorps %zmm1, %zmm9, %zmm0
109 testl %edx, %edx
110
111 /* Go to special inputs processing branch */
112 jne L(SPECIAL_VALUES_BRANCH)
113 # LOE rbx r12 r13 r14 r15 edx zmm0 zmm3
114
115 /* Restore registers
116 * and exit the function
117 */
118
119L(EXIT):
120 movq %rbp, %rsp
121 popq %rbp
122 cfi_def_cfa(7, 8)
123 cfi_restore(6)
124 ret
125 cfi_def_cfa(6, 16)
126 cfi_offset(6, -16)
127
128 /* Branch to process
129 * special inputs
130 */
131
132L(SPECIAL_VALUES_BRANCH):
133 vmovups %zmm3, 64(%rsp)
134 vmovups %zmm0, 128(%rsp)
135 # LOE rbx r12 r13 r14 r15 edx zmm0
136
137 xorl %eax, %eax
138 # LOE rbx r12 r13 r14 r15 eax edx
139
140 vzeroupper
141 movq %r12, 16(%rsp)
142 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
143 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
144 movl %eax, %r12d
145 movq %r13, 8(%rsp)
146 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
147 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
148 movl %edx, %r13d
149 movq %r14, (%rsp)
150 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
151 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
152 # LOE rbx r15 r12d r13d
153
154 /* Range mask
155 * bits check
156 */
157
158L(RANGEMASK_CHECK):
159 btl %r12d, %r13d
160
161 /* Call scalar math function */
162 jc L(SCALAR_MATH_CALL)
163 # LOE rbx r15 r12d r13d
164
165 /* Special inputs
166 * processing loop
167 */
168
169L(SPECIAL_VALUES_LOOP):
170 incl %r12d
171 cmpl $16, %r12d
172
173 /* Check bits in range mask */
174 jl L(RANGEMASK_CHECK)
175 # LOE rbx r15 r12d r13d
176
177 movq 16(%rsp), %r12
178 cfi_restore(12)
179 movq 8(%rsp), %r13
180 cfi_restore(13)
181 movq (%rsp), %r14
182 cfi_restore(14)
183 vmovups 128(%rsp), %zmm0
184
185 /* Go to exit */
186 jmp L(EXIT)
187 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
188 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
189 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
190 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
191 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
192 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
193 # LOE rbx r12 r13 r14 r15 zmm0
194
195 /* Scalar math function call
196 * to process special input
197 */
198
199L(SCALAR_MATH_CALL):
200 movl %r12d, %r14d
201 vmovss 64(%rsp, %r14, 4), %xmm0
202 call asinf@PLT
203 # LOE rbx r14 r15 r12d r13d xmm0
204
205 vmovss %xmm0, 128(%rsp, %r14, 4)
206
207 /* Process special inputs in loop */
208 jmp L(SPECIAL_VALUES_LOOP)
209 # LOE rbx r15 r12d r13d
210END(_ZGVeN16v_asinf_skx)
211
212 .section .rodata, "a"
213 .align 64
214
215#ifdef __svml_sasin_data_internal_typedef
216typedef unsigned int VUINT32;
217typedef struct {
218 __declspec(align(64)) VUINT32 AbsMask[16][1];
219 __declspec(align(64)) VUINT32 OneHalf[16][1];
220 __declspec(align(64)) VUINT32 SmallNorm[16][1];
221 __declspec(align(64)) VUINT32 One[16][1];
222 __declspec(align(64)) VUINT32 Two[16][1];
223 __declspec(align(64)) VUINT32 sqrt_coeff[2][16][1];
224 __declspec(align(64)) VUINT32 poly_coeff[5][16][1];
225 __declspec(align(64)) VUINT32 Pi2H[16][1];
226} __svml_sasin_data_internal;
227#endif
228__svml_sasin_data_internal:
229 /* AbsMask */
230 .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
231 /* OneHalf */
232 .align 64
233 .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000
234 /* SmallNorm */
235 .align 64
236 .long 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000
237 /* One */
238 .align 64
239 .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
240 /* Two */
241 .align 64
242 .long 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000
243 /* sqrt_coeff[2] */
244 .align 64
245 .long 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004 /* sqrt_coeff2 */
246 .long 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001 /* sqrt_coeff1 */
247 /* poly_coeff[5] */
248 .align 64
249 .long 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07 /* poly_coeff5 */
250 .long 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B /* poly_coeff4 */
251 .long 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4 /* poly_coeff3 */
252 .long 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12 /* poly_coeff2 */
253 .long 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF /* poly_coeff1 */
254 /* Pi2H */
255 .align 64
256 .long 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB
257 .align 64
258 .type __svml_sasin_data_internal, @object
259 .size __svml_sasin_data_internal, .-__svml_sasin_data_internal
260

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_s_asinf16_core_avx512.S