1/* Function asinf vectorized with AVX2.
2 Copyright (C) 2021-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
18
19/*
20 * ALGORITHM DESCRIPTION:
21 *
22 * SelMask = (|x| >= 0.5) ? 1 : 0;
23 * R = SelMask ? sqrt(0.5 - 0.5*|x|) : |x|
24 * asin(x) = (SelMask ? (Pi/2 - 2*Poly(R)) : Poly(R))*(-1)^sign(x)
25 *
26 *
27 */
28
29/* Offsets for data table __svml_sasin_data_internal
30 */
31#define AbsMask 0
32#define OneHalf 32
33#define SmallNorm 64
34#define One 96
35#define Two 128
36#define sqrt_coeff 160
37#define poly_coeff 224
38#define Pi2H 384
39
40#include <sysdep.h>
41
42 .section .text.avx2, "ax", @progbits
43ENTRY(_ZGVdN8v_asinf_avx2)
44 pushq %rbp
45 cfi_def_cfa_offset(16)
46 movq %rsp, %rbp
47 cfi_def_cfa(6, 16)
48 cfi_offset(6, -16)
49 andq $-32, %rsp
50 subq $96, %rsp
51 vmovups __svml_sasin_data_internal(%rip), %ymm5
52 vmovups OneHalf+__svml_sasin_data_internal(%rip), %ymm9
53 vmovups One+__svml_sasin_data_internal(%rip), %ymm6
54 vmovaps %ymm0, %ymm4
55
56 /* x = |arg| */
57 vandps %ymm4, %ymm5, %ymm3
58
59 /* Y = 0.5 - 0.5*x */
60 vmovaps %ymm9, %ymm12
61 vfnmadd231ps %ymm3, %ymm9, %ymm12
62
63 /* x^2 */
64 vmulps %ymm3, %ymm3, %ymm7
65 vcmplt_oqps %ymm3, %ymm6, %ymm8
66
67 /* SQ ~ -2*sqrt(Y) */
68 vcmplt_oqps SmallNorm+__svml_sasin_data_internal(%rip), %ymm12, %ymm10
69 vminps %ymm12, %ymm7, %ymm1
70 vaddps %ymm12, %ymm12, %ymm15
71 vcmpnlt_uqps %ymm9, %ymm3, %ymm0
72 vrsqrtps %ymm12, %ymm11
73 vmovups poly_coeff+64+__svml_sasin_data_internal(%rip), %ymm7
74 vmulps %ymm1, %ymm1, %ymm6
75 vmovups sqrt_coeff+__svml_sasin_data_internal(%rip), %ymm9
76 vfmadd213ps poly_coeff+96+__svml_sasin_data_internal(%rip), %ymm1, %ymm7
77 vmovmskps %ymm8, %edx
78
79 /* polynomial */
80 vmovups poly_coeff+__svml_sasin_data_internal(%rip), %ymm8
81 vandnps %ymm11, %ymm10, %ymm13
82 vmulps %ymm13, %ymm13, %ymm14
83 vfmadd213ps poly_coeff+32+__svml_sasin_data_internal(%rip), %ymm1, %ymm8
84 vandnps %ymm4, %ymm5, %ymm2
85 vmulps %ymm15, %ymm13, %ymm5
86 vfmsub213ps Two+__svml_sasin_data_internal(%rip), %ymm14, %ymm15
87 vfmadd213ps %ymm7, %ymm6, %ymm8
88 vfmadd213ps sqrt_coeff+32+__svml_sasin_data_internal(%rip), %ymm15, %ymm9
89 vmulps %ymm15, %ymm5, %ymm15
90 vfmadd213ps poly_coeff+128+__svml_sasin_data_internal(%rip), %ymm1, %ymm8
91 vfmsub213ps %ymm5, %ymm15, %ymm9
92 vmulps %ymm8, %ymm1, %ymm1
93 vblendvps %ymm0, %ymm9, %ymm3, %ymm3
94 vfmadd213ps %ymm3, %ymm3, %ymm1
95 vandps Pi2H+__svml_sasin_data_internal(%rip), %ymm0, %ymm0
96 vaddps %ymm1, %ymm0, %ymm10
97 vxorps %ymm2, %ymm10, %ymm0
98 testl %edx, %edx
99
100 /* Go to special inputs processing branch */
101 jne L(SPECIAL_VALUES_BRANCH)
102 # LOE rbx r12 r13 r14 r15 edx ymm0 ymm4
103
104 /* Restore registers
105 * and exit the function
106 */
107
108L(EXIT):
109 movq %rbp, %rsp
110 popq %rbp
111 cfi_def_cfa(7, 8)
112 cfi_restore(6)
113 ret
114 cfi_def_cfa(6, 16)
115 cfi_offset(6, -16)
116
117 /* Branch to process
118 * special inputs
119 */
120
121L(SPECIAL_VALUES_BRANCH):
122 vmovups %ymm4, 32(%rsp)
123 vmovups %ymm0, 64(%rsp)
124 # LOE rbx r12 r13 r14 r15 edx ymm0
125
126 xorl %eax, %eax
127 # LOE rbx r12 r13 r14 r15 eax edx
128
129 vzeroupper
130 movq %r12, 16(%rsp)
131 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */
132 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
133 movl %eax, %r12d
134 movq %r13, 8(%rsp)
135 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */
136 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
137 movl %edx, %r13d
138 movq %r14, (%rsp)
139 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */
140 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
141 # LOE rbx r15 r12d r13d
142
143 /* Range mask
144 * bits check
145 */
146
147L(RANGEMASK_CHECK):
148 btl %r12d, %r13d
149
150 /* Call scalar math function */
151 jc L(SCALAR_MATH_CALL)
152 # LOE rbx r15 r12d r13d
153
154 /* Special inputs
155 * processing loop
156 */
157
158L(SPECIAL_VALUES_LOOP):
159 incl %r12d
160 cmpl $8, %r12d
161
162 /* Check bits in range mask */
163 jl L(RANGEMASK_CHECK)
164 # LOE rbx r15 r12d r13d
165
166 movq 16(%rsp), %r12
167 cfi_restore(12)
168 movq 8(%rsp), %r13
169 cfi_restore(13)
170 movq (%rsp), %r14
171 cfi_restore(14)
172 vmovups 64(%rsp), %ymm0
173
174 /* Go to exit */
175 jmp L(EXIT)
176 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */
177 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
178 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */
179 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
180 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */
181 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
182 # LOE rbx r12 r13 r14 r15 ymm0
183
184 /* Scalar math function call
185 * to process special input
186 */
187
188L(SCALAR_MATH_CALL):
189 movl %r12d, %r14d
190 vmovss 32(%rsp, %r14, 4), %xmm0
191 call asinf@PLT
192 # LOE rbx r14 r15 r12d r13d xmm0
193
194 vmovss %xmm0, 64(%rsp, %r14, 4)
195
196 /* Process special inputs in loop */
197 jmp L(SPECIAL_VALUES_LOOP)
198 # LOE rbx r15 r12d r13d
199END(_ZGVdN8v_asinf_avx2)
200
201 .section .rodata, "a"
202 .align 32
203
204#ifdef __svml_sasin_data_internal_typedef
205typedef unsigned int VUINT32;
206typedef struct {
207 __declspec(align(32)) VUINT32 AbsMask[8][1];
208 __declspec(align(32)) VUINT32 OneHalf[8][1];
209 __declspec(align(32)) VUINT32 SmallNorm[8][1];
210 __declspec(align(32)) VUINT32 One[8][1];
211 __declspec(align(32)) VUINT32 Two[8][1];
212 __declspec(align(32)) VUINT32 sqrt_coeff[2][8][1];
213 __declspec(align(32)) VUINT32 poly_coeff[5][8][1];
214 __declspec(align(32)) VUINT32 Pi2H[8][1];
215} __svml_sasin_data_internal;
216#endif
217__svml_sasin_data_internal:
218 /* AbsMask */
219 .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
220 /* OneHalf */
221 .align 32
222 .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000
223 /* SmallNorm */
224 .align 32
225 .long 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000
226 /* One */
227 .align 32
228 .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
229 /* Two */
230 .align 32
231 .long 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000
232 /* sqrt_coeff[2] */
233 .align 32
234 .long 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004 /* sqrt_coeff2 */
235 .long 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001 /* sqrt_coeff1 */
236 /* poly_coeff[5] */
237 .align 32
238 .long 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07 /* poly_coeff5 */
239 .long 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B /* poly_coeff4 */
240 .long 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4 /* poly_coeff3 */
241 .long 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12 /* poly_coeff2 */
242 .long 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF /* poly_coeff1 */
243 /* Pi2H */
244 .align 32
245 .long 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB
246 .align 32
247 .type __svml_sasin_data_internal, @object
248 .size __svml_sasin_data_internal, .-__svml_sasin_data_internal
249

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_s_asinf8_core_avx2.S