1/* Function acosf vectorized with AVX-512.
2 Copyright (C) 2021-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
18
19/*
20 * ALGORITHM DESCRIPTION:
21 *
22 * SelMask = (|x| >= 0.5) ? 1 : 0;
23 * R = SelMask ? sqrt(0.5 - 0.5*|x|) : |x|
24 * acos(|x|) = SelMask ? 2*Poly(R) : (Pi/2 - Poly(R))
25 * acos(x) = sign(x) ? (Pi - acos(|x|)) : acos(|x|)
26 *
27 *
28 */
29
30/* Offsets for data table __svml_sacos_data_internal
31 */
32#define SgnBit 0
33#define OneHalf 64
34#define SmallNorm 128
35#define MOne 192
36#define Two 256
37#define sqrt_coeff_1 320
38#define sqrt_coeff_2 384
39#define poly_coeff_1 448
40#define poly_coeff_2 512
41#define poly_coeff_3 576
42#define poly_coeff_4 640
43#define poly_coeff_5 704
44#define Pi2H 768
45#define PiH 832
46
47#include <sysdep.h>
48
49 .section .text.evex512, "ax", @progbits
50ENTRY(_ZGVeN16v_acosf_skx)
51 pushq %rbp
52 cfi_def_cfa_offset(16)
53 movq %rsp, %rbp
54 cfi_def_cfa(6, 16)
55 cfi_offset(6, -16)
56 andq $-64, %rsp
57 subq $192, %rsp
58 vmovups __svml_sacos_data_internal(%rip), %zmm5
59 vmovups OneHalf+__svml_sacos_data_internal(%rip), %zmm6
60
61 /* SQ ~ 2*sqrt(Y) */
62 vmovups SmallNorm+__svml_sacos_data_internal(%rip), %zmm9
63 vmovups MOne+__svml_sacos_data_internal(%rip), %zmm8
64 vmovups Two+__svml_sacos_data_internal(%rip), %zmm12
65 vmovups sqrt_coeff_1+__svml_sacos_data_internal(%rip), %zmm13
66 vmovaps %zmm0, %zmm4
67
68 /* x = -|arg| */
69 vorps %zmm4, %zmm5, %zmm3
70 vandps %zmm4, %zmm5, %zmm2
71 vmovups sqrt_coeff_2+__svml_sacos_data_internal(%rip), %zmm0
72
73 /* Y = 0.5 + 0.5*(-x) */
74 vfmadd231ps {rn-sae}, %zmm3, %zmm6, %zmm6
75
76 /* x^2 */
77 vmulps {rn-sae}, %zmm3, %zmm3, %zmm7
78 vrsqrt14ps %zmm6, %zmm10
79 vcmpps $17, {sae}, %zmm9, %zmm6, %k1
80 vcmpps $22, {sae}, %zmm3, %zmm8, %k0
81 vmovups poly_coeff_4+__svml_sacos_data_internal(%rip), %zmm9
82 vminps {sae}, %zmm6, %zmm7, %zmm1
83 vmovups poly_coeff_3+__svml_sacos_data_internal(%rip), %zmm7
84 vxorps %zmm10, %zmm10, %zmm10{%k1}
85 vaddps {rn-sae}, %zmm6, %zmm6, %zmm14
86 vmulps {rn-sae}, %zmm1, %zmm1, %zmm8
87 vmulps {rn-sae}, %zmm10, %zmm10, %zmm11
88 vmulps {rn-sae}, %zmm10, %zmm14, %zmm5
89 vcmpps $21, {sae}, %zmm6, %zmm1, %k4
90
91 /* X<X^2 iff X<0 */
92 vcmpps $17, {sae}, %zmm1, %zmm4, %k2
93
94 /* polynomial */
95 vmovups poly_coeff_1+__svml_sacos_data_internal(%rip), %zmm6
96 vfmsub213ps {rn-sae}, %zmm12, %zmm11, %zmm14
97 vmovups poly_coeff_2+__svml_sacos_data_internal(%rip), %zmm11
98 vfmadd231ps {rn-sae}, %zmm1, %zmm7, %zmm9
99 vmovups poly_coeff_5+__svml_sacos_data_internal(%rip), %zmm10
100 vmovups Pi2H+__svml_sacos_data_internal(%rip), %zmm12
101 vfmadd231ps {rn-sae}, %zmm14, %zmm13, %zmm0
102 vfmadd231ps {rn-sae}, %zmm1, %zmm6, %zmm11
103 vmulps {rn-sae}, %zmm14, %zmm5, %zmm15
104 vfmadd213ps {rn-sae}, %zmm9, %zmm8, %zmm11
105 vxorps %zmm12, %zmm12, %zmm12{%k4}
106 vfnmadd213ps {rn-sae}, %zmm5, %zmm15, %zmm0
107 vfmadd213ps {rn-sae}, %zmm10, %zmm1, %zmm11
108 kmovw %k0, %edx
109 vmulps {rn-sae}, %zmm1, %zmm11, %zmm13
110 vblendmps %zmm0, %zmm3, %zmm0{%k4}
111 vxorps %zmm2, %zmm0, %zmm1
112 kandw %k4, %k2, %k3
113 vfmadd213ps {rn-sae}, %zmm1, %zmm1, %zmm13
114 vorps PiH+__svml_sacos_data_internal(%rip), %zmm12, %zmm12{%k3}
115 vaddps {rn-sae}, %zmm13, %zmm12, %zmm0
116 testl %edx, %edx
117
118 /* Go to special inputs processing branch */
119 jne L(SPECIAL_VALUES_BRANCH)
120 # LOE rbx r12 r13 r14 r15 edx zmm0 zmm4
121
122 /* Restore registers
123 * and exit the function
124 */
125
126L(EXIT):
127 movq %rbp, %rsp
128 popq %rbp
129 cfi_def_cfa(7, 8)
130 cfi_restore(6)
131 ret
132 cfi_def_cfa(6, 16)
133 cfi_offset(6, -16)
134
135 /* Branch to process
136 * special inputs
137 */
138
139L(SPECIAL_VALUES_BRANCH):
140 vmovups %zmm4, 64(%rsp)
141 vmovups %zmm0, 128(%rsp)
142 # LOE rbx r12 r13 r14 r15 edx zmm0
143
144 xorl %eax, %eax
145 # LOE rbx r12 r13 r14 r15 eax edx
146
147 vzeroupper
148 movq %r12, 16(%rsp)
149 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
150 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
151 movl %eax, %r12d
152 movq %r13, 8(%rsp)
153 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
154 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
155 movl %edx, %r13d
156 movq %r14, (%rsp)
157 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
158 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
159 # LOE rbx r15 r12d r13d
160
161 /* Range mask
162 * bits check
163 */
164
165L(RANGEMASK_CHECK):
166 btl %r12d, %r13d
167
168 /* Call scalar math function */
169 jc L(SCALAR_MATH_CALL)
170 # LOE rbx r15 r12d r13d
171
172 /* Special inputs
173 * processing loop
174 */
175
176L(SPECIAL_VALUES_LOOP):
177 incl %r12d
178 cmpl $16, %r12d
179
180 /* Check bits in range mask */
181 jl L(RANGEMASK_CHECK)
182 # LOE rbx r15 r12d r13d
183
184 movq 16(%rsp), %r12
185 cfi_restore(12)
186 movq 8(%rsp), %r13
187 cfi_restore(13)
188 movq (%rsp), %r14
189 cfi_restore(14)
190 vmovups 128(%rsp), %zmm0
191
192 /* Go to exit */
193 jmp L(EXIT)
194 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
195 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
196 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
197 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
198 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
199 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
200 # LOE rbx r12 r13 r14 r15 zmm0
201
202 /* Scalar math function call
203 * to process special input
204 */
205
206L(SCALAR_MATH_CALL):
207 movl %r12d, %r14d
208 vmovss 64(%rsp, %r14, 4), %xmm0
209 call acosf@PLT
210 # LOE rbx r14 r15 r12d r13d xmm0
211
212 vmovss %xmm0, 128(%rsp, %r14, 4)
213
214 /* Process special inputs in loop */
215 jmp L(SPECIAL_VALUES_LOOP)
216 # LOE rbx r15 r12d r13d
217END(_ZGVeN16v_acosf_skx)
218
219 .section .rodata, "a"
220 .align 64
221
222#ifdef __svml_sacos_data_internal_typedef
223typedef unsigned int VUINT32;
224typedef struct {
225 __declspec(align(64)) VUINT32 SgnBit[16][1];
226 __declspec(align(64)) VUINT32 OneHalf[16][1];
227 __declspec(align(64)) VUINT32 SmallNorm[16][1];
228 __declspec(align(64)) VUINT32 MOne[16][1];
229 __declspec(align(64)) VUINT32 Two[16][1];
230 __declspec(align(64)) VUINT32 sqrt_coeff[2][16][1];
231 __declspec(align(64)) VUINT32 poly_coeff[5][16][1];
232 __declspec(align(64)) VUINT32 Pi2H[16][1];
233 __declspec(align(64)) VUINT32 PiH[16][1];
234} __svml_sacos_data_internal;
235#endif
236__svml_sacos_data_internal:
237 /* SgnBit */
238 .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000
239 /* OneHalf */
240 .align 64
241 .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000
242 /* SmallNorm */
243 .align 64
244 .long 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000
245 /* MOne */
246 .align 64
247 .long 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000
248 /* Two */
249 .align 64
250 .long 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000
251 /* sqrt_coeff[2] */
252 .align 64
253 .long 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004 /* sqrt_coeff2 */
254 .long 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001 /* sqrt_coeff1 */
255 /* poly_coeff[5] */
256 .align 64
257 .long 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07 /* poly_coeff5 */
258 .long 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B /* poly_coeff4 */
259 .long 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4 /* poly_coeff3 */
260 .long 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12 /* poly_coeff2 */
261 .long 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF /* poly_coeff1 */
262 /* Pi2H */
263 .align 64
264 .long 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB
265 /* PiH */
266 .align 64
267 .long 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB
268 .align 64
269 .type __svml_sacos_data_internal, @object
270 .size __svml_sacos_data_internal, .-__svml_sacos_data_internal
271

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core_avx512.S