1/* Function acosf vectorized with AVX2.
2 Copyright (C) 2021-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
18
19/*
20 * ALGORITHM DESCRIPTION:
21 *
22 * SelMask = (|x| >= 0.5) ? 1 : 0;
23 * R = SelMask ? sqrt(0.5 - 0.5*|x|) : |x|
24 * acos(|x|) = SelMask ? 2*Poly(R) : (Pi/2 - Poly(R))
25 * acos(x) = sign(x) ? (Pi - acos(|x|)) : acos(|x|)
26 *
27 *
28 */
29
30/* Offsets for data table __svml_sacos_data_internal
31 */
32#define SgnBit 0
33#define OneHalf 32
34#define SmallNorm 64
35#define MOne 96
36#define Two 128
37#define sqrt_coeff 160
38#define poly_coeff 224
39#define Pi2H 384
40#define PiH 416
41
42#include <sysdep.h>
43
44 .section .text.avx2, "ax", @progbits
45ENTRY(_ZGVdN8v_acosf_avx2)
46 pushq %rbp
47 cfi_def_cfa_offset(16)
48 movq %rsp, %rbp
49 cfi_def_cfa(6, 16)
50 cfi_offset(6, -16)
51 andq $-32, %rsp
52 subq $96, %rsp
53
54 /*
55 * 2*sqrt(X) ~ Sh - Sl (to 24+ bits)
56 * SQ ~ 2*sqrt(X)
57 */
58 vmovups __svml_sacos_data_internal(%rip), %ymm6
59 vmovups OneHalf+__svml_sacos_data_internal(%rip), %ymm7
60 vmovaps %ymm0, %ymm5
61
62 /* x = -|arg| */
63 vorps %ymm5, %ymm6, %ymm4
64
65 /* Y = 0.5 + 0.5*(-x) */
66 vfmadd231ps %ymm4, %ymm7, %ymm7
67
68 /* x^2 */
69 vmulps %ymm4, %ymm4, %ymm8
70
71 /* SQ ~ 2*sqrt(Y) */
72 vmovups sqrt_coeff+__svml_sacos_data_internal(%rip), %ymm0
73 vcmpnge_uqps MOne+__svml_sacos_data_internal(%rip), %ymm4, %ymm9
74 vcmplt_oqps SmallNorm+__svml_sacos_data_internal(%rip), %ymm7, %ymm10
75 vminps %ymm7, %ymm8, %ymm2
76 vaddps %ymm7, %ymm7, %ymm14
77 vrsqrtps %ymm7, %ymm11
78 vmovups poly_coeff+64+__svml_sacos_data_internal(%rip), %ymm8
79 vcmpnlt_uqps %ymm7, %ymm2, %ymm1
80 vmulps %ymm2, %ymm2, %ymm7
81 vfmadd213ps poly_coeff+96+__svml_sacos_data_internal(%rip), %ymm2, %ymm8
82 vmovmskps %ymm9, %edx
83
84 /* polynomial */
85 vmovups poly_coeff+__svml_sacos_data_internal(%rip), %ymm9
86 vandnps %ymm11, %ymm10, %ymm12
87 vmulps %ymm12, %ymm12, %ymm13
88 vfmadd213ps poly_coeff+32+__svml_sacos_data_internal(%rip), %ymm2, %ymm9
89
90 /* X<X^2 iff X<0 */
91 vcmplt_oqps %ymm2, %ymm5, %ymm10
92 vfmadd213ps %ymm8, %ymm7, %ymm9
93 vandps %ymm5, %ymm6, %ymm3
94 vmulps %ymm14, %ymm12, %ymm6
95 vfmsub213ps Two+__svml_sacos_data_internal(%rip), %ymm13, %ymm14
96 vfmadd213ps poly_coeff+128+__svml_sacos_data_internal(%rip), %ymm2, %ymm9
97 vfmadd213ps sqrt_coeff+32+__svml_sacos_data_internal(%rip), %ymm14, %ymm0
98 vmulps %ymm14, %ymm6, %ymm15
99 vmulps %ymm9, %ymm2, %ymm14
100 vfnmadd213ps %ymm6, %ymm15, %ymm0
101 vblendvps %ymm1, %ymm0, %ymm4, %ymm0
102 vandps PiH+__svml_sacos_data_internal(%rip), %ymm1, %ymm2
103 vandnps Pi2H+__svml_sacos_data_internal(%rip), %ymm1, %ymm12
104 vxorps %ymm3, %ymm0, %ymm1
105 vfmadd213ps %ymm1, %ymm1, %ymm14
106 vandps %ymm10, %ymm2, %ymm11
107 vaddps %ymm12, %ymm11, %ymm13
108 vaddps %ymm14, %ymm13, %ymm0
109 testl %edx, %edx
110
111 /* Go to special inputs processing branch */
112 jne L(SPECIAL_VALUES_BRANCH)
113 # LOE rbx r12 r13 r14 r15 edx ymm0 ymm5
114
115 /* Restore registers
116 * and exit the function
117 */
118
119L(EXIT):
120 movq %rbp, %rsp
121 popq %rbp
122 cfi_def_cfa(7, 8)
123 cfi_restore(6)
124 ret
125 cfi_def_cfa(6, 16)
126 cfi_offset(6, -16)
127
128 /* Branch to process
129 * special inputs
130 */
131
132L(SPECIAL_VALUES_BRANCH):
133 vmovups %ymm5, 32(%rsp)
134 vmovups %ymm0, 64(%rsp)
135 # LOE rbx r12 r13 r14 r15 edx ymm0
136
137 xorl %eax, %eax
138 # LOE rbx r12 r13 r14 r15 eax edx
139
140 vzeroupper
141 movq %r12, 16(%rsp)
142 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */
143 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
144 movl %eax, %r12d
145 movq %r13, 8(%rsp)
146 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */
147 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
148 movl %edx, %r13d
149 movq %r14, (%rsp)
150 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */
151 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
152 # LOE rbx r15 r12d r13d
153
154 /* Range mask
155 * bits check
156 */
157
158L(RANGEMASK_CHECK):
159 btl %r12d, %r13d
160
161 /* Call scalar math function */
162 jc L(SCALAR_MATH_CALL)
163 # LOE rbx r15 r12d r13d
164
165 /* Special inputs
166 * processing loop
167 */
168
169L(SPECIAL_VALUES_LOOP):
170 incl %r12d
171 cmpl $8, %r12d
172
173 /* Check bits in range mask */
174 jl L(RANGEMASK_CHECK)
175 # LOE rbx r15 r12d r13d
176
177 movq 16(%rsp), %r12
178 cfi_restore(12)
179 movq 8(%rsp), %r13
180 cfi_restore(13)
181 movq (%rsp), %r14
182 cfi_restore(14)
183 vmovups 64(%rsp), %ymm0
184
185 /* Go to exit */
186 jmp L(EXIT)
187 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */
188 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
189 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */
190 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
191 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */
192 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
193 # LOE rbx r12 r13 r14 r15 ymm0
194
195 /* Scalar math function call
196 * to process special input
197 */
198
199L(SCALAR_MATH_CALL):
200 movl %r12d, %r14d
201 vmovss 32(%rsp, %r14, 4), %xmm0
202 call acosf@PLT
203 # LOE rbx r14 r15 r12d r13d xmm0
204
205 vmovss %xmm0, 64(%rsp, %r14, 4)
206
207 /* Process special inputs in loop */
208 jmp L(SPECIAL_VALUES_LOOP)
209 # LOE rbx r15 r12d r13d
210END(_ZGVdN8v_acosf_avx2)
211
212 .section .rodata, "a"
213 .align 32
214
215#ifdef __svml_sacos_data_internal_typedef
216typedef unsigned int VUINT32;
217typedef struct {
218 __declspec(align(32)) VUINT32 SgnBit[8][1];
219 __declspec(align(32)) VUINT32 OneHalf[8][1];
220 __declspec(align(32)) VUINT32 SmallNorm[8][1];
221 __declspec(align(32)) VUINT32 MOne[8][1];
222 __declspec(align(32)) VUINT32 Two[8][1];
223 __declspec(align(32)) VUINT32 sqrt_coeff[2][8][1];
224 __declspec(align(32)) VUINT32 poly_coeff[5][8][1];
225 __declspec(align(32)) VUINT32 Pi2H[8][1];
226 __declspec(align(32)) VUINT32 PiH[8][1];
227} __svml_sacos_data_internal;
228#endif
229__svml_sacos_data_internal:
230 /* SgnBit */
231 .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000
232 /* OneHalf */
233 .align 32
234 .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000
235 /* SmallNorm */
236 .align 32
237 .long 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000
238 /* MOne */
239 .align 32
240 .long 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000
241 /* Two */
242 .align 32
243 .long 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000
244 /* sqrt_coeff[2] */
245 .align 32
246 .long 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004 /* sqrt_coeff2 */
247 .long 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001 /* sqrt_coeff1 */
248 /* poly_coeff[5] */
249 .align 32
250 .long 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07 /* poly_coeff5 */
251 .long 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B /* poly_coeff4 */
252 .long 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4 /* poly_coeff3 */
253 .long 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12 /* poly_coeff2 */
254 .long 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF /* poly_coeff1 */
255 /* Pi2H */
256 .align 32
257 .long 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB
258 /* PiH */
259 .align 32
260 .long 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB
261 .align 32
262 .type __svml_sacos_data_internal, @object
263 .size __svml_sacos_data_internal, .-__svml_sacos_data_internal
264

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core_avx2.S