1/* Function atan2f vectorized with SSE4.
2 Copyright (C) 2021-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
18
19/*
20 * ALGORITHM DESCRIPTION:
21 * For 0.0 <= x <= 7.0/16.0: atan(x) = atan(0.0) + atan(s), where s=(x-0.0)/(1.0+0.0*x)
22 * For 7.0/16.0 <= x <= 11.0/16.0: atan(x) = atan(0.5) + atan(s), where s=(x-0.5)/(1.0+0.5*x)
23 * For 11.0/16.0 <= x <= 19.0/16.0: atan(x) = atan(1.0) + atan(s), where s=(x-1.0)/(1.0+1.0*x)
24 * For 19.0/16.0 <= x <= 39.0/16.0: atan(x) = atan(1.5) + atan(s), where s=(x-1.5)/(1.0+1.5*x)
25 * For 39.0/16.0 <= x <= inf : atan(x) = atan(inf) + atan(s), where s=-1.0/x
26 * Where atan(s) ~= s+s^3*Poly11(s^2) on interval |s|<7.0/0.16.
27 *
28 *
29 */
30
31/* Offsets for data table __svml_satan2_data_internal
32 */
33#define sZERO 0
34#define sSIGN_MASK 16
35#define sABS_MASK 32
36#define sPIO2 48
37#define sPI 64
38#define sPC8 80
39#define sPC7 96
40#define sPC6 112
41#define sPC5 128
42#define sPC4 144
43#define sPC3 160
44#define sPC2 176
45#define sPC1 192
46#define sPC0 208
47#define iCHK_WORK_SUB 224
48#define iCHK_WORK_CMP 240
49
50#include <sysdep.h>
51
52 .section .text.sse4, "ax", @progbits
53ENTRY(_ZGVbN4vv_atan2f_sse4)
54 subq $88, %rsp
55 cfi_def_cfa_offset(96)
56 movaps %xmm0, %xmm12
57
58 /*
59 * #define NO_VECTOR_ZERO_ATAN2_ARGS
60 * Declarations
61 * Variables
62 * Constants
63 * The end of declarations
64 * Implementation
65 * Arguments signs
66 */
67 movups sABS_MASK+__svml_satan2_data_internal(%rip), %xmm10
68 movaps %xmm1, %xmm13
69 movaps %xmm10, %xmm11
70 andps %xmm12, %xmm10
71 andps %xmm13, %xmm11
72 movaps %xmm10, %xmm7
73 cmpltps %xmm11, %xmm7
74
75 /*
76 * 1) If y<x then a= y, b=x, PIO2=0
77 * 2) If y>x then a=-x, b=y, PIO2=Pi/2
78 */
79 movups sSIGN_MASK+__svml_satan2_data_internal(%rip), %xmm6
80 movaps %xmm7, %xmm0
81 orps %xmm11, %xmm6
82 movaps %xmm10, %xmm4
83 andnps %xmm6, %xmm0
84 movaps %xmm7, %xmm6
85 movaps %xmm11, %xmm5
86 andps %xmm7, %xmm4
87 andnps %xmm10, %xmm6
88 andps %xmm7, %xmm5
89 orps %xmm4, %xmm0
90 orps %xmm5, %xmm6
91
92 /* Division a/b. */
93 divps %xmm6, %xmm0
94
95 /* Testing on working interval. */
96 movdqu iCHK_WORK_SUB+__svml_satan2_data_internal(%rip), %xmm14
97 movaps %xmm11, %xmm15
98 movaps %xmm10, %xmm3
99 psubd %xmm14, %xmm15
100 psubd %xmm14, %xmm3
101 movdqa %xmm15, %xmm1
102 movdqu iCHK_WORK_CMP+__svml_satan2_data_internal(%rip), %xmm2
103 movdqa %xmm3, %xmm14
104 pcmpgtd %xmm2, %xmm1
105 pcmpeqd %xmm2, %xmm15
106 pcmpgtd %xmm2, %xmm14
107 pcmpeqd %xmm2, %xmm3
108 por %xmm15, %xmm1
109 por %xmm3, %xmm14
110 por %xmm14, %xmm1
111
112 /* Polynomial. */
113 movaps %xmm0, %xmm14
114 mulps %xmm0, %xmm14
115 movaps %xmm13, %xmm4
116 movmskps %xmm1, %ecx
117 movaps %xmm14, %xmm15
118 movaps %xmm11, %xmm9
119 mulps %xmm14, %xmm15
120 pxor %xmm13, %xmm9
121 movups sPC8+__svml_satan2_data_internal(%rip), %xmm2
122 movaps %xmm10, %xmm8
123 mulps %xmm15, %xmm2
124 pxor %xmm12, %xmm8
125 movups sPC7+__svml_satan2_data_internal(%rip), %xmm3
126 xorl %edx, %edx
127 mulps %xmm15, %xmm3
128 addps sPC6+__svml_satan2_data_internal(%rip), %xmm2
129 mulps %xmm15, %xmm2
130 addps sPC5+__svml_satan2_data_internal(%rip), %xmm3
131 mulps %xmm15, %xmm3
132 addps sPC4+__svml_satan2_data_internal(%rip), %xmm2
133 mulps %xmm15, %xmm2
134 addps sPC3+__svml_satan2_data_internal(%rip), %xmm3
135 mulps %xmm15, %xmm3
136 addps sPC2+__svml_satan2_data_internal(%rip), %xmm2
137 mulps %xmm2, %xmm15
138 addps sPC1+__svml_satan2_data_internal(%rip), %xmm3
139 mulps %xmm3, %xmm14
140 addps sPC0+__svml_satan2_data_internal(%rip), %xmm15
141
142 /* if x<0, sPI = Pi, else sPI =0 */
143 movups __svml_satan2_data_internal(%rip), %xmm5
144 xorl %eax, %eax
145 andnps sPIO2+__svml_satan2_data_internal(%rip), %xmm7
146 addps %xmm14, %xmm15
147 cmpleps %xmm5, %xmm4
148
149 /* Reconstruction. */
150 mulps %xmm15, %xmm0
151 andps sPI+__svml_satan2_data_internal(%rip), %xmm4
152 addps %xmm7, %xmm0
153 orps %xmm9, %xmm0
154 addps %xmm4, %xmm0
155 orps %xmm8, %xmm0
156
157 /* Special branch for fast (vector) processing of zero arguments */
158 testl %ecx, %ecx
159
160 /* Go to auxiliary branch */
161 jne L(AUX_BRANCH)
162 # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm1 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13
163
164 /* Return from auxiliary branch
165 * for out of main path inputs
166 */
167
168L(AUX_BRANCH_RETURN):
169 /*
170 * Special branch for fast (vector) processing of zero arguments
171 * The end of implementation
172 */
173 testl %edx, %edx
174
175 /* Go to special inputs processing branch */
176 jne L(SPECIAL_VALUES_BRANCH)
177 # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm12 xmm13
178
179 /* Restore registers
180 * and exit the function
181 */
182
183L(EXIT):
184 addq $88, %rsp
185 cfi_def_cfa_offset(8)
186 ret
187 cfi_def_cfa_offset(96)
188
189 /* Branch to process
190 * special inputs
191 */
192
193L(SPECIAL_VALUES_BRANCH):
194 movups %xmm12, 32(%rsp)
195 movups %xmm13, 48(%rsp)
196 movups %xmm0, 64(%rsp)
197 # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0
198
199 movq %r12, 16(%rsp)
200 cfi_offset(12, -80)
201 movl %eax, %r12d
202 movq %r13, 8(%rsp)
203 cfi_offset(13, -88)
204 movl %edx, %r13d
205 movq %r14, (%rsp)
206 cfi_offset(14, -96)
207 # LOE rbx rbp r15 r12d r13d
208
209 /* Range mask
210 * bits check
211 */
212
213L(RANGEMASK_CHECK):
214 btl %r12d, %r13d
215
216 /* Call scalar math function */
217 jc L(SCALAR_MATH_CALL)
218 # LOE rbx rbp r15 r12d r13d
219
220 /* Special inputs
221 * processing loop
222 */
223
224L(SPECIAL_VALUES_LOOP):
225 incl %r12d
226 cmpl $4, %r12d
227
228 /* Check bits in range mask */
229 jl L(RANGEMASK_CHECK)
230 # LOE rbx rbp r15 r12d r13d
231
232 movq 16(%rsp), %r12
233 cfi_restore(12)
234 movq 8(%rsp), %r13
235 cfi_restore(13)
236 movq (%rsp), %r14
237 cfi_restore(14)
238 movups 64(%rsp), %xmm0
239
240 /* Go to exit */
241 jmp L(EXIT)
242 cfi_offset(12, -80)
243 cfi_offset(13, -88)
244 cfi_offset(14, -96)
245 # LOE rbx rbp r12 r13 r14 r15 xmm0
246
247 /* Scalar math function call
248 * to process special input
249 */
250
251L(SCALAR_MATH_CALL):
252 movl %r12d, %r14d
253 movss 32(%rsp, %r14, 4), %xmm0
254 movss 48(%rsp, %r14, 4), %xmm1
255 call atan2f@PLT
256 # LOE rbx rbp r14 r15 r12d r13d xmm0
257
258 movss %xmm0, 64(%rsp, %r14, 4)
259
260 /* Process special inputs in loop */
261 jmp L(SPECIAL_VALUES_LOOP)
262 cfi_restore(12)
263 cfi_restore(13)
264 cfi_restore(14)
265 # LOE rbx rbp r15 r12d r13d
266
267 /* Auxiliary branch
268 * for out of main path inputs
269 */
270
271L(AUX_BRANCH):
272 /* Check if both X & Y are not NaNs: iXYnotNAN */
273 movaps %xmm13, %xmm3
274 movaps %xmm12, %xmm2
275 cmpordps %xmm13, %xmm3
276 cmpordps %xmm12, %xmm2
277
278 /*
279 * Path for zero arguments (at least one of both)
280 * Check if both args are zeros (den. is zero)
281 */
282 cmpeqps %xmm5, %xmm6
283
284 /* Check if at least on of Y or Y is zero: iAXAYZERO */
285 pcmpeqd %xmm5, %xmm11
286 pcmpeqd %xmm5, %xmm10
287 andps %xmm2, %xmm3
288 por %xmm10, %xmm11
289
290 /* Check if at least on of Y or Y is zero and not NaN: iAXAYZEROnotNAN */
291 andps %xmm3, %xmm11
292
293 /* Exclude from previous callout mask zero (and not NaN) arguments */
294 movaps %xmm11, %xmm10
295 pandn %xmm1, %xmm10
296
297 /* Set sPIO2 to zero if den. is zero */
298 movaps %xmm6, %xmm1
299 andnps %xmm7, %xmm1
300 andps %xmm5, %xmm6
301 orps %xmm6, %xmm1
302
303 /* Res = sign(Y)*(X<0)?(PIO2+PI):PIO2 */
304 pcmpgtd %xmm13, %xmm5
305 orps %xmm9, %xmm1
306 andps %xmm4, %xmm5
307
308 /* Merge results from main and spec path */
309 movaps %xmm11, %xmm4
310 addps %xmm5, %xmm1
311
312 /* Go to callout */
313 movmskps %xmm10, %edx
314 orps %xmm8, %xmm1
315 andnps %xmm0, %xmm4
316 andps %xmm11, %xmm1
317 movaps %xmm4, %xmm0
318 orps %xmm1, %xmm0
319
320 /* Return to main vector processing path */
321 jmp L(AUX_BRANCH_RETURN)
322 # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm12 xmm13
323END(_ZGVbN4vv_atan2f_sse4)
324
325 .section .rodata, "a"
326 .align 16
327
328#ifdef __svml_satan2_data_internal_typedef
329typedef unsigned int VUINT32;
330typedef struct {
331 __declspec(align(16)) VUINT32 sZERO[4][1];
332 __declspec(align(16)) VUINT32 sSIGN_MASK[4][1];
333 __declspec(align(16)) VUINT32 sABS_MASK[4][1];
334 __declspec(align(16)) VUINT32 sPIO2[4][1];
335 __declspec(align(16)) VUINT32 sPI[4][1];
336 __declspec(align(16)) VUINT32 sPC8[4][1];
337 __declspec(align(16)) VUINT32 sPC7[4][1];
338 __declspec(align(16)) VUINT32 sPC6[4][1];
339 __declspec(align(16)) VUINT32 sPC5[4][1];
340 __declspec(align(16)) VUINT32 sPC4[4][1];
341 __declspec(align(16)) VUINT32 sPC3[4][1];
342 __declspec(align(16)) VUINT32 sPC2[4][1];
343 __declspec(align(16)) VUINT32 sPC1[4][1];
344 __declspec(align(16)) VUINT32 sPC0[4][1];
345 __declspec(align(16)) VUINT32 iCHK_WORK_SUB[4][1];
346 __declspec(align(16)) VUINT32 iCHK_WORK_CMP[4][1];
347} __svml_satan2_data_internal;
348#endif
349__svml_satan2_data_internal:
350 .long 0x00000000, 0x00000000, 0x00000000, 0x00000000 // sZERO
351 .align 16
352 .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 // sSIGN_MASK
353 .align 16
354 .long 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF // sABS_MASK
355 .align 16
356 .long 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB // sPIO2
357 .align 16
358 .long 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB // sPI
359 .align 16
360 .long 0x3B322CC0, 0x3B322CC0, 0x3B322CC0, 0x3B322CC0 // sA08
361 .align 16
362 .long 0xBC7F2631, 0xBC7F2631, 0xBC7F2631, 0xBC7F2631 // sA07
363 .align 16
364 .long 0x3D2BC384, 0x3D2BC384, 0x3D2BC384, 0x3D2BC384 // sA06
365 .align 16
366 .long 0xBD987629, 0xBD987629, 0xBD987629, 0xBD987629 // sA05
367 .align 16
368 .long 0x3DD96474, 0x3DD96474, 0x3DD96474, 0x3DD96474 // sA04
369 .align 16
370 .long 0xBE1161F8, 0xBE1161F8, 0xBE1161F8, 0xBE1161F8 // sA03
371 .align 16
372 .long 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F // sA02
373 .align 16
374 .long 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49 // sA01
375 .align 16
376 .long 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000 // sA00
377 .align 16
378 .long 0x81000000, 0x81000000, 0x81000000, 0x81000000 // iCHK_WORK_SUB
379 .align 16
380 .long 0xFC000000, 0xFC000000, 0xFC000000, 0xFC000000 // iCHK_WORK_CMP
381 .align 16
382 .type __svml_satan2_data_internal, @object
383 .size __svml_satan2_data_internal, .-__svml_satan2_data_internal
384

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_s_atan2f4_core_sse4.S