1/* Function atan2f vectorized with AVX2.
2 Copyright (C) 2021-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
18
19/*
20 * ALGORITHM DESCRIPTION:
21 * For 0.0 <= x <= 7.0/16.0: atan(x) = atan(0.0) + atan(s), where s=(x-0.0)/(1.0+0.0*x)
22 * For 7.0/16.0 <= x <= 11.0/16.0: atan(x) = atan(0.5) + atan(s), where s=(x-0.5)/(1.0+0.5*x)
23 * For 11.0/16.0 <= x <= 19.0/16.0: atan(x) = atan(1.0) + atan(s), where s=(x-1.0)/(1.0+1.0*x)
24 * For 19.0/16.0 <= x <= 39.0/16.0: atan(x) = atan(1.5) + atan(s), where s=(x-1.5)/(1.0+1.5*x)
25 * For 39.0/16.0 <= x <= inf : atan(x) = atan(inf) + atan(s), where s=-1.0/x
26 * Where atan(s) ~= s+s^3*Poly11(s^2) on interval |s|<7.0/0.16.
27 *
28 *
29 */
30
31/* Offsets for data table __svml_satan2_data_internal
32 */
33#define sZERO 0
34#define sSIGN_MASK 32
35#define sABS_MASK 64
36#define sPIO2 96
37#define sPI 128
38#define sPC8 160
39#define sPC7 192
40#define sPC6 224
41#define sPC5 256
42#define sPC4 288
43#define sPC3 320
44#define sPC2 352
45#define sPC1 384
46#define sPC0 416
47#define iCHK_WORK_SUB 448
48#define iCHK_WORK_CMP 480
49
50#include <sysdep.h>
51
52 .section .text.avx2, "ax", @progbits
53ENTRY(_ZGVdN8vv_atan2f_avx2)
54 pushq %rbp
55 cfi_def_cfa_offset(16)
56 movq %rsp, %rbp
57 cfi_def_cfa(6, 16)
58 cfi_offset(6, -16)
59 andq $-32, %rsp
60 subq $128, %rsp
61 xorl %edx, %edx
62
63 /*
64 * #define NO_VECTOR_ZERO_ATAN2_ARGS
65 * Declarations
66 * Variables
67 * Constants
68 * The end of declarations
69 * Implementation
70 * Arguments signs
71 */
72 vmovups sABS_MASK+__svml_satan2_data_internal(%rip), %ymm2
73
74 /* Testing on working interval. */
75 vmovups iCHK_WORK_SUB+__svml_satan2_data_internal(%rip), %ymm15
76 vmovups iCHK_WORK_CMP+__svml_satan2_data_internal(%rip), %ymm9
77
78 /* if x<0, sPI = Pi, else sPI =0 */
79 vmovups __svml_satan2_data_internal(%rip), %ymm5
80 vmovaps %ymm1, %ymm7
81 vandps %ymm2, %ymm7, %ymm13
82 vandps %ymm2, %ymm0, %ymm12
83 vcmplt_oqps %ymm13, %ymm12, %ymm4
84 vcmple_oqps %ymm5, %ymm7, %ymm6
85 vpsubd %ymm15, %ymm13, %ymm10
86 vpsubd %ymm15, %ymm12, %ymm8
87
88 /*
89 * 1) If y<x then a= y, b=x, PIO2=0
90 * 2) If y>x then a=-x, b=y, PIO2=Pi/2
91 */
92 vorps sSIGN_MASK+__svml_satan2_data_internal(%rip), %ymm13, %ymm3
93 vblendvps %ymm4, %ymm12, %ymm3, %ymm14
94 vblendvps %ymm4, %ymm13, %ymm12, %ymm3
95
96 /* Division a/b. */
97 vdivps %ymm3, %ymm14, %ymm11
98 vpcmpgtd %ymm9, %ymm10, %ymm14
99 vpcmpeqd %ymm9, %ymm10, %ymm15
100 vpor %ymm15, %ymm14, %ymm10
101 vmovups sPC7+__svml_satan2_data_internal(%rip), %ymm15
102 vpcmpgtd %ymm9, %ymm8, %ymm14
103 vpcmpeqd %ymm9, %ymm8, %ymm8
104 vpor %ymm8, %ymm14, %ymm9
105 vmovups sPC8+__svml_satan2_data_internal(%rip), %ymm14
106 vpor %ymm9, %ymm10, %ymm10
107
108 /* Polynomial. */
109 vmulps %ymm11, %ymm11, %ymm9
110 vmulps %ymm9, %ymm9, %ymm8
111 vfmadd213ps sPC6+__svml_satan2_data_internal(%rip), %ymm8, %ymm14
112 vfmadd213ps sPC5+__svml_satan2_data_internal(%rip), %ymm8, %ymm15
113 vfmadd213ps sPC4+__svml_satan2_data_internal(%rip), %ymm8, %ymm14
114 vfmadd213ps sPC3+__svml_satan2_data_internal(%rip), %ymm8, %ymm15
115 vfmadd213ps sPC2+__svml_satan2_data_internal(%rip), %ymm8, %ymm14
116 vfmadd213ps sPC1+__svml_satan2_data_internal(%rip), %ymm8, %ymm15
117 vfmadd213ps sPC0+__svml_satan2_data_internal(%rip), %ymm8, %ymm14
118 vfmadd213ps %ymm14, %ymm9, %ymm15
119 vandnps sPIO2+__svml_satan2_data_internal(%rip), %ymm4, %ymm4
120
121 /* Reconstruction. */
122 vfmadd213ps %ymm4, %ymm11, %ymm15
123 vxorps %ymm13, %ymm7, %ymm1
124 vandps sPI+__svml_satan2_data_internal(%rip), %ymm6, %ymm6
125 vorps %ymm1, %ymm15, %ymm11
126 vaddps %ymm11, %ymm6, %ymm8
127 vmovmskps %ymm10, %eax
128 vxorps %ymm12, %ymm0, %ymm2
129 vorps %ymm2, %ymm8, %ymm9
130
131 /* Special branch for fast (vector) processing of zero arguments */
132 testl %eax, %eax
133
134 /* Go to auxiliary branch */
135 jne L(AUX_BRANCH)
136 # LOE rbx r12 r13 r14 r15 edx ymm0 ymm1 ymm2 ymm3 ymm4 ymm5 ymm6 ymm7 ymm9 ymm10 ymm12 ymm13
137
138 /* Return from auxiliary branch
139 * for out of main path inputs
140 */
141
142L(AUX_BRANCH_RETURN):
143 /*
144 * Special branch for fast (vector) processing of zero arguments
145 * The end of implementation
146 */
147 testl %edx, %edx
148
149 /* Go to special inputs processing branch */
150 jne L(SPECIAL_VALUES_BRANCH)
151 # LOE rbx r12 r13 r14 r15 edx ymm0 ymm7 ymm9
152
153 /* Restore registers
154 * and exit the function
155 */
156
157L(EXIT):
158 vmovaps %ymm9, %ymm0
159 movq %rbp, %rsp
160 popq %rbp
161 cfi_def_cfa(7, 8)
162 cfi_restore(6)
163 ret
164 cfi_def_cfa(6, 16)
165 cfi_offset(6, -16)
166
167 /* Branch to process
168 * special inputs
169 */
170
171L(SPECIAL_VALUES_BRANCH):
172 vmovups %ymm0, 32(%rsp)
173 vmovups %ymm7, 64(%rsp)
174 vmovups %ymm9, 96(%rsp)
175 # LOE rbx r12 r13 r14 r15 edx ymm9
176
177 xorl %eax, %eax
178 # LOE rbx r12 r13 r14 r15 eax edx
179
180 vzeroupper
181 movq %r12, 16(%rsp)
182 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -112; DW_OP_plus) */
183 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x90, 0xff, 0xff, 0xff, 0x22
184 movl %eax, %r12d
185 movq %r13, 8(%rsp)
186 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -120; DW_OP_plus) */
187 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x88, 0xff, 0xff, 0xff, 0x22
188 movl %edx, %r13d
189 movq %r14, (%rsp)
190 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -128; DW_OP_plus) */
191 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x22
192 # LOE rbx r15 r12d r13d
193
194 /* Range mask
195 * bits check
196 */
197
198L(RANGEMASK_CHECK):
199 btl %r12d, %r13d
200
201 /* Call scalar math function */
202 jc L(SCALAR_MATH_CALL)
203 # LOE rbx r15 r12d r13d
204
205 /* Special inputs
206 * processing loop
207 */
208
209L(SPECIAL_VALUES_LOOP):
210 incl %r12d
211 cmpl $8, %r12d
212
213 /* Check bits in range mask */
214 jl L(RANGEMASK_CHECK)
215 # LOE rbx r15 r12d r13d
216
217 movq 16(%rsp), %r12
218 cfi_restore(12)
219 movq 8(%rsp), %r13
220 cfi_restore(13)
221 movq (%rsp), %r14
222 cfi_restore(14)
223 vmovups 96(%rsp), %ymm9
224
225 /* Go to exit */
226 jmp L(EXIT)
227 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -112; DW_OP_plus) */
228 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x90, 0xff, 0xff, 0xff, 0x22
229 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -120; DW_OP_plus) */
230 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x88, 0xff, 0xff, 0xff, 0x22
231 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -128; DW_OP_plus) */
232 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x22
233 # LOE rbx r12 r13 r14 r15 ymm9
234
235 /* Scalar math function call
236 * to process special input
237 */
238
239L(SCALAR_MATH_CALL):
240 movl %r12d, %r14d
241 vmovss 32(%rsp, %r14, 4), %xmm0
242 vmovss 64(%rsp, %r14, 4), %xmm1
243 call atan2f@PLT
244 # LOE rbx r14 r15 r12d r13d xmm0
245
246 vmovss %xmm0, 96(%rsp, %r14, 4)
247
248 /* Process special inputs in loop */
249 jmp L(SPECIAL_VALUES_LOOP)
250 cfi_restore(12)
251 cfi_restore(13)
252 cfi_restore(14)
253 # LOE rbx r15 r12d r13d
254
255 /* Auxiliary branch
256 * for out of main path inputs
257 */
258
259L(AUX_BRANCH):
260 /* Check if at least on of Y or Y is zero: iAXAYZERO */
261 vpcmpeqd %ymm5, %ymm13, %ymm13
262 vpcmpeqd %ymm5, %ymm12, %ymm12
263
264 /* Check if both X & Y are not NaNs: iXYnotNAN */
265 vcmpordps %ymm7, %ymm7, %ymm11
266 vcmpordps %ymm0, %ymm0, %ymm14
267
268 /*
269 * Path for zero arguments (at least one of both)
270 * Check if both args are zeros (den. is zero)
271 */
272 vcmpeqps %ymm5, %ymm3, %ymm3
273 vpor %ymm12, %ymm13, %ymm15
274
275 /* Set sPIO2 to zero if den. is zero */
276 vblendvps %ymm3, %ymm5, %ymm4, %ymm4
277 vandps %ymm14, %ymm11, %ymm8
278
279 /* Check if at least on of Y or Y is zero and not NaN: iAXAYZEROnotNAN */
280 vpand %ymm8, %ymm15, %ymm8
281
282 /* Res = sign(Y)*(X<0)?(PIO2+PI):PIO2 */
283 vpcmpgtd %ymm7, %ymm5, %ymm5
284 vorps %ymm1, %ymm4, %ymm1
285 vandps %ymm6, %ymm5, %ymm6
286 vaddps %ymm6, %ymm1, %ymm1
287
288 /* Exclude from previous callout mask zero (and not NaN) arguments */
289 vpandn %ymm10, %ymm8, %ymm10
290 vorps %ymm2, %ymm1, %ymm2
291
292 /* Go to callout */
293 vmovmskps %ymm10, %edx
294
295 /* Merge results from main and spec path */
296 vblendvps %ymm8, %ymm2, %ymm9, %ymm9
297
298 /* Return to main vector processing path */
299 jmp L(AUX_BRANCH_RETURN)
300 # LOE rbx r12 r13 r14 r15 edx ymm0 ymm7 ymm9
301END(_ZGVdN8vv_atan2f_avx2)
302
303 .section .rodata, "a"
304 .align 32
305
306#ifdef __svml_satan2_data_internal_typedef
307typedef unsigned int VUINT32;
308typedef struct {
309 __declspec(align(32)) VUINT32 sZERO[8][1];
310 __declspec(align(32)) VUINT32 sSIGN_MASK[8][1];
311 __declspec(align(32)) VUINT32 sABS_MASK[8][1];
312 __declspec(align(32)) VUINT32 sPIO2[8][1];
313 __declspec(align(32)) VUINT32 sPI[8][1];
314 __declspec(align(32)) VUINT32 sPC8[8][1];
315 __declspec(align(32)) VUINT32 sPC7[8][1];
316 __declspec(align(32)) VUINT32 sPC6[8][1];
317 __declspec(align(32)) VUINT32 sPC5[8][1];
318 __declspec(align(32)) VUINT32 sPC4[8][1];
319 __declspec(align(32)) VUINT32 sPC3[8][1];
320 __declspec(align(32)) VUINT32 sPC2[8][1];
321 __declspec(align(32)) VUINT32 sPC1[8][1];
322 __declspec(align(32)) VUINT32 sPC0[8][1];
323 __declspec(align(32)) VUINT32 iCHK_WORK_SUB[8][1];
324 __declspec(align(32)) VUINT32 iCHK_WORK_CMP[8][1];
325} __svml_satan2_data_internal;
326#endif
327__svml_satan2_data_internal:
328 .long 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // sZERO
329 .align 32
330 .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000 // sSIGN_MASK
331 .align 32
332 .long 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF // sABS_MASK
333 .align 32
334 .long 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB // sPIO2
335 .align 32
336 .long 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB // sPI
337 .align 32
338 .long 0x3B322CC0, 0x3B322CC0, 0x3B322CC0, 0x3B322CC0, 0x3B322CC0, 0x3B322CC0, 0x3B322CC0, 0x3B322CC0 // sA08
339 .align 32
340 .long 0xBC7F2631, 0xBC7F2631, 0xBC7F2631, 0xBC7F2631, 0xBC7F2631, 0xBC7F2631, 0xBC7F2631, 0xBC7F2631 // sA07
341 .align 32
342 .long 0x3D2BC384, 0x3D2BC384, 0x3D2BC384, 0x3D2BC384, 0x3D2BC384, 0x3D2BC384, 0x3D2BC384, 0x3D2BC384 // sA06
343 .align 32
344 .long 0xBD987629, 0xBD987629, 0xBD987629, 0xBD987629, 0xBD987629, 0xBD987629, 0xBD987629, 0xBD987629 // sA05
345 .align 32
346 .long 0x3DD96474, 0x3DD96474, 0x3DD96474, 0x3DD96474, 0x3DD96474, 0x3DD96474, 0x3DD96474, 0x3DD96474 // sA04
347 .align 32
348 .long 0xBE1161F8, 0xBE1161F8, 0xBE1161F8, 0xBE1161F8, 0xBE1161F8, 0xBE1161F8, 0xBE1161F8, 0xBE1161F8 // sA03
349 .align 32
350 .long 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F // sA02
351 .align 32
352 .long 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49 // sA01
353 .align 32
354 .long 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000 // sA00
355 .align 32
356 .long 0x81000000, 0x81000000, 0x81000000, 0x81000000, 0x81000000, 0x81000000, 0x81000000, 0x81000000 // iCHK_WORK_SUB
357 .align 32
358 .long 0xFC000000, 0xFC000000, 0xFC000000, 0xFC000000, 0xFC000000, 0xFC000000, 0xFC000000, 0xFC000000 // iCHK_WORK_CMP
359 .align 32
360 .type __svml_satan2_data_internal, @object
361 .size __svml_satan2_data_internal, .-__svml_satan2_data_internal
362

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_s_atan2f8_core_avx2.S