1 | /* Function atan2f vectorized with SSE4. |
2 | Copyright (C) 2021-2024 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | https://www.gnu.org/licenses/. */ |
18 | |
19 | /* |
20 | * ALGORITHM DESCRIPTION: |
21 | * For 0.0 <= x <= 7.0/16.0: atan(x) = atan(0.0) + atan(s), where s=(x-0.0)/(1.0+0.0*x) |
22 | * For 7.0/16.0 <= x <= 11.0/16.0: atan(x) = atan(0.5) + atan(s), where s=(x-0.5)/(1.0+0.5*x) |
23 | * For 11.0/16.0 <= x <= 19.0/16.0: atan(x) = atan(1.0) + atan(s), where s=(x-1.0)/(1.0+1.0*x) |
24 | * For 19.0/16.0 <= x <= 39.0/16.0: atan(x) = atan(1.5) + atan(s), where s=(x-1.5)/(1.0+1.5*x) |
25 | * For 39.0/16.0 <= x <= inf : atan(x) = atan(inf) + atan(s), where s=-1.0/x |
26 | * Where atan(s) ~= s+s^3*Poly11(s^2) on interval |s|<7.0/0.16. |
27 | * |
28 | * |
29 | */ |
30 | |
31 | /* Offsets for data table __svml_satan2_data_internal |
32 | */ |
33 | #define sZERO 0 |
34 | #define sSIGN_MASK 16 |
35 | #define sABS_MASK 32 |
36 | #define sPIO2 48 |
37 | #define sPI 64 |
38 | #define sPC8 80 |
39 | #define sPC7 96 |
40 | #define sPC6 112 |
41 | #define sPC5 128 |
42 | #define sPC4 144 |
43 | #define sPC3 160 |
44 | #define sPC2 176 |
45 | #define sPC1 192 |
46 | #define sPC0 208 |
47 | #define iCHK_WORK_SUB 224 |
48 | #define iCHK_WORK_CMP 240 |
49 | |
50 | #include <sysdep.h> |
51 | |
52 | .section .text.sse4, "ax" , @progbits |
53 | ENTRY(_ZGVbN4vv_atan2f_sse4) |
54 | subq $88, %rsp |
55 | cfi_def_cfa_offset(96) |
56 | movaps %xmm0, %xmm12 |
57 | |
58 | /* |
59 | * #define NO_VECTOR_ZERO_ATAN2_ARGS |
60 | * Declarations |
61 | * Variables |
62 | * Constants |
63 | * The end of declarations |
64 | * Implementation |
65 | * Arguments signs |
66 | */ |
67 | movups sABS_MASK+__svml_satan2_data_internal(%rip), %xmm10 |
68 | movaps %xmm1, %xmm13 |
69 | movaps %xmm10, %xmm11 |
70 | andps %xmm12, %xmm10 |
71 | andps %xmm13, %xmm11 |
72 | movaps %xmm10, %xmm7 |
73 | cmpltps %xmm11, %xmm7 |
74 | |
75 | /* |
76 | * 1) If y<x then a= y, b=x, PIO2=0 |
77 | * 2) If y>x then a=-x, b=y, PIO2=Pi/2 |
78 | */ |
79 | movups sSIGN_MASK+__svml_satan2_data_internal(%rip), %xmm6 |
80 | movaps %xmm7, %xmm0 |
81 | orps %xmm11, %xmm6 |
82 | movaps %xmm10, %xmm4 |
83 | andnps %xmm6, %xmm0 |
84 | movaps %xmm7, %xmm6 |
85 | movaps %xmm11, %xmm5 |
86 | andps %xmm7, %xmm4 |
87 | andnps %xmm10, %xmm6 |
88 | andps %xmm7, %xmm5 |
89 | orps %xmm4, %xmm0 |
90 | orps %xmm5, %xmm6 |
91 | |
92 | /* Division a/b. */ |
93 | divps %xmm6, %xmm0 |
94 | |
95 | /* Testing on working interval. */ |
96 | movdqu iCHK_WORK_SUB+__svml_satan2_data_internal(%rip), %xmm14 |
97 | movaps %xmm11, %xmm15 |
98 | movaps %xmm10, %xmm3 |
99 | psubd %xmm14, %xmm15 |
100 | psubd %xmm14, %xmm3 |
101 | movdqa %xmm15, %xmm1 |
102 | movdqu iCHK_WORK_CMP+__svml_satan2_data_internal(%rip), %xmm2 |
103 | movdqa %xmm3, %xmm14 |
104 | pcmpgtd %xmm2, %xmm1 |
105 | pcmpeqd %xmm2, %xmm15 |
106 | pcmpgtd %xmm2, %xmm14 |
107 | pcmpeqd %xmm2, %xmm3 |
108 | por %xmm15, %xmm1 |
109 | por %xmm3, %xmm14 |
110 | por %xmm14, %xmm1 |
111 | |
112 | /* Polynomial. */ |
113 | movaps %xmm0, %xmm14 |
114 | mulps %xmm0, %xmm14 |
115 | movaps %xmm13, %xmm4 |
116 | movmskps %xmm1, %ecx |
117 | movaps %xmm14, %xmm15 |
118 | movaps %xmm11, %xmm9 |
119 | mulps %xmm14, %xmm15 |
120 | pxor %xmm13, %xmm9 |
121 | movups sPC8+__svml_satan2_data_internal(%rip), %xmm2 |
122 | movaps %xmm10, %xmm8 |
123 | mulps %xmm15, %xmm2 |
124 | pxor %xmm12, %xmm8 |
125 | movups sPC7+__svml_satan2_data_internal(%rip), %xmm3 |
126 | xorl %edx, %edx |
127 | mulps %xmm15, %xmm3 |
128 | addps sPC6+__svml_satan2_data_internal(%rip), %xmm2 |
129 | mulps %xmm15, %xmm2 |
130 | addps sPC5+__svml_satan2_data_internal(%rip), %xmm3 |
131 | mulps %xmm15, %xmm3 |
132 | addps sPC4+__svml_satan2_data_internal(%rip), %xmm2 |
133 | mulps %xmm15, %xmm2 |
134 | addps sPC3+__svml_satan2_data_internal(%rip), %xmm3 |
135 | mulps %xmm15, %xmm3 |
136 | addps sPC2+__svml_satan2_data_internal(%rip), %xmm2 |
137 | mulps %xmm2, %xmm15 |
138 | addps sPC1+__svml_satan2_data_internal(%rip), %xmm3 |
139 | mulps %xmm3, %xmm14 |
140 | addps sPC0+__svml_satan2_data_internal(%rip), %xmm15 |
141 | |
142 | /* if x<0, sPI = Pi, else sPI =0 */ |
143 | movups __svml_satan2_data_internal(%rip), %xmm5 |
144 | xorl %eax, %eax |
145 | andnps sPIO2+__svml_satan2_data_internal(%rip), %xmm7 |
146 | addps %xmm14, %xmm15 |
147 | cmpleps %xmm5, %xmm4 |
148 | |
149 | /* Reconstruction. */ |
150 | mulps %xmm15, %xmm0 |
151 | andps sPI+__svml_satan2_data_internal(%rip), %xmm4 |
152 | addps %xmm7, %xmm0 |
153 | orps %xmm9, %xmm0 |
154 | addps %xmm4, %xmm0 |
155 | orps %xmm8, %xmm0 |
156 | |
157 | /* Special branch for fast (vector) processing of zero arguments */ |
158 | testl %ecx, %ecx |
159 | |
160 | /* Go to auxiliary branch */ |
161 | jne L(AUX_BRANCH) |
162 | # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm1 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 |
163 | |
164 | /* Return from auxiliary branch |
165 | * for out of main path inputs |
166 | */ |
167 | |
168 | L(AUX_BRANCH_RETURN): |
169 | /* |
170 | * Special branch for fast (vector) processing of zero arguments |
171 | * The end of implementation |
172 | */ |
173 | testl %edx, %edx |
174 | |
175 | /* Go to special inputs processing branch */ |
176 | jne L(SPECIAL_VALUES_BRANCH) |
177 | # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm12 xmm13 |
178 | |
179 | /* Restore registers |
180 | * and exit the function |
181 | */ |
182 | |
183 | L(EXIT): |
184 | addq $88, %rsp |
185 | cfi_def_cfa_offset(8) |
186 | ret |
187 | cfi_def_cfa_offset(96) |
188 | |
189 | /* Branch to process |
190 | * special inputs |
191 | */ |
192 | |
193 | L(SPECIAL_VALUES_BRANCH): |
194 | movups %xmm12, 32(%rsp) |
195 | movups %xmm13, 48(%rsp) |
196 | movups %xmm0, 64(%rsp) |
197 | # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 |
198 | |
199 | movq %r12, 16(%rsp) |
200 | cfi_offset(12, -80) |
201 | movl %eax, %r12d |
202 | movq %r13, 8(%rsp) |
203 | cfi_offset(13, -88) |
204 | movl %edx, %r13d |
205 | movq %r14, (%rsp) |
206 | cfi_offset(14, -96) |
207 | # LOE rbx rbp r15 r12d r13d |
208 | |
209 | /* Range mask |
210 | * bits check |
211 | */ |
212 | |
213 | L(RANGEMASK_CHECK): |
214 | btl %r12d, %r13d |
215 | |
216 | /* Call scalar math function */ |
217 | jc L(SCALAR_MATH_CALL) |
218 | # LOE rbx rbp r15 r12d r13d |
219 | |
220 | /* Special inputs |
221 | * processing loop |
222 | */ |
223 | |
224 | L(SPECIAL_VALUES_LOOP): |
225 | incl %r12d |
226 | cmpl $4, %r12d |
227 | |
228 | /* Check bits in range mask */ |
229 | jl L(RANGEMASK_CHECK) |
230 | # LOE rbx rbp r15 r12d r13d |
231 | |
232 | movq 16(%rsp), %r12 |
233 | cfi_restore(12) |
234 | movq 8(%rsp), %r13 |
235 | cfi_restore(13) |
236 | movq (%rsp), %r14 |
237 | cfi_restore(14) |
238 | movups 64(%rsp), %xmm0 |
239 | |
240 | /* Go to exit */ |
241 | jmp L(EXIT) |
242 | cfi_offset(12, -80) |
243 | cfi_offset(13, -88) |
244 | cfi_offset(14, -96) |
245 | # LOE rbx rbp r12 r13 r14 r15 xmm0 |
246 | |
247 | /* Scalar math function call |
248 | * to process special input |
249 | */ |
250 | |
251 | L(SCALAR_MATH_CALL): |
252 | movl %r12d, %r14d |
253 | movss 32(%rsp, %r14, 4), %xmm0 |
254 | movss 48(%rsp, %r14, 4), %xmm1 |
255 | call atan2f@PLT |
256 | # LOE rbx rbp r14 r15 r12d r13d xmm0 |
257 | |
258 | movss %xmm0, 64(%rsp, %r14, 4) |
259 | |
260 | /* Process special inputs in loop */ |
261 | jmp L(SPECIAL_VALUES_LOOP) |
262 | cfi_restore(12) |
263 | cfi_restore(13) |
264 | cfi_restore(14) |
265 | # LOE rbx rbp r15 r12d r13d |
266 | |
267 | /* Auxiliary branch |
268 | * for out of main path inputs |
269 | */ |
270 | |
271 | L(AUX_BRANCH): |
272 | /* Check if both X & Y are not NaNs: iXYnotNAN */ |
273 | movaps %xmm13, %xmm3 |
274 | movaps %xmm12, %xmm2 |
275 | cmpordps %xmm13, %xmm3 |
276 | cmpordps %xmm12, %xmm2 |
277 | |
278 | /* |
279 | * Path for zero arguments (at least one of both) |
280 | * Check if both args are zeros (den. is zero) |
281 | */ |
282 | cmpeqps %xmm5, %xmm6 |
283 | |
284 | /* Check if at least on of Y or Y is zero: iAXAYZERO */ |
285 | pcmpeqd %xmm5, %xmm11 |
286 | pcmpeqd %xmm5, %xmm10 |
287 | andps %xmm2, %xmm3 |
288 | por %xmm10, %xmm11 |
289 | |
290 | /* Check if at least on of Y or Y is zero and not NaN: iAXAYZEROnotNAN */ |
291 | andps %xmm3, %xmm11 |
292 | |
293 | /* Exclude from previous callout mask zero (and not NaN) arguments */ |
294 | movaps %xmm11, %xmm10 |
295 | pandn %xmm1, %xmm10 |
296 | |
297 | /* Set sPIO2 to zero if den. is zero */ |
298 | movaps %xmm6, %xmm1 |
299 | andnps %xmm7, %xmm1 |
300 | andps %xmm5, %xmm6 |
301 | orps %xmm6, %xmm1 |
302 | |
303 | /* Res = sign(Y)*(X<0)?(PIO2+PI):PIO2 */ |
304 | pcmpgtd %xmm13, %xmm5 |
305 | orps %xmm9, %xmm1 |
306 | andps %xmm4, %xmm5 |
307 | |
308 | /* Merge results from main and spec path */ |
309 | movaps %xmm11, %xmm4 |
310 | addps %xmm5, %xmm1 |
311 | |
312 | /* Go to callout */ |
313 | movmskps %xmm10, %edx |
314 | orps %xmm8, %xmm1 |
315 | andnps %xmm0, %xmm4 |
316 | andps %xmm11, %xmm1 |
317 | movaps %xmm4, %xmm0 |
318 | orps %xmm1, %xmm0 |
319 | |
320 | /* Return to main vector processing path */ |
321 | jmp L(AUX_BRANCH_RETURN) |
322 | # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm12 xmm13 |
323 | END(_ZGVbN4vv_atan2f_sse4) |
324 | |
325 | .section .rodata, "a" |
326 | .align 16 |
327 | |
328 | #ifdef __svml_satan2_data_internal_typedef |
329 | typedef unsigned int VUINT32; |
330 | typedef struct { |
331 | __declspec(align(16)) VUINT32 sZERO[4][1]; |
332 | __declspec(align(16)) VUINT32 sSIGN_MASK[4][1]; |
333 | __declspec(align(16)) VUINT32 sABS_MASK[4][1]; |
334 | __declspec(align(16)) VUINT32 sPIO2[4][1]; |
335 | __declspec(align(16)) VUINT32 sPI[4][1]; |
336 | __declspec(align(16)) VUINT32 sPC8[4][1]; |
337 | __declspec(align(16)) VUINT32 sPC7[4][1]; |
338 | __declspec(align(16)) VUINT32 sPC6[4][1]; |
339 | __declspec(align(16)) VUINT32 sPC5[4][1]; |
340 | __declspec(align(16)) VUINT32 sPC4[4][1]; |
341 | __declspec(align(16)) VUINT32 sPC3[4][1]; |
342 | __declspec(align(16)) VUINT32 sPC2[4][1]; |
343 | __declspec(align(16)) VUINT32 sPC1[4][1]; |
344 | __declspec(align(16)) VUINT32 sPC0[4][1]; |
345 | __declspec(align(16)) VUINT32 iCHK_WORK_SUB[4][1]; |
346 | __declspec(align(16)) VUINT32 iCHK_WORK_CMP[4][1]; |
347 | } __svml_satan2_data_internal; |
348 | #endif |
349 | __svml_satan2_data_internal: |
350 | .long 0x00000000, 0x00000000, 0x00000000, 0x00000000 // sZERO |
351 | .align 16 |
352 | .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 // sSIGN_MASK |
353 | .align 16 |
354 | .long 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF // sABS_MASK |
355 | .align 16 |
356 | .long 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB // sPIO2 |
357 | .align 16 |
358 | .long 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB // sPI |
359 | .align 16 |
360 | .long 0x3B322CC0, 0x3B322CC0, 0x3B322CC0, 0x3B322CC0 // sA08 |
361 | .align 16 |
362 | .long 0xBC7F2631, 0xBC7F2631, 0xBC7F2631, 0xBC7F2631 // sA07 |
363 | .align 16 |
364 | .long 0x3D2BC384, 0x3D2BC384, 0x3D2BC384, 0x3D2BC384 // sA06 |
365 | .align 16 |
366 | .long 0xBD987629, 0xBD987629, 0xBD987629, 0xBD987629 // sA05 |
367 | .align 16 |
368 | .long 0x3DD96474, 0x3DD96474, 0x3DD96474, 0x3DD96474 // sA04 |
369 | .align 16 |
370 | .long 0xBE1161F8, 0xBE1161F8, 0xBE1161F8, 0xBE1161F8 // sA03 |
371 | .align 16 |
372 | .long 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F // sA02 |
373 | .align 16 |
374 | .long 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49 // sA01 |
375 | .align 16 |
376 | .long 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000 // sA00 |
377 | .align 16 |
378 | .long 0x81000000, 0x81000000, 0x81000000, 0x81000000 // iCHK_WORK_SUB |
379 | .align 16 |
380 | .long 0xFC000000, 0xFC000000, 0xFC000000, 0xFC000000 // iCHK_WORK_CMP |
381 | .align 16 |
382 | .type __svml_satan2_data_internal, @object |
383 | .size __svml_satan2_data_internal, .-__svml_satan2_data_internal |
384 | |