1 | /* Function atan2f vectorized with AVX2. |
2 | Copyright (C) 2021-2024 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | https://www.gnu.org/licenses/. */ |
18 | |
19 | /* |
20 | * ALGORITHM DESCRIPTION: |
21 | * For 0.0 <= x <= 7.0/16.0: atan(x) = atan(0.0) + atan(s), where s=(x-0.0)/(1.0+0.0*x) |
22 | * For 7.0/16.0 <= x <= 11.0/16.0: atan(x) = atan(0.5) + atan(s), where s=(x-0.5)/(1.0+0.5*x) |
23 | * For 11.0/16.0 <= x <= 19.0/16.0: atan(x) = atan(1.0) + atan(s), where s=(x-1.0)/(1.0+1.0*x) |
24 | * For 19.0/16.0 <= x <= 39.0/16.0: atan(x) = atan(1.5) + atan(s), where s=(x-1.5)/(1.0+1.5*x) |
25 | * For 39.0/16.0 <= x <= inf : atan(x) = atan(inf) + atan(s), where s=-1.0/x |
26 | * Where atan(s) ~= s+s^3*Poly11(s^2) on interval |s|<7.0/0.16. |
27 | * |
28 | * |
29 | */ |
30 | |
31 | /* Offsets for data table __svml_satan2_data_internal |
32 | */ |
33 | #define sZERO 0 |
34 | #define sSIGN_MASK 32 |
35 | #define sABS_MASK 64 |
36 | #define sPIO2 96 |
37 | #define sPI 128 |
38 | #define sPC8 160 |
39 | #define sPC7 192 |
40 | #define sPC6 224 |
41 | #define sPC5 256 |
42 | #define sPC4 288 |
43 | #define sPC3 320 |
44 | #define sPC2 352 |
45 | #define sPC1 384 |
46 | #define sPC0 416 |
47 | #define iCHK_WORK_SUB 448 |
48 | #define iCHK_WORK_CMP 480 |
49 | |
50 | #include <sysdep.h> |
51 | |
52 | .section .text.avx2, "ax" , @progbits |
53 | ENTRY(_ZGVdN8vv_atan2f_avx2) |
54 | pushq %rbp |
55 | cfi_def_cfa_offset(16) |
56 | movq %rsp, %rbp |
57 | cfi_def_cfa(6, 16) |
58 | cfi_offset(6, -16) |
59 | andq $-32, %rsp |
60 | subq $128, %rsp |
61 | xorl %edx, %edx |
62 | |
63 | /* |
64 | * #define NO_VECTOR_ZERO_ATAN2_ARGS |
65 | * Declarations |
66 | * Variables |
67 | * Constants |
68 | * The end of declarations |
69 | * Implementation |
70 | * Arguments signs |
71 | */ |
72 | vmovups sABS_MASK+__svml_satan2_data_internal(%rip), %ymm2 |
73 | |
74 | /* Testing on working interval. */ |
75 | vmovups iCHK_WORK_SUB+__svml_satan2_data_internal(%rip), %ymm15 |
76 | vmovups iCHK_WORK_CMP+__svml_satan2_data_internal(%rip), %ymm9 |
77 | |
78 | /* if x<0, sPI = Pi, else sPI =0 */ |
79 | vmovups __svml_satan2_data_internal(%rip), %ymm5 |
80 | vmovaps %ymm1, %ymm7 |
81 | vandps %ymm2, %ymm7, %ymm13 |
82 | vandps %ymm2, %ymm0, %ymm12 |
83 | vcmplt_oqps %ymm13, %ymm12, %ymm4 |
84 | vcmple_oqps %ymm5, %ymm7, %ymm6 |
85 | vpsubd %ymm15, %ymm13, %ymm10 |
86 | vpsubd %ymm15, %ymm12, %ymm8 |
87 | |
88 | /* |
89 | * 1) If y<x then a= y, b=x, PIO2=0 |
90 | * 2) If y>x then a=-x, b=y, PIO2=Pi/2 |
91 | */ |
92 | vorps sSIGN_MASK+__svml_satan2_data_internal(%rip), %ymm13, %ymm3 |
93 | vblendvps %ymm4, %ymm12, %ymm3, %ymm14 |
94 | vblendvps %ymm4, %ymm13, %ymm12, %ymm3 |
95 | |
96 | /* Division a/b. */ |
97 | vdivps %ymm3, %ymm14, %ymm11 |
98 | vpcmpgtd %ymm9, %ymm10, %ymm14 |
99 | vpcmpeqd %ymm9, %ymm10, %ymm15 |
100 | vpor %ymm15, %ymm14, %ymm10 |
101 | vmovups sPC7+__svml_satan2_data_internal(%rip), %ymm15 |
102 | vpcmpgtd %ymm9, %ymm8, %ymm14 |
103 | vpcmpeqd %ymm9, %ymm8, %ymm8 |
104 | vpor %ymm8, %ymm14, %ymm9 |
105 | vmovups sPC8+__svml_satan2_data_internal(%rip), %ymm14 |
106 | vpor %ymm9, %ymm10, %ymm10 |
107 | |
108 | /* Polynomial. */ |
109 | vmulps %ymm11, %ymm11, %ymm9 |
110 | vmulps %ymm9, %ymm9, %ymm8 |
111 | vfmadd213ps sPC6+__svml_satan2_data_internal(%rip), %ymm8, %ymm14 |
112 | vfmadd213ps sPC5+__svml_satan2_data_internal(%rip), %ymm8, %ymm15 |
113 | vfmadd213ps sPC4+__svml_satan2_data_internal(%rip), %ymm8, %ymm14 |
114 | vfmadd213ps sPC3+__svml_satan2_data_internal(%rip), %ymm8, %ymm15 |
115 | vfmadd213ps sPC2+__svml_satan2_data_internal(%rip), %ymm8, %ymm14 |
116 | vfmadd213ps sPC1+__svml_satan2_data_internal(%rip), %ymm8, %ymm15 |
117 | vfmadd213ps sPC0+__svml_satan2_data_internal(%rip), %ymm8, %ymm14 |
118 | vfmadd213ps %ymm14, %ymm9, %ymm15 |
119 | vandnps sPIO2+__svml_satan2_data_internal(%rip), %ymm4, %ymm4 |
120 | |
121 | /* Reconstruction. */ |
122 | vfmadd213ps %ymm4, %ymm11, %ymm15 |
123 | vxorps %ymm13, %ymm7, %ymm1 |
124 | vandps sPI+__svml_satan2_data_internal(%rip), %ymm6, %ymm6 |
125 | vorps %ymm1, %ymm15, %ymm11 |
126 | vaddps %ymm11, %ymm6, %ymm8 |
127 | vmovmskps %ymm10, %eax |
128 | vxorps %ymm12, %ymm0, %ymm2 |
129 | vorps %ymm2, %ymm8, %ymm9 |
130 | |
131 | /* Special branch for fast (vector) processing of zero arguments */ |
132 | testl %eax, %eax |
133 | |
134 | /* Go to auxiliary branch */ |
135 | jne L(AUX_BRANCH) |
136 | # LOE rbx r12 r13 r14 r15 edx ymm0 ymm1 ymm2 ymm3 ymm4 ymm5 ymm6 ymm7 ymm9 ymm10 ymm12 ymm13 |
137 | |
138 | /* Return from auxiliary branch |
139 | * for out of main path inputs |
140 | */ |
141 | |
142 | L(AUX_BRANCH_RETURN): |
143 | /* |
144 | * Special branch for fast (vector) processing of zero arguments |
145 | * The end of implementation |
146 | */ |
147 | testl %edx, %edx |
148 | |
149 | /* Go to special inputs processing branch */ |
150 | jne L(SPECIAL_VALUES_BRANCH) |
151 | # LOE rbx r12 r13 r14 r15 edx ymm0 ymm7 ymm9 |
152 | |
153 | /* Restore registers |
154 | * and exit the function |
155 | */ |
156 | |
157 | L(EXIT): |
158 | vmovaps %ymm9, %ymm0 |
159 | movq %rbp, %rsp |
160 | popq %rbp |
161 | cfi_def_cfa(7, 8) |
162 | cfi_restore(6) |
163 | ret |
164 | cfi_def_cfa(6, 16) |
165 | cfi_offset(6, -16) |
166 | |
167 | /* Branch to process |
168 | * special inputs |
169 | */ |
170 | |
171 | L(SPECIAL_VALUES_BRANCH): |
172 | vmovups %ymm0, 32(%rsp) |
173 | vmovups %ymm7, 64(%rsp) |
174 | vmovups %ymm9, 96(%rsp) |
175 | # LOE rbx r12 r13 r14 r15 edx ymm9 |
176 | |
177 | xorl %eax, %eax |
178 | # LOE rbx r12 r13 r14 r15 eax edx |
179 | |
180 | vzeroupper |
181 | movq %r12, 16(%rsp) |
182 | /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -112; DW_OP_plus) */ |
183 | .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x90, 0xff, 0xff, 0xff, 0x22 |
184 | movl %eax, %r12d |
185 | movq %r13, 8(%rsp) |
186 | /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -120; DW_OP_plus) */ |
187 | .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x88, 0xff, 0xff, 0xff, 0x22 |
188 | movl %edx, %r13d |
189 | movq %r14, (%rsp) |
190 | /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -128; DW_OP_plus) */ |
191 | .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x22 |
192 | # LOE rbx r15 r12d r13d |
193 | |
194 | /* Range mask |
195 | * bits check |
196 | */ |
197 | |
198 | L(RANGEMASK_CHECK): |
199 | btl %r12d, %r13d |
200 | |
201 | /* Call scalar math function */ |
202 | jc L(SCALAR_MATH_CALL) |
203 | # LOE rbx r15 r12d r13d |
204 | |
205 | /* Special inputs |
206 | * processing loop |
207 | */ |
208 | |
209 | L(SPECIAL_VALUES_LOOP): |
210 | incl %r12d |
211 | cmpl $8, %r12d |
212 | |
213 | /* Check bits in range mask */ |
214 | jl L(RANGEMASK_CHECK) |
215 | # LOE rbx r15 r12d r13d |
216 | |
217 | movq 16(%rsp), %r12 |
218 | cfi_restore(12) |
219 | movq 8(%rsp), %r13 |
220 | cfi_restore(13) |
221 | movq (%rsp), %r14 |
222 | cfi_restore(14) |
223 | vmovups 96(%rsp), %ymm9 |
224 | |
225 | /* Go to exit */ |
226 | jmp L(EXIT) |
227 | /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -112; DW_OP_plus) */ |
228 | .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x90, 0xff, 0xff, 0xff, 0x22 |
229 | /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -120; DW_OP_plus) */ |
230 | .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x88, 0xff, 0xff, 0xff, 0x22 |
231 | /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -128; DW_OP_plus) */ |
232 | .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x22 |
233 | # LOE rbx r12 r13 r14 r15 ymm9 |
234 | |
235 | /* Scalar math function call |
236 | * to process special input |
237 | */ |
238 | |
239 | L(SCALAR_MATH_CALL): |
240 | movl %r12d, %r14d |
241 | vmovss 32(%rsp, %r14, 4), %xmm0 |
242 | vmovss 64(%rsp, %r14, 4), %xmm1 |
243 | call atan2f@PLT |
244 | # LOE rbx r14 r15 r12d r13d xmm0 |
245 | |
246 | vmovss %xmm0, 96(%rsp, %r14, 4) |
247 | |
248 | /* Process special inputs in loop */ |
249 | jmp L(SPECIAL_VALUES_LOOP) |
250 | cfi_restore(12) |
251 | cfi_restore(13) |
252 | cfi_restore(14) |
253 | # LOE rbx r15 r12d r13d |
254 | |
255 | /* Auxiliary branch |
256 | * for out of main path inputs |
257 | */ |
258 | |
259 | L(AUX_BRANCH): |
260 | /* Check if at least on of Y or Y is zero: iAXAYZERO */ |
261 | vpcmpeqd %ymm5, %ymm13, %ymm13 |
262 | vpcmpeqd %ymm5, %ymm12, %ymm12 |
263 | |
264 | /* Check if both X & Y are not NaNs: iXYnotNAN */ |
265 | vcmpordps %ymm7, %ymm7, %ymm11 |
266 | vcmpordps %ymm0, %ymm0, %ymm14 |
267 | |
268 | /* |
269 | * Path for zero arguments (at least one of both) |
270 | * Check if both args are zeros (den. is zero) |
271 | */ |
272 | vcmpeqps %ymm5, %ymm3, %ymm3 |
273 | vpor %ymm12, %ymm13, %ymm15 |
274 | |
275 | /* Set sPIO2 to zero if den. is zero */ |
276 | vblendvps %ymm3, %ymm5, %ymm4, %ymm4 |
277 | vandps %ymm14, %ymm11, %ymm8 |
278 | |
279 | /* Check if at least on of Y or Y is zero and not NaN: iAXAYZEROnotNAN */ |
280 | vpand %ymm8, %ymm15, %ymm8 |
281 | |
282 | /* Res = sign(Y)*(X<0)?(PIO2+PI):PIO2 */ |
283 | vpcmpgtd %ymm7, %ymm5, %ymm5 |
284 | vorps %ymm1, %ymm4, %ymm1 |
285 | vandps %ymm6, %ymm5, %ymm6 |
286 | vaddps %ymm6, %ymm1, %ymm1 |
287 | |
288 | /* Exclude from previous callout mask zero (and not NaN) arguments */ |
289 | vpandn %ymm10, %ymm8, %ymm10 |
290 | vorps %ymm2, %ymm1, %ymm2 |
291 | |
292 | /* Go to callout */ |
293 | vmovmskps %ymm10, %edx |
294 | |
295 | /* Merge results from main and spec path */ |
296 | vblendvps %ymm8, %ymm2, %ymm9, %ymm9 |
297 | |
298 | /* Return to main vector processing path */ |
299 | jmp L(AUX_BRANCH_RETURN) |
300 | # LOE rbx r12 r13 r14 r15 edx ymm0 ymm7 ymm9 |
301 | END(_ZGVdN8vv_atan2f_avx2) |
302 | |
303 | .section .rodata, "a" |
304 | .align 32 |
305 | |
306 | #ifdef __svml_satan2_data_internal_typedef |
307 | typedef unsigned int VUINT32; |
308 | typedef struct { |
309 | __declspec(align(32)) VUINT32 sZERO[8][1]; |
310 | __declspec(align(32)) VUINT32 sSIGN_MASK[8][1]; |
311 | __declspec(align(32)) VUINT32 sABS_MASK[8][1]; |
312 | __declspec(align(32)) VUINT32 sPIO2[8][1]; |
313 | __declspec(align(32)) VUINT32 sPI[8][1]; |
314 | __declspec(align(32)) VUINT32 sPC8[8][1]; |
315 | __declspec(align(32)) VUINT32 sPC7[8][1]; |
316 | __declspec(align(32)) VUINT32 sPC6[8][1]; |
317 | __declspec(align(32)) VUINT32 sPC5[8][1]; |
318 | __declspec(align(32)) VUINT32 sPC4[8][1]; |
319 | __declspec(align(32)) VUINT32 sPC3[8][1]; |
320 | __declspec(align(32)) VUINT32 sPC2[8][1]; |
321 | __declspec(align(32)) VUINT32 sPC1[8][1]; |
322 | __declspec(align(32)) VUINT32 sPC0[8][1]; |
323 | __declspec(align(32)) VUINT32 iCHK_WORK_SUB[8][1]; |
324 | __declspec(align(32)) VUINT32 iCHK_WORK_CMP[8][1]; |
325 | } __svml_satan2_data_internal; |
326 | #endif |
327 | __svml_satan2_data_internal: |
328 | .long 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // sZERO |
329 | .align 32 |
330 | .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000 // sSIGN_MASK |
331 | .align 32 |
332 | .long 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF // sABS_MASK |
333 | .align 32 |
334 | .long 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB // sPIO2 |
335 | .align 32 |
336 | .long 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB // sPI |
337 | .align 32 |
338 | .long 0x3B322CC0, 0x3B322CC0, 0x3B322CC0, 0x3B322CC0, 0x3B322CC0, 0x3B322CC0, 0x3B322CC0, 0x3B322CC0 // sA08 |
339 | .align 32 |
340 | .long 0xBC7F2631, 0xBC7F2631, 0xBC7F2631, 0xBC7F2631, 0xBC7F2631, 0xBC7F2631, 0xBC7F2631, 0xBC7F2631 // sA07 |
341 | .align 32 |
342 | .long 0x3D2BC384, 0x3D2BC384, 0x3D2BC384, 0x3D2BC384, 0x3D2BC384, 0x3D2BC384, 0x3D2BC384, 0x3D2BC384 // sA06 |
343 | .align 32 |
344 | .long 0xBD987629, 0xBD987629, 0xBD987629, 0xBD987629, 0xBD987629, 0xBD987629, 0xBD987629, 0xBD987629 // sA05 |
345 | .align 32 |
346 | .long 0x3DD96474, 0x3DD96474, 0x3DD96474, 0x3DD96474, 0x3DD96474, 0x3DD96474, 0x3DD96474, 0x3DD96474 // sA04 |
347 | .align 32 |
348 | .long 0xBE1161F8, 0xBE1161F8, 0xBE1161F8, 0xBE1161F8, 0xBE1161F8, 0xBE1161F8, 0xBE1161F8, 0xBE1161F8 // sA03 |
349 | .align 32 |
350 | .long 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F // sA02 |
351 | .align 32 |
352 | .long 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49 // sA01 |
353 | .align 32 |
354 | .long 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000 // sA00 |
355 | .align 32 |
356 | .long 0x81000000, 0x81000000, 0x81000000, 0x81000000, 0x81000000, 0x81000000, 0x81000000, 0x81000000 // iCHK_WORK_SUB |
357 | .align 32 |
358 | .long 0xFC000000, 0xFC000000, 0xFC000000, 0xFC000000, 0xFC000000, 0xFC000000, 0xFC000000, 0xFC000000 // iCHK_WORK_CMP |
359 | .align 32 |
360 | .type __svml_satan2_data_internal, @object |
361 | .size __svml_satan2_data_internal, .-__svml_satan2_data_internal |
362 | |