svml_s_atan2f8_core_avx2.S source code [glibc/sysdeps/x86_64/fpu/multiarch/svml_s_atan2f8_core_avx2.S]

1	/ Function atan2f vectorized with AVX2.*
2	Copyright (C) 2021-2024 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	https://www.gnu.org/licenses/. /*
18
19	/*
20	* ALGORITHM DESCRIPTION:
21	* For 0.0 <= x <= 7.0/16.0: atan(x) = atan(0.0) + atan(s), where s=(x-0.0)/(1.0+0.0*x)
22	* For 7.0/16.0 <= x <= 11.0/16.0: atan(x) = atan(0.5) + atan(s), where s=(x-0.5)/(1.0+0.5*x)
23	* For 11.0/16.0 <= x <= 19.0/16.0: atan(x) = atan(1.0) + atan(s), where s=(x-1.0)/(1.0+1.0*x)
24	* For 19.0/16.0 <= x <= 39.0/16.0: atan(x) = atan(1.5) + atan(s), where s=(x-1.5)/(1.0+1.5*x)
25	* For 39.0/16.0 <= x <= inf : atan(x) = atan(inf) + atan(s), where s=-1.0/x
26	* Where atan(s) ~= s+s^3*Poly11(s^2) on interval \|s\|<7.0/0.16.
27	*
28	*
29	*/
30
31	/ Offsets for data table __svml_satan2_data_internal*
32	*/
33	#define sZERO 0
34	#define sSIGN_MASK 32
35	#define sABS_MASK 64
36	#define sPIO2 96
37	#define sPI 128
38	#define sPC8 160
39	#define sPC7 192
40	#define sPC6 224
41	#define sPC5 256
42	#define sPC4 288
43	#define sPC3 320
44	#define sPC2 352
45	#define sPC1 384
46	#define sPC0 416
47	#define iCHK_WORK_SUB 448
48	#define iCHK_WORK_CMP 480
49
50	#include <sysdep.h>
51
52	.section .text.avx2, "ax", @progbits
53	ENTRY(_ZGVdN8vv_atan2f_avx2)
54	pushq %rbp
55	cfi_def_cfa_offset(`16`)
56	movq %rsp, %rbp
57	cfi_def_cfa(`6`, `16`)
58	cfi_offset(`6`, -`16`)
59	andq $-`32`, %rsp
60	subq $`128`, %rsp
61	xorl %edx, %edx
62
63	/*
64	* #define NO_VECTOR_ZERO_ATAN2_ARGS
65	* Declarations
66	* Variables
67	* Constants
68	* The end of declarations
69	* Implementation
70	* Arguments signs
71	*/
72	vmovups sABS_MASK+__svml_satan2_data_internal(%rip), %ymm2
73
74	/ Testing on working interval. /
75	vmovups iCHK_WORK_SUB+__svml_satan2_data_internal(%rip), %ymm15
76	vmovups iCHK_WORK_CMP+__svml_satan2_data_internal(%rip), %ymm9
77
78	/ if x<0, sPI = Pi, else sPI =0 /
79	vmovups __svml_satan2_data_internal(%rip), %ymm5
80	vmovaps %ymm1, %ymm7
81	vandps %ymm2, %ymm7, %ymm13
82	vandps %ymm2, %ymm0, %ymm12
83	vcmplt_oqps %ymm13, %ymm12, %ymm4
84	vcmple_oqps %ymm5, %ymm7, %ymm6
85	vpsubd %ymm15, %ymm13, %ymm10
86	vpsubd %ymm15, %ymm12, %ymm8
87
88	/*
89	* 1) If y<x then a= y, b=x, PIO2=0
90	* 2) If y>x then a=-x, b=y, PIO2=Pi/2
91	*/
92	vorps sSIGN_MASK+__svml_satan2_data_internal(%rip), %ymm13, %ymm3
93	vblendvps %ymm4, %ymm12, %ymm3, %ymm14
94	vblendvps %ymm4, %ymm13, %ymm12, %ymm3
95
96	/ Division a/b. /
97	vdivps %ymm3, %ymm14, %ymm11
98	vpcmpgtd %ymm9, %ymm10, %ymm14
99	vpcmpeqd %ymm9, %ymm10, %ymm15
100	vpor %ymm15, %ymm14, %ymm10
101	vmovups sPC7+__svml_satan2_data_internal(%rip), %ymm15
102	vpcmpgtd %ymm9, %ymm8, %ymm14
103	vpcmpeqd %ymm9, %ymm8, %ymm8
104	vpor %ymm8, %ymm14, %ymm9
105	vmovups sPC8+__svml_satan2_data_internal(%rip), %ymm14
106	vpor %ymm9, %ymm10, %ymm10
107
108	/ Polynomial. /
109	vmulps %ymm11, %ymm11, %ymm9
110	vmulps %ymm9, %ymm9, %ymm8
111	vfmadd213ps sPC6+__svml_satan2_data_internal(%rip), %ymm8, %ymm14
112	vfmadd213ps sPC5+__svml_satan2_data_internal(%rip), %ymm8, %ymm15
113	vfmadd213ps sPC4+__svml_satan2_data_internal(%rip), %ymm8, %ymm14
114	vfmadd213ps sPC3+__svml_satan2_data_internal(%rip), %ymm8, %ymm15
115	vfmadd213ps sPC2+__svml_satan2_data_internal(%rip), %ymm8, %ymm14
116	vfmadd213ps sPC1+__svml_satan2_data_internal(%rip), %ymm8, %ymm15
117	vfmadd213ps sPC0+__svml_satan2_data_internal(%rip), %ymm8, %ymm14
118	vfmadd213ps %ymm14, %ymm9, %ymm15
119	vandnps sPIO2+__svml_satan2_data_internal(%rip), %ymm4, %ymm4
120
121	/ Reconstruction. /
122	vfmadd213ps %ymm4, %ymm11, %ymm15
123	vxorps %ymm13, %ymm7, %ymm1
124	vandps sPI+__svml_satan2_data_internal(%rip), %ymm6, %ymm6
125	vorps %ymm1, %ymm15, %ymm11
126	vaddps %ymm11, %ymm6, %ymm8
127	vmovmskps %ymm10, %eax
128	vxorps %ymm12, %ymm0, %ymm2
129	vorps %ymm2, %ymm8, %ymm9
130
131	/ Special branch for fast (vector) processing of zero arguments /
132	testl %eax, %eax
133
134	/ Go to auxiliary branch /
135	jne L(AUX_BRANCH)
136	# LOE rbx r12 r13 r14 r15 edx ymm0 ymm1 ymm2 ymm3 ymm4 ymm5 ymm6 ymm7 ymm9 ymm10 ymm12 ymm13
137
138	/ Return from auxiliary branch*
139	* for out of main path inputs
140	*/
141
142	L(AUX_BRANCH_RETURN):
143	/*
144	* Special branch for fast (vector) processing of zero arguments
145	* The end of implementation
146	*/
147	testl %edx, %edx
148
149	/ Go to special inputs processing branch /
150	jne L(SPECIAL_VALUES_BRANCH)
151	# LOE rbx r12 r13 r14 r15 edx ymm0 ymm7 ymm9
152
153	/ Restore registers*
154	* and exit the function
155	*/
156
157	L(EXIT):
158	vmovaps %ymm9, %ymm0
159	movq %rbp, %rsp
160	popq %rbp
161	cfi_def_cfa(`7`, `8`)
162	cfi_restore(`6`)
163	ret
164	cfi_def_cfa(`6`, `16`)
165	cfi_offset(`6`, -`16`)
166
167	/ Branch to process*
168	* special inputs
169	*/
170
171	L(SPECIAL_VALUES_BRANCH):
172	vmovups %ymm0, `32`(%rsp)
173	vmovups %ymm7, `64`(%rsp)
174	vmovups %ymm9, `96`(%rsp)
175	# LOE rbx r12 r13 r14 r15 edx ymm9
176
177	xorl %eax, %eax
178	# LOE rbx r12 r13 r14 r15 eax edx
179
180	vzeroupper
181	movq %r12, `16`(%rsp)
182	/ DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -112; DW_OP_plus) /
183	.cfi_escape `0x10`, `0x0c`, `0x0e`, `0x38`, `0x1c`, `0x0d`, `0xe0`, `0xff`, `0xff`, `0xff`, `0x1a`, `0x0d`, `0x90`, `0xff`, `0xff`, `0xff`, `0x22`
184	movl %eax, %r12d
185	movq %r13, `8`(%rsp)
186	/ DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -120; DW_OP_plus) /
187	.cfi_escape `0x10`, `0x0d`, `0x0e`, `0x38`, `0x1c`, `0x0d`, `0xe0`, `0xff`, `0xff`, `0xff`, `0x1a`, `0x0d`, `0x88`, `0xff`, `0xff`, `0xff`, `0x22`
188	movl %edx, %r13d
189	movq %r14, (%rsp)
190	/ DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -128; DW_OP_plus) /
191	.cfi_escape `0x10`, `0x0e`, `0x0e`, `0x38`, `0x1c`, `0x0d`, `0xe0`, `0xff`, `0xff`, `0xff`, `0x1a`, `0x0d`, `0x80`, `0xff`, `0xff`, `0xff`, `0x22`
192	# LOE rbx r15 r12d r13d
193
194	/ Range mask*
195	* bits check
196	*/
197
198	L(RANGEMASK_CHECK):
199	btl %r12d, %r13d
200
201	/ Call scalar math function /
202	jc L(SCALAR_MATH_CALL)
203	# LOE rbx r15 r12d r13d
204
205	/ Special inputs*
206	* processing loop
207	*/
208
209	L(SPECIAL_VALUES_LOOP):
210	incl %r12d
211	cmpl $`8`, %r12d
212
213	/ Check bits in range mask /
214	jl L(RANGEMASK_CHECK)
215	# LOE rbx r15 r12d r13d
216
217	movq `16`(%rsp), %r12
218	cfi_restore(`12`)
219	movq `8`(%rsp), %r13
220	cfi_restore(`13`)
221	movq (%rsp), %r14
222	cfi_restore(`14`)
223	vmovups `96`(%rsp), %ymm9
224
225	/ Go to exit /
226	jmp L(EXIT)
227	/ DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -112; DW_OP_plus) /
228	.cfi_escape `0x10`, `0x0c`, `0x0e`, `0x38`, `0x1c`, `0x0d`, `0xe0`, `0xff`, `0xff`, `0xff`, `0x1a`, `0x0d`, `0x90`, `0xff`, `0xff`, `0xff`, `0x22`
229	/ DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -120; DW_OP_plus) /
230	.cfi_escape `0x10`, `0x0d`, `0x0e`, `0x38`, `0x1c`, `0x0d`, `0xe0`, `0xff`, `0xff`, `0xff`, `0x1a`, `0x0d`, `0x88`, `0xff`, `0xff`, `0xff`, `0x22`
231	/ DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -128; DW_OP_plus) /
232	.cfi_escape `0x10`, `0x0e`, `0x0e`, `0x38`, `0x1c`, `0x0d`, `0xe0`, `0xff`, `0xff`, `0xff`, `0x1a`, `0x0d`, `0x80`, `0xff`, `0xff`, `0xff`, `0x22`
233	# LOE rbx r12 r13 r14 r15 ymm9
234
235	/ Scalar math function call*
236	* to process special input
237	*/
238
239	L(SCALAR_MATH_CALL):
240	movl %r12d, %r14d
241	vmovss `32`(%rsp, %r14, `4`), %xmm0
242	vmovss `64`(%rsp, %r14, `4`), %xmm1
243	call atan2f@PLT
244	# LOE rbx r14 r15 r12d r13d xmm0
245
246	vmovss %xmm0, `96`(%rsp, %r14, `4`)
247
248	/ Process special inputs in loop /
249	jmp L(SPECIAL_VALUES_LOOP)
250	cfi_restore(`12`)
251	cfi_restore(`13`)
252	cfi_restore(`14`)
253	# LOE rbx r15 r12d r13d
254
255	/ Auxiliary branch*
256	* for out of main path inputs
257	*/
258
259	L(AUX_BRANCH):
260	/ Check if at least on of Y or Y is zero: iAXAYZERO /
261	vpcmpeqd %ymm5, %ymm13, %ymm13
262	vpcmpeqd %ymm5, %ymm12, %ymm12
263
264	/ Check if both X & Y are not NaNs: iXYnotNAN /
265	vcmpordps %ymm7, %ymm7, %ymm11
266	vcmpordps %ymm0, %ymm0, %ymm14
267
268	/*
269	* Path for zero arguments (at least one of both)
270	* Check if both args are zeros (den. is zero)
271	*/
272	vcmpeqps %ymm5, %ymm3, %ymm3
273	vpor %ymm12, %ymm13, %ymm15
274
275	/ Set sPIO2 to zero if den. is zero /
276	vblendvps %ymm3, %ymm5, %ymm4, %ymm4
277	vandps %ymm14, %ymm11, %ymm8
278
279	/ Check if at least on of Y or Y is zero and not NaN: iAXAYZEROnotNAN /
280	vpand %ymm8, %ymm15, %ymm8
281
282	/ Res = sign(Y)(X<0)?(PIO2+PI):PIO2 /*
283	vpcmpgtd %ymm7, %ymm5, %ymm5
284	vorps %ymm1, %ymm4, %ymm1
285	vandps %ymm6, %ymm5, %ymm6
286	vaddps %ymm6, %ymm1, %ymm1
287
288	/ Exclude from previous callout mask zero (and not NaN) arguments /
289	vpandn %ymm10, %ymm8, %ymm10
290	vorps %ymm2, %ymm1, %ymm2
291
292	/ Go to callout /
293	vmovmskps %ymm10, %edx
294
295	/ Merge results from main and spec path /
296	vblendvps %ymm8, %ymm2, %ymm9, %ymm9
297
298	/ Return to main vector processing path /
299	jmp L(AUX_BRANCH_RETURN)
300	# LOE rbx r12 r13 r14 r15 edx ymm0 ymm7 ymm9
301	END(_ZGVdN8vv_atan2f_avx2)
302
303	.section .rodata, "a"
304	.align `32`
305
306	#ifdef __svml_satan2_data_internal_typedef
307	typedef unsigned int VUINT32;
308	typedef struct {
309	__declspec(align(`32`)) VUINT32 sZERO[`8`][`1`];
310	__declspec(align(`32`)) VUINT32 sSIGN_MASK[`8`][`1`];
311	__declspec(align(`32`)) VUINT32 sABS_MASK[`8`][`1`];
312	__declspec(align(`32`)) VUINT32 sPIO2[`8`][`1`];
313	__declspec(align(`32`)) VUINT32 sPI[`8`][`1`];
314	__declspec(align(`32`)) VUINT32 sPC8[`8`][`1`];
315	__declspec(align(`32`)) VUINT32 sPC7[`8`][`1`];
316	__declspec(align(`32`)) VUINT32 sPC6[`8`][`1`];
317	__declspec(align(`32`)) VUINT32 sPC5[`8`][`1`];
318	__declspec(align(`32`)) VUINT32 sPC4[`8`][`1`];
319	__declspec(align(`32`)) VUINT32 sPC3[`8`][`1`];
320	__declspec(align(`32`)) VUINT32 sPC2[`8`][`1`];
321	__declspec(align(`32`)) VUINT32 sPC1[`8`][`1`];
322	__declspec(align(`32`)) VUINT32 sPC0[`8`][`1`];
323	__declspec(align(`32`)) VUINT32 iCHK_WORK_SUB[`8`][`1`];
324	__declspec(align(`32`)) VUINT32 iCHK_WORK_CMP[`8`][`1`];
325	} __svml_satan2_data_internal;
326	#endif
327	__svml_satan2_data_internal:
328	.long `0x00000000`, `0x00000000`, `0x00000000`, `0x00000000`, `0x00000000`, `0x00000000`, `0x00000000`, `0x00000000` // sZERO
329	.align `32`
330	.long `0x80000000`, `0x80000000`, `0x80000000`, `0x80000000`, `0x80000000`, `0x80000000`, `0x80000000`, `0x80000000` // sSIGN_MASK
331	.align `32`
332	.long `0x7FFFFFFF`, `0x7FFFFFFF`, `0x7FFFFFFF`, `0x7FFFFFFF`, `0x7FFFFFFF`, `0x7FFFFFFF`, `0x7FFFFFFF`, `0x7FFFFFFF` // sABS_MASK
333	.align `32`
334	.long `0x3FC90FDB`, `0x3FC90FDB`, `0x3FC90FDB`, `0x3FC90FDB`, `0x3FC90FDB`, `0x3FC90FDB`, `0x3FC90FDB`, `0x3FC90FDB` // sPIO2
335	.align `32`
336	.long `0x40490FDB`, `0x40490FDB`, `0x40490FDB`, `0x40490FDB`, `0x40490FDB`, `0x40490FDB`, `0x40490FDB`, `0x40490FDB` // sPI
337	.align `32`
338	.long `0x3B322CC0`, `0x3B322CC0`, `0x3B322CC0`, `0x3B322CC0`, `0x3B322CC0`, `0x3B322CC0`, `0x3B322CC0`, `0x3B322CC0` // sA08
339	.align `32`
340	.long `0xBC7F2631`, `0xBC7F2631`, `0xBC7F2631`, `0xBC7F2631`, `0xBC7F2631`, `0xBC7F2631`, `0xBC7F2631`, `0xBC7F2631` // sA07
341	.align `32`
342	.long `0x3D2BC384`, `0x3D2BC384`, `0x3D2BC384`, `0x3D2BC384`, `0x3D2BC384`, `0x3D2BC384`, `0x3D2BC384`, `0x3D2BC384` // sA06
343	.align `32`
344	.long `0xBD987629`, `0xBD987629`, `0xBD987629`, `0xBD987629`, `0xBD987629`, `0xBD987629`, `0xBD987629`, `0xBD987629` // sA05
345	.align `32`
346	.long `0x3DD96474`, `0x3DD96474`, `0x3DD96474`, `0x3DD96474`, `0x3DD96474`, `0x3DD96474`, `0x3DD96474`, `0x3DD96474` // sA04
347	.align `32`
348	.long `0xBE1161F8`, `0xBE1161F8`, `0xBE1161F8`, `0xBE1161F8`, `0xBE1161F8`, `0xBE1161F8`, `0xBE1161F8`, `0xBE1161F8` // sA03
349	.align `32`
350	.long `0x3E4CB79F`, `0x3E4CB79F`, `0x3E4CB79F`, `0x3E4CB79F`, `0x3E4CB79F`, `0x3E4CB79F`, `0x3E4CB79F`, `0x3E4CB79F` // sA02
351	.align `32`
352	.long `0xBEAAAA49`, `0xBEAAAA49`, `0xBEAAAA49`, `0xBEAAAA49`, `0xBEAAAA49`, `0xBEAAAA49`, `0xBEAAAA49`, `0xBEAAAA49` // sA01
353	.align `32`
354	.long `0x3F800000`, `0x3F800000`, `0x3F800000`, `0x3F800000`, `0x3F800000`, `0x3F800000`, `0x3F800000`, `0x3F800000` // sA00
355	.align `32`
356	.long `0x81000000`, `0x81000000`, `0x81000000`, `0x81000000`, `0x81000000`, `0x81000000`, `0x81000000`, `0x81000000` // iCHK_WORK_SUB
357	.align `32`
358	.long `0xFC000000`, `0xFC000000`, `0xFC000000`, `0xFC000000`, `0xFC000000`, `0xFC000000`, `0xFC000000`, `0xFC000000` // iCHK_WORK_CMP
359	.align `32`
360	.type __svml_satan2_data_internal, @object
361	.size __svml_satan2_data_internal, .-__svml_satan2_data_internal
362

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_s_atan2f8_core_avx2.S