svml_s_atan2f4_core_sse4.S source code [glibc/sysdeps/x86_64/fpu/multiarch/svml_s_atan2f4_core_sse4.S]

1	/ Function atan2f vectorized with SSE4.*
2	Copyright (C) 2021-2024 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	https://www.gnu.org/licenses/. /*
18
19	/*
20	* ALGORITHM DESCRIPTION:
21	* For 0.0 <= x <= 7.0/16.0: atan(x) = atan(0.0) + atan(s), where s=(x-0.0)/(1.0+0.0*x)
22	* For 7.0/16.0 <= x <= 11.0/16.0: atan(x) = atan(0.5) + atan(s), where s=(x-0.5)/(1.0+0.5*x)
23	* For 11.0/16.0 <= x <= 19.0/16.0: atan(x) = atan(1.0) + atan(s), where s=(x-1.0)/(1.0+1.0*x)
24	* For 19.0/16.0 <= x <= 39.0/16.0: atan(x) = atan(1.5) + atan(s), where s=(x-1.5)/(1.0+1.5*x)
25	* For 39.0/16.0 <= x <= inf : atan(x) = atan(inf) + atan(s), where s=-1.0/x
26	* Where atan(s) ~= s+s^3*Poly11(s^2) on interval \|s\|<7.0/0.16.
27	*
28	*
29	*/
30
31	/ Offsets for data table __svml_satan2_data_internal*
32	*/
33	#define sZERO 0
34	#define sSIGN_MASK 16
35	#define sABS_MASK 32
36	#define sPIO2 48
37	#define sPI 64
38	#define sPC8 80
39	#define sPC7 96
40	#define sPC6 112
41	#define sPC5 128
42	#define sPC4 144
43	#define sPC3 160
44	#define sPC2 176
45	#define sPC1 192
46	#define sPC0 208
47	#define iCHK_WORK_SUB 224
48	#define iCHK_WORK_CMP 240
49
50	#include <sysdep.h>
51
52	.section .text.sse4, "ax", @progbits
53	ENTRY(_ZGVbN4vv_atan2f_sse4)
54	subq $`88`, %rsp
55	cfi_def_cfa_offset(`96`)
56	movaps %xmm0, %xmm12
57
58	/*
59	* #define NO_VECTOR_ZERO_ATAN2_ARGS
60	* Declarations
61	* Variables
62	* Constants
63	* The end of declarations
64	* Implementation
65	* Arguments signs
66	*/
67	movups sABS_MASK+__svml_satan2_data_internal(%rip), %xmm10
68	movaps %xmm1, %xmm13
69	movaps %xmm10, %xmm11
70	andps %xmm12, %xmm10
71	andps %xmm13, %xmm11
72	movaps %xmm10, %xmm7
73	cmpltps %xmm11, %xmm7
74
75	/*
76	* 1) If y<x then a= y, b=x, PIO2=0
77	* 2) If y>x then a=-x, b=y, PIO2=Pi/2
78	*/
79	movups sSIGN_MASK+__svml_satan2_data_internal(%rip), %xmm6
80	movaps %xmm7, %xmm0
81	orps %xmm11, %xmm6
82	movaps %xmm10, %xmm4
83	andnps %xmm6, %xmm0
84	movaps %xmm7, %xmm6
85	movaps %xmm11, %xmm5
86	andps %xmm7, %xmm4
87	andnps %xmm10, %xmm6
88	andps %xmm7, %xmm5
89	orps %xmm4, %xmm0
90	orps %xmm5, %xmm6
91
92	/ Division a/b. /
93	divps %xmm6, %xmm0
94
95	/ Testing on working interval. /
96	movdqu iCHK_WORK_SUB+__svml_satan2_data_internal(%rip), %xmm14
97	movaps %xmm11, %xmm15
98	movaps %xmm10, %xmm3
99	psubd %xmm14, %xmm15
100	psubd %xmm14, %xmm3
101	movdqa %xmm15, %xmm1
102	movdqu iCHK_WORK_CMP+__svml_satan2_data_internal(%rip), %xmm2
103	movdqa %xmm3, %xmm14
104	pcmpgtd %xmm2, %xmm1
105	pcmpeqd %xmm2, %xmm15
106	pcmpgtd %xmm2, %xmm14
107	pcmpeqd %xmm2, %xmm3
108	por %xmm15, %xmm1
109	por %xmm3, %xmm14
110	por %xmm14, %xmm1
111
112	/ Polynomial. /
113	movaps %xmm0, %xmm14
114	mulps %xmm0, %xmm14
115	movaps %xmm13, %xmm4
116	movmskps %xmm1, %ecx
117	movaps %xmm14, %xmm15
118	movaps %xmm11, %xmm9
119	mulps %xmm14, %xmm15
120	pxor %xmm13, %xmm9
121	movups sPC8+__svml_satan2_data_internal(%rip), %xmm2
122	movaps %xmm10, %xmm8
123	mulps %xmm15, %xmm2
124	pxor %xmm12, %xmm8
125	movups sPC7+__svml_satan2_data_internal(%rip), %xmm3
126	xorl %edx, %edx
127	mulps %xmm15, %xmm3
128	addps sPC6+__svml_satan2_data_internal(%rip), %xmm2
129	mulps %xmm15, %xmm2
130	addps sPC5+__svml_satan2_data_internal(%rip), %xmm3
131	mulps %xmm15, %xmm3
132	addps sPC4+__svml_satan2_data_internal(%rip), %xmm2
133	mulps %xmm15, %xmm2
134	addps sPC3+__svml_satan2_data_internal(%rip), %xmm3
135	mulps %xmm15, %xmm3
136	addps sPC2+__svml_satan2_data_internal(%rip), %xmm2
137	mulps %xmm2, %xmm15
138	addps sPC1+__svml_satan2_data_internal(%rip), %xmm3
139	mulps %xmm3, %xmm14
140	addps sPC0+__svml_satan2_data_internal(%rip), %xmm15
141
142	/ if x<0, sPI = Pi, else sPI =0 /
143	movups __svml_satan2_data_internal(%rip), %xmm5
144	xorl %eax, %eax
145	andnps sPIO2+__svml_satan2_data_internal(%rip), %xmm7
146	addps %xmm14, %xmm15
147	cmpleps %xmm5, %xmm4
148
149	/ Reconstruction. /
150	mulps %xmm15, %xmm0
151	andps sPI+__svml_satan2_data_internal(%rip), %xmm4
152	addps %xmm7, %xmm0
153	orps %xmm9, %xmm0
154	addps %xmm4, %xmm0
155	orps %xmm8, %xmm0
156
157	/ Special branch for fast (vector) processing of zero arguments /
158	testl %ecx, %ecx
159
160	/ Go to auxiliary branch /
161	jne L(AUX_BRANCH)
162	# LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm1 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13
163
164	/ Return from auxiliary branch*
165	* for out of main path inputs
166	*/
167
168	L(AUX_BRANCH_RETURN):
169	/*
170	* Special branch for fast (vector) processing of zero arguments
171	* The end of implementation
172	*/
173	testl %edx, %edx
174
175	/ Go to special inputs processing branch /
176	jne L(SPECIAL_VALUES_BRANCH)
177	# LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm12 xmm13
178
179	/ Restore registers*
180	* and exit the function
181	*/
182
183	L(EXIT):
184	addq $`88`, %rsp
185	cfi_def_cfa_offset(`8`)
186	ret
187	cfi_def_cfa_offset(`96`)
188
189	/ Branch to process*
190	* special inputs
191	*/
192
193	L(SPECIAL_VALUES_BRANCH):
194	movups %xmm12, `32`(%rsp)
195	movups %xmm13, `48`(%rsp)
196	movups %xmm0, `64`(%rsp)
197	# LOE rbx rbp r12 r13 r14 r15 eax edx xmm0
198
199	movq %r12, `16`(%rsp)
200	cfi_offset(`12`, -`80`)
201	movl %eax, %r12d
202	movq %r13, `8`(%rsp)
203	cfi_offset(`13`, -`88`)
204	movl %edx, %r13d
205	movq %r14, (%rsp)
206	cfi_offset(`14`, -`96`)
207	# LOE rbx rbp r15 r12d r13d
208
209	/ Range mask*
210	* bits check
211	*/
212
213	L(RANGEMASK_CHECK):
214	btl %r12d, %r13d
215
216	/ Call scalar math function /
217	jc L(SCALAR_MATH_CALL)
218	# LOE rbx rbp r15 r12d r13d
219
220	/ Special inputs*
221	* processing loop
222	*/
223
224	L(SPECIAL_VALUES_LOOP):
225	incl %r12d
226	cmpl $`4`, %r12d
227
228	/ Check bits in range mask /
229	jl L(RANGEMASK_CHECK)
230	# LOE rbx rbp r15 r12d r13d
231
232	movq `16`(%rsp), %r12
233	cfi_restore(`12`)
234	movq `8`(%rsp), %r13
235	cfi_restore(`13`)
236	movq (%rsp), %r14
237	cfi_restore(`14`)
238	movups `64`(%rsp), %xmm0
239
240	/ Go to exit /
241	jmp L(EXIT)
242	cfi_offset(`12`, -`80`)
243	cfi_offset(`13`, -`88`)
244	cfi_offset(`14`, -`96`)
245	# LOE rbx rbp r12 r13 r14 r15 xmm0
246
247	/ Scalar math function call*
248	* to process special input
249	*/
250
251	L(SCALAR_MATH_CALL):
252	movl %r12d, %r14d
253	movss `32`(%rsp, %r14, `4`), %xmm0
254	movss `48`(%rsp, %r14, `4`), %xmm1
255	call atan2f@PLT
256	# LOE rbx rbp r14 r15 r12d r13d xmm0
257
258	movss %xmm0, `64`(%rsp, %r14, `4`)
259
260	/ Process special inputs in loop /
261	jmp L(SPECIAL_VALUES_LOOP)
262	cfi_restore(`12`)
263	cfi_restore(`13`)
264	cfi_restore(`14`)
265	# LOE rbx rbp r15 r12d r13d
266
267	/ Auxiliary branch*
268	* for out of main path inputs
269	*/
270
271	L(AUX_BRANCH):
272	/ Check if both X & Y are not NaNs: iXYnotNAN /
273	movaps %xmm13, %xmm3
274	movaps %xmm12, %xmm2
275	cmpordps %xmm13, %xmm3
276	cmpordps %xmm12, %xmm2
277
278	/*
279	* Path for zero arguments (at least one of both)
280	* Check if both args are zeros (den. is zero)
281	*/
282	cmpeqps %xmm5, %xmm6
283
284	/ Check if at least on of Y or Y is zero: iAXAYZERO /
285	pcmpeqd %xmm5, %xmm11
286	pcmpeqd %xmm5, %xmm10
287	andps %xmm2, %xmm3
288	por %xmm10, %xmm11
289
290	/ Check if at least on of Y or Y is zero and not NaN: iAXAYZEROnotNAN /
291	andps %xmm3, %xmm11
292
293	/ Exclude from previous callout mask zero (and not NaN) arguments /
294	movaps %xmm11, %xmm10
295	pandn %xmm1, %xmm10
296
297	/ Set sPIO2 to zero if den. is zero /
298	movaps %xmm6, %xmm1
299	andnps %xmm7, %xmm1
300	andps %xmm5, %xmm6
301	orps %xmm6, %xmm1
302
303	/ Res = sign(Y)(X<0)?(PIO2+PI):PIO2 /*
304	pcmpgtd %xmm13, %xmm5
305	orps %xmm9, %xmm1
306	andps %xmm4, %xmm5
307
308	/ Merge results from main and spec path /
309	movaps %xmm11, %xmm4
310	addps %xmm5, %xmm1
311
312	/ Go to callout /
313	movmskps %xmm10, %edx
314	orps %xmm8, %xmm1
315	andnps %xmm0, %xmm4
316	andps %xmm11, %xmm1
317	movaps %xmm4, %xmm0
318	orps %xmm1, %xmm0
319
320	/ Return to main vector processing path /
321	jmp L(AUX_BRANCH_RETURN)
322	# LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm12 xmm13
323	END(_ZGVbN4vv_atan2f_sse4)
324
325	.section .rodata, "a"
326	.align `16`
327
328	#ifdef __svml_satan2_data_internal_typedef
329	typedef unsigned int VUINT32;
330	typedef struct {
331	__declspec(align(`16`)) VUINT32 sZERO[`4`][`1`];
332	__declspec(align(`16`)) VUINT32 sSIGN_MASK[`4`][`1`];
333	__declspec(align(`16`)) VUINT32 sABS_MASK[`4`][`1`];
334	__declspec(align(`16`)) VUINT32 sPIO2[`4`][`1`];
335	__declspec(align(`16`)) VUINT32 sPI[`4`][`1`];
336	__declspec(align(`16`)) VUINT32 sPC8[`4`][`1`];
337	__declspec(align(`16`)) VUINT32 sPC7[`4`][`1`];
338	__declspec(align(`16`)) VUINT32 sPC6[`4`][`1`];
339	__declspec(align(`16`)) VUINT32 sPC5[`4`][`1`];
340	__declspec(align(`16`)) VUINT32 sPC4[`4`][`1`];
341	__declspec(align(`16`)) VUINT32 sPC3[`4`][`1`];
342	__declspec(align(`16`)) VUINT32 sPC2[`4`][`1`];
343	__declspec(align(`16`)) VUINT32 sPC1[`4`][`1`];
344	__declspec(align(`16`)) VUINT32 sPC0[`4`][`1`];
345	__declspec(align(`16`)) VUINT32 iCHK_WORK_SUB[`4`][`1`];
346	__declspec(align(`16`)) VUINT32 iCHK_WORK_CMP[`4`][`1`];
347	} __svml_satan2_data_internal;
348	#endif
349	__svml_satan2_data_internal:
350	.long `0x00000000`, `0x00000000`, `0x00000000`, `0x00000000` // sZERO
351	.align `16`
352	.long `0x80000000`, `0x80000000`, `0x80000000`, `0x80000000` // sSIGN_MASK
353	.align `16`
354	.long `0x7FFFFFFF`, `0x7FFFFFFF`, `0x7FFFFFFF`, `0x7FFFFFFF` // sABS_MASK
355	.align `16`
356	.long `0x3FC90FDB`, `0x3FC90FDB`, `0x3FC90FDB`, `0x3FC90FDB` // sPIO2
357	.align `16`
358	.long `0x40490FDB`, `0x40490FDB`, `0x40490FDB`, `0x40490FDB` // sPI
359	.align `16`
360	.long `0x3B322CC0`, `0x3B322CC0`, `0x3B322CC0`, `0x3B322CC0` // sA08
361	.align `16`
362	.long `0xBC7F2631`, `0xBC7F2631`, `0xBC7F2631`, `0xBC7F2631` // sA07
363	.align `16`
364	.long `0x3D2BC384`, `0x3D2BC384`, `0x3D2BC384`, `0x3D2BC384` // sA06
365	.align `16`
366	.long `0xBD987629`, `0xBD987629`, `0xBD987629`, `0xBD987629` // sA05
367	.align `16`
368	.long `0x3DD96474`, `0x3DD96474`, `0x3DD96474`, `0x3DD96474` // sA04
369	.align `16`
370	.long `0xBE1161F8`, `0xBE1161F8`, `0xBE1161F8`, `0xBE1161F8` // sA03
371	.align `16`
372	.long `0x3E4CB79F`, `0x3E4CB79F`, `0x3E4CB79F`, `0x3E4CB79F` // sA02
373	.align `16`
374	.long `0xBEAAAA49`, `0xBEAAAA49`, `0xBEAAAA49`, `0xBEAAAA49` // sA01
375	.align `16`
376	.long `0x3F800000`, `0x3F800000`, `0x3F800000`, `0x3F800000` // sA00
377	.align `16`
378	.long `0x81000000`, `0x81000000`, `0x81000000`, `0x81000000` // iCHK_WORK_SUB
379	.align `16`
380	.long `0xFC000000`, `0xFC000000`, `0xFC000000`, `0xFC000000` // iCHK_WORK_CMP
381	.align `16`
382	.type __svml_satan2_data_internal, @object
383	.size __svml_satan2_data_internal, .-__svml_satan2_data_internal
384

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_s_atan2f4_core_sse4.S