svml_d_atan22_core_sse4.S source code [glibc/sysdeps/x86_64/fpu/multiarch/svml_d_atan22_core_sse4.S]

1	/ Function atan2 vectorized with SSE4.*
2	Copyright (C) 2021-2024 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	https://www.gnu.org/licenses/. /*
18
19	/*
20	* ALGORITHM DESCRIPTION:
21	* For 0.0 <= x <= 7.0/16.0: atan(x) = atan(0.0) + atan(s), where s=(x-0.0)/(1.0+0.0*x)
22	* For 7.0/16.0 <= x <= 11.0/16.0: atan(x) = atan(0.5) + atan(s), where s=(x-0.5)/(1.0+0.5*x)
23	* For 11.0/16.0 <= x <= 19.0/16.0: atan(x) = atan(1.0) + atan(s), where s=(x-1.0)/(1.0+1.0*x)
24	* For 19.0/16.0 <= x <= 39.0/16.0: atan(x) = atan(1.5) + atan(s), where s=(x-1.5)/(1.0+1.5*x)
25	* For 39.0/16.0 <= x <= inf : atan(x) = atan(inf) + atan(s), where s=-1.0/x
26	* Where atan(s) ~= s+s^3*Poly11(s^2) on interval \|s\|<7.0/0.16.
27	*
28	*
29	*/
30
31	/ Offsets for data table __svml_datan2_data_internal*
32	*/
33	#define dPI 0
34	#define dPIO2 16
35	#define dA19 32
36	#define dA18 48
37	#define dA17 64
38	#define dA16 80
39	#define dA15 96
40	#define dA14 112
41	#define dA13 128
42	#define dA12 144
43	#define dA11 160
44	#define dA10 176
45	#define dA09 192
46	#define dA08 208
47	#define dA07 224
48	#define dA06 240
49	#define dA05 256
50	#define dA04 272
51	#define dA03 288
52	#define dA02 304
53	#define dA01 320
54	#define dA00 336
55	#define dSIGN_MASK 352
56	#define iCHK_WORK_SUB 368
57	#define iCHK_WORK_CMP 384
58	#define dABS_MASK 400
59	#define dZERO 416
60
61	#include <sysdep.h>
62
63	.section .text.sse4, "ax", @progbits
64	ENTRY(_ZGVbN2vv_atan2_sse4)
65	subq $`88`, %rsp
66	cfi_def_cfa_offset(`96`)
67	movaps %xmm1, %xmm11
68
69	/*
70	* #define NO_VECTOR_ZERO_ATAN2_ARGS
71	* Declarations
72	* Variables
73	* Constants
74	* The end of declarations
75	* Implementation
76	* Get r0~=1/B
77	* Cannot be replaced by VQRCP(D, dR0, dB);
78	* Argument Absolute values
79	*/
80	movups dABS_MASK+__svml_datan2_data_internal(%rip), %xmm1
81	movaps %xmm0, %xmm10
82	movaps %xmm1, %xmm9
83	andps %xmm10, %xmm1
84	andps %xmm11, %xmm9
85	movaps %xmm1, %xmm4
86	cmpnltpd %xmm9, %xmm4
87
88	/ Argument signs /
89	movups dSIGN_MASK+__svml_datan2_data_internal(%rip), %xmm5
90	movaps %xmm4, %xmm0
91	movaps %xmm5, %xmm8
92	movaps %xmm5, %xmm7
93
94	/*
95	* 1) If y<x then a= y, b=x, PIO2=0
96	* 2) If y>x then a=-x, b=y, PIO2=Pi/2
97	*/
98	orps %xmm9, %xmm5
99	andnps %xmm1, %xmm0
100	andps %xmm4, %xmm5
101	andps %xmm11, %xmm8
102	movups dPIO2+__svml_datan2_data_internal(%rip), %xmm6
103	orps %xmm5, %xmm0
104	movaps %xmm4, %xmm5
105	andps %xmm4, %xmm6
106	andnps %xmm9, %xmm5
107	andps %xmm1, %xmm4
108	orps %xmm4, %xmm5
109	andps %xmm10, %xmm7
110	divpd %xmm5, %xmm0
111	movq iCHK_WORK_SUB+__svml_datan2_data_internal(%rip), %xmm2
112	xorl %edx, %edx
113
114	/ Check if y and x are on main path. /
115	pshufd $`221`, %xmm9, %xmm3
116	xorl %eax, %eax
117	pshufd $`221`, %xmm1, %xmm13
118	psubd %xmm2, %xmm3
119	psubd %xmm2, %xmm13
120	movdqa %xmm3, %xmm4
121	movq iCHK_WORK_CMP+__svml_datan2_data_internal(%rip), %xmm12
122	movdqa %xmm13, %xmm14
123	pcmpgtd %xmm12, %xmm4
124	pcmpeqd %xmm12, %xmm3
125	pcmpgtd %xmm12, %xmm14
126	pcmpeqd %xmm12, %xmm13
127
128	/ Polynomial. /
129	movaps %xmm0, %xmm12
130	por %xmm3, %xmm4
131	mulpd %xmm0, %xmm12
132
133	/ P = A19R2 + A18 /*
134	movups dA19+__svml_datan2_data_internal(%rip), %xmm15
135	movaps %xmm11, %xmm2
136	mulpd %xmm12, %xmm15
137	addpd dA18+__svml_datan2_data_internal(%rip), %xmm15
138
139	/ P = PR2 + A17 /*
140	mulpd %xmm12, %xmm15
141	addpd dA17+__svml_datan2_data_internal(%rip), %xmm15
142
143	/ P = PR2 + A16 /*
144	mulpd %xmm12, %xmm15
145	addpd dA16+__svml_datan2_data_internal(%rip), %xmm15
146
147	/ P = PR2 + A15 /*
148	mulpd %xmm12, %xmm15
149	addpd dA15+__svml_datan2_data_internal(%rip), %xmm15
150
151	/ P = PR2 + A14 /*
152	mulpd %xmm12, %xmm15
153	addpd dA14+__svml_datan2_data_internal(%rip), %xmm15
154
155	/ P = PR2 + A13 /*
156	mulpd %xmm12, %xmm15
157	addpd dA13+__svml_datan2_data_internal(%rip), %xmm15
158
159	/ P = PR2 + A12 /*
160	mulpd %xmm12, %xmm15
161	addpd dA12+__svml_datan2_data_internal(%rip), %xmm15
162
163	/ P = PR2 + A11 /*
164	mulpd %xmm12, %xmm15
165	addpd dA11+__svml_datan2_data_internal(%rip), %xmm15
166
167	/ P = PR2 + A10 /*
168	mulpd %xmm12, %xmm15
169	addpd dA10+__svml_datan2_data_internal(%rip), %xmm15
170
171	/ P = PR2 + A09 /*
172	mulpd %xmm12, %xmm15
173	addpd dA09+__svml_datan2_data_internal(%rip), %xmm15
174
175	/ P = PR2 + A08 /*
176	mulpd %xmm12, %xmm15
177	addpd dA08+__svml_datan2_data_internal(%rip), %xmm15
178
179	/ P = PR2 + A07 /*
180	mulpd %xmm12, %xmm15
181	addpd dA07+__svml_datan2_data_internal(%rip), %xmm15
182
183	/ P = PR2 + A06 /*
184	mulpd %xmm12, %xmm15
185	addpd dA06+__svml_datan2_data_internal(%rip), %xmm15
186
187	/ P = PR2 + A05 /*
188	mulpd %xmm12, %xmm15
189	addpd dA05+__svml_datan2_data_internal(%rip), %xmm15
190
191	/ P = PR2 + A04 /*
192	mulpd %xmm12, %xmm15
193	addpd dA04+__svml_datan2_data_internal(%rip), %xmm15
194
195	/ P = PR2 + A03 /*
196	mulpd %xmm12, %xmm15
197	addpd dA03+__svml_datan2_data_internal(%rip), %xmm15
198
199	/ P = PR2 + A02 /*
200	mulpd %xmm12, %xmm15
201	addpd dA02+__svml_datan2_data_internal(%rip), %xmm15
202
203	/ P = PR2 + A01 /*
204	mulpd %xmm12, %xmm15
205	addpd dA01+__svml_datan2_data_internal(%rip), %xmm15
206
207	/ P = PR2 /*
208	mulpd %xmm15, %xmm12
209
210	/*
211	* Reconstruction.
212	* dP=(R+R*dP) + dPIO2
213	*/
214	mulpd %xmm0, %xmm12
215	addpd %xmm12, %xmm0
216
217	/ if x<0, dPI = Pi, else dPI =0 /
218	movups dZERO+__svml_datan2_data_internal(%rip), %xmm3
219	por %xmm13, %xmm14
220	cmplepd %xmm3, %xmm2
221	addpd %xmm6, %xmm0
222	andps __svml_datan2_data_internal(%rip), %xmm2
223	orps %xmm8, %xmm0
224	addpd %xmm2, %xmm0
225	por %xmm14, %xmm4
226	orps %xmm7, %xmm0
227	movmskps %xmm4, %ecx
228
229	/ Special branch for fast (vector) processing of zero arguments /
230	testb $`3`, %cl
231
232	/ Go to auxiliary branch /
233	jne L(AUX_BRANCH)
234	# LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11
235
236	/ Return from auxiliary branch*
237	* for out of main path inputs
238	*/
239
240	L(AUX_BRANCH_RETURN):
241	/*
242	* Special branch for fast (vector) processing of zero arguments
243	* The end of implementation
244	*/
245	testl %edx, %edx
246
247	/ Go to special inputs processing branch /
248	jne L(SPECIAL_VALUES_BRANCH)
249	# LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm10 xmm11
250
251	/ Restore registers*
252	* and exit the function
253	*/
254
255	L(EXIT):
256	addq $`88`, %rsp
257	cfi_def_cfa_offset(`8`)
258	ret
259	cfi_def_cfa_offset(`96`)
260
261	/ Branch to process*
262	* special inputs
263	*/
264
265	L(SPECIAL_VALUES_BRANCH):
266	movups %xmm10, `32`(%rsp)
267	movups %xmm11, `48`(%rsp)
268	movups %xmm0, `64`(%rsp)
269	# LOE rbx rbp r12 r13 r14 r15 eax edx xmm0
270
271	movq %r12, `16`(%rsp)
272	cfi_offset(`12`, -`80`)
273	movl %eax, %r12d
274	movq %r13, `8`(%rsp)
275	cfi_offset(`13`, -`88`)
276	movl %edx, %r13d
277	movq %r14, (%rsp)
278	cfi_offset(`14`, -`96`)
279	# LOE rbx rbp r15 r12d r13d
280
281	/ Range mask*
282	* bits check
283	*/
284
285	L(RANGEMASK_CHECK):
286	btl %r12d, %r13d
287
288	/ Call scalar math function /
289	jc L(SCALAR_MATH_CALL)
290	# LOE rbx rbp r15 r12d r13d
291
292	/ Special inputs*
293	* processing loop
294	*/
295
296	L(SPECIAL_VALUES_LOOP):
297	incl %r12d
298	cmpl $`2`, %r12d
299
300	/ Check bits in range mask /
301	jl L(RANGEMASK_CHECK)
302	# LOE rbx rbp r15 r12d r13d
303
304	movq `16`(%rsp), %r12
305	cfi_restore(`12`)
306	movq `8`(%rsp), %r13
307	cfi_restore(`13`)
308	movq (%rsp), %r14
309	cfi_restore(`14`)
310	movups `64`(%rsp), %xmm0
311
312	/ Go to exit /
313	jmp L(EXIT)
314	cfi_offset(`12`, -`80`)
315	cfi_offset(`13`, -`88`)
316	cfi_offset(`14`, -`96`)
317	# LOE rbx rbp r12 r13 r14 r15 xmm0
318
319	/ Scalar math function call*
320	* to process special input
321	*/
322
323	L(SCALAR_MATH_CALL):
324	movl %r12d, %r14d
325	movsd `32`(%rsp, %r14, `8`), %xmm0
326	movsd `48`(%rsp, %r14, `8`), %xmm1
327	call atan2@PLT
328	# LOE rbx rbp r14 r15 r12d r13d xmm0
329
330	movsd %xmm0, `64`(%rsp, %r14, `8`)
331
332	/ Process special inputs in loop /
333	jmp L(SPECIAL_VALUES_LOOP)
334	cfi_restore(`12`)
335	cfi_restore(`13`)
336	cfi_restore(`14`)
337	# LOE rbx rbp r15 r12d r13d
338
339	/ Auxiliary branch*
340	* for out of main path inputs
341	*/
342
343	L(AUX_BRANCH):
344	/ Check if both X & Y are not NaNs: iXYnotNAN /
345	movaps %xmm11, %xmm13
346	movaps %xmm10, %xmm12
347	cmpordpd %xmm11, %xmm13
348	cmpordpd %xmm10, %xmm12
349
350	/ Check if at least on of Y or Y is zero: iAXAYZERO /
351	cmpeqpd %xmm3, %xmm9
352	cmpeqpd %xmm3, %xmm1
353
354	/*
355	* Path for zero arguments (at least one of both)
356	* Check if both args are zeros (den. is zero)
357	*/
358	cmpeqpd %xmm3, %xmm5
359	andps %xmm12, %xmm13
360	orps %xmm1, %xmm9
361	pshufd $`221`, %xmm9, %xmm1
362	pshufd $`221`, %xmm13, %xmm9
363
364	/ Check if at least on of Y or Y is zero and not NaN: iAXAYZEROnotNAN /
365	pand %xmm9, %xmm1
366
367	/ Exclude from previous callout mask zero (and not NaN) arguments /
368	movdqa %xmm1, %xmm14
369	pandn %xmm4, %xmm14
370
371	/ Set sPIO2 to zero if den. is zero /
372	movaps %xmm5, %xmm4
373	andnps %xmm6, %xmm4
374	andps %xmm3, %xmm5
375
376	/ Res = sign(Y)(X<0)?(PIO2+PI):PIO2 /*
377	pshufd $`221`, %xmm3, %xmm3
378	orps %xmm5, %xmm4
379	pshufd $`221`, %xmm11, %xmm5
380	orps %xmm8, %xmm4
381	pcmpgtd %xmm5, %xmm3
382	pshufd $`80`, %xmm3, %xmm6
383	andps %xmm2, %xmm6
384	addpd %xmm6, %xmm4
385
386	/ Go to callout /
387	movmskps %xmm14, %edx
388
389	/ Merge results from main and spec path /
390	pshufd $`80`, %xmm1, %xmm2
391	orps %xmm7, %xmm4
392	movdqa %xmm2, %xmm7
393	andps %xmm2, %xmm4
394	andnps %xmm0, %xmm7
395	andl $`3`, %edx
396	movaps %xmm7, %xmm0
397	orps %xmm4, %xmm0
398
399	/ Return to main vector processing path /
400	jmp L(AUX_BRANCH_RETURN)
401	# LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm10 xmm11
402	END(_ZGVbN2vv_atan2_sse4)
403
404	.section .rodata, "a"
405	.align `16`
406
407	#ifdef __svml_datan2_data_internal_typedef
408	typedef unsigned int VUINT32;
409	typedef struct {
410	__declspec(align(`16`)) VUINT32 dPI[`2`][`2`];
411	__declspec(align(`16`)) VUINT32 dPIO2[`2`][`2`];
412	__declspec(align(`16`)) VUINT32 dA19[`2`][`2`];
413	__declspec(align(`16`)) VUINT32 dA18[`2`][`2`];
414	__declspec(align(`16`)) VUINT32 dA17[`2`][`2`];
415	__declspec(align(`16`)) VUINT32 dA16[`2`][`2`];
416	__declspec(align(`16`)) VUINT32 dA15[`2`][`2`];
417	__declspec(align(`16`)) VUINT32 dA14[`2`][`2`];
418	__declspec(align(`16`)) VUINT32 dA13[`2`][`2`];
419	__declspec(align(`16`)) VUINT32 dA12[`2`][`2`];
420	__declspec(align(`16`)) VUINT32 dA11[`2`][`2`];
421	__declspec(align(`16`)) VUINT32 dA10[`2`][`2`];
422	__declspec(align(`16`)) VUINT32 dA09[`2`][`2`];
423	__declspec(align(`16`)) VUINT32 dA08[`2`][`2`];
424	__declspec(align(`16`)) VUINT32 dA07[`2`][`2`];
425	__declspec(align(`16`)) VUINT32 dA06[`2`][`2`];
426	__declspec(align(`16`)) VUINT32 dA05[`2`][`2`];
427	__declspec(align(`16`)) VUINT32 dA04[`2`][`2`];
428	__declspec(align(`16`)) VUINT32 dA03[`2`][`2`];
429	__declspec(align(`16`)) VUINT32 dA02[`2`][`2`];
430	__declspec(align(`16`)) VUINT32 dA01[`2`][`2`];
431	__declspec(align(`16`)) VUINT32 dA00[`2`][`2`];
432	__declspec(align(`16`)) VUINT32 dSIGN_MASK[`2`][`2`];
433	__declspec(align(`16`)) VUINT32 iCHK_WORK_SUB[`4`][`1`];
434	__declspec(align(`16`)) VUINT32 iCHK_WORK_CMP[`4`][`1`];
435	__declspec(align(`16`)) VUINT32 dABS_MASK[`2`][`2`];
436	__declspec(align(`16`)) VUINT32 dZERO[`2`][`2`];
437	} __svml_datan2_data_internal;
438	#endif
439	__svml_datan2_data_internal:
440	.quad `0x400921FB54442D18`, `0x400921FB54442D18` // dPI
441	.align `16`
442	.quad `0x3FF921FB54442D18`, `0x3FF921FB54442D18` // dPIO2
443	.align `16`
444	.quad `0xBEF4FDB537ABC7A3`, `0xBEF4FDB537ABC7A3` // dA19
445	.align `16`
446	.quad `0x3F2CED0A36665209`, `0x3F2CED0A36665209` // dA18
447	.align `16`
448	.quad `0xBF52E67C93954C23`, `0xBF52E67C93954C23` // dA17
449	.align `16`
450	.quad `0x3F6F5A1DAE82AFB3`, `0x3F6F5A1DAE82AFB3` // dA16
451	.align `16`
452	.quad `0xBF82B2EC618E4BAD`, `0xBF82B2EC618E4BAD` // dA15
453	.align `16`
454	.quad `0x3F914F4C661116A5`, `0x3F914F4C661116A5` // dA14
455	.align `16`
456	.quad `0xBF9A5E83B081F69C`, `0xBF9A5E83B081F69C` // dA13
457	.align `16`
458	.quad `0x3FA169980CB6AD4F`, `0x3FA169980CB6AD4F` // dA12
459	.align `16`
460	.quad `0xBFA4EFA2E563C1BC`, `0xBFA4EFA2E563C1BC` // dA11
461	.align `16`
462	.quad `0x3FA7EC0FBC50683B`, `0x3FA7EC0FBC50683B` // dA10
463	.align `16`
464	.quad `0xBFAAD261EAA09954`, `0xBFAAD261EAA09954` // dA09
465	.align `16`
466	.quad `0x3FAE1749BD612DCF`, `0x3FAE1749BD612DCF` // dA08
467	.align `16`
468	.quad `0xBFB11084009435E0`, `0xBFB11084009435E0` // dA07
469	.align `16`
470	.quad `0x3FB3B12A49295651`, `0x3FB3B12A49295651` // dA06
471	.align `16`
472	.quad `0xBFB745D009BADA94`, `0xBFB745D009BADA94` // dA05
473	.align `16`
474	.quad `0x3FBC71C707F7D5B5`, `0x3FBC71C707F7D5B5` // dA04
475	.align `16`
476	.quad `0xBFC2492491EE55C7`, `0xBFC2492491EE55C7` // dA03
477	.align `16`
478	.quad `0x3FC999999997EE34`, `0x3FC999999997EE34` // dA02
479	.align `16`
480	.quad `0xBFD55555555553C5`, `0xBFD55555555553C5` // dA01
481	.align `16`
482	.quad `0x3FF0000000000000`, `0x3FF0000000000000` // dA00
483	.align `16`
484	.quad `0x8000000000000000`, `0x8000000000000000` // dSIGN_MASK
485	.align `16`
486	.long `0x80300000`, `0x80300000`, `0x80300000`, `0x80300000` // iCHK_WORK_SUB
487	.align `16`
488	.long `0xfdd00000`, `0xfdd00000`, `0xfdd00000`, `0xfdd00000` // iCHK_WORK_CMP
489	.align `16`
490	.quad `0x7fffffffffffffff`, `0x7fffffffffffffff` // dABS_MASK
491	.align `16`
492	.quad `0x0000000000000000`, `0x0000000000000000` // dZERO
493	.align `16`
494	.type __svml_datan2_data_internal, @object
495	.size __svml_datan2_data_internal, .-__svml_datan2_data_internal
496

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_d_atan22_core_sse4.S