1/* Function atan vectorized with SSE4.
2 Copyright (C) 2021-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
18
19/*
20 * ALGORITHM DESCRIPTION:
21 *
22 * For 0.0 <= x <= 7.0/16.0: atan(x) = atan(0.0) + atan(s), where s=(x-0.0)/(1.0+0.0*x)
23 * For 7.0/16.0 <= x <= 11.0/16.0: atan(x) = atan(0.5) + atan(s), where s=(x-0.5)/(1.0+0.5*x)
24 * For 11.0/16.0 <= x <= 19.0/16.0: atan(x) = atan(1.0) + atan(s), where s=(x-1.0)/(1.0+1.0*x)
25 * For 19.0/16.0 <= x <= 39.0/16.0: atan(x) = atan(1.5) + atan(s), where s=(x-1.5)/(1.0+1.5*x)
26 * For 39.0/16.0 <= x <= inf : atan(x) = atan(inf) + atan(s), where s=-1.0/x
27 * Where atan(s) ~= s+s^3*Poly11(s^2) on interval |s|<7.0/0.16.
28 *
29 */
30
31/* Offsets for data table __svml_datan_data_internal_avx512
32 */
33#define AbsMask 0
34#define Shifter 16
35#define MaxThreshold 32
36#define MOne 48
37#define One 64
38#define LargeX 80
39#define Zero 96
40#define Tbl_H 112
41#define Tbl_L 368
42#define dIndexMed 624
43#define Pi2 640
44#define Pi2_low 656
45#define coeff 672
46
47#include <sysdep.h>
48
49 .section .text.sse4, "ax", @progbits
50ENTRY(_ZGVbN2v_atan_sse4)
51 lea Tbl_H+128+__svml_datan_data_internal_avx512(%rip), %rcx
52 movups __svml_datan_data_internal_avx512(%rip), %xmm4
53 movups Shifter+__svml_datan_data_internal_avx512(%rip), %xmm3
54 andps %xmm0, %xmm4
55 movaps %xmm3, %xmm12
56 movaps %xmm4, %xmm5
57 addpd %xmm4, %xmm12
58 movaps %xmm12, %xmm7
59
60 /*
61 * table lookup sequence
62 * VPERMUTE not available
63 */
64 movaps %xmm12, %xmm10
65 subpd %xmm3, %xmm7
66 subpd %xmm7, %xmm5
67 mulpd %xmm4, %xmm7
68 movups MaxThreshold+__svml_datan_data_internal_avx512(%rip), %xmm2
69 psllq $3, %xmm10
70
71 /* saturate X range */
72 movups LargeX+__svml_datan_data_internal_avx512(%rip), %xmm8
73 pxor %xmm4, %xmm0
74 cmplepd %xmm4, %xmm2
75 addpd One+__svml_datan_data_internal_avx512(%rip), %xmm7
76 minpd %xmm4, %xmm8
77 movups MOne+__svml_datan_data_internal_avx512(%rip), %xmm6
78 movaps %xmm2, %xmm1
79 movaps %xmm2, %xmm9
80 andnps %xmm5, %xmm1
81 andps %xmm2, %xmm6
82 andnps %xmm7, %xmm9
83 andps %xmm2, %xmm8
84 orps %xmm6, %xmm1
85 orps %xmm8, %xmm9
86
87 /* R+Rl = DiffX/Y */
88 divpd %xmm9, %xmm1
89 pand .FLT_11(%rip), %xmm10
90
91 /* set table value to Pi/2 for large X */
92 movups Pi2+__svml_datan_data_internal_avx512(%rip), %xmm4
93 movd %xmm10, %eax
94 andps %xmm2, %xmm4
95 pshufd $2, %xmm10, %xmm11
96 movaps %xmm2, %xmm10
97
98 /* polynomial evaluation */
99 movaps %xmm1, %xmm2
100 mulpd %xmm1, %xmm2
101 movd %xmm11, %edx
102 movups coeff+__svml_datan_data_internal_avx512(%rip), %xmm5
103 movaps %xmm2, %xmm7
104 movups coeff+32+__svml_datan_data_internal_avx512(%rip), %xmm6
105 movaps %xmm2, %xmm9
106 mulpd %xmm2, %xmm5
107 mulpd %xmm2, %xmm7
108 addpd coeff+16+__svml_datan_data_internal_avx512(%rip), %xmm5
109 mulpd %xmm2, %xmm6
110 mulpd %xmm7, %xmm5
111 addpd coeff+48+__svml_datan_data_internal_avx512(%rip), %xmm6
112 mulpd %xmm1, %xmm9
113 addpd %xmm5, %xmm6
114 movups coeff+64+__svml_datan_data_internal_avx512(%rip), %xmm8
115 mulpd %xmm2, %xmm8
116 mulpd %xmm6, %xmm7
117 addpd coeff+80+__svml_datan_data_internal_avx512(%rip), %xmm8
118 addpd %xmm7, %xmm8
119 mulpd %xmm8, %xmm9
120 movups dIndexMed+__svml_datan_data_internal_avx512(%rip), %xmm14
121 cmplepd %xmm12, %xmm14
122 addpd %xmm9, %xmm1
123 movslq %eax, %rax
124 movaps %xmm14, %xmm3
125 movslq %edx, %rdx
126 movsd -128(%rax, %rcx), %xmm13
127 movsd (%rcx, %rax), %xmm15
128 movhpd -128(%rdx, %rcx), %xmm13
129 movhpd (%rcx, %rdx), %xmm15
130 andnps %xmm13, %xmm3
131 andps %xmm14, %xmm15
132 orps %xmm15, %xmm3
133 andnps %xmm3, %xmm10
134 orps %xmm4, %xmm10
135 addpd %xmm1, %xmm10
136 pxor %xmm10, %xmm0
137 ret
138
139END(_ZGVbN2v_atan_sse4)
140
141 .section .rodata, "a"
142 .align 16
143
144#ifdef __svml_datan_data_internal_avx512_typedef
145typedef unsigned int VUINT32;
146typedef struct {
147 __declspec(align(16)) VUINT32 AbsMask[2][2];
148 __declspec(align(16)) VUINT32 Shifter[2][2];
149 __declspec(align(16)) VUINT32 MaxThreshold[2][2];
150 __declspec(align(16)) VUINT32 MOne[2][2];
151 __declspec(align(16)) VUINT32 One[2][2];
152 __declspec(align(16)) VUINT32 LargeX[2][2];
153 __declspec(align(16)) VUINT32 Zero[2][2];
154 __declspec(align(16)) VUINT32 Tbl_H[32][2];
155 __declspec(align(16)) VUINT32 Tbl_L[32][2];
156 __declspec(align(16)) VUINT32 dIndexMed[2][2];
157 __declspec(align(16)) VUINT32 Pi2[2][2];
158 __declspec(align(16)) VUINT32 Pi2_low[2][2];
159 __declspec(align(16)) VUINT32 coeff[6][2][2];
160} __svml_datan_data_internal_avx512;
161#endif
162__svml_datan_data_internal_avx512:
163 /* AbsMask */
164 .quad 0x7fffffffffffffff, 0x7fffffffffffffff
165 /* Shifter */
166 .align 16
167 .quad 0x4318000000000000, 0x4318000000000000
168 /* MaxThreshold */
169 .align 16
170 .quad 0x401f800000000000, 0x401f800000000000
171 /* MOne */
172 .align 16
173 .quad 0xbff0000000000000, 0xbff0000000000000
174 /* One */
175 .align 16
176 .quad 0x3ff0000000000000, 0x3ff0000000000000
177 /* LargeX */
178 .align 16
179 .quad 0x47f0000000000000, 0x47f0000000000000
180 /* Zero */
181 .align 16
182 .quad 0x0000000000000000, 0x0000000000000000
183 /* Tbl_H */
184 .align 16
185 .quad 0x0000000000000000, 0x3fcf5b75f92c80dd
186 .quad 0x3fddac670561bb4f, 0x3fe4978fa3269ee1
187 .quad 0x3fe921fb54442d18, 0x3fecac7c57846f9e
188 .quad 0x3fef730bd281f69b, 0x3ff0d38f2c5ba09f
189 .quad 0x3ff1b6e192ebbe44, 0x3ff270ef55a53a25
190 .quad 0x3ff30b6d796a4da8, 0x3ff38d6a6ce13353
191 .quad 0x3ff3fc176b7a8560, 0x3ff45b54837351a0
192 .quad 0x3ff4ae10fc6589a5, 0x3ff4f68dea672617
193 .quad 0x3ff5368c951e9cfd, 0x3ff56f6f33a3e6a7
194 .quad 0x3ff5a25052114e60, 0x3ff5d013c41adabd
195 .quad 0x3ff5f97315254857, 0x3ff61f06c6a92b89
196 .quad 0x3ff6414d44094c7c, 0x3ff660b02c736a06
197 .quad 0x3ff67d8863bc99bd, 0x3ff698213a9d5053
198 .quad 0x3ff6b0bae830c070, 0x3ff6c78c7edeb195
199 .quad 0x3ff6dcc57bb565fd, 0x3ff6f08f07435fec
200 .quad 0x3ff7030cf9403197, 0x3ff7145eac2088a4
201 /* Tbl_L */
202 .align 16
203 .quad 0x0000000000000000, 0x3c68ab6e3cf7afbd
204 .quad 0x3c7a2b7f222f65e2, 0x3c72419a87f2a458
205 .quad 0x3c81a62633145c07, 0x3c80dae13ad18a6b
206 .quad 0x3c7007887af0cbbd, 0xbc9bd0dc231bfd70
207 .quad 0x3c9b1b466a88828e, 0xbc9a66b1af5f84fb
208 .quad 0x3c96254cb03bb199, 0xbc812c77e8a80f5c
209 .quad 0xbc4441a3bd3f1084, 0x3c79e4a72eedacc4
210 .quad 0xbc93b03e8a27f555, 0x3c9934f9f2b0020e
211 .quad 0xbc996f47948a99f1, 0xbc7df6edd6f1ec3b
212 .quad 0x3c78c2d0c89de218, 0x3c9f82bba194dd5d
213 .quad 0xbc831151a43b51ca, 0xbc8487d50bceb1a5
214 .quad 0xbc9c5f60a65c7397, 0xbc7acb6afb332a0f
215 .quad 0xbc99b7bd2e1e8c9c, 0xbc9b9839085189e3
216 .quad 0xbc97d1ab82ffb70b, 0x3c99239ad620ffe2
217 .quad 0xbc929c86447928e7, 0xbc8957a7170df016
218 .quad 0xbc7cbe1896221608, 0xbc9fda5797b32a0b
219 /* dIndexMed */
220 .align 16
221 .quad 0x4318000000000010, 0x4318000000000010
222 /* Pi2 */
223 .align 16
224 .quad 0x3ff921fb54442d18, 0x3ff921fb54442d18
225 /* Pi2_low */
226 .align 16
227 .quad 0x3c91a62633145c07, 0x3c91a62633145c07
228 /* coeff6 */
229 .align 16
230 .quad 0x3fb2e9b9f5c4fe97, 0x3fb2e9b9f5c4fe97
231 .quad 0xbfb74257c46790cc, 0xbfb74257c46790cc
232 .quad 0x3fbc71bfeff916a0, 0x3fbc71bfeff916a0
233 .quad 0xbfc249248eef04da, 0xbfc249248eef04da
234 .quad 0x3fc999999998741e, 0x3fc999999998741e
235 .quad 0xbfd555555555554d, 0xbfd555555555554d
236 .align 16
237 .type __svml_datan_data_internal_avx512, @object
238 .size __svml_datan_data_internal_avx512, .-__svml_datan_data_internal_avx512
239 .align 16
240
241.FLT_11:
242 .long 0x00000078, 0x00000000, 0x00000078, 0x00000000
243 .type .FLT_11, @object
244 .size .FLT_11, 16
245

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_d_atan2_core_sse4.S