1 | /* Function atan vectorized with SSE4. |
2 | Copyright (C) 2021-2024 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | https://www.gnu.org/licenses/. */ |
18 | |
19 | /* |
20 | * ALGORITHM DESCRIPTION: |
21 | * |
22 | * For 0.0 <= x <= 7.0/16.0: atan(x) = atan(0.0) + atan(s), where s=(x-0.0)/(1.0+0.0*x) |
23 | * For 7.0/16.0 <= x <= 11.0/16.0: atan(x) = atan(0.5) + atan(s), where s=(x-0.5)/(1.0+0.5*x) |
24 | * For 11.0/16.0 <= x <= 19.0/16.0: atan(x) = atan(1.0) + atan(s), where s=(x-1.0)/(1.0+1.0*x) |
25 | * For 19.0/16.0 <= x <= 39.0/16.0: atan(x) = atan(1.5) + atan(s), where s=(x-1.5)/(1.0+1.5*x) |
26 | * For 39.0/16.0 <= x <= inf : atan(x) = atan(inf) + atan(s), where s=-1.0/x |
27 | * Where atan(s) ~= s+s^3*Poly11(s^2) on interval |s|<7.0/0.16. |
28 | * |
29 | */ |
30 | |
31 | /* Offsets for data table __svml_datan_data_internal_avx512 |
32 | */ |
33 | #define AbsMask 0 |
34 | #define Shifter 16 |
35 | #define MaxThreshold 32 |
36 | #define MOne 48 |
37 | #define One 64 |
38 | #define LargeX 80 |
39 | #define Zero 96 |
40 | #define Tbl_H 112 |
41 | #define Tbl_L 368 |
42 | #define dIndexMed 624 |
43 | #define Pi2 640 |
44 | #define Pi2_low 656 |
45 | #define coeff 672 |
46 | |
47 | #include <sysdep.h> |
48 | |
49 | .section .text.sse4, "ax" , @progbits |
50 | ENTRY(_ZGVbN2v_atan_sse4) |
51 | lea Tbl_H+128+__svml_datan_data_internal_avx512(%rip), %rcx |
52 | movups __svml_datan_data_internal_avx512(%rip), %xmm4 |
53 | movups Shifter+__svml_datan_data_internal_avx512(%rip), %xmm3 |
54 | andps %xmm0, %xmm4 |
55 | movaps %xmm3, %xmm12 |
56 | movaps %xmm4, %xmm5 |
57 | addpd %xmm4, %xmm12 |
58 | movaps %xmm12, %xmm7 |
59 | |
60 | /* |
61 | * table lookup sequence |
62 | * VPERMUTE not available |
63 | */ |
64 | movaps %xmm12, %xmm10 |
65 | subpd %xmm3, %xmm7 |
66 | subpd %xmm7, %xmm5 |
67 | mulpd %xmm4, %xmm7 |
68 | movups MaxThreshold+__svml_datan_data_internal_avx512(%rip), %xmm2 |
69 | psllq $3, %xmm10 |
70 | |
71 | /* saturate X range */ |
72 | movups LargeX+__svml_datan_data_internal_avx512(%rip), %xmm8 |
73 | pxor %xmm4, %xmm0 |
74 | cmplepd %xmm4, %xmm2 |
75 | addpd One+__svml_datan_data_internal_avx512(%rip), %xmm7 |
76 | minpd %xmm4, %xmm8 |
77 | movups MOne+__svml_datan_data_internal_avx512(%rip), %xmm6 |
78 | movaps %xmm2, %xmm1 |
79 | movaps %xmm2, %xmm9 |
80 | andnps %xmm5, %xmm1 |
81 | andps %xmm2, %xmm6 |
82 | andnps %xmm7, %xmm9 |
83 | andps %xmm2, %xmm8 |
84 | orps %xmm6, %xmm1 |
85 | orps %xmm8, %xmm9 |
86 | |
87 | /* R+Rl = DiffX/Y */ |
88 | divpd %xmm9, %xmm1 |
89 | pand .FLT_11(%rip), %xmm10 |
90 | |
91 | /* set table value to Pi/2 for large X */ |
92 | movups Pi2+__svml_datan_data_internal_avx512(%rip), %xmm4 |
93 | movd %xmm10, %eax |
94 | andps %xmm2, %xmm4 |
95 | pshufd $2, %xmm10, %xmm11 |
96 | movaps %xmm2, %xmm10 |
97 | |
98 | /* polynomial evaluation */ |
99 | movaps %xmm1, %xmm2 |
100 | mulpd %xmm1, %xmm2 |
101 | movd %xmm11, %edx |
102 | movups coeff+__svml_datan_data_internal_avx512(%rip), %xmm5 |
103 | movaps %xmm2, %xmm7 |
104 | movups coeff+32+__svml_datan_data_internal_avx512(%rip), %xmm6 |
105 | movaps %xmm2, %xmm9 |
106 | mulpd %xmm2, %xmm5 |
107 | mulpd %xmm2, %xmm7 |
108 | addpd coeff+16+__svml_datan_data_internal_avx512(%rip), %xmm5 |
109 | mulpd %xmm2, %xmm6 |
110 | mulpd %xmm7, %xmm5 |
111 | addpd coeff+48+__svml_datan_data_internal_avx512(%rip), %xmm6 |
112 | mulpd %xmm1, %xmm9 |
113 | addpd %xmm5, %xmm6 |
114 | movups coeff+64+__svml_datan_data_internal_avx512(%rip), %xmm8 |
115 | mulpd %xmm2, %xmm8 |
116 | mulpd %xmm6, %xmm7 |
117 | addpd coeff+80+__svml_datan_data_internal_avx512(%rip), %xmm8 |
118 | addpd %xmm7, %xmm8 |
119 | mulpd %xmm8, %xmm9 |
120 | movups dIndexMed+__svml_datan_data_internal_avx512(%rip), %xmm14 |
121 | cmplepd %xmm12, %xmm14 |
122 | addpd %xmm9, %xmm1 |
123 | movslq %eax, %rax |
124 | movaps %xmm14, %xmm3 |
125 | movslq %edx, %rdx |
126 | movsd -128(%rax, %rcx), %xmm13 |
127 | movsd (%rcx, %rax), %xmm15 |
128 | movhpd -128(%rdx, %rcx), %xmm13 |
129 | movhpd (%rcx, %rdx), %xmm15 |
130 | andnps %xmm13, %xmm3 |
131 | andps %xmm14, %xmm15 |
132 | orps %xmm15, %xmm3 |
133 | andnps %xmm3, %xmm10 |
134 | orps %xmm4, %xmm10 |
135 | addpd %xmm1, %xmm10 |
136 | pxor %xmm10, %xmm0 |
137 | ret |
138 | |
139 | END(_ZGVbN2v_atan_sse4) |
140 | |
141 | .section .rodata, "a" |
142 | .align 16 |
143 | |
144 | #ifdef __svml_datan_data_internal_avx512_typedef |
145 | typedef unsigned int VUINT32; |
146 | typedef struct { |
147 | __declspec(align(16)) VUINT32 AbsMask[2][2]; |
148 | __declspec(align(16)) VUINT32 Shifter[2][2]; |
149 | __declspec(align(16)) VUINT32 MaxThreshold[2][2]; |
150 | __declspec(align(16)) VUINT32 MOne[2][2]; |
151 | __declspec(align(16)) VUINT32 One[2][2]; |
152 | __declspec(align(16)) VUINT32 LargeX[2][2]; |
153 | __declspec(align(16)) VUINT32 Zero[2][2]; |
154 | __declspec(align(16)) VUINT32 Tbl_H[32][2]; |
155 | __declspec(align(16)) VUINT32 Tbl_L[32][2]; |
156 | __declspec(align(16)) VUINT32 dIndexMed[2][2]; |
157 | __declspec(align(16)) VUINT32 Pi2[2][2]; |
158 | __declspec(align(16)) VUINT32 Pi2_low[2][2]; |
159 | __declspec(align(16)) VUINT32 coeff[6][2][2]; |
160 | } __svml_datan_data_internal_avx512; |
161 | #endif |
162 | __svml_datan_data_internal_avx512: |
163 | /* AbsMask */ |
164 | .quad 0x7fffffffffffffff, 0x7fffffffffffffff |
165 | /* Shifter */ |
166 | .align 16 |
167 | .quad 0x4318000000000000, 0x4318000000000000 |
168 | /* MaxThreshold */ |
169 | .align 16 |
170 | .quad 0x401f800000000000, 0x401f800000000000 |
171 | /* MOne */ |
172 | .align 16 |
173 | .quad 0xbff0000000000000, 0xbff0000000000000 |
174 | /* One */ |
175 | .align 16 |
176 | .quad 0x3ff0000000000000, 0x3ff0000000000000 |
177 | /* LargeX */ |
178 | .align 16 |
179 | .quad 0x47f0000000000000, 0x47f0000000000000 |
180 | /* Zero */ |
181 | .align 16 |
182 | .quad 0x0000000000000000, 0x0000000000000000 |
183 | /* Tbl_H */ |
184 | .align 16 |
185 | .quad 0x0000000000000000, 0x3fcf5b75f92c80dd |
186 | .quad 0x3fddac670561bb4f, 0x3fe4978fa3269ee1 |
187 | .quad 0x3fe921fb54442d18, 0x3fecac7c57846f9e |
188 | .quad 0x3fef730bd281f69b, 0x3ff0d38f2c5ba09f |
189 | .quad 0x3ff1b6e192ebbe44, 0x3ff270ef55a53a25 |
190 | .quad 0x3ff30b6d796a4da8, 0x3ff38d6a6ce13353 |
191 | .quad 0x3ff3fc176b7a8560, 0x3ff45b54837351a0 |
192 | .quad 0x3ff4ae10fc6589a5, 0x3ff4f68dea672617 |
193 | .quad 0x3ff5368c951e9cfd, 0x3ff56f6f33a3e6a7 |
194 | .quad 0x3ff5a25052114e60, 0x3ff5d013c41adabd |
195 | .quad 0x3ff5f97315254857, 0x3ff61f06c6a92b89 |
196 | .quad 0x3ff6414d44094c7c, 0x3ff660b02c736a06 |
197 | .quad 0x3ff67d8863bc99bd, 0x3ff698213a9d5053 |
198 | .quad 0x3ff6b0bae830c070, 0x3ff6c78c7edeb195 |
199 | .quad 0x3ff6dcc57bb565fd, 0x3ff6f08f07435fec |
200 | .quad 0x3ff7030cf9403197, 0x3ff7145eac2088a4 |
201 | /* Tbl_L */ |
202 | .align 16 |
203 | .quad 0x0000000000000000, 0x3c68ab6e3cf7afbd |
204 | .quad 0x3c7a2b7f222f65e2, 0x3c72419a87f2a458 |
205 | .quad 0x3c81a62633145c07, 0x3c80dae13ad18a6b |
206 | .quad 0x3c7007887af0cbbd, 0xbc9bd0dc231bfd70 |
207 | .quad 0x3c9b1b466a88828e, 0xbc9a66b1af5f84fb |
208 | .quad 0x3c96254cb03bb199, 0xbc812c77e8a80f5c |
209 | .quad 0xbc4441a3bd3f1084, 0x3c79e4a72eedacc4 |
210 | .quad 0xbc93b03e8a27f555, 0x3c9934f9f2b0020e |
211 | .quad 0xbc996f47948a99f1, 0xbc7df6edd6f1ec3b |
212 | .quad 0x3c78c2d0c89de218, 0x3c9f82bba194dd5d |
213 | .quad 0xbc831151a43b51ca, 0xbc8487d50bceb1a5 |
214 | .quad 0xbc9c5f60a65c7397, 0xbc7acb6afb332a0f |
215 | .quad 0xbc99b7bd2e1e8c9c, 0xbc9b9839085189e3 |
216 | .quad 0xbc97d1ab82ffb70b, 0x3c99239ad620ffe2 |
217 | .quad 0xbc929c86447928e7, 0xbc8957a7170df016 |
218 | .quad 0xbc7cbe1896221608, 0xbc9fda5797b32a0b |
219 | /* dIndexMed */ |
220 | .align 16 |
221 | .quad 0x4318000000000010, 0x4318000000000010 |
222 | /* Pi2 */ |
223 | .align 16 |
224 | .quad 0x3ff921fb54442d18, 0x3ff921fb54442d18 |
225 | /* Pi2_low */ |
226 | .align 16 |
227 | .quad 0x3c91a62633145c07, 0x3c91a62633145c07 |
228 | /* coeff6 */ |
229 | .align 16 |
230 | .quad 0x3fb2e9b9f5c4fe97, 0x3fb2e9b9f5c4fe97 |
231 | .quad 0xbfb74257c46790cc, 0xbfb74257c46790cc |
232 | .quad 0x3fbc71bfeff916a0, 0x3fbc71bfeff916a0 |
233 | .quad 0xbfc249248eef04da, 0xbfc249248eef04da |
234 | .quad 0x3fc999999998741e, 0x3fc999999998741e |
235 | .quad 0xbfd555555555554d, 0xbfd555555555554d |
236 | .align 16 |
237 | .type __svml_datan_data_internal_avx512, @object |
238 | .size __svml_datan_data_internal_avx512, .-__svml_datan_data_internal_avx512 |
239 | .align 16 |
240 | |
241 | .FLT_11: |
242 | .long 0x00000078, 0x00000000, 0x00000078, 0x00000000 |
243 | .type .FLT_11, @object |
244 | .size .FLT_11, 16 |
245 | |