1 | /* Function atan vectorized with AVX2. |
2 | Copyright (C) 2021-2024 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | https://www.gnu.org/licenses/. */ |
18 | |
19 | /* |
20 | * ALGORITHM DESCRIPTION: |
21 | * |
22 | * For 0.0 <= x <= 7.0/16.0: atan(x) = atan(0.0) + atan(s), where s=(x-0.0)/(1.0+0.0*x) |
23 | * For 7.0/16.0 <= x <= 11.0/16.0: atan(x) = atan(0.5) + atan(s), where s=(x-0.5)/(1.0+0.5*x) |
24 | * For 11.0/16.0 <= x <= 19.0/16.0: atan(x) = atan(1.0) + atan(s), where s=(x-1.0)/(1.0+1.0*x) |
25 | * For 19.0/16.0 <= x <= 39.0/16.0: atan(x) = atan(1.5) + atan(s), where s=(x-1.5)/(1.0+1.5*x) |
26 | * For 39.0/16.0 <= x <= inf : atan(x) = atan(inf) + atan(s), where s=-1.0/x |
27 | * Where atan(s) ~= s+s^3*Poly11(s^2) on interval |s|<7.0/0.16. |
28 | * |
29 | */ |
30 | |
31 | /* Offsets for data table __svml_datan_data_internal_avx512 |
32 | */ |
33 | #define AbsMask 0 |
34 | #define Shifter 32 |
35 | #define MaxThreshold 64 |
36 | #define MOne 96 |
37 | #define One 128 |
38 | #define LargeX 160 |
39 | #define Zero 192 |
40 | #define Tbl_H 224 |
41 | #define Tbl_L 480 |
42 | #define dIndexMed 736 |
43 | #define Pi2 768 |
44 | #define Pi2_low 800 |
45 | #define coeff 832 |
46 | |
47 | #include <sysdep.h> |
48 | |
49 | .section .text.avx2, "ax" , @progbits |
50 | ENTRY(_ZGVdN4v_atan_avx2) |
51 | lea Tbl_H+128+__svml_datan_data_internal_avx512(%rip), %rdi |
52 | vmovupd Shifter+__svml_datan_data_internal_avx512(%rip), %ymm4 |
53 | vmovupd One+__svml_datan_data_internal_avx512(%rip), %ymm9 |
54 | |
55 | /* saturate X range */ |
56 | vmovupd LargeX+__svml_datan_data_internal_avx512(%rip), %ymm6 |
57 | vandpd __svml_datan_data_internal_avx512(%rip), %ymm0, %ymm7 |
58 | vaddpd %ymm4, %ymm7, %ymm2 |
59 | vcmpge_oqpd MaxThreshold+__svml_datan_data_internal_avx512(%rip), %ymm7, %ymm3 |
60 | vminpd %ymm7, %ymm6, %ymm10 |
61 | vsubpd %ymm4, %ymm2, %ymm5 |
62 | |
63 | /* |
64 | * table lookup sequence |
65 | * VPERMUTE not available |
66 | */ |
67 | vpsllq $3, %ymm2, %ymm13 |
68 | vsubpd %ymm5, %ymm7, %ymm8 |
69 | vcmpge_oqpd dIndexMed+__svml_datan_data_internal_avx512(%rip), %ymm2, %ymm2 |
70 | vfmadd231pd %ymm7, %ymm5, %ymm9 |
71 | vpand .FLT_11(%rip), %ymm13, %ymm14 |
72 | vblendvpd %ymm3, MOne+__svml_datan_data_internal_avx512(%rip), %ymm8, %ymm11 |
73 | vblendvpd %ymm3, %ymm10, %ymm9, %ymm12 |
74 | vxorpd %ymm0, %ymm7, %ymm1 |
75 | |
76 | /* R+Rl = DiffX/Y */ |
77 | vdivpd %ymm12, %ymm11, %ymm0 |
78 | vextractf128 $1, %ymm14, %xmm4 |
79 | vmovd %xmm14, %eax |
80 | vmovd %xmm4, %ecx |
81 | movslq %eax, %rax |
82 | vpextrd $2, %xmm14, %edx |
83 | movslq %ecx, %rcx |
84 | vpextrd $2, %xmm4, %esi |
85 | movslq %edx, %rdx |
86 | movslq %esi, %rsi |
87 | vmovsd -128(%rax, %rdi), %xmm15 |
88 | vmovsd (%rdi, %rax), %xmm7 |
89 | vmovsd -128(%rcx, %rdi), %xmm5 |
90 | vmovsd (%rdi, %rcx), %xmm9 |
91 | vmovhpd -128(%rdx, %rdi), %xmm15, %xmm15 |
92 | vmovhpd (%rdi, %rdx), %xmm7, %xmm8 |
93 | vmovhpd -128(%rsi, %rdi), %xmm5, %xmm6 |
94 | vmovhpd (%rdi, %rsi), %xmm9, %xmm10 |
95 | |
96 | /* polynomial evaluation */ |
97 | vmulpd %ymm0, %ymm0, %ymm5 |
98 | vmulpd %ymm5, %ymm5, %ymm4 |
99 | vinsertf128 $1, %xmm6, %ymm15, %ymm11 |
100 | vinsertf128 $1, %xmm10, %ymm8, %ymm12 |
101 | vblendvpd %ymm2, %ymm12, %ymm11, %ymm13 |
102 | vmovupd coeff+__svml_datan_data_internal_avx512(%rip), %ymm8 |
103 | vmovupd coeff+64+__svml_datan_data_internal_avx512(%rip), %ymm2 |
104 | vmulpd %ymm5, %ymm0, %ymm6 |
105 | vfmadd213pd coeff+32+__svml_datan_data_internal_avx512(%rip), %ymm5, %ymm8 |
106 | vfmadd213pd coeff+96+__svml_datan_data_internal_avx512(%rip), %ymm5, %ymm2 |
107 | |
108 | /* set table value to Pi/2 for large X */ |
109 | vblendvpd %ymm3, Pi2+__svml_datan_data_internal_avx512(%rip), %ymm13, %ymm7 |
110 | vmovupd coeff+128+__svml_datan_data_internal_avx512(%rip), %ymm3 |
111 | vfmadd213pd %ymm2, %ymm4, %ymm8 |
112 | vfmadd213pd coeff+160+__svml_datan_data_internal_avx512(%rip), %ymm3, %ymm5 |
113 | vfmadd213pd %ymm5, %ymm4, %ymm8 |
114 | vfmadd213pd %ymm0, %ymm6, %ymm8 |
115 | vaddpd %ymm8, %ymm7, %ymm0 |
116 | vxorpd %ymm1, %ymm0, %ymm0 |
117 | ret |
118 | |
119 | END(_ZGVdN4v_atan_avx2) |
120 | |
121 | .section .rodata, "a" |
122 | .align 32 |
123 | |
124 | .FLT_11: |
125 | .long 0x00000078, 0x00000000, 0x00000078, 0x00000000, 0x00000078, 0x00000000, 0x00000078, 0x00000000 |
126 | .type .FLT_11, @object |
127 | .size .FLT_11, 32 |
128 | .align 32 |
129 | |
130 | #ifdef __svml_datan_data_internal_avx512_typedef |
131 | typedef unsigned int VUINT32; |
132 | typedef struct { |
133 | __declspec(align(32)) VUINT32 AbsMask[4][2]; |
134 | __declspec(align(32)) VUINT32 Shifter[4][2]; |
135 | __declspec(align(32)) VUINT32 MaxThreshold[4][2]; |
136 | __declspec(align(32)) VUINT32 MOne[4][2]; |
137 | __declspec(align(32)) VUINT32 One[4][2]; |
138 | __declspec(align(32)) VUINT32 LargeX[4][2]; |
139 | __declspec(align(32)) VUINT32 Zero[4][2]; |
140 | __declspec(align(32)) VUINT32 Tbl_H[32][2]; |
141 | __declspec(align(32)) VUINT32 Tbl_L[32][2]; |
142 | __declspec(align(32)) VUINT32 dIndexMed[4][2]; |
143 | __declspec(align(32)) VUINT32 Pi2[4][2]; |
144 | __declspec(align(32)) VUINT32 Pi2_low[4][2]; |
145 | __declspec(align(32)) VUINT32 coeff[6][4][2]; |
146 | } __svml_datan_data_internal_avx512; |
147 | #endif |
148 | __svml_datan_data_internal_avx512: |
149 | /* AbsMask */ |
150 | .quad 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff |
151 | /* Shifter */ |
152 | .align 32 |
153 | .quad 0x4318000000000000, 0x4318000000000000, 0x4318000000000000, 0x4318000000000000 |
154 | /* MaxThreshold */ |
155 | .align 32 |
156 | .quad 0x401f800000000000, 0x401f800000000000, 0x401f800000000000, 0x401f800000000000 |
157 | /* MOne */ |
158 | .align 32 |
159 | .quad 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000 |
160 | /* One */ |
161 | .align 32 |
162 | .quad 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000 |
163 | /* LargeX */ |
164 | .align 32 |
165 | .quad 0x47f0000000000000, 0x47f0000000000000, 0x47f0000000000000, 0x47f0000000000000 |
166 | /* Zero */ |
167 | .align 32 |
168 | .quad 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000 |
169 | /* Tbl_H */ |
170 | .align 32 |
171 | .quad 0x0000000000000000, 0x3fcf5b75f92c80dd |
172 | .quad 0x3fddac670561bb4f, 0x3fe4978fa3269ee1 |
173 | .quad 0x3fe921fb54442d18, 0x3fecac7c57846f9e |
174 | .quad 0x3fef730bd281f69b, 0x3ff0d38f2c5ba09f |
175 | .quad 0x3ff1b6e192ebbe44, 0x3ff270ef55a53a25 |
176 | .quad 0x3ff30b6d796a4da8, 0x3ff38d6a6ce13353 |
177 | .quad 0x3ff3fc176b7a8560, 0x3ff45b54837351a0 |
178 | .quad 0x3ff4ae10fc6589a5, 0x3ff4f68dea672617 |
179 | .quad 0x3ff5368c951e9cfd, 0x3ff56f6f33a3e6a7 |
180 | .quad 0x3ff5a25052114e60, 0x3ff5d013c41adabd |
181 | .quad 0x3ff5f97315254857, 0x3ff61f06c6a92b89 |
182 | .quad 0x3ff6414d44094c7c, 0x3ff660b02c736a06 |
183 | .quad 0x3ff67d8863bc99bd, 0x3ff698213a9d5053 |
184 | .quad 0x3ff6b0bae830c070, 0x3ff6c78c7edeb195 |
185 | .quad 0x3ff6dcc57bb565fd, 0x3ff6f08f07435fec |
186 | .quad 0x3ff7030cf9403197, 0x3ff7145eac2088a4 |
187 | /* Tbl_L */ |
188 | .align 32 |
189 | .quad 0x0000000000000000, 0x3c68ab6e3cf7afbd |
190 | .quad 0x3c7a2b7f222f65e2, 0x3c72419a87f2a458 |
191 | .quad 0x3c81a62633145c07, 0x3c80dae13ad18a6b |
192 | .quad 0x3c7007887af0cbbd, 0xbc9bd0dc231bfd70 |
193 | .quad 0x3c9b1b466a88828e, 0xbc9a66b1af5f84fb |
194 | .quad 0x3c96254cb03bb199, 0xbc812c77e8a80f5c |
195 | .quad 0xbc4441a3bd3f1084, 0x3c79e4a72eedacc4 |
196 | .quad 0xbc93b03e8a27f555, 0x3c9934f9f2b0020e |
197 | .quad 0xbc996f47948a99f1, 0xbc7df6edd6f1ec3b |
198 | .quad 0x3c78c2d0c89de218, 0x3c9f82bba194dd5d |
199 | .quad 0xbc831151a43b51ca, 0xbc8487d50bceb1a5 |
200 | .quad 0xbc9c5f60a65c7397, 0xbc7acb6afb332a0f |
201 | .quad 0xbc99b7bd2e1e8c9c, 0xbc9b9839085189e3 |
202 | .quad 0xbc97d1ab82ffb70b, 0x3c99239ad620ffe2 |
203 | .quad 0xbc929c86447928e7, 0xbc8957a7170df016 |
204 | .quad 0xbc7cbe1896221608, 0xbc9fda5797b32a0b |
205 | /* dIndexMed */ |
206 | .align 32 |
207 | .quad 0x4318000000000010, 0x4318000000000010, 0x4318000000000010, 0x4318000000000010 |
208 | /* Pi2 */ |
209 | .align 32 |
210 | .quad 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18 |
211 | /* Pi2_low */ |
212 | .align 32 |
213 | .quad 0x3c91a62633145c07, 0x3c91a62633145c07, 0x3c91a62633145c07, 0x3c91a62633145c07 |
214 | /* coeff6 */ |
215 | .align 32 |
216 | .quad 0x3fb2e9b9f5c4fe97, 0x3fb2e9b9f5c4fe97, 0x3fb2e9b9f5c4fe97, 0x3fb2e9b9f5c4fe97 |
217 | .quad 0xbfb74257c46790cc, 0xbfb74257c46790cc, 0xbfb74257c46790cc, 0xbfb74257c46790cc |
218 | .quad 0x3fbc71bfeff916a0, 0x3fbc71bfeff916a0, 0x3fbc71bfeff916a0, 0x3fbc71bfeff916a0 |
219 | .quad 0xbfc249248eef04da, 0xbfc249248eef04da, 0xbfc249248eef04da, 0xbfc249248eef04da |
220 | .quad 0x3fc999999998741e, 0x3fc999999998741e, 0x3fc999999998741e, 0x3fc999999998741e |
221 | .quad 0xbfd555555555554d, 0xbfd555555555554d, 0xbfd555555555554d, 0xbfd555555555554d |
222 | .align 32 |
223 | .type __svml_datan_data_internal_avx512, @object |
224 | .size __svml_datan_data_internal_avx512, .-__svml_datan_data_internal_avx512 |
225 | |