1 | /* Function atan vectorized with AVX-512. |
2 | Copyright (C) 2021-2024 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | https://www.gnu.org/licenses/. */ |
18 | |
19 | /* |
20 | * ALGORITHM DESCRIPTION: |
21 | * |
22 | * For 0.0 <= x <= 7.0/16.0: atan(x) = atan(0.0) + atan(s), where s=(x-0.0)/(1.0+0.0*x) |
23 | * For 7.0/16.0 <= x <= 11.0/16.0: atan(x) = atan(0.5) + atan(s), where s=(x-0.5)/(1.0+0.5*x) |
24 | * For 11.0/16.0 <= x <= 19.0/16.0: atan(x) = atan(1.0) + atan(s), where s=(x-1.0)/(1.0+1.0*x) |
25 | * For 19.0/16.0 <= x <= 39.0/16.0: atan(x) = atan(1.5) + atan(s), where s=(x-1.5)/(1.0+1.5*x) |
26 | * For 39.0/16.0 <= x <= inf : atan(x) = atan(inf) + atan(s), where s=-1.0/x |
27 | * Where atan(s) ~= s+s^3*Poly11(s^2) on interval |s|<7.0/0.16. |
28 | * |
29 | */ |
30 | |
31 | /* Offsets for data table __svml_datan_data_internal_avx512 |
32 | */ |
33 | #define AbsMask 0 |
34 | #define Shifter 64 |
35 | #define MaxThreshold 128 |
36 | #define MOne 192 |
37 | #define One 256 |
38 | #define LargeX 320 |
39 | #define Zero 384 |
40 | #define Tbl_H 448 |
41 | #define dIndexMed 704 |
42 | #define Pi2 768 |
43 | #define coeff_1 832 |
44 | #define coeff_2 896 |
45 | #define coeff_3 960 |
46 | #define coeff_4 1024 |
47 | #define coeff_5 1088 |
48 | #define coeff_6 1152 |
49 | |
50 | #include <sysdep.h> |
51 | |
52 | .section .text.evex512, "ax" , @progbits |
53 | ENTRY(_ZGVeN8v_atan_skx) |
54 | vmovups Shifter+__svml_datan_data_internal_avx512(%rip), %zmm4 |
55 | vmovups MaxThreshold+__svml_datan_data_internal_avx512(%rip), %zmm3 |
56 | vmovups One+__svml_datan_data_internal_avx512(%rip), %zmm9 |
57 | |
58 | /* saturate X range */ |
59 | vmovups LargeX+__svml_datan_data_internal_avx512(%rip), %zmm7 |
60 | vandpd __svml_datan_data_internal_avx512(%rip), %zmm0, %zmm8 |
61 | |
62 | /* R+Rl = DiffX/Y */ |
63 | vbroadcastsd .FLT_10(%rip), %zmm15 |
64 | vaddpd {rn-sae}, %zmm4, %zmm8, %zmm2 |
65 | vxorpd %zmm0, %zmm8, %zmm1 |
66 | vcmppd $29, {sae}, %zmm3, %zmm8, %k2 |
67 | |
68 | /* round to 2 bits after binary point */ |
69 | vreducepd $40, {sae}, %zmm8, %zmm6 |
70 | vsubpd {rn-sae}, %zmm4, %zmm2, %zmm5 |
71 | |
72 | /* |
73 | * if|X|>=MaxThreshold, set DiffX=-1 |
74 | * VMSUB(D, DiffX, LargeMask, Zero, One); |
75 | */ |
76 | vblendmpd MOne+__svml_datan_data_internal_avx512(%rip), %zmm6, %zmm10{%k2} |
77 | vfmadd231pd {rn-sae}, %zmm8, %zmm5, %zmm9 |
78 | vmovups dIndexMed+__svml_datan_data_internal_avx512(%rip), %zmm5 |
79 | |
80 | /* table lookup sequence */ |
81 | vmovups Tbl_H+__svml_datan_data_internal_avx512(%rip), %zmm6 |
82 | vgetmantpd $0, {sae}, %zmm10, %zmm14 |
83 | vgetexppd {sae}, %zmm10, %zmm11 |
84 | vmovups coeff_5+__svml_datan_data_internal_avx512(%rip), %zmm10 |
85 | |
86 | /* |
87 | * if|X|>=MaxThreshold, set Y=X |
88 | * VMADD(D, Y, LargeMask, X, Zero); |
89 | */ |
90 | vminpd {sae}, %zmm8, %zmm7, %zmm9{%k2} |
91 | vcmppd $29, {sae}, %zmm5, %zmm2, %k1 |
92 | vmovups Tbl_H+128+__svml_datan_data_internal_avx512(%rip), %zmm7 |
93 | vmovups coeff_1+__svml_datan_data_internal_avx512(%rip), %zmm8 |
94 | vgetmantpd $0, {sae}, %zmm9, %zmm3 |
95 | vgetexppd {sae}, %zmm9, %zmm12 |
96 | vmovups coeff_3+__svml_datan_data_internal_avx512(%rip), %zmm9 |
97 | vpermt2pd Tbl_H+64+__svml_datan_data_internal_avx512(%rip), %zmm2, %zmm6 |
98 | vsubpd {rn-sae}, %zmm12, %zmm11, %zmm4 |
99 | vpermt2pd Tbl_H+192+__svml_datan_data_internal_avx512(%rip), %zmm2, %zmm7 |
100 | vrcp14pd %zmm3, %zmm13 |
101 | vmovups coeff_4+__svml_datan_data_internal_avx512(%rip), %zmm12 |
102 | vmovups coeff_6+__svml_datan_data_internal_avx512(%rip), %zmm11 |
103 | vblendmpd %zmm7, %zmm6, %zmm2{%k1} |
104 | vmulpd {rn-sae}, %zmm13, %zmm14, %zmm0 |
105 | vfnmadd231pd {rn-sae}, %zmm3, %zmm13, %zmm15 |
106 | vfnmadd213pd {rn-sae}, %zmm14, %zmm0, %zmm3 |
107 | vfmadd213pd {rn-sae}, %zmm15, %zmm15, %zmm15 |
108 | vfmadd213pd {rn-sae}, %zmm13, %zmm13, %zmm15 |
109 | vfmadd213pd {rn-sae}, %zmm0, %zmm15, %zmm3 |
110 | vscalefpd {rn-sae}, %zmm4, %zmm3, %zmm0 |
111 | |
112 | /* set table value to Pi/2 for large X */ |
113 | vblendmpd Pi2+__svml_datan_data_internal_avx512(%rip), %zmm2, %zmm3{%k2} |
114 | vmovups coeff_2+__svml_datan_data_internal_avx512(%rip), %zmm2 |
115 | |
116 | /* polynomial evaluation */ |
117 | vmulpd {rn-sae}, %zmm0, %zmm0, %zmm14 |
118 | vmulpd {rn-sae}, %zmm14, %zmm14, %zmm13 |
119 | vmulpd {rn-sae}, %zmm0, %zmm14, %zmm15 |
120 | vfmadd231pd {rn-sae}, %zmm14, %zmm8, %zmm2 |
121 | vfmadd231pd {rn-sae}, %zmm14, %zmm9, %zmm12 |
122 | vfmadd213pd {rn-sae}, %zmm11, %zmm10, %zmm14 |
123 | vfmadd213pd {rn-sae}, %zmm12, %zmm13, %zmm2 |
124 | vfmadd213pd {rn-sae}, %zmm14, %zmm13, %zmm2 |
125 | vfmadd213pd {rn-sae}, %zmm0, %zmm15, %zmm2 |
126 | vaddpd {rn-sae}, %zmm3, %zmm2, %zmm0 |
127 | vxorpd %zmm1, %zmm0, %zmm0 |
128 | ret |
129 | |
130 | END(_ZGVeN8v_atan_skx) |
131 | |
132 | .section .rodata, "a" |
133 | .align 64 |
134 | |
135 | #ifdef __svml_datan_data_internal_avx512_typedef |
136 | typedef unsigned int VUINT32; |
137 | typedef struct { |
138 | __declspec(align(64)) VUINT32 AbsMask[8][2]; |
139 | __declspec(align(64)) VUINT32 Shifter[8][2]; |
140 | __declspec(align(64)) VUINT32 MaxThreshold[8][2]; |
141 | __declspec(align(64)) VUINT32 MOne[8][2]; |
142 | __declspec(align(64)) VUINT32 One[8][2]; |
143 | __declspec(align(64)) VUINT32 LargeX[8][2]; |
144 | __declspec(align(64)) VUINT32 Zero[8][2]; |
145 | __declspec(align(64)) VUINT32 Tbl_H[32][2]; |
146 | __declspec(align(64)) VUINT32 dIndexMed[8][2]; |
147 | __declspec(align(64)) VUINT32 Pi2[8][2]; |
148 | __declspec(align(64)) VUINT32 coeff[6][8][2]; |
149 | } __svml_datan_data_internal_avx512; |
150 | #endif |
151 | __svml_datan_data_internal_avx512: |
152 | /* AbsMask */ |
153 | .quad 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff |
154 | /* Shifter */ |
155 | .align 64 |
156 | .quad 0x4318000000000000, 0x4318000000000000, 0x4318000000000000, 0x4318000000000000, 0x4318000000000000, 0x4318000000000000, 0x4318000000000000, 0x4318000000000000 |
157 | /* MaxThreshold */ |
158 | .align 64 |
159 | .quad 0x401f800000000000, 0x401f800000000000, 0x401f800000000000, 0x401f800000000000, 0x401f800000000000, 0x401f800000000000, 0x401f800000000000, 0x401f800000000000 |
160 | /* MOne */ |
161 | .align 64 |
162 | .quad 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000 |
163 | /* One */ |
164 | .align 64 |
165 | .quad 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000 |
166 | /* LargeX */ |
167 | .align 64 |
168 | .quad 0x47f0000000000000, 0x47f0000000000000, 0x47f0000000000000, 0x47f0000000000000, 0x47f0000000000000, 0x47f0000000000000, 0x47f0000000000000, 0x47f0000000000000 |
169 | /* Zero */ |
170 | .align 64 |
171 | .quad 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000 |
172 | /* Tbl_H */ |
173 | .align 64 |
174 | .quad 0x0000000000000000, 0x3fcf5b75f92c80dd |
175 | .quad 0x3fddac670561bb4f, 0x3fe4978fa3269ee1 |
176 | .quad 0x3fe921fb54442d18, 0x3fecac7c57846f9e |
177 | .quad 0x3fef730bd281f69b, 0x3ff0d38f2c5ba09f |
178 | .quad 0x3ff1b6e192ebbe44, 0x3ff270ef55a53a25 |
179 | .quad 0x3ff30b6d796a4da8, 0x3ff38d6a6ce13353 |
180 | .quad 0x3ff3fc176b7a8560, 0x3ff45b54837351a0 |
181 | .quad 0x3ff4ae10fc6589a5, 0x3ff4f68dea672617 |
182 | .quad 0x3ff5368c951e9cfd, 0x3ff56f6f33a3e6a7 |
183 | .quad 0x3ff5a25052114e60, 0x3ff5d013c41adabd |
184 | .quad 0x3ff5f97315254857, 0x3ff61f06c6a92b89 |
185 | .quad 0x3ff6414d44094c7c, 0x3ff660b02c736a06 |
186 | .quad 0x3ff67d8863bc99bd, 0x3ff698213a9d5053 |
187 | .quad 0x3ff6b0bae830c070, 0x3ff6c78c7edeb195 |
188 | .quad 0x3ff6dcc57bb565fd, 0x3ff6f08f07435fec |
189 | .quad 0x3ff7030cf9403197, 0x3ff7145eac2088a4 |
190 | /* dIndexMed */ |
191 | .align 64 |
192 | .quad 0x4318000000000010, 0x4318000000000010, 0x4318000000000010, 0x4318000000000010, 0x4318000000000010, 0x4318000000000010, 0x4318000000000010, 0x4318000000000010 |
193 | /* Pi2 */ |
194 | .align 64 |
195 | .quad 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18 |
196 | /* coeff6 */ |
197 | .align 64 |
198 | .quad 0x3fb2e9b9f5c4fe97, 0x3fb2e9b9f5c4fe97, 0x3fb2e9b9f5c4fe97, 0x3fb2e9b9f5c4fe97, 0x3fb2e9b9f5c4fe97, 0x3fb2e9b9f5c4fe97, 0x3fb2e9b9f5c4fe97, 0x3fb2e9b9f5c4fe97 |
199 | .quad 0xbfb74257c46790cc, 0xbfb74257c46790cc, 0xbfb74257c46790cc, 0xbfb74257c46790cc, 0xbfb74257c46790cc, 0xbfb74257c46790cc, 0xbfb74257c46790cc, 0xbfb74257c46790cc |
200 | .quad 0x3fbc71bfeff916a0, 0x3fbc71bfeff916a0, 0x3fbc71bfeff916a0, 0x3fbc71bfeff916a0, 0x3fbc71bfeff916a0, 0x3fbc71bfeff916a0, 0x3fbc71bfeff916a0, 0x3fbc71bfeff916a0 |
201 | .quad 0xbfc249248eef04da, 0xbfc249248eef04da, 0xbfc249248eef04da, 0xbfc249248eef04da, 0xbfc249248eef04da, 0xbfc249248eef04da, 0xbfc249248eef04da, 0xbfc249248eef04da |
202 | .quad 0x3fc999999998741e, 0x3fc999999998741e, 0x3fc999999998741e, 0x3fc999999998741e, 0x3fc999999998741e, 0x3fc999999998741e, 0x3fc999999998741e, 0x3fc999999998741e |
203 | .quad 0xbfd555555555554d, 0xbfd555555555554d, 0xbfd555555555554d, 0xbfd555555555554d, 0xbfd555555555554d, 0xbfd555555555554d, 0xbfd555555555554d, 0xbfd555555555554d |
204 | .align 64 |
205 | .type __svml_datan_data_internal_avx512, @object |
206 | .size __svml_datan_data_internal_avx512, .-__svml_datan_data_internal_avx512 |
207 | .align 8 |
208 | |
209 | .FLT_10: |
210 | .long 0x00000000, 0x3ff00000 |
211 | .type .FLT_10, @object |
212 | .size .FLT_10, 8 |
213 | |