1 | /* Function cbrt vectorized with AVX-512. |
2 | Copyright (C) 2021-2024 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | https://www.gnu.org/licenses/. */ |
18 | |
19 | /* |
20 | * ALGORITHM DESCRIPTION: |
21 | * |
22 | * x=2^{3*k+j} * 1.b1 b2 ... b5 b6 ... b52 |
23 | * Let r=(x*2^{-3k-j} - 1.b1 b2 ... b5 1)* rcp[b1 b2 ..b5], |
24 | * where rcp[b1 b2 .. b5]=1/(1.b1 b2 b3 b4 b5 1) in double precision |
25 | * cbrt(2^j * 1. b1 b2 .. b5 1) is approximated as T[j][b1..b5]+D[j][b1..b5] |
26 | * (T stores the high 53 bits, D stores the low order bits) |
27 | * Result=2^k*T+(2^k*T*r)*P+2^k*D |
28 | * where P=p1+p2*r+..+p8*r^7 |
29 | * |
30 | */ |
31 | |
32 | /* Offsets for data table __svml_dcbrt_data_internal_avx512 |
33 | */ |
34 | #define etbl_H 0 |
35 | #define etbl_L 64 |
36 | #define cbrt_tbl_H 128 |
37 | #define BiasL 256 |
38 | #define SZero 320 |
39 | #define OneThird 384 |
40 | #define Bias3 448 |
41 | #define Three 512 |
42 | #define One 576 |
43 | #define poly_coeff10 640 |
44 | #define poly_coeff9 704 |
45 | #define poly_coeff8 768 |
46 | #define poly_coeff7 832 |
47 | #define poly_coeff6 896 |
48 | #define poly_coeff5 960 |
49 | #define poly_coeff4 1024 |
50 | #define poly_coeff3 1088 |
51 | #define poly_coeff2 1152 |
52 | #define poly_coeff1 1216 |
53 | |
54 | #include <sysdep.h> |
55 | |
56 | .section .text.evex512, "ax" , @progbits |
57 | ENTRY(_ZGVeN8v_cbrt_skx) |
58 | vgetmantpd $0, {sae}, %zmm0, %zmm14 |
59 | |
60 | /* GetExp(x) */ |
61 | vgetexppd {sae}, %zmm0, %zmm7 |
62 | vmovups BiasL+__svml_dcbrt_data_internal_avx512(%rip), %zmm8 |
63 | |
64 | /* exponent/3 */ |
65 | vmovups OneThird+__svml_dcbrt_data_internal_avx512(%rip), %zmm9 |
66 | vmovups Bias3+__svml_dcbrt_data_internal_avx512(%rip), %zmm10 |
67 | |
68 | /* Reduced argument: R = DblRcp*Mantissa - 1 */ |
69 | vmovups One+__svml_dcbrt_data_internal_avx512(%rip), %zmm2 |
70 | |
71 | /* exponent%3 (to be used as index) */ |
72 | vmovups Three+__svml_dcbrt_data_internal_avx512(%rip), %zmm11 |
73 | |
74 | /* DblRcp ~ 1/Mantissa */ |
75 | vrcp14pd %zmm14, %zmm13 |
76 | vaddpd {rn-sae}, %zmm8, %zmm7, %zmm12 |
77 | vandpd SZero+__svml_dcbrt_data_internal_avx512(%rip), %zmm0, %zmm6 |
78 | |
79 | /* round DblRcp to 3 fractional bits (RN mode, no Precision exception) */ |
80 | vrndscalepd $72, {sae}, %zmm13, %zmm15 |
81 | vfmsub231pd {rn-sae}, %zmm12, %zmm9, %zmm10 |
82 | |
83 | /* polynomial */ |
84 | vmovups poly_coeff10+__svml_dcbrt_data_internal_avx512(%rip), %zmm0 |
85 | vmovups poly_coeff8+__svml_dcbrt_data_internal_avx512(%rip), %zmm7 |
86 | vmovups poly_coeff7+__svml_dcbrt_data_internal_avx512(%rip), %zmm9 |
87 | vfmsub231pd {rn-sae}, %zmm15, %zmm14, %zmm2 |
88 | vrndscalepd $9, {sae}, %zmm10, %zmm5 |
89 | |
90 | /* Table lookup */ |
91 | vmovups cbrt_tbl_H+__svml_dcbrt_data_internal_avx512(%rip), %zmm10 |
92 | vmovups poly_coeff6+__svml_dcbrt_data_internal_avx512(%rip), %zmm8 |
93 | vmovups poly_coeff3+__svml_dcbrt_data_internal_avx512(%rip), %zmm13 |
94 | vfmadd231pd {rn-sae}, %zmm2, %zmm7, %zmm9 |
95 | vfnmadd231pd {rn-sae}, %zmm5, %zmm11, %zmm12 |
96 | vmovups poly_coeff5+__svml_dcbrt_data_internal_avx512(%rip), %zmm11 |
97 | vmovups poly_coeff1+__svml_dcbrt_data_internal_avx512(%rip), %zmm14 |
98 | |
99 | /* Prepare table index */ |
100 | vpsrlq $49, %zmm15, %zmm1 |
101 | |
102 | /* Table lookup: 2^(exponent%3) */ |
103 | vpermpd __svml_dcbrt_data_internal_avx512(%rip), %zmm12, %zmm4 |
104 | vpermpd etbl_L+__svml_dcbrt_data_internal_avx512(%rip), %zmm12, %zmm3 |
105 | vpermt2pd cbrt_tbl_H+64+__svml_dcbrt_data_internal_avx512(%rip), %zmm1, %zmm10 |
106 | vmovups poly_coeff9+__svml_dcbrt_data_internal_avx512(%rip), %zmm1 |
107 | vfmadd231pd {rn-sae}, %zmm2, %zmm8, %zmm11 |
108 | vmovups poly_coeff2+__svml_dcbrt_data_internal_avx512(%rip), %zmm12 |
109 | vscalefpd {rn-sae}, %zmm5, %zmm10, %zmm15 |
110 | vfmadd231pd {rn-sae}, %zmm2, %zmm0, %zmm1 |
111 | vmovups poly_coeff4+__svml_dcbrt_data_internal_avx512(%rip), %zmm5 |
112 | vfmadd231pd {rn-sae}, %zmm2, %zmm12, %zmm14 |
113 | vmulpd {rn-sae}, %zmm2, %zmm2, %zmm0 |
114 | vfmadd231pd {rn-sae}, %zmm2, %zmm5, %zmm13 |
115 | |
116 | /* Sh*R */ |
117 | vmulpd {rn-sae}, %zmm2, %zmm4, %zmm2 |
118 | vfmadd213pd {rn-sae}, %zmm9, %zmm0, %zmm1 |
119 | vfmadd213pd {rn-sae}, %zmm11, %zmm0, %zmm1 |
120 | vfmadd213pd {rn-sae}, %zmm13, %zmm0, %zmm1 |
121 | vfmadd213pd {rn-sae}, %zmm14, %zmm0, %zmm1 |
122 | |
123 | /* Sl + (Sh*R)*Poly */ |
124 | vfmadd213pd {rn-sae}, %zmm3, %zmm1, %zmm2 |
125 | |
126 | /* |
127 | * branch-free |
128 | * scaled_Th*(Sh+Sl+Sh*R*Poly) |
129 | */ |
130 | vaddpd {rn-sae}, %zmm4, %zmm2, %zmm3 |
131 | vmulpd {rn-sae}, %zmm15, %zmm3, %zmm4 |
132 | vorpd %zmm6, %zmm4, %zmm0 |
133 | ret |
134 | |
135 | END(_ZGVeN8v_cbrt_skx) |
136 | |
137 | .section .rodata, "a" |
138 | .align 64 |
139 | |
140 | #ifdef __svml_dcbrt_data_internal_avx512_typedef |
141 | typedef unsigned int VUINT32; |
142 | typedef struct { |
143 | __declspec(align(64)) VUINT32 etbl_H[8][2]; |
144 | __declspec(align(64)) VUINT32 etbl_L[8][2]; |
145 | __declspec(align(64)) VUINT32 cbrt_tbl_H[16][2]; |
146 | __declspec(align(64)) VUINT32 BiasL[8][2]; |
147 | __declspec(align(64)) VUINT32 SZero[8][2]; |
148 | __declspec(align(64)) VUINT32 OneThird[8][2]; |
149 | __declspec(align(64)) VUINT32 Bias3[8][2]; |
150 | __declspec(align(64)) VUINT32 Three[8][2]; |
151 | __declspec(align(64)) VUINT32 One[8][2]; |
152 | __declspec(align(64)) VUINT32 poly_coeff10[8][2]; |
153 | __declspec(align(64)) VUINT32 poly_coeff9[8][2]; |
154 | __declspec(align(64)) VUINT32 poly_coeff8[8][2]; |
155 | __declspec(align(64)) VUINT32 poly_coeff7[8][2]; |
156 | __declspec(align(64)) VUINT32 poly_coeff6[8][2]; |
157 | __declspec(align(64)) VUINT32 poly_coeff5[8][2]; |
158 | __declspec(align(64)) VUINT32 poly_coeff4[8][2]; |
159 | __declspec(align(64)) VUINT32 poly_coeff3[8][2]; |
160 | __declspec(align(64)) VUINT32 poly_coeff2[8][2]; |
161 | __declspec(align(64)) VUINT32 poly_coeff1[8][2]; |
162 | } __svml_dcbrt_data_internal_avx512; |
163 | #endif |
164 | __svml_dcbrt_data_internal_avx512: |
165 | /* etbl_H */ |
166 | .quad 0x3ff0000000000000 |
167 | .quad 0x3ff428a2f98d728b |
168 | .quad 0x3ff965fea53d6e3d |
169 | .quad 0x0000000000000000 |
170 | .quad 0xbff0000000000000 |
171 | .quad 0xbff428a2f98d728b |
172 | .quad 0xbff965fea53d6e3d |
173 | .quad 0x0000000000000000 |
174 | /* etbl_L */ |
175 | .align 64 |
176 | .quad 0x0000000000000000 |
177 | .quad 0xbc7ddc22548ea41e |
178 | .quad 0xbc9f53e999952f09 |
179 | .quad 0x0000000000000000 |
180 | .quad 0x0000000000000000 |
181 | .quad 0x3c7ddc22548ea41e |
182 | .quad 0x3c9f53e999952f09 |
183 | .quad 0x0000000000000000 |
184 | /* cbrt_tbl_H */ |
185 | .align 64 |
186 | .quad 0x3ff428a2f98d728b |
187 | .quad 0x3ff361f35ca116ff |
188 | .quad 0x3ff2b6b5edf6b54a |
189 | .quad 0x3ff220e6dd675180 |
190 | .quad 0x3ff19c3b38e975a8 |
191 | .quad 0x3ff12589c21fb842 |
192 | .quad 0x3ff0ba6ee5f9aad4 |
193 | .quad 0x3ff059123d3a9848 |
194 | .quad 0x3ff0000000000000 |
195 | .quad 0x0000000000000000 |
196 | .quad 0x0000000000000000 |
197 | .quad 0x0000000000000000 |
198 | .quad 0x0000000000000000 |
199 | .quad 0x0000000000000000 |
200 | .quad 0x0000000000000000 |
201 | .quad 0x0000000000000000 |
202 | /* BiasL */ |
203 | .align 64 |
204 | .quad 0x4338000000000000, 0x4338000000000000, 0x4338000000000000, 0x4338000000000000, 0x4338000000000000, 0x4338000000000000, 0x4338000000000000, 0x4338000000000000 |
205 | /* Zero */ |
206 | .align 64 |
207 | .quad 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000 |
208 | /* OneThird */ |
209 | .align 64 |
210 | .quad 0x3fd5555555555556, 0x3fd5555555555556, 0x3fd5555555555556, 0x3fd5555555555556, 0x3fd5555555555556, 0x3fd5555555555556, 0x3fd5555555555556, 0x3fd5555555555556 |
211 | /* Bias3 */ |
212 | .align 64 |
213 | .quad 0x4320000000000000, 0x4320000000000000, 0x4320000000000000, 0x4320000000000000, 0x4320000000000000, 0x4320000000000000, 0x4320000000000000, 0x4320000000000000 |
214 | /* Three */ |
215 | .align 64 |
216 | .quad 0x4008000000000000, 0x4008000000000000, 0x4008000000000000, 0x4008000000000000, 0x4008000000000000, 0x4008000000000000, 0x4008000000000000, 0x4008000000000000 |
217 | /* One */ |
218 | .align 64 |
219 | .quad 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000 |
220 | /* poly_coeff10 */ |
221 | .align 64 |
222 | .quad 0xbf882e3b6adeca62, 0xbf882e3b6adeca62, 0xbf882e3b6adeca62, 0xbf882e3b6adeca62, 0xbf882e3b6adeca62, 0xbf882e3b6adeca62, 0xbf882e3b6adeca62, 0xbf882e3b6adeca62 |
223 | /* poly_coeff9 */ |
224 | .align 64 |
225 | .quad 0x3f8bda24bae48875, 0x3f8bda24bae48875, 0x3f8bda24bae48875, 0x3f8bda24bae48875, 0x3f8bda24bae48875, 0x3f8bda24bae48875, 0x3f8bda24bae48875, 0x3f8bda24bae48875 |
226 | /* poly_coeff8 */ |
227 | .align 64 |
228 | .quad 0xbf9036b87c71d55f, 0xbf9036b87c71d55f, 0xbf9036b87c71d55f, 0xbf9036b87c71d55f, 0xbf9036b87c71d55f, 0xbf9036b87c71d55f, 0xbf9036b87c71d55f, 0xbf9036b87c71d55f |
229 | /* poly_coeff7 */ |
230 | .align 64 |
231 | .quad 0x3f9374ed9398b914, 0x3f9374ed9398b914, 0x3f9374ed9398b914, 0x3f9374ed9398b914, 0x3f9374ed9398b914, 0x3f9374ed9398b914, 0x3f9374ed9398b914, 0x3f9374ed9398b914 |
232 | /* poly_coeff6 */ |
233 | .align 64 |
234 | .quad 0xbf98090d77f2468e, 0xbf98090d77f2468e, 0xbf98090d77f2468e, 0xbf98090d77f2468e, 0xbf98090d77f2468e, 0xbf98090d77f2468e, 0xbf98090d77f2468e, 0xbf98090d77f2468e |
235 | /* poly_coeff5 */ |
236 | .align 64 |
237 | .quad 0x3f9ee71141dcf569, 0x3f9ee71141dcf569, 0x3f9ee71141dcf569, 0x3f9ee71141dcf569, 0x3f9ee71141dcf569, 0x3f9ee71141dcf569, 0x3f9ee71141dcf569, 0x3f9ee71141dcf569 |
238 | /* poly_coeff4 */ |
239 | .align 64 |
240 | .quad 0xbfa511e8d2b0363e, 0xbfa511e8d2b0363e, 0xbfa511e8d2b0363e, 0xbfa511e8d2b0363e, 0xbfa511e8d2b0363e, 0xbfa511e8d2b0363e, 0xbfa511e8d2b0363e, 0xbfa511e8d2b0363e |
241 | /* poly_coeff3 */ |
242 | .align 64 |
243 | .quad 0x3faf9add3c0b7e31, 0x3faf9add3c0b7e31, 0x3faf9add3c0b7e31, 0x3faf9add3c0b7e31, 0x3faf9add3c0b7e31, 0x3faf9add3c0b7e31, 0x3faf9add3c0b7e31, 0x3faf9add3c0b7e31 |
244 | /* poly_coeff2 */ |
245 | .align 64 |
246 | .quad 0xbfbc71c71c71c741, 0xbfbc71c71c71c741, 0xbfbc71c71c71c741, 0xbfbc71c71c71c741, 0xbfbc71c71c71c741, 0xbfbc71c71c71c741, 0xbfbc71c71c71c741, 0xbfbc71c71c71c741 |
247 | /* poly_coeff1 */ |
248 | .align 64 |
249 | .quad 0x3fd5555555555557, 0x3fd5555555555557, 0x3fd5555555555557, 0x3fd5555555555557, 0x3fd5555555555557, 0x3fd5555555555557, 0x3fd5555555555557, 0x3fd5555555555557 |
250 | .align 64 |
251 | .type __svml_dcbrt_data_internal_avx512, @object |
252 | .size __svml_dcbrt_data_internal_avx512, .-__svml_dcbrt_data_internal_avx512 |
253 | |