1/* Function cbrt vectorized with AVX-512.
2 Copyright (C) 2021-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
18
19/*
20 * ALGORITHM DESCRIPTION:
21 *
22 * x=2^{3*k+j} * 1.b1 b2 ... b5 b6 ... b52
23 * Let r=(x*2^{-3k-j} - 1.b1 b2 ... b5 1)* rcp[b1 b2 ..b5],
24 * where rcp[b1 b2 .. b5]=1/(1.b1 b2 b3 b4 b5 1) in double precision
25 * cbrt(2^j * 1. b1 b2 .. b5 1) is approximated as T[j][b1..b5]+D[j][b1..b5]
26 * (T stores the high 53 bits, D stores the low order bits)
27 * Result=2^k*T+(2^k*T*r)*P+2^k*D
28 * where P=p1+p2*r+..+p8*r^7
29 *
30 */
31
32/* Offsets for data table __svml_dcbrt_data_internal_avx512
33 */
34#define etbl_H 0
35#define etbl_L 64
36#define cbrt_tbl_H 128
37#define BiasL 256
38#define SZero 320
39#define OneThird 384
40#define Bias3 448
41#define Three 512
42#define One 576
43#define poly_coeff10 640
44#define poly_coeff9 704
45#define poly_coeff8 768
46#define poly_coeff7 832
47#define poly_coeff6 896
48#define poly_coeff5 960
49#define poly_coeff4 1024
50#define poly_coeff3 1088
51#define poly_coeff2 1152
52#define poly_coeff1 1216
53
54#include <sysdep.h>
55
56 .section .text.evex512, "ax", @progbits
57ENTRY(_ZGVeN8v_cbrt_skx)
58 vgetmantpd $0, {sae}, %zmm0, %zmm14
59
60 /* GetExp(x) */
61 vgetexppd {sae}, %zmm0, %zmm7
62 vmovups BiasL+__svml_dcbrt_data_internal_avx512(%rip), %zmm8
63
64 /* exponent/3 */
65 vmovups OneThird+__svml_dcbrt_data_internal_avx512(%rip), %zmm9
66 vmovups Bias3+__svml_dcbrt_data_internal_avx512(%rip), %zmm10
67
68 /* Reduced argument: R = DblRcp*Mantissa - 1 */
69 vmovups One+__svml_dcbrt_data_internal_avx512(%rip), %zmm2
70
71 /* exponent%3 (to be used as index) */
72 vmovups Three+__svml_dcbrt_data_internal_avx512(%rip), %zmm11
73
74 /* DblRcp ~ 1/Mantissa */
75 vrcp14pd %zmm14, %zmm13
76 vaddpd {rn-sae}, %zmm8, %zmm7, %zmm12
77 vandpd SZero+__svml_dcbrt_data_internal_avx512(%rip), %zmm0, %zmm6
78
79 /* round DblRcp to 3 fractional bits (RN mode, no Precision exception) */
80 vrndscalepd $72, {sae}, %zmm13, %zmm15
81 vfmsub231pd {rn-sae}, %zmm12, %zmm9, %zmm10
82
83 /* polynomial */
84 vmovups poly_coeff10+__svml_dcbrt_data_internal_avx512(%rip), %zmm0
85 vmovups poly_coeff8+__svml_dcbrt_data_internal_avx512(%rip), %zmm7
86 vmovups poly_coeff7+__svml_dcbrt_data_internal_avx512(%rip), %zmm9
87 vfmsub231pd {rn-sae}, %zmm15, %zmm14, %zmm2
88 vrndscalepd $9, {sae}, %zmm10, %zmm5
89
90 /* Table lookup */
91 vmovups cbrt_tbl_H+__svml_dcbrt_data_internal_avx512(%rip), %zmm10
92 vmovups poly_coeff6+__svml_dcbrt_data_internal_avx512(%rip), %zmm8
93 vmovups poly_coeff3+__svml_dcbrt_data_internal_avx512(%rip), %zmm13
94 vfmadd231pd {rn-sae}, %zmm2, %zmm7, %zmm9
95 vfnmadd231pd {rn-sae}, %zmm5, %zmm11, %zmm12
96 vmovups poly_coeff5+__svml_dcbrt_data_internal_avx512(%rip), %zmm11
97 vmovups poly_coeff1+__svml_dcbrt_data_internal_avx512(%rip), %zmm14
98
99 /* Prepare table index */
100 vpsrlq $49, %zmm15, %zmm1
101
102 /* Table lookup: 2^(exponent%3) */
103 vpermpd __svml_dcbrt_data_internal_avx512(%rip), %zmm12, %zmm4
104 vpermpd etbl_L+__svml_dcbrt_data_internal_avx512(%rip), %zmm12, %zmm3
105 vpermt2pd cbrt_tbl_H+64+__svml_dcbrt_data_internal_avx512(%rip), %zmm1, %zmm10
106 vmovups poly_coeff9+__svml_dcbrt_data_internal_avx512(%rip), %zmm1
107 vfmadd231pd {rn-sae}, %zmm2, %zmm8, %zmm11
108 vmovups poly_coeff2+__svml_dcbrt_data_internal_avx512(%rip), %zmm12
109 vscalefpd {rn-sae}, %zmm5, %zmm10, %zmm15
110 vfmadd231pd {rn-sae}, %zmm2, %zmm0, %zmm1
111 vmovups poly_coeff4+__svml_dcbrt_data_internal_avx512(%rip), %zmm5
112 vfmadd231pd {rn-sae}, %zmm2, %zmm12, %zmm14
113 vmulpd {rn-sae}, %zmm2, %zmm2, %zmm0
114 vfmadd231pd {rn-sae}, %zmm2, %zmm5, %zmm13
115
116 /* Sh*R */
117 vmulpd {rn-sae}, %zmm2, %zmm4, %zmm2
118 vfmadd213pd {rn-sae}, %zmm9, %zmm0, %zmm1
119 vfmadd213pd {rn-sae}, %zmm11, %zmm0, %zmm1
120 vfmadd213pd {rn-sae}, %zmm13, %zmm0, %zmm1
121 vfmadd213pd {rn-sae}, %zmm14, %zmm0, %zmm1
122
123 /* Sl + (Sh*R)*Poly */
124 vfmadd213pd {rn-sae}, %zmm3, %zmm1, %zmm2
125
126 /*
127 * branch-free
128 * scaled_Th*(Sh+Sl+Sh*R*Poly)
129 */
130 vaddpd {rn-sae}, %zmm4, %zmm2, %zmm3
131 vmulpd {rn-sae}, %zmm15, %zmm3, %zmm4
132 vorpd %zmm6, %zmm4, %zmm0
133 ret
134
135END(_ZGVeN8v_cbrt_skx)
136
137 .section .rodata, "a"
138 .align 64
139
140#ifdef __svml_dcbrt_data_internal_avx512_typedef
141typedef unsigned int VUINT32;
142typedef struct {
143 __declspec(align(64)) VUINT32 etbl_H[8][2];
144 __declspec(align(64)) VUINT32 etbl_L[8][2];
145 __declspec(align(64)) VUINT32 cbrt_tbl_H[16][2];
146 __declspec(align(64)) VUINT32 BiasL[8][2];
147 __declspec(align(64)) VUINT32 SZero[8][2];
148 __declspec(align(64)) VUINT32 OneThird[8][2];
149 __declspec(align(64)) VUINT32 Bias3[8][2];
150 __declspec(align(64)) VUINT32 Three[8][2];
151 __declspec(align(64)) VUINT32 One[8][2];
152 __declspec(align(64)) VUINT32 poly_coeff10[8][2];
153 __declspec(align(64)) VUINT32 poly_coeff9[8][2];
154 __declspec(align(64)) VUINT32 poly_coeff8[8][2];
155 __declspec(align(64)) VUINT32 poly_coeff7[8][2];
156 __declspec(align(64)) VUINT32 poly_coeff6[8][2];
157 __declspec(align(64)) VUINT32 poly_coeff5[8][2];
158 __declspec(align(64)) VUINT32 poly_coeff4[8][2];
159 __declspec(align(64)) VUINT32 poly_coeff3[8][2];
160 __declspec(align(64)) VUINT32 poly_coeff2[8][2];
161 __declspec(align(64)) VUINT32 poly_coeff1[8][2];
162} __svml_dcbrt_data_internal_avx512;
163#endif
164__svml_dcbrt_data_internal_avx512:
165 /* etbl_H */
166 .quad 0x3ff0000000000000
167 .quad 0x3ff428a2f98d728b
168 .quad 0x3ff965fea53d6e3d
169 .quad 0x0000000000000000
170 .quad 0xbff0000000000000
171 .quad 0xbff428a2f98d728b
172 .quad 0xbff965fea53d6e3d
173 .quad 0x0000000000000000
174 /* etbl_L */
175 .align 64
176 .quad 0x0000000000000000
177 .quad 0xbc7ddc22548ea41e
178 .quad 0xbc9f53e999952f09
179 .quad 0x0000000000000000
180 .quad 0x0000000000000000
181 .quad 0x3c7ddc22548ea41e
182 .quad 0x3c9f53e999952f09
183 .quad 0x0000000000000000
184 /* cbrt_tbl_H */
185 .align 64
186 .quad 0x3ff428a2f98d728b
187 .quad 0x3ff361f35ca116ff
188 .quad 0x3ff2b6b5edf6b54a
189 .quad 0x3ff220e6dd675180
190 .quad 0x3ff19c3b38e975a8
191 .quad 0x3ff12589c21fb842
192 .quad 0x3ff0ba6ee5f9aad4
193 .quad 0x3ff059123d3a9848
194 .quad 0x3ff0000000000000
195 .quad 0x0000000000000000
196 .quad 0x0000000000000000
197 .quad 0x0000000000000000
198 .quad 0x0000000000000000
199 .quad 0x0000000000000000
200 .quad 0x0000000000000000
201 .quad 0x0000000000000000
202 /* BiasL */
203 .align 64
204 .quad 0x4338000000000000, 0x4338000000000000, 0x4338000000000000, 0x4338000000000000, 0x4338000000000000, 0x4338000000000000, 0x4338000000000000, 0x4338000000000000
205 /* Zero */
206 .align 64
207 .quad 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000
208 /* OneThird */
209 .align 64
210 .quad 0x3fd5555555555556, 0x3fd5555555555556, 0x3fd5555555555556, 0x3fd5555555555556, 0x3fd5555555555556, 0x3fd5555555555556, 0x3fd5555555555556, 0x3fd5555555555556
211 /* Bias3 */
212 .align 64
213 .quad 0x4320000000000000, 0x4320000000000000, 0x4320000000000000, 0x4320000000000000, 0x4320000000000000, 0x4320000000000000, 0x4320000000000000, 0x4320000000000000
214 /* Three */
215 .align 64
216 .quad 0x4008000000000000, 0x4008000000000000, 0x4008000000000000, 0x4008000000000000, 0x4008000000000000, 0x4008000000000000, 0x4008000000000000, 0x4008000000000000
217 /* One */
218 .align 64
219 .quad 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000
220 /* poly_coeff10 */
221 .align 64
222 .quad 0xbf882e3b6adeca62, 0xbf882e3b6adeca62, 0xbf882e3b6adeca62, 0xbf882e3b6adeca62, 0xbf882e3b6adeca62, 0xbf882e3b6adeca62, 0xbf882e3b6adeca62, 0xbf882e3b6adeca62
223 /* poly_coeff9 */
224 .align 64
225 .quad 0x3f8bda24bae48875, 0x3f8bda24bae48875, 0x3f8bda24bae48875, 0x3f8bda24bae48875, 0x3f8bda24bae48875, 0x3f8bda24bae48875, 0x3f8bda24bae48875, 0x3f8bda24bae48875
226 /* poly_coeff8 */
227 .align 64
228 .quad 0xbf9036b87c71d55f, 0xbf9036b87c71d55f, 0xbf9036b87c71d55f, 0xbf9036b87c71d55f, 0xbf9036b87c71d55f, 0xbf9036b87c71d55f, 0xbf9036b87c71d55f, 0xbf9036b87c71d55f
229 /* poly_coeff7 */
230 .align 64
231 .quad 0x3f9374ed9398b914, 0x3f9374ed9398b914, 0x3f9374ed9398b914, 0x3f9374ed9398b914, 0x3f9374ed9398b914, 0x3f9374ed9398b914, 0x3f9374ed9398b914, 0x3f9374ed9398b914
232 /* poly_coeff6 */
233 .align 64
234 .quad 0xbf98090d77f2468e, 0xbf98090d77f2468e, 0xbf98090d77f2468e, 0xbf98090d77f2468e, 0xbf98090d77f2468e, 0xbf98090d77f2468e, 0xbf98090d77f2468e, 0xbf98090d77f2468e
235 /* poly_coeff5 */
236 .align 64
237 .quad 0x3f9ee71141dcf569, 0x3f9ee71141dcf569, 0x3f9ee71141dcf569, 0x3f9ee71141dcf569, 0x3f9ee71141dcf569, 0x3f9ee71141dcf569, 0x3f9ee71141dcf569, 0x3f9ee71141dcf569
238 /* poly_coeff4 */
239 .align 64
240 .quad 0xbfa511e8d2b0363e, 0xbfa511e8d2b0363e, 0xbfa511e8d2b0363e, 0xbfa511e8d2b0363e, 0xbfa511e8d2b0363e, 0xbfa511e8d2b0363e, 0xbfa511e8d2b0363e, 0xbfa511e8d2b0363e
241 /* poly_coeff3 */
242 .align 64
243 .quad 0x3faf9add3c0b7e31, 0x3faf9add3c0b7e31, 0x3faf9add3c0b7e31, 0x3faf9add3c0b7e31, 0x3faf9add3c0b7e31, 0x3faf9add3c0b7e31, 0x3faf9add3c0b7e31, 0x3faf9add3c0b7e31
244 /* poly_coeff2 */
245 .align 64
246 .quad 0xbfbc71c71c71c741, 0xbfbc71c71c71c741, 0xbfbc71c71c71c741, 0xbfbc71c71c71c741, 0xbfbc71c71c71c741, 0xbfbc71c71c71c741, 0xbfbc71c71c71c741, 0xbfbc71c71c71c741
247 /* poly_coeff1 */
248 .align 64
249 .quad 0x3fd5555555555557, 0x3fd5555555555557, 0x3fd5555555555557, 0x3fd5555555555557, 0x3fd5555555555557, 0x3fd5555555555557, 0x3fd5555555555557, 0x3fd5555555555557
250 .align 64
251 .type __svml_dcbrt_data_internal_avx512, @object
252 .size __svml_dcbrt_data_internal_avx512, .-__svml_dcbrt_data_internal_avx512
253

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_d_cbrt8_core_avx512.S