1 | /* Function cbrtf vectorized with AVX-512. |
2 | Copyright (C) 2021-2024 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | https://www.gnu.org/licenses/. */ |
18 | |
19 | /* |
20 | * ALGORITHM DESCRIPTION: |
21 | * |
22 | * x=2^{3*k+j} * 1.b1 b2 ... b5 b6 ... b52 |
23 | * Let r=(x*2^{-3k-j} - 1.b1 b2 ... b5 1)* rcp[b1 b2 ..b5], |
24 | * where rcp[b1 b2 .. b5]=1/(1.b1 b2 b3 b4 b5 1) in single precision |
25 | * cbrtf(2^j * 1. b1 b2 .. b5 1) is approximated as T[j][b1..b5]+D[j][b1..b5] |
26 | * (T stores the high 24 bits, D stores the low order bits) |
27 | * Result=2^k*T+(2^k*T*r)*P+2^k*D |
28 | * where P=p1+p2*r+.. |
29 | * |
30 | */ |
31 | |
32 | /* Offsets for data table __svml_scbrt_data_internal_avx512 |
33 | */ |
34 | #define etbl_H 0 |
35 | #define etbl_L 64 |
36 | #define cbrt_tbl_H 128 |
37 | #define BiasL 256 |
38 | #define SZero 320 |
39 | #define OneThird 384 |
40 | #define Bias3 448 |
41 | #define Three 512 |
42 | #define One 576 |
43 | #define poly_coeff3 640 |
44 | #define poly_coeff2 704 |
45 | #define poly_coeff1 768 |
46 | |
47 | #include <sysdep.h> |
48 | |
49 | .section .text.evex512, "ax" , @progbits |
50 | ENTRY(_ZGVeN16v_cbrtf_skx) |
51 | vgetmantps $0, {sae}, %zmm0, %zmm8 |
52 | |
53 | /* GetExp(x) */ |
54 | vgetexpps {sae}, %zmm0, %zmm1 |
55 | vmovups BiasL+__svml_scbrt_data_internal_avx512(%rip), %zmm2 |
56 | |
57 | /* exponent/3 */ |
58 | vmovups OneThird+__svml_scbrt_data_internal_avx512(%rip), %zmm3 |
59 | vmovups Bias3+__svml_scbrt_data_internal_avx512(%rip), %zmm4 |
60 | vmovups One+__svml_scbrt_data_internal_avx512(%rip), %zmm15 |
61 | |
62 | /* exponent%3 (to be used as index) */ |
63 | vmovups Three+__svml_scbrt_data_internal_avx512(%rip), %zmm5 |
64 | |
65 | /* polynomial */ |
66 | vmovups poly_coeff3+__svml_scbrt_data_internal_avx512(%rip), %zmm11 |
67 | vmovups poly_coeff1+__svml_scbrt_data_internal_avx512(%rip), %zmm14 |
68 | |
69 | /* Table lookup */ |
70 | vmovups cbrt_tbl_H+__svml_scbrt_data_internal_avx512(%rip), %zmm12 |
71 | |
72 | /* DblRcp ~ 1/Mantissa */ |
73 | vrcp14ps %zmm8, %zmm7 |
74 | vaddps {rn-sae}, %zmm2, %zmm1, %zmm6 |
75 | vandps SZero+__svml_scbrt_data_internal_avx512(%rip), %zmm0, %zmm0 |
76 | |
77 | /* round DblRcp to 3 fractional bits (RN mode, no Precision exception) */ |
78 | vrndscaleps $88, {sae}, %zmm7, %zmm9 |
79 | vfmsub231ps {rn-sae}, %zmm6, %zmm3, %zmm4 |
80 | vmovups poly_coeff2+__svml_scbrt_data_internal_avx512(%rip), %zmm7 |
81 | |
82 | /* Reduced argument: R = DblRcp*Mantissa - 1 */ |
83 | vfmsub231ps {rn-sae}, %zmm9, %zmm8, %zmm15 |
84 | vrndscaleps $9, {sae}, %zmm4, %zmm13 |
85 | |
86 | /* Prepare table index */ |
87 | vpsrld $19, %zmm9, %zmm10 |
88 | vfmadd231ps {rn-sae}, %zmm15, %zmm11, %zmm7 |
89 | vfnmadd231ps {rn-sae}, %zmm13, %zmm5, %zmm6 |
90 | vpermt2ps cbrt_tbl_H+64+__svml_scbrt_data_internal_avx512(%rip), %zmm10, %zmm12 |
91 | vfmadd213ps {rn-sae}, %zmm14, %zmm15, %zmm7 |
92 | vscalefps {rn-sae}, %zmm13, %zmm12, %zmm2 |
93 | |
94 | /* Table lookup: 2^(exponent%3) */ |
95 | vpermps __svml_scbrt_data_internal_avx512(%rip), %zmm6, %zmm1 |
96 | vpermps etbl_L+__svml_scbrt_data_internal_avx512(%rip), %zmm6, %zmm6 |
97 | |
98 | /* Sh*R */ |
99 | vmulps {rn-sae}, %zmm15, %zmm1, %zmm14 |
100 | |
101 | /* Sl + (Sh*R)*Poly */ |
102 | vfmadd213ps {rn-sae}, %zmm6, %zmm7, %zmm14 |
103 | |
104 | /* |
105 | * branch-free |
106 | * scaled_Th*(Sh+Sl+Sh*R*Poly) |
107 | */ |
108 | vaddps {rn-sae}, %zmm1, %zmm14, %zmm15 |
109 | vmulps {rn-sae}, %zmm2, %zmm15, %zmm3 |
110 | vorps %zmm0, %zmm3, %zmm0 |
111 | ret |
112 | |
113 | END(_ZGVeN16v_cbrtf_skx) |
114 | |
115 | .section .rodata, "a" |
116 | .align 64 |
117 | |
118 | #ifdef __svml_scbrt_data_internal_avx512_typedef |
119 | typedef unsigned int VUINT32; |
120 | typedef struct { |
121 | __declspec(align(64)) VUINT32 etbl_H[16][1]; |
122 | __declspec(align(64)) VUINT32 etbl_L[16][1]; |
123 | __declspec(align(64)) VUINT32 cbrt_tbl_H[32][1]; |
124 | __declspec(align(64)) VUINT32 BiasL[16][1]; |
125 | __declspec(align(64)) VUINT32 SZero[16][1]; |
126 | __declspec(align(64)) VUINT32 OneThird[16][1]; |
127 | __declspec(align(64)) VUINT32 Bias3[16][1]; |
128 | __declspec(align(64)) VUINT32 Three[16][1]; |
129 | __declspec(align(64)) VUINT32 One[16][1]; |
130 | __declspec(align(64)) VUINT32 poly_coeff3[16][1]; |
131 | __declspec(align(64)) VUINT32 poly_coeff2[16][1]; |
132 | __declspec(align(64)) VUINT32 poly_coeff1[16][1]; |
133 | } __svml_scbrt_data_internal_avx512; |
134 | #endif |
135 | __svml_scbrt_data_internal_avx512: |
136 | /* etbl_H */ |
137 | .long 0x3f800000 |
138 | .long 0x3fa14518 |
139 | .long 0x3fcb2ff5 |
140 | .long 0x00000000 |
141 | .long 0x00000000 |
142 | .long 0x00000000 |
143 | .long 0x00000000 |
144 | .long 0x00000000 |
145 | .long 0x00000000 |
146 | .long 0x00000000 |
147 | .long 0x00000000 |
148 | .long 0x00000000 |
149 | .long 0x00000000 |
150 | .long 0x00000000 |
151 | .long 0x00000000 |
152 | .long 0x00000000 |
153 | /* etbl_L */ |
154 | .align 64 |
155 | .long 0x00000000 |
156 | .long 0xb2ce51af |
157 | .long 0x32a7adc8 |
158 | .long 0x00000000 |
159 | .long 0x00000000 |
160 | .long 0x00000000 |
161 | .long 0x00000000 |
162 | .long 0x00000000 |
163 | .long 0x00000000 |
164 | .long 0x00000000 |
165 | .long 0x00000000 |
166 | .long 0x00000000 |
167 | .long 0x00000000 |
168 | .long 0x00000000 |
169 | .long 0x00000000 |
170 | .long 0x00000000 |
171 | /* cbrt_tbl_H */ |
172 | .align 64 |
173 | .long 0x3fa14518 |
174 | .long 0x3f9e0b2b |
175 | .long 0x3f9b0f9b |
176 | .long 0x3f984a9a |
177 | .long 0x3f95b5af |
178 | .long 0x3f934b6c |
179 | .long 0x3f910737 |
180 | .long 0x3f8ee526 |
181 | .long 0x3f8ce1da |
182 | .long 0x3f8afa6a |
183 | .long 0x3f892c4e |
184 | .long 0x3f87754e |
185 | .long 0x3f85d377 |
186 | .long 0x3f844510 |
187 | .long 0x3f82c892 |
188 | .long 0x3f815c9f |
189 | .long 0x3f800000 |
190 | .long 0x00000000 |
191 | .long 0x00000000 |
192 | .long 0x00000000 |
193 | .long 0x00000000 |
194 | .long 0x00000000 |
195 | .long 0x00000000 |
196 | .long 0x00000000 |
197 | .long 0x00000000 |
198 | .long 0x00000000 |
199 | .long 0x00000000 |
200 | .long 0x00000000 |
201 | .long 0x00000000 |
202 | .long 0x00000000 |
203 | .long 0x00000000 |
204 | .long 0x00000000 |
205 | /* BiasL */ |
206 | .align 64 |
207 | .long 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000 |
208 | /* Zero */ |
209 | .align 64 |
210 | .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000 |
211 | /* OneThird */ |
212 | .align 64 |
213 | .long 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab |
214 | /* Bias3 */ |
215 | .align 64 |
216 | .long 0x4a800000, 0x4a800000, 0x4a800000, 0x4a800000, 0x4a800000, 0x4a800000, 0x4a800000, 0x4a800000, 0x4a800000, 0x4a800000, 0x4a800000, 0x4a800000, 0x4a800000, 0x4a800000, 0x4a800000, 0x4a800000 |
217 | /* Three */ |
218 | .align 64 |
219 | .long 0x40400000, 0x40400000, 0x40400000, 0x40400000, 0x40400000, 0x40400000, 0x40400000, 0x40400000, 0x40400000, 0x40400000, 0x40400000, 0x40400000, 0x40400000, 0x40400000, 0x40400000, 0x40400000 |
220 | /* One */ |
221 | .align 64 |
222 | .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 |
223 | /* poly_coeff3 */ |
224 | .align 64 |
225 | .long 0x3d7d057c, 0x3d7d057c, 0x3d7d057c, 0x3d7d057c, 0x3d7d057c, 0x3d7d057c, 0x3d7d057c, 0x3d7d057c, 0x3d7d057c, 0x3d7d057c, 0x3d7d057c, 0x3d7d057c, 0x3d7d057c, 0x3d7d057c, 0x3d7d057c, 0x3d7d057c |
226 | /* poly_coeff2 */ |
227 | .align 64 |
228 | .long 0xbde3a363, 0xbde3a363, 0xbde3a363, 0xbde3a363, 0xbde3a363, 0xbde3a363, 0xbde3a363, 0xbde3a363, 0xbde3a363, 0xbde3a363, 0xbde3a363, 0xbde3a363, 0xbde3a363, 0xbde3a363, 0xbde3a363, 0xbde3a363 |
229 | /* poly_coeff1 */ |
230 | .align 64 |
231 | .long 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa |
232 | .align 64 |
233 | .type __svml_scbrt_data_internal_avx512, @object |
234 | .size __svml_scbrt_data_internal_avx512, .-__svml_scbrt_data_internal_avx512 |
235 | |