1/* Function cbrtf vectorized with AVX-512.
2 Copyright (C) 2021-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
18
19/*
20 * ALGORITHM DESCRIPTION:
21 *
22 * x=2^{3*k+j} * 1.b1 b2 ... b5 b6 ... b52
23 * Let r=(x*2^{-3k-j} - 1.b1 b2 ... b5 1)* rcp[b1 b2 ..b5],
24 * where rcp[b1 b2 .. b5]=1/(1.b1 b2 b3 b4 b5 1) in single precision
25 * cbrtf(2^j * 1. b1 b2 .. b5 1) is approximated as T[j][b1..b5]+D[j][b1..b5]
26 * (T stores the high 24 bits, D stores the low order bits)
27 * Result=2^k*T+(2^k*T*r)*P+2^k*D
28 * where P=p1+p2*r+..
29 *
30 */
31
32/* Offsets for data table __svml_scbrt_data_internal_avx512
33 */
34#define etbl_H 0
35#define etbl_L 64
36#define cbrt_tbl_H 128
37#define BiasL 256
38#define SZero 320
39#define OneThird 384
40#define Bias3 448
41#define Three 512
42#define One 576
43#define poly_coeff3 640
44#define poly_coeff2 704
45#define poly_coeff1 768
46
47#include <sysdep.h>
48
49 .section .text.evex512, "ax", @progbits
50ENTRY(_ZGVeN16v_cbrtf_skx)
51 vgetmantps $0, {sae}, %zmm0, %zmm8
52
53 /* GetExp(x) */
54 vgetexpps {sae}, %zmm0, %zmm1
55 vmovups BiasL+__svml_scbrt_data_internal_avx512(%rip), %zmm2
56
57 /* exponent/3 */
58 vmovups OneThird+__svml_scbrt_data_internal_avx512(%rip), %zmm3
59 vmovups Bias3+__svml_scbrt_data_internal_avx512(%rip), %zmm4
60 vmovups One+__svml_scbrt_data_internal_avx512(%rip), %zmm15
61
62 /* exponent%3 (to be used as index) */
63 vmovups Three+__svml_scbrt_data_internal_avx512(%rip), %zmm5
64
65 /* polynomial */
66 vmovups poly_coeff3+__svml_scbrt_data_internal_avx512(%rip), %zmm11
67 vmovups poly_coeff1+__svml_scbrt_data_internal_avx512(%rip), %zmm14
68
69 /* Table lookup */
70 vmovups cbrt_tbl_H+__svml_scbrt_data_internal_avx512(%rip), %zmm12
71
72 /* DblRcp ~ 1/Mantissa */
73 vrcp14ps %zmm8, %zmm7
74 vaddps {rn-sae}, %zmm2, %zmm1, %zmm6
75 vandps SZero+__svml_scbrt_data_internal_avx512(%rip), %zmm0, %zmm0
76
77 /* round DblRcp to 3 fractional bits (RN mode, no Precision exception) */
78 vrndscaleps $88, {sae}, %zmm7, %zmm9
79 vfmsub231ps {rn-sae}, %zmm6, %zmm3, %zmm4
80 vmovups poly_coeff2+__svml_scbrt_data_internal_avx512(%rip), %zmm7
81
82 /* Reduced argument: R = DblRcp*Mantissa - 1 */
83 vfmsub231ps {rn-sae}, %zmm9, %zmm8, %zmm15
84 vrndscaleps $9, {sae}, %zmm4, %zmm13
85
86 /* Prepare table index */
87 vpsrld $19, %zmm9, %zmm10
88 vfmadd231ps {rn-sae}, %zmm15, %zmm11, %zmm7
89 vfnmadd231ps {rn-sae}, %zmm13, %zmm5, %zmm6
90 vpermt2ps cbrt_tbl_H+64+__svml_scbrt_data_internal_avx512(%rip), %zmm10, %zmm12
91 vfmadd213ps {rn-sae}, %zmm14, %zmm15, %zmm7
92 vscalefps {rn-sae}, %zmm13, %zmm12, %zmm2
93
94 /* Table lookup: 2^(exponent%3) */
95 vpermps __svml_scbrt_data_internal_avx512(%rip), %zmm6, %zmm1
96 vpermps etbl_L+__svml_scbrt_data_internal_avx512(%rip), %zmm6, %zmm6
97
98 /* Sh*R */
99 vmulps {rn-sae}, %zmm15, %zmm1, %zmm14
100
101 /* Sl + (Sh*R)*Poly */
102 vfmadd213ps {rn-sae}, %zmm6, %zmm7, %zmm14
103
104 /*
105 * branch-free
106 * scaled_Th*(Sh+Sl+Sh*R*Poly)
107 */
108 vaddps {rn-sae}, %zmm1, %zmm14, %zmm15
109 vmulps {rn-sae}, %zmm2, %zmm15, %zmm3
110 vorps %zmm0, %zmm3, %zmm0
111 ret
112
113END(_ZGVeN16v_cbrtf_skx)
114
115 .section .rodata, "a"
116 .align 64
117
118#ifdef __svml_scbrt_data_internal_avx512_typedef
119typedef unsigned int VUINT32;
120typedef struct {
121 __declspec(align(64)) VUINT32 etbl_H[16][1];
122 __declspec(align(64)) VUINT32 etbl_L[16][1];
123 __declspec(align(64)) VUINT32 cbrt_tbl_H[32][1];
124 __declspec(align(64)) VUINT32 BiasL[16][1];
125 __declspec(align(64)) VUINT32 SZero[16][1];
126 __declspec(align(64)) VUINT32 OneThird[16][1];
127 __declspec(align(64)) VUINT32 Bias3[16][1];
128 __declspec(align(64)) VUINT32 Three[16][1];
129 __declspec(align(64)) VUINT32 One[16][1];
130 __declspec(align(64)) VUINT32 poly_coeff3[16][1];
131 __declspec(align(64)) VUINT32 poly_coeff2[16][1];
132 __declspec(align(64)) VUINT32 poly_coeff1[16][1];
133} __svml_scbrt_data_internal_avx512;
134#endif
135__svml_scbrt_data_internal_avx512:
136 /* etbl_H */
137 .long 0x3f800000
138 .long 0x3fa14518
139 .long 0x3fcb2ff5
140 .long 0x00000000
141 .long 0x00000000
142 .long 0x00000000
143 .long 0x00000000
144 .long 0x00000000
145 .long 0x00000000
146 .long 0x00000000
147 .long 0x00000000
148 .long 0x00000000
149 .long 0x00000000
150 .long 0x00000000
151 .long 0x00000000
152 .long 0x00000000
153 /* etbl_L */
154 .align 64
155 .long 0x00000000
156 .long 0xb2ce51af
157 .long 0x32a7adc8
158 .long 0x00000000
159 .long 0x00000000
160 .long 0x00000000
161 .long 0x00000000
162 .long 0x00000000
163 .long 0x00000000
164 .long 0x00000000
165 .long 0x00000000
166 .long 0x00000000
167 .long 0x00000000
168 .long 0x00000000
169 .long 0x00000000
170 .long 0x00000000
171 /* cbrt_tbl_H */
172 .align 64
173 .long 0x3fa14518
174 .long 0x3f9e0b2b
175 .long 0x3f9b0f9b
176 .long 0x3f984a9a
177 .long 0x3f95b5af
178 .long 0x3f934b6c
179 .long 0x3f910737
180 .long 0x3f8ee526
181 .long 0x3f8ce1da
182 .long 0x3f8afa6a
183 .long 0x3f892c4e
184 .long 0x3f87754e
185 .long 0x3f85d377
186 .long 0x3f844510
187 .long 0x3f82c892
188 .long 0x3f815c9f
189 .long 0x3f800000
190 .long 0x00000000
191 .long 0x00000000
192 .long 0x00000000
193 .long 0x00000000
194 .long 0x00000000
195 .long 0x00000000
196 .long 0x00000000
197 .long 0x00000000
198 .long 0x00000000
199 .long 0x00000000
200 .long 0x00000000
201 .long 0x00000000
202 .long 0x00000000
203 .long 0x00000000
204 .long 0x00000000
205 /* BiasL */
206 .align 64
207 .long 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000
208 /* Zero */
209 .align 64
210 .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000
211 /* OneThird */
212 .align 64
213 .long 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab, 0x3eaaaaab
214 /* Bias3 */
215 .align 64
216 .long 0x4a800000, 0x4a800000, 0x4a800000, 0x4a800000, 0x4a800000, 0x4a800000, 0x4a800000, 0x4a800000, 0x4a800000, 0x4a800000, 0x4a800000, 0x4a800000, 0x4a800000, 0x4a800000, 0x4a800000, 0x4a800000
217 /* Three */
218 .align 64
219 .long 0x40400000, 0x40400000, 0x40400000, 0x40400000, 0x40400000, 0x40400000, 0x40400000, 0x40400000, 0x40400000, 0x40400000, 0x40400000, 0x40400000, 0x40400000, 0x40400000, 0x40400000, 0x40400000
220 /* One */
221 .align 64
222 .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
223 /* poly_coeff3 */
224 .align 64
225 .long 0x3d7d057c, 0x3d7d057c, 0x3d7d057c, 0x3d7d057c, 0x3d7d057c, 0x3d7d057c, 0x3d7d057c, 0x3d7d057c, 0x3d7d057c, 0x3d7d057c, 0x3d7d057c, 0x3d7d057c, 0x3d7d057c, 0x3d7d057c, 0x3d7d057c, 0x3d7d057c
226 /* poly_coeff2 */
227 .align 64
228 .long 0xbde3a363, 0xbde3a363, 0xbde3a363, 0xbde3a363, 0xbde3a363, 0xbde3a363, 0xbde3a363, 0xbde3a363, 0xbde3a363, 0xbde3a363, 0xbde3a363, 0xbde3a363, 0xbde3a363, 0xbde3a363, 0xbde3a363, 0xbde3a363
229 /* poly_coeff1 */
230 .align 64
231 .long 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa, 0x3eaaaaaa
232 .align 64
233 .type __svml_scbrt_data_internal_avx512, @object
234 .size __svml_scbrt_data_internal_avx512, .-__svml_scbrt_data_internal_avx512
235

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_s_cbrtf16_core_avx512.S