1/* Function log10 vectorized with AVX-512.
2 Copyright (C) 2021-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
18
19/*
20 * ALGORITHM DESCRIPTION:
21 *
22 * Get short reciprocal approximation Rcp ~ 1/mantissa(x)
23 * R = Rcp*x - 1.0
24 * log10(x) = k*log10(2.0) - log10(Rcp) + poly_approximation(R)
25 * log10(Rcp) is tabulated
26 *
27 *
28 */
29
30/* Offsets for data table __svml_dlog10_data_internal_avx512
31 */
32#define Log_tbl 0
33#define One 128
34#define C075 192
35#define poly_coeff9 256
36#define poly_coeff8 320
37#define poly_coeff7 384
38#define poly_coeff6 448
39#define poly_coeff5 512
40#define poly_coeff4 576
41#define poly_coeff3 640
42#define poly_coeff2 704
43#define poly_coeff1 768
44#define L2 832
45
46#include <sysdep.h>
47
48 .section .text.evex512, "ax", @progbits
49ENTRY(_ZGVeN8v_log10_skx)
50 pushq %rbp
51 cfi_def_cfa_offset(16)
52 movq %rsp, %rbp
53 cfi_def_cfa(6, 16)
54 cfi_offset(6, -16)
55 andq $-64, %rsp
56 subq $192, %rsp
57 vmovaps %zmm0, %zmm7
58 vgetmantpd $8, {sae}, %zmm7, %zmm6
59 vmovups One+__svml_dlog10_data_internal_avx512(%rip), %zmm3
60 vmovups poly_coeff5+__svml_dlog10_data_internal_avx512(%rip), %zmm12
61 vmovups poly_coeff3+__svml_dlog10_data_internal_avx512(%rip), %zmm13
62
63 /* Start polynomial evaluation */
64 vmovups poly_coeff9+__svml_dlog10_data_internal_avx512(%rip), %zmm10
65 vmovups poly_coeff8+__svml_dlog10_data_internal_avx512(%rip), %zmm1
66 vmovups poly_coeff7+__svml_dlog10_data_internal_avx512(%rip), %zmm11
67 vmovups poly_coeff6+__svml_dlog10_data_internal_avx512(%rip), %zmm14
68
69 /* Prepare exponent correction: DblRcp<0.75? */
70 vmovups C075+__svml_dlog10_data_internal_avx512(%rip), %zmm2
71
72 /* Table lookup */
73 vmovups __svml_dlog10_data_internal_avx512(%rip), %zmm5
74
75 /* GetExp(x) */
76 vgetexppd {sae}, %zmm7, %zmm0
77
78 /* DblRcp ~ 1/Mantissa */
79 vrcp14pd %zmm6, %zmm8
80
81 /* x<=0? */
82 vfpclasspd $94, %zmm7, %k0
83
84 /* round DblRcp to 4 fractional bits (RN mode, no Precision exception) */
85 vrndscalepd $88, {sae}, %zmm8, %zmm4
86 vmovups poly_coeff4+__svml_dlog10_data_internal_avx512(%rip), %zmm8
87 kmovw %k0, %edx
88
89 /* Reduced argument: R = DblRcp*Mantissa - 1 */
90 vfmsub213pd {rn-sae}, %zmm3, %zmm4, %zmm6
91 vcmppd $17, {sae}, %zmm2, %zmm4, %k1
92 vfmadd231pd {rn-sae}, %zmm6, %zmm12, %zmm8
93 vmovups poly_coeff2+__svml_dlog10_data_internal_avx512(%rip), %zmm12
94 vfmadd231pd {rn-sae}, %zmm6, %zmm10, %zmm1
95 vfmadd231pd {rn-sae}, %zmm6, %zmm11, %zmm14
96 vmovups poly_coeff1+__svml_dlog10_data_internal_avx512(%rip), %zmm2
97
98 /* R^2 */
99 vmulpd {rn-sae}, %zmm6, %zmm6, %zmm15
100 vfmadd231pd {rn-sae}, %zmm6, %zmm13, %zmm12
101
102 /* Prepare table index */
103 vpsrlq $48, %zmm4, %zmm9
104
105 /* add 1 to Expon if DblRcp<0.75 */
106 vaddpd {rn-sae}, %zmm3, %zmm0, %zmm0{%k1}
107 vmulpd {rn-sae}, %zmm15, %zmm15, %zmm13
108 vfmadd213pd {rn-sae}, %zmm14, %zmm15, %zmm1
109 vfmadd213pd {rn-sae}, %zmm12, %zmm15, %zmm8
110 vpermt2pd Log_tbl+64+__svml_dlog10_data_internal_avx512(%rip), %zmm9, %zmm5
111
112 /* polynomial */
113 vfmadd213pd {rn-sae}, %zmm8, %zmm13, %zmm1
114 vfmadd213pd {rn-sae}, %zmm2, %zmm6, %zmm1
115 vfmadd213pd {rn-sae}, %zmm5, %zmm1, %zmm6
116 vmovups L2+__svml_dlog10_data_internal_avx512(%rip), %zmm1
117 vfmadd213pd {rn-sae}, %zmm6, %zmm1, %zmm0
118 testl %edx, %edx
119
120 /* Go to special inputs processing branch */
121 jne L(SPECIAL_VALUES_BRANCH)
122 # LOE rbx r12 r13 r14 r15 edx zmm0 zmm7
123
124 /* Restore registers
125 * and exit the function
126 */
127
128L(EXIT):
129 movq %rbp, %rsp
130 popq %rbp
131 cfi_def_cfa(7, 8)
132 cfi_restore(6)
133 ret
134 cfi_def_cfa(6, 16)
135 cfi_offset(6, -16)
136
137 /* Branch to process
138 * special inputs
139 */
140
141L(SPECIAL_VALUES_BRANCH):
142 vmovups %zmm7, 64(%rsp)
143 vmovups %zmm0, 128(%rsp)
144 # LOE rbx r12 r13 r14 r15 edx zmm0
145
146 xorl %eax, %eax
147 # LOE rbx r12 r13 r14 r15 eax edx
148
149 vzeroupper
150 movq %r12, 16(%rsp)
151 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
152 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
153 movl %eax, %r12d
154 movq %r13, 8(%rsp)
155 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
156 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
157 movl %edx, %r13d
158 movq %r14, (%rsp)
159 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
160 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
161 # LOE rbx r15 r12d r13d
162
163 /* Range mask
164 * bits check
165 */
166
167L(RANGEMASK_CHECK):
168 btl %r12d, %r13d
169
170 /* Call scalar math function */
171 jc L(SCALAR_MATH_CALL)
172 # LOE rbx r15 r12d r13d
173
174 /* Special inputs
175 * processing loop
176 */
177
178L(SPECIAL_VALUES_LOOP):
179 incl %r12d
180 cmpl $8, %r12d
181
182 /* Check bits in range mask */
183 jl L(RANGEMASK_CHECK)
184 # LOE rbx r15 r12d r13d
185
186 movq 16(%rsp), %r12
187 cfi_restore(12)
188 movq 8(%rsp), %r13
189 cfi_restore(13)
190 movq (%rsp), %r14
191 cfi_restore(14)
192 vmovups 128(%rsp), %zmm0
193
194 /* Go to exit */
195 jmp L(EXIT)
196 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
197 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
198 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
199 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
200 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
201 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
202 # LOE rbx r12 r13 r14 r15 zmm0
203
204 /* Scalar math function call
205 * to process special input
206 */
207
208L(SCALAR_MATH_CALL):
209 movl %r12d, %r14d
210 vmovsd 64(%rsp, %r14, 8), %xmm0
211 call log10@PLT
212 # LOE rbx r14 r15 r12d r13d xmm0
213
214 vmovsd %xmm0, 128(%rsp, %r14, 8)
215
216 /* Process special inputs in loop */
217 jmp L(SPECIAL_VALUES_LOOP)
218 # LOE rbx r15 r12d r13d
219END(_ZGVeN8v_log10_skx)
220
221 .section .rodata, "a"
222 .align 64
223
224#ifdef __svml_dlog10_data_internal_avx512_typedef
225typedef unsigned int VUINT32;
226typedef struct {
227 __declspec(align(64)) VUINT32 Log_tbl[16][2];
228 __declspec(align(64)) VUINT32 One[8][2];
229 __declspec(align(64)) VUINT32 C075[8][2];
230 __declspec(align(64)) VUINT32 poly_coeff9[8][2];
231 __declspec(align(64)) VUINT32 poly_coeff8[8][2];
232 __declspec(align(64)) VUINT32 poly_coeff7[8][2];
233 __declspec(align(64)) VUINT32 poly_coeff6[8][2];
234 __declspec(align(64)) VUINT32 poly_coeff5[8][2];
235 __declspec(align(64)) VUINT32 poly_coeff4[8][2];
236 __declspec(align(64)) VUINT32 poly_coeff3[8][2];
237 __declspec(align(64)) VUINT32 poly_coeff2[8][2];
238 __declspec(align(64)) VUINT32 poly_coeff1[8][2];
239 __declspec(align(64)) VUINT32 L2[8][2];
240} __svml_dlog10_data_internal_avx512;
241#endif
242__svml_dlog10_data_internal_avx512:
243 /* Log_tbl */
244 .quad 0x0000000000000000
245 .quad 0xbf9af5f92b00e610
246 .quad 0xbfaa30a9d609efea
247 .quad 0xbfb31b3055c47118
248 .quad 0xbfb8cf183886480d
249 .quad 0xbfbe3bc1ab0e19fe
250 .quad 0xbfc1b3e71ec94f7b
251 .quad 0xbfc42c7e7fe3fc02
252 .quad 0x3fbffbfc2bbc7803
253 .quad 0x3fbb721cd17157e3
254 .quad 0x3fb715d0ce367afc
255 .quad 0x3fb2e3a740b7800f
256 .quad 0x3fadb11ed766abf4
257 .quad 0x3fa5e3966b7e9295
258 .quad 0x3f9cb38fccd8bfdb
259 .quad 0x3f8c3d0837784c41
260 /* One */
261 .align 64
262 .quad 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000
263 /* 0.75 */
264 .align 64
265 .quad 0x3fe8000000000000, 0x3fe8000000000000, 0x3fe8000000000000, 0x3fe8000000000000, 0x3fe8000000000000, 0x3fe8000000000000, 0x3fe8000000000000, 0x3fe8000000000000
266 /* poly_coeff9 */
267 .align 64
268 .quad 0x3fa8c2d828480370, 0x3fa8c2d828480370, 0x3fa8c2d828480370, 0x3fa8c2d828480370, 0x3fa8c2d828480370, 0x3fa8c2d828480370, 0x3fa8c2d828480370, 0x3fa8c2d828480370
269 /* poly_coeff8 */
270 .align 64
271 .quad 0xbfabd80d96029814, 0xbfabd80d96029814, 0xbfabd80d96029814, 0xbfabd80d96029814, 0xbfabd80d96029814, 0xbfabd80d96029814, 0xbfabd80d96029814, 0xbfabd80d96029814
272 /* poly_coeff7 */
273 .align 64
274 .quad 0x3fafc3f6f38b58a2, 0x3fafc3f6f38b58a2, 0x3fafc3f6f38b58a2, 0x3fafc3f6f38b58a2, 0x3fafc3f6f38b58a2, 0x3fafc3f6f38b58a2, 0x3fafc3f6f38b58a2, 0x3fafc3f6f38b58a2
275 /* poly_coeff6 */
276 .align 64
277 .quad 0xbfb287a63464dc80, 0xbfb287a63464dc80, 0xbfb287a63464dc80, 0xbfb287a63464dc80, 0xbfb287a63464dc80, 0xbfb287a63464dc80, 0xbfb287a63464dc80, 0xbfb287a63464dc80
278 /* poly_coeff5 */
279 .align 64
280 .quad 0x3fb63c62777f27d9, 0x3fb63c62777f27d9, 0x3fb63c62777f27d9, 0x3fb63c62777f27d9, 0x3fb63c62777f27d9, 0x3fb63c62777f27d9, 0x3fb63c62777f27d9, 0x3fb63c62777f27d9
281 /* poly_coeff4 */
282 .align 64
283 .quad 0xbfbbcb7b153c06a3, 0xbfbbcb7b153c06a3, 0xbfbbcb7b153c06a3, 0xbfbbcb7b153c06a3, 0xbfbbcb7b153c06a3, 0xbfbbcb7b153c06a3, 0xbfbbcb7b153c06a3, 0xbfbbcb7b153c06a3
284 /* poly_coeff3 */
285 .align 64
286 .quad 0x3fc287a7636f428c, 0x3fc287a7636f428c, 0x3fc287a7636f428c, 0x3fc287a7636f428c, 0x3fc287a7636f428c, 0x3fc287a7636f428c, 0x3fc287a7636f428c, 0x3fc287a7636f428c
287 /* poly_coeff2 */
288 .align 64
289 .quad 0xbfcbcb7b1526e4db, 0xbfcbcb7b1526e4db, 0xbfcbcb7b1526e4db, 0xbfcbcb7b1526e4db, 0xbfcbcb7b1526e4db, 0xbfcbcb7b1526e4db, 0xbfcbcb7b1526e4db, 0xbfcbcb7b1526e4db
290 /* poly_coeff1 */
291 .align 64
292 .quad 0x3fdbcb7b1526e50e, 0x3fdbcb7b1526e50e, 0x3fdbcb7b1526e50e, 0x3fdbcb7b1526e50e, 0x3fdbcb7b1526e50e, 0x3fdbcb7b1526e50e, 0x3fdbcb7b1526e50e, 0x3fdbcb7b1526e50e
293 /* L2 */
294 .align 64
295 .quad 0x3fd34413509f79ff, 0x3fd34413509f79ff, 0x3fd34413509f79ff, 0x3fd34413509f79ff, 0x3fd34413509f79ff, 0x3fd34413509f79ff, 0x3fd34413509f79ff, 0x3fd34413509f79ff
296 .align 64
297 .type __svml_dlog10_data_internal_avx512, @object
298 .size __svml_dlog10_data_internal_avx512, .-__svml_dlog10_data_internal_avx512
299

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_d_log108_core_avx512.S