1/* Function log10f vectorized with AVX-512.
2 Copyright (C) 2021-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
18
19/*
20 * ALGORITHM DESCRIPTION:
21 *
22 * Get short reciprocal approximation Rcp ~ 1/mantissa(x)
23 * R = Rcp*x - 1.0
24 * log10(x) = k*log10(2.0) - log10(Rcp) + poly_approximation(R)
25 * log10(Rcp) is tabulated
26 *
27 *
28 */
29
30/* Offsets for data table __svml_slog10_data_internal_avx512
31 */
32#define One 0
33#define coeff4 64
34#define coeff3 128
35#define coeff2 192
36#define coeff1 256
37#define L2 320
38
39#include <sysdep.h>
40
41 .section .text.evex512, "ax", @progbits
42ENTRY(_ZGVeN16v_log10f_skx)
43 pushq %rbp
44 cfi_def_cfa_offset(16)
45 movq %rsp, %rbp
46 cfi_def_cfa(6, 16)
47 cfi_offset(6, -16)
48 andq $-64, %rsp
49 subq $192, %rsp
50 vgetmantps $11, {sae}, %zmm0, %zmm3
51 vmovups __svml_slog10_data_internal_avx512(%rip), %zmm1
52 vgetexpps {sae}, %zmm0, %zmm5
53 vmovups L2+__svml_slog10_data_internal_avx512(%rip), %zmm10
54 vpsrld $19, %zmm3, %zmm7
55 vgetexpps {sae}, %zmm3, %zmm6
56 vsubps {rn-sae}, %zmm1, %zmm3, %zmm11
57 vpermps coeff4+__svml_slog10_data_internal_avx512(%rip), %zmm7, %zmm1
58 vpermps coeff3+__svml_slog10_data_internal_avx512(%rip), %zmm7, %zmm2
59 vsubps {rn-sae}, %zmm6, %zmm5, %zmm9
60 vpermps coeff2+__svml_slog10_data_internal_avx512(%rip), %zmm7, %zmm4
61 vpermps coeff1+__svml_slog10_data_internal_avx512(%rip), %zmm7, %zmm8
62
63 /* x<=0? */
64 vfpclassps $94, %zmm0, %k0
65 vfmadd213ps {rn-sae}, %zmm2, %zmm11, %zmm1
66 vmulps {rn-sae}, %zmm10, %zmm9, %zmm12
67 vfmadd213ps {rn-sae}, %zmm4, %zmm11, %zmm1
68 kmovw %k0, %edx
69 vfmadd213ps {rn-sae}, %zmm8, %zmm11, %zmm1
70 vfmadd213ps {rn-sae}, %zmm12, %zmm11, %zmm1
71 testl %edx, %edx
72
73 /* Go to special inputs processing branch */
74 jne L(SPECIAL_VALUES_BRANCH)
75 # LOE rbx r12 r13 r14 r15 edx zmm0 zmm1
76
77 /* Restore registers
78 * and exit the function
79 */
80
81L(EXIT):
82 vmovaps %zmm1, %zmm0
83 movq %rbp, %rsp
84 popq %rbp
85 cfi_def_cfa(7, 8)
86 cfi_restore(6)
87 ret
88 cfi_def_cfa(6, 16)
89 cfi_offset(6, -16)
90
91 /* Branch to process
92 * special inputs
93 */
94
95L(SPECIAL_VALUES_BRANCH):
96 vmovups %zmm0, 64(%rsp)
97 vmovups %zmm1, 128(%rsp)
98 # LOE rbx r12 r13 r14 r15 edx zmm1
99
100 xorl %eax, %eax
101 # LOE rbx r12 r13 r14 r15 eax edx
102
103 vzeroupper
104 movq %r12, 16(%rsp)
105 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
106 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
107 movl %eax, %r12d
108 movq %r13, 8(%rsp)
109 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
110 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
111 movl %edx, %r13d
112 movq %r14, (%rsp)
113 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
114 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
115 # LOE rbx r15 r12d r13d
116
117 /* Range mask
118 * bits check
119 */
120
121L(RANGEMASK_CHECK):
122 btl %r12d, %r13d
123
124 /* Call scalar math function */
125 jc L(SCALAR_MATH_CALL)
126 # LOE rbx r15 r12d r13d
127
128 /* Special inputs
129 * processing loop
130 */
131
132L(SPECIAL_VALUES_LOOP):
133 incl %r12d
134 cmpl $16, %r12d
135
136 /* Check bits in range mask */
137 jl L(RANGEMASK_CHECK)
138 # LOE rbx r15 r12d r13d
139
140 movq 16(%rsp), %r12
141 cfi_restore(12)
142 movq 8(%rsp), %r13
143 cfi_restore(13)
144 movq (%rsp), %r14
145 cfi_restore(14)
146 vmovups 128(%rsp), %zmm1
147
148 /* Go to exit */
149 jmp L(EXIT)
150 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
151 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
152 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
153 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
154 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
155 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
156 # LOE rbx r12 r13 r14 r15 zmm1
157
158 /* Scalar math function call
159 * to process special input
160 */
161
162L(SCALAR_MATH_CALL):
163 movl %r12d, %r14d
164 vmovss 64(%rsp, %r14, 4), %xmm0
165 call log10f@PLT
166 # LOE rbx r14 r15 r12d r13d xmm0
167
168 vmovss %xmm0, 128(%rsp, %r14, 4)
169
170 /* Process special inputs in loop */
171 jmp L(SPECIAL_VALUES_LOOP)
172 # LOE rbx r15 r12d r13d
173END(_ZGVeN16v_log10f_skx)
174
175 .section .rodata, "a"
176 .align 64
177
178#ifdef __svml_slog10_data_internal_avx512_typedef
179typedef unsigned int VUINT32;
180typedef struct {
181 __declspec(align(64)) VUINT32 One[16][1];
182 __declspec(align(64)) VUINT32 coeff4[16][1];
183 __declspec(align(64)) VUINT32 coeff3[16][1];
184 __declspec(align(64)) VUINT32 coeff2[16][1];
185 __declspec(align(64)) VUINT32 coeff1[16][1];
186 __declspec(align(64)) VUINT32 L2[16][1];
187} __svml_slog10_data_internal_avx512;
188#endif
189__svml_slog10_data_internal_avx512:
190 /* One */
191 .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
192 // c4
193 .align 64
194 .long 0xbdc9ae9b, 0xbda6fcf4
195 .long 0xbd8bac76, 0xbd6bca30
196 .long 0xbd48a99b, 0xbd2c0a9f
197 .long 0xbd1480db, 0xbd00faf2
198 .long 0xbe823aa9, 0xbe656348
199 .long 0xbe4afbb9, 0xbe346895
200 .long 0xbe20ffff, 0xbe103a0b
201 .long 0xbe01a91c, 0xbde9e84e
202 // c3
203 .align 64
204 .long 0x3e13d888, 0x3e10a87c
205 .long 0x3e0b95c3, 0x3e057f0b
206 .long 0x3dfde038, 0x3df080d9
207 .long 0x3de34c1e, 0x3dd68333
208 .long 0x3dac6e8e, 0x3dd54a51
209 .long 0x3df30f40, 0x3e04235d
210 .long 0x3e0b7033, 0x3e102c90
211 .long 0x3e12ebad, 0x3e141ff8
212 // c2
213 .align 64
214 .long 0xbe5e5a9b, 0xbe5e2677
215 .long 0xbe5d83f5, 0xbe5c6016
216 .long 0xbe5abd0b, 0xbe58a6fd
217 .long 0xbe562e02, 0xbe5362f8
218 .long 0xbe68e27c, 0xbe646747
219 .long 0xbe619a73, 0xbe5ff05a
220 .long 0xbe5f0570, 0xbe5e92d0
221 .long 0xbe5e662b, 0xbe5e5c08
222 // c1
223 .align 64
224 .long 0x3ede5bd8, 0x3ede5b45
225 .long 0x3ede57d8, 0x3ede4eb1
226 .long 0x3ede3d37, 0x3ede2166
227 .long 0x3eddf9d9, 0x3eddc5bb
228 .long 0x3ede08ed, 0x3ede32e7
229 .long 0x3ede4967, 0x3ede5490
230 .long 0x3ede597f, 0x3ede5b50
231 .long 0x3ede5bca, 0x3ede5bd9
232 /* L2 */
233 .align 64
234 .long 0x3e9a209b, 0x3e9a209b, 0x3e9a209b, 0x3e9a209b, 0x3e9a209b, 0x3e9a209b, 0x3e9a209b, 0x3e9a209b, 0x3e9a209b, 0x3e9a209b, 0x3e9a209b, 0x3e9a209b, 0x3e9a209b, 0x3e9a209b, 0x3e9a209b, 0x3e9a209b
235 .align 64
236 .type __svml_slog10_data_internal_avx512, @object
237 .size __svml_slog10_data_internal_avx512, .-__svml_slog10_data_internal_avx512
238

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_s_log10f16_core_avx512.S