1/* Function exp10 vectorized with AVX-512.
2 Copyright (C) 2021-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
18
19/*
20 * ALGORITHM DESCRIPTION:
21 * Typical exp10() implementation, except that:
22 * - tables are small (16 elements), allowing for fast gathers
23 * - all arguments processed in the main path
24 * - final VSCALEF assists branch-free design (correct overflow/underflow and special case responses)
25 * - a VAND is used to ensure the reduced argument |R|<2, even for large inputs
26 * - RZ mode used to avoid overflow to +/-Inf for x*log2(e); helps with special case handling
27 * - SAE used to avoid spurious flag settings
28 *
29 */
30
31/* Offsets for data table __svml_dexp10_data_internal_avx512
32 */
33#define Exp_tbl_H 0
34#define L2E 128
35#define Shifter 192
36#define L2H 256
37#define L2L 320
38#define EMask 384
39#define poly_coeff6 448
40#define poly_coeff5 512
41#define poly_coeff4 576
42#define poly_coeff3 640
43#define poly_coeff2 704
44#define poly_coeff1 768
45#define AbsMask 832
46#define Threshold 896
47
48#include <sysdep.h>
49
50 .section .text.evex512, "ax", @progbits
51ENTRY(_ZGVeN8v_exp10_skx)
52 pushq %rbp
53 cfi_def_cfa_offset(16)
54 movq %rsp, %rbp
55 cfi_def_cfa(6, 16)
56 cfi_offset(6, -16)
57 andq $-64, %rsp
58 subq $192, %rsp
59 vmovups L2E+__svml_dexp10_data_internal_avx512(%rip), %zmm4
60 vmovups Shifter+__svml_dexp10_data_internal_avx512(%rip), %zmm2
61 vmovups L2H+__svml_dexp10_data_internal_avx512(%rip), %zmm5
62 vmovups L2L+__svml_dexp10_data_internal_avx512(%rip), %zmm3
63
64 /* polynomial */
65 vmovups poly_coeff6+__svml_dexp10_data_internal_avx512(%rip), %zmm6
66 vmovups poly_coeff4+__svml_dexp10_data_internal_avx512(%rip), %zmm7
67 vmovups poly_coeff3+__svml_dexp10_data_internal_avx512(%rip), %zmm9
68 vmovups poly_coeff2+__svml_dexp10_data_internal_avx512(%rip), %zmm8
69 vmovups poly_coeff1+__svml_dexp10_data_internal_avx512(%rip), %zmm11
70 vmovups Threshold+__svml_dexp10_data_internal_avx512(%rip), %zmm14
71 vmovaps %zmm0, %zmm1
72
73 /* 2^(52-4)*1.5 + x * log2(e) */
74 vfmadd213pd {rz-sae}, %zmm2, %zmm1, %zmm4
75 vandpd AbsMask+__svml_dexp10_data_internal_avx512(%rip), %zmm1, %zmm13
76
77 /* Z0 ~ x*log2(e), rounded down to 4 fractional bits */
78 vsubpd {rn-sae}, %zmm2, %zmm4, %zmm0
79
80 /* Table lookup: Th */
81 vmovups __svml_dexp10_data_internal_avx512(%rip), %zmm2
82 vcmppd $29, {sae}, %zmm14, %zmm13, %k0
83
84 /* R = x - Z0*log(2) */
85 vfnmadd213pd {rn-sae}, %zmm1, %zmm0, %zmm5
86 vpermt2pd Exp_tbl_H+64+__svml_dexp10_data_internal_avx512(%rip), %zmm4, %zmm2
87 kmovw %k0, %edx
88 vfnmadd231pd {rn-sae}, %zmm0, %zmm3, %zmm5
89 vmovups poly_coeff5+__svml_dexp10_data_internal_avx512(%rip), %zmm3
90
91 /* ensure |R|<2 even for special cases */
92 vandpd EMask+__svml_dexp10_data_internal_avx512(%rip), %zmm5, %zmm12
93 vmulpd {rn-sae}, %zmm12, %zmm12, %zmm10
94 vmulpd {rn-sae}, %zmm12, %zmm2, %zmm15
95 vfmadd231pd {rn-sae}, %zmm12, %zmm6, %zmm3
96 vfmadd231pd {rn-sae}, %zmm12, %zmm7, %zmm9
97 vfmadd231pd {rn-sae}, %zmm12, %zmm8, %zmm11
98 vfmadd213pd {rn-sae}, %zmm9, %zmm10, %zmm3
99 vfmadd213pd {rn-sae}, %zmm11, %zmm10, %zmm3
100 vfmadd213pd {rn-sae}, %zmm2, %zmm15, %zmm3
101 vscalefpd {rn-sae}, %zmm0, %zmm3, %zmm0
102 testl %edx, %edx
103
104 /* Go to special inputs processing branch */
105 jne L(SPECIAL_VALUES_BRANCH)
106 # LOE rbx r12 r13 r14 r15 edx zmm0 zmm1
107
108 /* Restore registers
109 * and exit the function
110 */
111
112L(EXIT):
113 movq %rbp, %rsp
114 popq %rbp
115 cfi_def_cfa(7, 8)
116 cfi_restore(6)
117 ret
118 cfi_def_cfa(6, 16)
119 cfi_offset(6, -16)
120
121 /* Branch to process
122 * special inputs
123 */
124
125L(SPECIAL_VALUES_BRANCH):
126 vmovups %zmm1, 64(%rsp)
127 vmovups %zmm0, 128(%rsp)
128 # LOE rbx r12 r13 r14 r15 edx zmm0
129
130 xorl %eax, %eax
131 # LOE rbx r12 r13 r14 r15 eax edx
132
133 vzeroupper
134 movq %r12, 16(%rsp)
135 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
136 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
137 movl %eax, %r12d
138 movq %r13, 8(%rsp)
139 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
140 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
141 movl %edx, %r13d
142 movq %r14, (%rsp)
143 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
144 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
145 # LOE rbx r15 r12d r13d
146
147 /* Range mask
148 * bits check
149 */
150
151L(RANGEMASK_CHECK):
152 btl %r12d, %r13d
153
154 /* Call scalar math function */
155 jc L(SCALAR_MATH_CALL)
156 # LOE rbx r15 r12d r13d
157
158 /* Special inputs
159 * processing loop
160 */
161
162L(SPECIAL_VALUES_LOOP):
163 incl %r12d
164 cmpl $8, %r12d
165
166 /* Check bits in range mask */
167 jl L(RANGEMASK_CHECK)
168 # LOE rbx r15 r12d r13d
169
170 movq 16(%rsp), %r12
171 cfi_restore(12)
172 movq 8(%rsp), %r13
173 cfi_restore(13)
174 movq (%rsp), %r14
175 cfi_restore(14)
176 vmovups 128(%rsp), %zmm0
177
178 /* Go to exit */
179 jmp L(EXIT)
180 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
181 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
182 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
183 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
184 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
185 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
186 # LOE rbx r12 r13 r14 r15 zmm0
187
188 /* Scalar math function call
189 * to process special input
190 */
191
192L(SCALAR_MATH_CALL):
193 movl %r12d, %r14d
194 vmovsd 64(%rsp, %r14, 8), %xmm0
195 call exp10@PLT
196 # LOE rbx r14 r15 r12d r13d xmm0
197
198 vmovsd %xmm0, 128(%rsp, %r14, 8)
199
200 /* Process special inputs in loop */
201 jmp L(SPECIAL_VALUES_LOOP)
202 # LOE rbx r15 r12d r13d
203END(_ZGVeN8v_exp10_skx)
204
205 .section .rodata, "a"
206 .align 64
207
208#ifdef __svml_dexp10_data_internal_avx512_typedef
209typedef unsigned int VUINT32;
210typedef struct {
211 __declspec(align(64)) VUINT32 Exp_tbl_H[16][2];
212 __declspec(align(64)) VUINT32 L2E[8][2];
213 __declspec(align(64)) VUINT32 Shifter[8][2];
214 __declspec(align(64)) VUINT32 L2H[8][2];
215 __declspec(align(64)) VUINT32 L2L[8][2];
216 __declspec(align(64)) VUINT32 EMask[8][2];
217 __declspec(align(64)) VUINT32 poly_coeff6[8][2];
218 __declspec(align(64)) VUINT32 poly_coeff5[8][2];
219 __declspec(align(64)) VUINT32 poly_coeff4[8][2];
220 __declspec(align(64)) VUINT32 poly_coeff3[8][2];
221 __declspec(align(64)) VUINT32 poly_coeff2[8][2];
222 __declspec(align(64)) VUINT32 poly_coeff1[8][2];
223 __declspec(align(64)) VUINT32 AbsMask[8][2];
224 __declspec(align(64)) VUINT32 Threshold[8][2];
225} __svml_dexp10_data_internal_avx512;
226#endif
227__svml_dexp10_data_internal_avx512:
228 /* Exp_tbl_H */
229 .quad 0x3ff0000000000000
230 .quad 0x3ff0b5586cf9890f
231 .quad 0x3ff172b83c7d517b
232 .quad 0x3ff2387a6e756238
233 .quad 0x3ff306fe0a31b715
234 .quad 0x3ff3dea64c123422
235 .quad 0x3ff4bfdad5362a27
236 .quad 0x3ff5ab07dd485429
237 .quad 0x3ff6a09e667f3bcd
238 .quad 0x3ff7a11473eb0187
239 .quad 0x3ff8ace5422aa0db
240 .quad 0x3ff9c49182a3f090
241 .quad 0x3ffae89f995ad3ad
242 .quad 0x3ffc199bdd85529c
243 .quad 0x3ffd5818dcfba487
244 .quad 0x3ffea4afa2a490da
245 /* log2(e) */
246 .align 64
247 .quad 0x400A934F0979A371, 0x400A934F0979A371, 0x400A934F0979A371, 0x400A934F0979A371, 0x400A934F0979A371, 0x400A934F0979A371, 0x400A934F0979A371, 0x400A934F0979A371
248 /* Shifter=2^(52-4)*1.5 */
249 .align 64
250 .quad 0x42f8000000003ff0, 0x42f8000000003ff0, 0x42f8000000003ff0, 0x42f8000000003ff0, 0x42f8000000003ff0, 0x42f8000000003ff0, 0x42f8000000003ff0, 0x42f8000000003ff0
251 /* L2H = log(2)_high */
252 .align 64
253 .quad 0x3fd34413509f79ff, 0x3fd34413509f79ff, 0x3fd34413509f79ff, 0x3fd34413509f79ff, 0x3fd34413509f79ff, 0x3fd34413509f79ff, 0x3fd34413509f79ff, 0x3fd34413509f79ff
254 /* L2L = log(2)_low */
255 .align 64
256 .quad 0xbc49dc1da994fd21, 0xbc49dc1da994fd21, 0xbc49dc1da994fd21, 0xbc49dc1da994fd21, 0xbc49dc1da994fd21, 0xbc49dc1da994fd21, 0xbc49dc1da994fd21, 0xbc49dc1da994fd21
257 /* EMask */
258 .align 64
259 .quad 0xbfffffffffffffff, 0xbfffffffffffffff, 0xbfffffffffffffff, 0xbfffffffffffffff, 0xbfffffffffffffff, 0xbfffffffffffffff, 0xbfffffffffffffff, 0xbfffffffffffffff
260 /* poly_coeff6 */
261 .align 64
262 .quad 0x3fcb137ed8ac2020, 0x3fcb137ed8ac2020, 0x3fcb137ed8ac2020, 0x3fcb137ed8ac2020, 0x3fcb137ed8ac2020, 0x3fcb137ed8ac2020, 0x3fcb137ed8ac2020, 0x3fcb137ed8ac2020
263 /* poly_coeff5 */
264 .align 64
265 .quad 0x3fe141a8e24f9424, 0x3fe141a8e24f9424, 0x3fe141a8e24f9424, 0x3fe141a8e24f9424, 0x3fe141a8e24f9424, 0x3fe141a8e24f9424, 0x3fe141a8e24f9424, 0x3fe141a8e24f9424
266 /* poly_coeff4 */
267 .align 64
268 .quad 0x3ff2bd77a0926c9d, 0x3ff2bd77a0926c9d, 0x3ff2bd77a0926c9d, 0x3ff2bd77a0926c9d, 0x3ff2bd77a0926c9d, 0x3ff2bd77a0926c9d, 0x3ff2bd77a0926c9d, 0x3ff2bd77a0926c9d
269 /* poly_coeff3 */
270 .align 64
271 .quad 0x40004705908704c8, 0x40004705908704c8, 0x40004705908704c8, 0x40004705908704c8, 0x40004705908704c8, 0x40004705908704c8, 0x40004705908704c8, 0x40004705908704c8
272 /* poly_coeff2 */
273 .align 64
274 .quad 0x40053524c73dfe25, 0x40053524c73dfe25, 0x40053524c73dfe25, 0x40053524c73dfe25, 0x40053524c73dfe25, 0x40053524c73dfe25, 0x40053524c73dfe25, 0x40053524c73dfe25
275 /* poly_coeff1 */
276 .align 64
277 .quad 0x40026bb1bbb554c2, 0x40026bb1bbb554c2, 0x40026bb1bbb554c2, 0x40026bb1bbb554c2, 0x40026bb1bbb554c2, 0x40026bb1bbb554c2, 0x40026bb1bbb554c2, 0x40026bb1bbb554c2
278 /* AbsMask */
279 .align 64
280 .quad 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff
281 /* Threshold */
282 .align 64
283 .quad 0x40733A7146F72A41, 0x40733A7146F72A41, 0x40733A7146F72A41, 0x40733A7146F72A41, 0x40733A7146F72A41, 0x40733A7146F72A41, 0x40733A7146F72A41, 0x40733A7146F72A41
284 .align 64
285 .type __svml_dexp10_data_internal_avx512, @object
286 .size __svml_dexp10_data_internal_avx512, .-__svml_dexp10_data_internal_avx512
287

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_d_exp108_core_avx512.S