1/* Function exp10f vectorized with AVX-512.
2 Copyright (C) 2021-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
18
19/*
20 * ALGORITHM DESCRIPTION:
21 * Typical exp10() implementation, except that:
22 * - tables are small (16 elements), allowing for fast gathers
23 * - all arguments processed in the main path
24 * - final VSCALEF assists branch-free design (correct overflow/underflow and special case responses)
25 * - a VAND is used to ensure the reduced argument |R|<2, even for large inputs
26 * - RZ mode used to avoid overflow to +/-Inf for x*log2(e); helps with special case handling
27 * - SAE used to avoid spurious flag settings
28 *
29 */
30
31/* Offsets for data table __svml_sexp10_data_internal_avx512
32 */
33#define Exp_tbl_L 0
34#define Exp_tbl_H 128
35#define L2E 256
36#define Shifter 320
37#define L2H 384
38#define L2L 448
39#define EMask 512
40#define AbsMask 576
41#define Threshold 640
42#define poly_coeff2 704
43#define poly_coeff1 768
44
45#include <sysdep.h>
46
47 .section .text.evex512, "ax", @progbits
48ENTRY(_ZGVeN16v_exp10f_skx)
49 pushq %rbp
50 cfi_def_cfa_offset(16)
51 movq %rsp, %rbp
52 cfi_def_cfa(6, 16)
53 cfi_offset(6, -16)
54 andq $-64, %rsp
55 subq $192, %rsp
56 vmovups L2E+__svml_sexp10_data_internal_avx512(%rip), %zmm2
57 vmovups Shifter+__svml_sexp10_data_internal_avx512(%rip), %zmm1
58 vmovups L2H+__svml_sexp10_data_internal_avx512(%rip), %zmm5
59 vmovups L2L+__svml_sexp10_data_internal_avx512(%rip), %zmm4
60
61 /* ensure |R|<2 even for special cases */
62 vmovups EMask+__svml_sexp10_data_internal_avx512(%rip), %zmm6
63 vmovups poly_coeff2+__svml_sexp10_data_internal_avx512(%rip), %zmm9
64
65 /* 2^(52-4)*1.5 + x * log2(e) */
66 vfmadd213ps {rz-sae}, %zmm1, %zmm0, %zmm2
67 vmovups poly_coeff1+__svml_sexp10_data_internal_avx512(%rip), %zmm10
68 vmovups __svml_sexp10_data_internal_avx512(%rip), %zmm8
69 vmovups Exp_tbl_H+__svml_sexp10_data_internal_avx512(%rip), %zmm15
70 vmovups Threshold+__svml_sexp10_data_internal_avx512(%rip), %zmm13
71 vpsrld $5, %zmm2, %zmm3
72
73 /* Z0 ~ x*log2(e), rounded down to 6 fractional bits */
74 vsubps {rn-sae}, %zmm1, %zmm2, %zmm1
75 vpermt2ps Exp_tbl_L+64+__svml_sexp10_data_internal_avx512(%rip), %zmm2, %zmm8
76 vpermt2ps Exp_tbl_H+64+__svml_sexp10_data_internal_avx512(%rip), %zmm3, %zmm15
77 vandps AbsMask+__svml_sexp10_data_internal_avx512(%rip), %zmm0, %zmm12
78
79 /* R = x - Z0*log(2) */
80 vfnmadd213ps {rn-sae}, %zmm0, %zmm1, %zmm5
81 vcmpps $29, {sae}, %zmm13, %zmm12, %k0
82 vfnmadd231ps {rn-sae}, %zmm1, %zmm4, %zmm5
83 kmovw %k0, %edx
84 vrangeps $2, {sae}, %zmm6, %zmm5, %zmm11
85 vfmadd231ps {rn-sae}, %zmm11, %zmm9, %zmm10
86 vmulps {rn-sae}, %zmm11, %zmm10, %zmm14
87
88 /* x!=0? */
89 vpxord %zmm7, %zmm7, %zmm7
90 vcmpps $4, {sae}, %zmm7, %zmm0, %k1
91
92 /* Th*Tl */
93 vmulps {rn-sae}, %zmm8, %zmm15, %zmm15{%k1}
94 vfmadd213ps {rn-sae}, %zmm15, %zmm14, %zmm15
95 vscalefps {rn-sae}, %zmm1, %zmm15, %zmm1
96 testl %edx, %edx
97
98 /* Go to special inputs processing branch */
99 jne L(SPECIAL_VALUES_BRANCH)
100 # LOE rbx r12 r13 r14 r15 edx zmm0 zmm1
101
102 /* Restore registers
103 * and exit the function
104 */
105
106L(EXIT):
107 vmovaps %zmm1, %zmm0
108 movq %rbp, %rsp
109 popq %rbp
110 cfi_def_cfa(7, 8)
111 cfi_restore(6)
112 ret
113 cfi_def_cfa(6, 16)
114 cfi_offset(6, -16)
115
116 /* Branch to process
117 * special inputs
118 */
119
120L(SPECIAL_VALUES_BRANCH):
121 vmovups %zmm0, 64(%rsp)
122 vmovups %zmm1, 128(%rsp)
123 # LOE rbx r12 r13 r14 r15 edx zmm1
124
125 xorl %eax, %eax
126 # LOE rbx r12 r13 r14 r15 eax edx
127
128 vzeroupper
129 movq %r12, 16(%rsp)
130 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
131 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
132 movl %eax, %r12d
133 movq %r13, 8(%rsp)
134 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
135 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
136 movl %edx, %r13d
137 movq %r14, (%rsp)
138 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
139 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
140 # LOE rbx r15 r12d r13d
141
142 /* Range mask
143 * bits check
144 */
145
146L(RANGEMASK_CHECK):
147 btl %r12d, %r13d
148
149 /* Call scalar math function */
150 jc L(SCALAR_MATH_CALL)
151 # LOE rbx r15 r12d r13d
152
153 /* Special inputs
154 * processing loop
155 */
156
157L(SPECIAL_VALUES_LOOP):
158 incl %r12d
159 cmpl $16, %r12d
160
161 /* Check bits in range mask */
162 jl L(RANGEMASK_CHECK)
163 # LOE rbx r15 r12d r13d
164
165 movq 16(%rsp), %r12
166 cfi_restore(12)
167 movq 8(%rsp), %r13
168 cfi_restore(13)
169 movq (%rsp), %r14
170 cfi_restore(14)
171 vmovups 128(%rsp), %zmm1
172
173 /* Go to exit */
174 jmp L(EXIT)
175 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
176 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
177 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
178 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
179 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
180 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
181 # LOE rbx r12 r13 r14 r15 zmm1
182
183 /* Scalar math function call
184 * to process special input
185 */
186
187L(SCALAR_MATH_CALL):
188 movl %r12d, %r14d
189 vmovss 64(%rsp, %r14, 4), %xmm0
190 call exp10f@PLT
191 # LOE rbx r14 r15 r12d r13d xmm0
192
193 vmovss %xmm0, 128(%rsp, %r14, 4)
194
195 /* Process special inputs in loop */
196 jmp L(SPECIAL_VALUES_LOOP)
197 # LOE rbx r15 r12d r13d
198END(_ZGVeN16v_exp10f_skx)
199
200 .section .rodata, "a"
201 .align 64
202
203#ifdef __svml_sexp10_data_internal_avx512_typedef
204typedef unsigned int VUINT32;
205typedef struct {
206 __declspec(align(64)) VUINT32 Exp_tbl_L[32][1];
207 __declspec(align(64)) VUINT32 Exp_tbl_H[32][1];
208 __declspec(align(64)) VUINT32 L2E[16][1];
209 __declspec(align(64)) VUINT32 Shifter[16][1];
210 __declspec(align(64)) VUINT32 L2H[16][1];
211 __declspec(align(64)) VUINT32 L2L[16][1];
212 __declspec(align(64)) VUINT32 EMask[16][1];
213 __declspec(align(64)) VUINT32 AbsMask[16][1];
214 __declspec(align(64)) VUINT32 Threshold[16][1];
215 __declspec(align(64)) VUINT32 poly_coeff2[16][1];
216 __declspec(align(64)) VUINT32 poly_coeff1[16][1];
217} __svml_sexp10_data_internal_avx512;
218#endif
219__svml_sexp10_data_internal_avx512:
220 /* Exp_tbl_L */
221 .long 0x3f800001, 0x3f801631, 0x3f802c65, 0x3f80429d
222 .long 0x3f8058d9, 0x3f806f18, 0x3f80855c, 0x3f809ba3
223 .long 0x3f80b1ee, 0x3f80c83d, 0x3f80de90, 0x3f80f4e7
224 .long 0x3f810b42, 0x3f8121a0, 0x3f813803, 0x3f814e69
225 .long 0x3f8164d3, 0x3f817b41, 0x3f8191b3, 0x3f81a829
226 .long 0x3f81bea2, 0x3f81d520, 0x3f81eba2, 0x3f820227
227 .long 0x3f8218b0, 0x3f822f3d, 0x3f8245cf, 0x3f825c64
228 .long 0x3f8272fd, 0x3f828999, 0x3f82a03a, 0x3f82b6df
229 /* Exp_tbl_H */
230 .align 64
231 .long 0x3f800000, 0x3f82cd87, 0x3f85aac3, 0x3f88980f
232 .long 0x3f8b95c2, 0x3f8ea43a, 0x3f91c3d3, 0x3f94f4f0
233 .long 0x3f9837f0, 0x3f9b8d3a, 0x3f9ef532, 0x3fa27043
234 .long 0x3fa5fed7, 0x3fa9a15b, 0x3fad583f, 0x3fb123f6
235 .long 0x3fb504f3, 0x3fb8fbaf, 0x3fbd08a4, 0x3fc12c4d
236 .long 0x3fc5672a, 0x3fc9b9be, 0x3fce248c, 0x3fd2a81e
237 .long 0x3fd744fd, 0x3fdbfbb8, 0x3fe0ccdf, 0x3fe5b907
238 .long 0x3feac0c7, 0x3fefe4ba, 0x3ff5257d, 0x3ffa83b3
239 /* log2(10) */
240 .align 64
241 .long 0x40549A78, 0x40549A78, 0x40549A78, 0x40549A78, 0x40549A78, 0x40549A78, 0x40549A78, 0x40549A78, 0x40549A78, 0x40549A78, 0x40549A78, 0x40549A78, 0x40549A78, 0x40549A78, 0x40549A78, 0x40549A78
242 /* Shifter=2^(23-10)*1.5 */
243 .align 64
244 .long 0x46400000, 0x46400000, 0x46400000, 0x46400000, 0x46400000, 0x46400000, 0x46400000, 0x46400000, 0x46400000, 0x46400000, 0x46400000, 0x46400000, 0x46400000, 0x46400000, 0x46400000, 0x46400000
245 /* L2H = log(2)_high */
246 .align 64
247 .long 0x3e9a209b, 0x3e9a209b, 0x3e9a209b, 0x3e9a209b, 0x3e9a209b, 0x3e9a209b, 0x3e9a209b, 0x3e9a209b, 0x3e9a209b, 0x3e9a209b, 0x3e9a209b, 0x3e9a209b, 0x3e9a209b, 0x3e9a209b, 0x3e9a209b, 0x3e9a209b
248 /* L2L = log(2)_low */
249 .align 64
250 .long 0xb2760860, 0xb2760860, 0xb2760860, 0xb2760860, 0xb2760860, 0xb2760860, 0xb2760860, 0xb2760860, 0xb2760860, 0xb2760860, 0xb2760860, 0xb2760860, 0xb2760860, 0xb2760860, 0xb2760860, 0xb2760860
251 /* EMask */
252 .align 64
253 .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000
254 /* AbsMask */
255 .align 64
256 .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
257 /* Threshold */
258 .align 64
259 .long 0x4217B818, 0x4217B818, 0x4217B818, 0x4217B818, 0x4217B818, 0x4217B818, 0x4217B818, 0x4217B818, 0x4217B818, 0x4217B818, 0x4217B818, 0x4217B818, 0x4217B818, 0x4217B818, 0x4217B818, 0x4217B818
260 /* poly_coeff2 */
261 .align 64
262 .long 0x4029B7DA, 0x4029B7DA, 0x4029B7DA, 0x4029B7DA, 0x4029B7DA, 0x4029B7DA, 0x4029B7DA, 0x4029B7DA, 0x4029B7DA, 0x4029B7DA, 0x4029B7DA, 0x4029B7DA, 0x4029B7DA, 0x4029B7DA, 0x4029B7DA, 0x4029B7DA
263 /* poly_coeff1 */
264 .align 64
265 .long 0x40135D8D, 0x40135D8D, 0x40135D8D, 0x40135D8D, 0x40135D8D, 0x40135D8D, 0x40135D8D, 0x40135D8D, 0x40135D8D, 0x40135D8D, 0x40135D8D, 0x40135D8D, 0x40135D8D, 0x40135D8D, 0x40135D8D, 0x40135D8D
266 .align 64
267 .type __svml_sexp10_data_internal_avx512, @object
268 .size __svml_sexp10_data_internal_avx512, .-__svml_sexp10_data_internal_avx512
269

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_s_exp10f16_core_avx512.S