1/* Function expm1f vectorized with AVX-512.
2 Copyright (C) 2021-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
18
19/*
20 * ALGORITHM DESCRIPTION:
21 * After computing exp(x) in high-low parts, an accurate computation is performed to obtain exp(x)-1
22 * Typical exp() implementation, except that:
23 * - tables are small (32 elements), allowing for fast gathers
24 * - all arguments processed in the main path
25 * - final VSCALEF assists branch-free design (correct overflow/underflow and special case responses)
26 * - a VAND is used to ensure the reduced argument |R|<2, even for large inputs
27 * - RZ mode used to avoid overflow to +/-Inf for x*log2(e); helps with special case handling
28 *
29 *
30 */
31
32/* Offsets for data table __svml_sexpm1_data_internal_avx512
33 */
34#define Exp_tbl_H 0
35#define Exp_tbl_L 128
36#define L2E 256
37#define Shifter 320
38#define Threshold 384
39#define SgnMask 448
40#define L2H 512
41#define L2L 576
42#define EMask 640
43#define poly_coeff3 704
44#define poly_coeff2 768
45#define One 832
46
47#include <sysdep.h>
48
49 .section .text.evex512, "ax", @progbits
50ENTRY(_ZGVeN16v_expm1f_skx)
51 pushq %rbp
52 cfi_def_cfa_offset(16)
53 movq %rsp, %rbp
54 cfi_def_cfa(6, 16)
55 cfi_offset(6, -16)
56 andq $-64, %rsp
57 subq $192, %rsp
58 vmovups L2E+__svml_sexpm1_data_internal_avx512(%rip), %zmm5
59 vmovups Shifter+__svml_sexpm1_data_internal_avx512(%rip), %zmm3
60 vmovups L2H+__svml_sexpm1_data_internal_avx512(%rip), %zmm8
61 vmovups L2L+__svml_sexpm1_data_internal_avx512(%rip), %zmm4
62 vmovups __svml_sexpm1_data_internal_avx512(%rip), %zmm6
63
64 /* polynomial */
65 vmovups poly_coeff3+__svml_sexpm1_data_internal_avx512(%rip), %zmm9
66 vmovups poly_coeff2+__svml_sexpm1_data_internal_avx512(%rip), %zmm12
67 vmovups Exp_tbl_L+__svml_sexpm1_data_internal_avx512(%rip), %zmm11
68 vmovups Threshold+__svml_sexpm1_data_internal_avx512(%rip), %zmm2
69
70 /* Th - 1 */
71 vmovups One+__svml_sexpm1_data_internal_avx512(%rip), %zmm14
72 vmovaps %zmm0, %zmm1
73
74 /* 2^(52-5)*1.5 + x * log2(e) */
75 vfmadd213ps {rn-sae}, %zmm3, %zmm1, %zmm5
76 vcmpps $29, {sae}, %zmm2, %zmm1, %k0
77
78 /* Z0 ~ x*log2(e), rounded to 5 fractional bits */
79 vsubps {rn-sae}, %zmm3, %zmm5, %zmm7
80 vpermt2ps Exp_tbl_H+64+__svml_sexpm1_data_internal_avx512(%rip), %zmm5, %zmm6
81 vpermt2ps Exp_tbl_L+64+__svml_sexpm1_data_internal_avx512(%rip), %zmm5, %zmm11
82 vandps SgnMask+__svml_sexpm1_data_internal_avx512(%rip), %zmm1, %zmm0
83
84 /* R = x - Z0*log(2) */
85 vfnmadd213ps {rn-sae}, %zmm1, %zmm7, %zmm8
86
87 /* scale Th */
88 vscalefps {rn-sae}, %zmm7, %zmm6, %zmm2
89 vfnmadd231ps {rn-sae}, %zmm7, %zmm4, %zmm8
90 kmovw %k0, %edx
91
92 /* ensure |R|<2 even for special cases */
93 vandps EMask+__svml_sexpm1_data_internal_avx512(%rip), %zmm8, %zmm13
94 vsubps {rn-sae}, %zmm14, %zmm2, %zmm8
95 vmulps {rn-sae}, %zmm13, %zmm13, %zmm10
96 vfmadd231ps {rn-sae}, %zmm13, %zmm9, %zmm12
97
98 /* Tlr + R+ R2*Poly */
99 vfmadd213ps {rn-sae}, %zmm11, %zmm10, %zmm12
100 vaddps {rn-sae}, %zmm13, %zmm12, %zmm15
101
102 /* (Th-1)+Th*(Tlr + R+ R*Poly) */
103 vfmadd213ps {rn-sae}, %zmm8, %zmm15, %zmm2
104 vorps %zmm0, %zmm2, %zmm0
105 testl %edx, %edx
106
107 /* Go to special inputs processing branch */
108 jne L(SPECIAL_VALUES_BRANCH)
109 # LOE rbx r12 r13 r14 r15 edx zmm0 zmm1
110
111 /* Restore registers
112 * and exit the function
113 */
114
115L(EXIT):
116 movq %rbp, %rsp
117 popq %rbp
118 cfi_def_cfa(7, 8)
119 cfi_restore(6)
120 ret
121 cfi_def_cfa(6, 16)
122 cfi_offset(6, -16)
123
124 /* Branch to process
125 * special inputs
126 */
127
128L(SPECIAL_VALUES_BRANCH):
129 vmovups %zmm1, 64(%rsp)
130 vmovups %zmm0, 128(%rsp)
131 # LOE rbx r12 r13 r14 r15 edx zmm0
132
133 xorl %eax, %eax
134 # LOE rbx r12 r13 r14 r15 eax edx
135
136 vzeroupper
137 movq %r12, 16(%rsp)
138 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
139 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
140 movl %eax, %r12d
141 movq %r13, 8(%rsp)
142 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
143 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
144 movl %edx, %r13d
145 movq %r14, (%rsp)
146 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
147 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
148 # LOE rbx r15 r12d r13d
149
150 /* Range mask
151 * bits check
152 */
153
154L(RANGEMASK_CHECK):
155 btl %r12d, %r13d
156
157 /* Call scalar math function */
158 jc L(SCALAR_MATH_CALL)
159 # LOE rbx r15 r12d r13d
160
161 /* Special inputs
162 * processing loop
163 */
164
165L(SPECIAL_VALUES_LOOP):
166 incl %r12d
167 cmpl $16, %r12d
168
169 /* Check bits in range mask */
170 jl L(RANGEMASK_CHECK)
171 # LOE rbx r15 r12d r13d
172
173 movq 16(%rsp), %r12
174 cfi_restore(12)
175 movq 8(%rsp), %r13
176 cfi_restore(13)
177 movq (%rsp), %r14
178 cfi_restore(14)
179 vmovups 128(%rsp), %zmm0
180
181 /* Go to exit */
182 jmp L(EXIT)
183 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
184 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
185 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
186 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
187 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
188 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
189 # LOE rbx r12 r13 r14 r15 zmm0
190
191 /* Scalar math function call
192 * to process special input
193 */
194
195L(SCALAR_MATH_CALL):
196 movl %r12d, %r14d
197 vmovss 64(%rsp, %r14, 4), %xmm0
198 call expm1f@PLT
199 # LOE rbx r14 r15 r12d r13d xmm0
200
201 vmovss %xmm0, 128(%rsp, %r14, 4)
202
203 /* Process special inputs in loop */
204 jmp L(SPECIAL_VALUES_LOOP)
205 # LOE rbx r15 r12d r13d
206END(_ZGVeN16v_expm1f_skx)
207
208 .section .rodata, "a"
209 .align 64
210
211#ifdef __svml_sexpm1_data_internal_avx512_typedef
212typedef unsigned int VUINT32;
213typedef struct {
214 __declspec(align(64)) VUINT32 Exp_tbl_H[32][1];
215 __declspec(align(64)) VUINT32 Exp_tbl_L[32][1];
216 __declspec(align(64)) VUINT32 L2E[16][1];
217 __declspec(align(64)) VUINT32 Shifter[16][1];
218 __declspec(align(64)) VUINT32 Threshold[16][1];
219 __declspec(align(64)) VUINT32 SgnMask[16][1];
220 __declspec(align(64)) VUINT32 L2H[16][1];
221 __declspec(align(64)) VUINT32 L2L[16][1];
222 __declspec(align(64)) VUINT32 EMask[16][1];
223 __declspec(align(64)) VUINT32 poly_coeff3[16][1];
224 __declspec(align(64)) VUINT32 poly_coeff2[16][1];
225 __declspec(align(64)) VUINT32 One[16][1];
226} __svml_sexpm1_data_internal_avx512;
227#endif
228__svml_sexpm1_data_internal_avx512:
229 /* Exp_tbl_H */
230 .long 0x3f800000, 0x3f82cd87, 0x3f85aac3, 0x3f88980f
231 .long 0x3f8b95c2, 0x3f8ea43a, 0x3f91c3d3, 0x3f94f4f0
232 .long 0x3f9837f0, 0x3f9b8d3a, 0x3f9ef532, 0x3fa27043
233 .long 0x3fa5fed7, 0x3fa9a15b, 0x3fad583f, 0x3fb123f6
234 .long 0x3fb504f3, 0x3fb8fbaf, 0x3fbd08a4, 0x3fc12c4d
235 .long 0x3fc5672a, 0x3fc9b9be, 0x3fce248c, 0x3fd2a81e
236 .long 0x3fd744fd, 0x3fdbfbb8, 0x3fe0ccdf, 0x3fe5b907
237 .long 0x3feac0c7, 0x3fefe4ba, 0x3ff5257d, 0x3ffa83b3
238 /* Exp_tbl_L */
239 .align 64
240 .long 0x00000000, 0xb34a3a0a, 0x3346cb6a, 0xb36ed17e
241 .long 0xb24e0611, 0xb3517dd9, 0x334b2482, 0xb31586de
242 .long 0x33092801, 0xb2e6f467, 0x331b85f2, 0x3099b6f1
243 .long 0xb3051aa8, 0xb2e2a0da, 0xb2006c56, 0xb3365942
244 .long 0x329302ae, 0x32c595dc, 0xb302e5a2, 0xb28e10a1
245 .long 0x31b3d0e5, 0xb31a472b, 0x31d1daf2, 0xb305bf64
246 .long 0xb27ce182, 0xb2f26443, 0xb1b4b0da, 0xb1da8a8f
247 .long 0xb1d290be, 0xb2d5b899, 0x31b0a147, 0xb2156afc
248 /* log2(e) */
249 .align 64
250 .long 0x3fB8AA3B, 0x3fB8AA3B, 0x3fB8AA3B, 0x3fB8AA3B, 0x3fB8AA3B, 0x3fB8AA3B, 0x3fB8AA3B, 0x3fB8AA3B, 0x3fB8AA3B, 0x3fB8AA3B, 0x3fB8AA3B, 0x3fB8AA3B, 0x3fB8AA3B, 0x3fB8AA3B, 0x3fB8AA3B, 0x3fB8AA3B
251 /* Shifter=2^(23-5)*1.5 */
252 .align 64
253 .long 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000
254 /* Threshold */
255 .align 64
256 .long 0x42AD496B, 0x42AD496B, 0x42AD496B, 0x42AD496B, 0x42AD496B, 0x42AD496B, 0x42AD496B, 0x42AD496B, 0x42AD496B, 0x42AD496B, 0x42AD496B, 0x42AD496B, 0x42AD496B, 0x42AD496B, 0x42AD496B, 0x42AD496B
257 /* Sgn */
258 .align 64
259 .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000
260 /* L2H = log(2)_high */
261 .align 64
262 .long 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
263 /* L2L = log(2)_low */
264 .align 64
265 .long 0xb102e308, 0xb102e308, 0xb102e308, 0xb102e308, 0xb102e308, 0xb102e308, 0xb102e308, 0xb102e308, 0xb102e308, 0xb102e308, 0xb102e308, 0xb102e308, 0xb102e308, 0xb102e308, 0xb102e308, 0xb102e308
266 /* EMask */
267 .align 64
268 .long 0xbfffffff, 0xbfffffff, 0xbfffffff, 0xbfffffff, 0xbfffffff, 0xbfffffff, 0xbfffffff, 0xbfffffff, 0xbfffffff, 0xbfffffff, 0xbfffffff, 0xbfffffff, 0xbfffffff, 0xbfffffff, 0xbfffffff, 0xbfffffff
269 /* poly_coeff3 */
270 .align 64
271 .long 0x3e2AABF3, 0x3e2AABF3, 0x3e2AABF3, 0x3e2AABF3, 0x3e2AABF3, 0x3e2AABF3, 0x3e2AABF3, 0x3e2AABF3, 0x3e2AABF3, 0x3e2AABF3, 0x3e2AABF3, 0x3e2AABF3, 0x3e2AABF3, 0x3e2AABF3, 0x3e2AABF3, 0x3e2AABF3
272 /* poly_coeff2 */
273 .align 64
274 .long 0x3f0000F6, 0x3f0000F6, 0x3f0000F6, 0x3f0000F6, 0x3f0000F6, 0x3f0000F6, 0x3f0000F6, 0x3f0000F6, 0x3f0000F6, 0x3f0000F6, 0x3f0000F6, 0x3f0000F6, 0x3f0000F6, 0x3f0000F6, 0x3f0000F6, 0x3f0000F6
275 /* One */
276 .align 64
277 .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
278 .align 64
279 .type __svml_sexpm1_data_internal_avx512, @object
280 .size __svml_sexpm1_data_internal_avx512, .-__svml_sexpm1_data_internal_avx512
281

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_s_expm1f16_core_avx512.S