1/* Function expm1 vectorized with AVX-512.
2 Copyright (C) 2021-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
18
19/*
20 * ALGORITHM DESCRIPTION:
21 * After computing exp(x) in high-low parts, an accurate computation is performed to obtain exp(x)-1
22 * Typical exp() implementation, except that:
23 * - tables are small (16 elements), allowing for fast gathers
24 * - all arguments processed in the main path
25 * - final VSCALEF assists branch-free design (correct overflow/underflow and special case responses)
26 * - a VAND is used to ensure the reduced argument |R|<2, even for large inputs
27 * - RZ mode used to avoid overflow to +/-Inf for x*log2(e); helps with special case handling
28 *
29 *
30 */
31
32/* Offsets for data table __svml_dexpm1_data_internal_avx512
33 */
34#define Exp_tbl_H 0
35#define Exp_tbl_L 128
36#define L2E 256
37#define Shifter 320
38#define Threshold 384
39#define SgnMask 448
40#define L2H 512
41#define L2L 576
42#define ZThres 640
43#define EMask 704
44#define poly_coeff7 768
45#define poly_coeff6 832
46#define poly_coeff5 896
47#define poly_coeff4 960
48#define poly_coeff3 1024
49#define poly_coeff2 1088
50#define One 1152
51
52#include <sysdep.h>
53
54 .section .text.evex512, "ax", @progbits
55ENTRY(_ZGVeN8v_expm1_skx)
56 pushq %rbp
57 cfi_def_cfa_offset(16)
58 movq %rsp, %rbp
59 cfi_def_cfa(6, 16)
60 cfi_offset(6, -16)
61 andq $-64, %rsp
62 subq $192, %rsp
63 vmovups L2E+__svml_dexpm1_data_internal_avx512(%rip), %zmm6
64 vmovups Shifter+__svml_dexpm1_data_internal_avx512(%rip), %zmm4
65 vmovups L2H+__svml_dexpm1_data_internal_avx512(%rip), %zmm11
66 vmovups L2L+__svml_dexpm1_data_internal_avx512(%rip), %zmm5
67 vmovups Threshold+__svml_dexpm1_data_internal_avx512(%rip), %zmm3
68 vmovups poly_coeff5+__svml_dexpm1_data_internal_avx512(%rip), %zmm13
69 vmovups poly_coeff4+__svml_dexpm1_data_internal_avx512(%rip), %zmm15
70
71 /* polynomial */
72 vmovups poly_coeff7+__svml_dexpm1_data_internal_avx512(%rip), %zmm12
73
74 /* set Z0=max(Z0, -128.0) */
75 vmovups ZThres+__svml_dexpm1_data_internal_avx512(%rip), %zmm8
76 vmovups poly_coeff3+__svml_dexpm1_data_internal_avx512(%rip), %zmm14
77 vmovups __svml_dexpm1_data_internal_avx512(%rip), %zmm9
78 vmovaps %zmm0, %zmm2
79
80 /* 2^(52-4)*1.5 + x * log2(e) */
81 vfmadd213pd {rn-sae}, %zmm4, %zmm2, %zmm6
82 vmovups Exp_tbl_L+__svml_dexpm1_data_internal_avx512(%rip), %zmm0
83 vcmppd $21, {sae}, %zmm3, %zmm2, %k0
84
85 /* Z0 ~ x*log2(e), rounded to 4 fractional bits */
86 vsubpd {rn-sae}, %zmm4, %zmm6, %zmm7
87 vpermt2pd Exp_tbl_H+64+__svml_dexpm1_data_internal_avx512(%rip), %zmm6, %zmm9
88 vpermt2pd Exp_tbl_L+64+__svml_dexpm1_data_internal_avx512(%rip), %zmm6, %zmm0
89 vandpd SgnMask+__svml_dexpm1_data_internal_avx512(%rip), %zmm2, %zmm1
90
91 /* R = x - Z0*log(2) */
92 vfnmadd213pd {rn-sae}, %zmm2, %zmm7, %zmm11
93 vmaxpd {sae}, %zmm8, %zmm7, %zmm10
94 vfnmadd231pd {rn-sae}, %zmm7, %zmm5, %zmm11
95 kmovw %k0, %edx
96
97 /* ensure |R|<2 even for special cases */
98 vandpd EMask+__svml_dexpm1_data_internal_avx512(%rip), %zmm11, %zmm3
99 vmovups poly_coeff6+__svml_dexpm1_data_internal_avx512(%rip), %zmm11
100
101 /* scale Th */
102 vscalefpd {rn-sae}, %zmm10, %zmm9, %zmm4
103 vfmadd231pd {rn-sae}, %zmm3, %zmm13, %zmm15
104 vfmadd231pd {rn-sae}, %zmm3, %zmm12, %zmm11
105 vmovups poly_coeff2+__svml_dexpm1_data_internal_avx512(%rip), %zmm12
106 vmulpd {rn-sae}, %zmm3, %zmm3, %zmm13
107 vfmadd231pd {rn-sae}, %zmm3, %zmm14, %zmm12
108 vfmadd213pd {rn-sae}, %zmm15, %zmm13, %zmm11
109 vfmadd213pd {rn-sae}, %zmm12, %zmm13, %zmm11
110
111 /* Tlr + R+ R*Poly */
112 vfmadd213pd {rn-sae}, %zmm0, %zmm13, %zmm11
113
114 /* Th - 1 */
115 vmovups One+__svml_dexpm1_data_internal_avx512(%rip), %zmm0
116 vaddpd {rn-sae}, %zmm3, %zmm11, %zmm14
117 vsubpd {rn-sae}, %zmm0, %zmm4, %zmm15
118
119 /* (Th-1)+Th*(Tlr + R+ R*Poly) */
120 vfmadd213pd {rn-sae}, %zmm15, %zmm14, %zmm4
121 vorpd %zmm1, %zmm4, %zmm0
122 testl %edx, %edx
123
124 /* Go to special inputs processing branch */
125 jne L(SPECIAL_VALUES_BRANCH)
126 # LOE rbx r12 r13 r14 r15 edx zmm0 zmm2
127
128 /* Restore registers
129 * and exit the function
130 */
131
132L(EXIT):
133 movq %rbp, %rsp
134 popq %rbp
135 cfi_def_cfa(7, 8)
136 cfi_restore(6)
137 ret
138 cfi_def_cfa(6, 16)
139 cfi_offset(6, -16)
140
141 /* Branch to process
142 * special inputs
143 */
144
145L(SPECIAL_VALUES_BRANCH):
146 vmovups %zmm2, 64(%rsp)
147 vmovups %zmm0, 128(%rsp)
148 # LOE rbx r12 r13 r14 r15 edx zmm0
149
150 xorl %eax, %eax
151 # LOE rbx r12 r13 r14 r15 eax edx
152
153 vzeroupper
154 movq %r12, 16(%rsp)
155 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
156 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
157 movl %eax, %r12d
158 movq %r13, 8(%rsp)
159 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
160 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
161 movl %edx, %r13d
162 movq %r14, (%rsp)
163 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
164 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
165 # LOE rbx r15 r12d r13d
166
167 /* Range mask
168 * bits check
169 */
170
171L(RANGEMASK_CHECK):
172 btl %r12d, %r13d
173
174 /* Call scalar math function */
175 jc L(SCALAR_MATH_CALL)
176 # LOE rbx r15 r12d r13d
177
178 /* Special inputs
179 * processing loop
180 */
181
182L(SPECIAL_VALUES_LOOP):
183 incl %r12d
184 cmpl $8, %r12d
185
186 /* Check bits in range mask */
187 jl L(RANGEMASK_CHECK)
188 # LOE rbx r15 r12d r13d
189
190 movq 16(%rsp), %r12
191 cfi_restore(12)
192 movq 8(%rsp), %r13
193 cfi_restore(13)
194 movq (%rsp), %r14
195 cfi_restore(14)
196 vmovups 128(%rsp), %zmm0
197
198 /* Go to exit */
199 jmp L(EXIT)
200 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
201 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
202 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
203 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
204 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
205 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
206 # LOE rbx r12 r13 r14 r15 zmm0
207
208 /* Scalar math function call
209 * to process special input
210 */
211
212L(SCALAR_MATH_CALL):
213 movl %r12d, %r14d
214 vmovsd 64(%rsp, %r14, 8), %xmm0
215 call expm1@PLT
216 # LOE rbx r14 r15 r12d r13d xmm0
217
218 vmovsd %xmm0, 128(%rsp, %r14, 8)
219
220 /* Process special inputs in loop */
221 jmp L(SPECIAL_VALUES_LOOP)
222 # LOE rbx r15 r12d r13d
223END(_ZGVeN8v_expm1_skx)
224
225 .section .rodata, "a"
226 .align 64
227
228#ifdef __svml_dexpm1_data_internal_avx512_typedef
229typedef unsigned int VUINT32;
230typedef struct {
231 __declspec(align(64)) VUINT32 Exp_tbl_H[16][2];
232 __declspec(align(64)) VUINT32 Exp_tbl_L[16][2];
233 __declspec(align(64)) VUINT32 L2E[8][2];
234 __declspec(align(64)) VUINT32 Shifter[8][2];
235 __declspec(align(64)) VUINT32 Threshold[8][2];
236 __declspec(align(64)) VUINT32 SgnMask[8][2];
237 __declspec(align(64)) VUINT32 L2H[8][2];
238 __declspec(align(64)) VUINT32 L2L[8][2];
239 __declspec(align(64)) VUINT32 ZThres[8][2];
240 __declspec(align(64)) VUINT32 EMask[8][2];
241 __declspec(align(64)) VUINT32 poly_coeff7[8][2];
242 __declspec(align(64)) VUINT32 poly_coeff6[8][2];
243 __declspec(align(64)) VUINT32 poly_coeff5[8][2];
244 __declspec(align(64)) VUINT32 poly_coeff4[8][2];
245 __declspec(align(64)) VUINT32 poly_coeff3[8][2];
246 __declspec(align(64)) VUINT32 poly_coeff2[8][2];
247 __declspec(align(64)) VUINT32 One[8][2];
248} __svml_dexpm1_data_internal_avx512;
249#endif
250__svml_dexpm1_data_internal_avx512:
251 /* Exp_tbl_H */
252 .quad 0x3ff0000000000000
253 .quad 0x3ff0b5586cf9890f
254 .quad 0x3ff172b83c7d517b
255 .quad 0x3ff2387a6e756238
256 .quad 0x3ff306fe0a31b715
257 .quad 0x3ff3dea64c123422
258 .quad 0x3ff4bfdad5362a27
259 .quad 0x3ff5ab07dd485429
260 .quad 0x3ff6a09e667f3bcd
261 .quad 0x3ff7a11473eb0187
262 .quad 0x3ff8ace5422aa0db
263 .quad 0x3ff9c49182a3f090
264 .quad 0x3ffae89f995ad3ad
265 .quad 0x3ffc199bdd85529c
266 .quad 0x3ffd5818dcfba487
267 .quad 0x3ffea4afa2a490da
268 /* Exp_tbl_L */
269 .align 64
270 .quad 0x0000000000000000
271 .quad 0x3c979aa65d837b6d
272 .quad 0xbc801b15eaa59348
273 .quad 0x3c968efde3a8a894
274 .quad 0x3c834d754db0abb6
275 .quad 0x3c859f48a72a4c6d
276 .quad 0x3c7690cebb7aafb0
277 .quad 0x3c9063e1e21c5409
278 .quad 0xbc93b3efbf5e2228
279 .quad 0xbc7b32dcb94da51d
280 .quad 0x3c8db72fc1f0eab4
281 .quad 0x3c71affc2b91ce27
282 .quad 0x3c8c1a7792cb3387
283 .quad 0x3c736eae30af0cb3
284 .quad 0x3c74a385a63d07a7
285 .quad 0xbc8ff7128fd391f0
286 /* log2(e) */
287 .align 64
288 .quad 0x3ff71547652B82FE, 0x3ff71547652B82FE, 0x3ff71547652B82FE, 0x3ff71547652B82FE, 0x3ff71547652B82FE, 0x3ff71547652B82FE, 0x3ff71547652B82FE, 0x3ff71547652B82FE
289 /* Shifter=2^(52-4)*1.5 */
290 .align 64
291 .quad 0x42f8000000003ff0, 0x42f8000000003ff0, 0x42f8000000003ff0, 0x42f8000000003ff0, 0x42f8000000003ff0, 0x42f8000000003ff0, 0x42f8000000003ff0, 0x42f8000000003ff0
292 /* Threshold */
293 .align 64
294 .quad 0x40861DA04CBAFE44, 0x40861DA04CBAFE44, 0x40861DA04CBAFE44, 0x40861DA04CBAFE44, 0x40861DA04CBAFE44, 0x40861DA04CBAFE44, 0x40861DA04CBAFE44, 0x40861DA04CBAFE44
295 /* Sgn */
296 .align 64
297 .quad 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000
298 /* L2H = log(2)_high */
299 .align 64
300 .quad 0x3fe62e42fefa39ef, 0x3fe62e42fefa39ef, 0x3fe62e42fefa39ef, 0x3fe62e42fefa39ef, 0x3fe62e42fefa39ef, 0x3fe62e42fefa39ef, 0x3fe62e42fefa39ef, 0x3fe62e42fefa39ef
301 /* L2L = log(2)_low */
302 .align 64
303 .quad 0x3c7abc9e3b39803f, 0x3c7abc9e3b39803f, 0x3c7abc9e3b39803f, 0x3c7abc9e3b39803f, 0x3c7abc9e3b39803f, 0x3c7abc9e3b39803f, 0x3c7abc9e3b39803f, 0x3c7abc9e3b39803f
304 /* ZThres */
305 .align 64
306 .quad 0xc060000000000000, 0xc060000000000000, 0xc060000000000000, 0xc060000000000000, 0xc060000000000000, 0xc060000000000000, 0xc060000000000000, 0xc060000000000000
307 /* EMask */
308 .align 64
309 .quad 0xbfffffffffffffff, 0xbfffffffffffffff, 0xbfffffffffffffff, 0xbfffffffffffffff, 0xbfffffffffffffff, 0xbfffffffffffffff, 0xbfffffffffffffff, 0xbfffffffffffffff
310 /* poly_coeff7 */
311 .align 64
312 .quad 0x3f2a020410303d8a, 0x3f2a020410303d8a, 0x3f2a020410303d8a, 0x3f2a020410303d8a, 0x3f2a020410303d8a, 0x3f2a020410303d8a, 0x3f2a020410303d8a, 0x3f2a020410303d8a
313 /* poly_coeff6 */
314 .align 64
315 .quad 0x3f56c1c38e164a2f, 0x3f56c1c38e164a2f, 0x3f56c1c38e164a2f, 0x3f56c1c38e164a2f, 0x3f56c1c38e164a2f, 0x3f56c1c38e164a2f, 0x3f56c1c38e164a2f, 0x3f56c1c38e164a2f
316 /* poly_coeff5 */
317 .align 64
318 .quad 0x3f81111110865214, 0x3f81111110865214, 0x3f81111110865214, 0x3f81111110865214, 0x3f81111110865214, 0x3f81111110865214, 0x3f81111110865214, 0x3f81111110865214
319 /* poly_coeff4 */
320 .align 64
321 .quad 0x3fa5555554ad3d06, 0x3fa5555554ad3d06, 0x3fa5555554ad3d06, 0x3fa5555554ad3d06, 0x3fa5555554ad3d06, 0x3fa5555554ad3d06, 0x3fa5555554ad3d06, 0x3fa5555554ad3d06
322 /* poly_coeff3 */
323 .align 64
324 .quad 0x3fc5555555555656, 0x3fc5555555555656, 0x3fc5555555555656, 0x3fc5555555555656, 0x3fc5555555555656, 0x3fc5555555555656, 0x3fc5555555555656, 0x3fc5555555555656
325 /* poly_coeff2 */
326 .align 64
327 .quad 0x3fe00000000000a2, 0x3fe00000000000a2, 0x3fe00000000000a2, 0x3fe00000000000a2, 0x3fe00000000000a2, 0x3fe00000000000a2, 0x3fe00000000000a2, 0x3fe00000000000a2
328 /* One */
329 .align 64
330 .quad 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000
331 .align 64
332 .type __svml_dexpm1_data_internal_avx512, @object
333 .size __svml_dexpm1_data_internal_avx512, .-__svml_dexpm1_data_internal_avx512
334

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_d_expm18_core_avx512.S