1/* Function expm1 vectorized with SSE4.
2 Copyright (C) 2021-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
18
19/*
20 * ALGORITHM DESCRIPTION:
21 *
22 * N = (int)(x*2^k/log(2.0)), R = x - N*log(2)/2^k
23 * exp(x) = 2^(N/2^k) * poly(R) is computed in high-low parts
24 * expm1(x) = exp(x)-1 is then obtained via multi-precision computation
25 *
26 *
27 */
28
29/* Offsets for data table __svml_dexpm1_data_internal
30 */
31#define Expm1_HA_table 0
32#define poly_coeff 2048
33#define Log2e 2112
34#define L2H 2128
35#define L2L 2144
36#define ExpAddConst 2160
37#define IndexMask 2176
38#define ExpMask 2192
39#define MOne 2208
40#define AbsMask 2224
41#define Threshold 2240
42#define L2 2256
43
44#include <sysdep.h>
45
46 .section .text.sse4, "ax", @progbits
47ENTRY(_ZGVbN2v_expm1_sse4)
48 pushq %rbp
49 cfi_def_cfa_offset(16)
50 movq %rsp, %rbp
51 cfi_def_cfa(6, 16)
52 cfi_offset(6, -16)
53 andq $-32, %rsp
54 subq $64, %rsp
55 movaps %xmm0, %xmm2
56 movups Log2e+__svml_dexpm1_data_internal(%rip), %xmm7
57 lea __svml_dexpm1_data_internal(%rip), %rsi
58 mulpd %xmm0, %xmm7
59 movups .FLT_10(%rip), %xmm3
60 addpd %xmm3, %xmm7
61 subpd %xmm3, %xmm7
62
63 /* argument reduction */
64 movups L2H+__svml_dexpm1_data_internal(%rip), %xmm4
65 mulpd %xmm7, %xmm4
66 movups L2L+__svml_dexpm1_data_internal(%rip), %xmm5
67 mulpd %xmm7, %xmm5
68 subpd %xmm4, %xmm2
69 subpd %xmm5, %xmm2
70
71 /* polynomial */
72 movups poly_coeff+__svml_dexpm1_data_internal(%rip), %xmm12
73 movaps %xmm2, %xmm14
74 mulpd %xmm2, %xmm12
75 mulpd %xmm2, %xmm14
76 addpd poly_coeff+16+__svml_dexpm1_data_internal(%rip), %xmm12
77 movups ExpAddConst+__svml_dexpm1_data_internal(%rip), %xmm15
78 addpd %xmm7, %xmm15
79 mulpd %xmm14, %xmm12
80 movups poly_coeff+32+__svml_dexpm1_data_internal(%rip), %xmm13
81 mulpd %xmm2, %xmm13
82
83 /* table lookup */
84 movdqu IndexMask+__svml_dexpm1_data_internal(%rip), %xmm8
85 pand %xmm15, %xmm8
86 movups AbsMask+__svml_dexpm1_data_internal(%rip), %xmm1
87 pshufd $2, %xmm8, %xmm9
88 movaps %xmm1, %xmm6
89 movd %xmm8, %eax
90 andps %xmm0, %xmm6
91 movd %xmm9, %ecx
92 andnps %xmm0, %xmm1
93 movdqu ExpMask+__svml_dexpm1_data_internal(%rip), %xmm11
94 pand %xmm11, %xmm15
95 cmpnlepd Threshold+__svml_dexpm1_data_internal(%rip), %xmm6
96 addpd poly_coeff+48+__svml_dexpm1_data_internal(%rip), %xmm13
97 movmskpd %xmm6, %edx
98 psllq $41, %xmm15
99
100 /* T-1 */
101 movups MOne+__svml_dexpm1_data_internal(%rip), %xmm4
102 movslq %eax, %rax
103 movslq %ecx, %rcx
104 addpd %xmm12, %xmm13
105 movups (%rsi, %rax), %xmm3
106 movups (%rsi, %rcx), %xmm10
107 movaps %xmm3, %xmm6
108 unpckhpd %xmm10, %xmm3
109
110 /* Th1 = (Th-1) + Tl */
111 mulpd %xmm15, %xmm3
112 mulpd %xmm13, %xmm14
113 unpcklpd %xmm10, %xmm6
114 orps %xmm15, %xmm6
115 addpd %xmm4, %xmm6
116 addpd %xmm14, %xmm2
117 addpd %xmm3, %xmm6
118
119 /* T = Th+Tl */
120 movaps %xmm6, %xmm5
121 subpd %xmm4, %xmm5
122 mulpd %xmm5, %xmm2
123 addpd %xmm2, %xmm6
124 orps %xmm1, %xmm6
125 testl %edx, %edx
126
127 /* Go to special inputs processing branch */
128 jne L(SPECIAL_VALUES_BRANCH)
129 # LOE rbx r12 r13 r14 r15 edx xmm0 xmm6
130
131 /* Restore registers
132 * and exit the function
133 */
134
135L(EXIT):
136 movaps %xmm6, %xmm0
137 movq %rbp, %rsp
138 popq %rbp
139 cfi_def_cfa(7, 8)
140 cfi_restore(6)
141 ret
142 cfi_def_cfa(6, 16)
143 cfi_offset(6, -16)
144
145 /* Branch to process
146 * special inputs
147 */
148
149L(SPECIAL_VALUES_BRANCH):
150 movups %xmm0, 32(%rsp)
151 movups %xmm6, 48(%rsp)
152 # LOE rbx r12 r13 r14 r15 edx
153
154 xorl %eax, %eax
155 movq %r12, 16(%rsp)
156 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -48; DW_OP_plus) */
157 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xd0, 0xff, 0xff, 0xff, 0x22
158 movl %eax, %r12d
159 movq %r13, 8(%rsp)
160 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -56; DW_OP_plus) */
161 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xc8, 0xff, 0xff, 0xff, 0x22
162 movl %edx, %r13d
163 movq %r14, (%rsp)
164 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -64; DW_OP_plus) */
165 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x22
166 # LOE rbx r15 r12d r13d
167
168 /* Range mask
169 * bits check
170 */
171
172L(RANGEMASK_CHECK):
173 btl %r12d, %r13d
174
175 /* Call scalar math function */
176 jc L(SCALAR_MATH_CALL)
177 # LOE rbx r15 r12d r13d
178
179 /* Special inputs
180 * processing loop
181 */
182
183L(SPECIAL_VALUES_LOOP):
184 incl %r12d
185 cmpl $2, %r12d
186
187 /* Check bits in range mask */
188 jl L(RANGEMASK_CHECK)
189 # LOE rbx r15 r12d r13d
190
191 movq 16(%rsp), %r12
192 cfi_restore(12)
193 movq 8(%rsp), %r13
194 cfi_restore(13)
195 movq (%rsp), %r14
196 cfi_restore(14)
197 movups 48(%rsp), %xmm6
198
199 /* Go to exit */
200 jmp L(EXIT)
201 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -48; DW_OP_plus) */
202 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xd0, 0xff, 0xff, 0xff, 0x22
203 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -56; DW_OP_plus) */
204 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xc8, 0xff, 0xff, 0xff, 0x22
205 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -64; DW_OP_plus) */
206 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x22
207 # LOE rbx r12 r13 r14 r15 xmm6
208
209 /* Scalar math function call
210 * to process special input
211 */
212
213L(SCALAR_MATH_CALL):
214 movl %r12d, %r14d
215 movsd 32(%rsp, %r14, 8), %xmm0
216 call expm1@PLT
217 # LOE rbx r14 r15 r12d r13d xmm0
218
219 movsd %xmm0, 48(%rsp, %r14, 8)
220
221 /* Process special inputs in loop */
222 jmp L(SPECIAL_VALUES_LOOP)
223 # LOE rbx r15 r12d r13d
224END(_ZGVbN2v_expm1_sse4)
225
226 .section .rodata, "a"
227 .align 16
228
229#ifdef __svml_dexpm1_data_internal_typedef
230typedef unsigned int VUINT32;
231typedef struct {
232 __declspec(align(16)) VUINT32 Expm1_HA_table[(1<<8)][2];
233 __declspec(align(16)) VUINT32 poly_coeff[4][2][2];
234 __declspec(align(16)) VUINT32 Log2e[2][2];
235 __declspec(align(16)) VUINT32 L2H[2][2];
236 __declspec(align(16)) VUINT32 L2L[2][2];
237 __declspec(align(16)) VUINT32 ExpAddConst[2][2];
238 __declspec(align(16)) VUINT32 IndexMask[2][2];
239 __declspec(align(16)) VUINT32 ExpMask[2][2];
240 __declspec(align(16)) VUINT32 MOne[2][2];
241 __declspec(align(16)) VUINT32 AbsMask[2][2];
242 __declspec(align(16)) VUINT32 Threshold[2][2];
243 __declspec(align(16)) VUINT32 L2[2][2];
244} __svml_dexpm1_data_internal;
245#endif
246__svml_dexpm1_data_internal:
247 /* Expm1_HA_table */
248 .quad 0x0000000000000000, 0x0000000000000000
249 .quad 0x0000163da8000000, 0x3e3fb33356d84a67
250 .quad 0x00002c9a40000000, 0xbe3887f9f1190835
251 .quad 0x00004315e8000000, 0x3e1b9fe12f5ce3e7
252 .quad 0x000059b0d0000000, 0x3e48ac2ba1d73e2a
253 .quad 0x0000706b28000000, 0x3e3ddf6ddc6dc404
254 .quad 0x0000874518000000, 0x3e1d66f20230d7c9
255 .quad 0x00009e3ec8000000, 0x3e46379c1a290f03
256 .quad 0x0000b55870000000, 0xbe4833b784eb3a37
257 .quad 0x0000cc9228000000, 0x3e4b923fba03db83
258 .quad 0x0000e3ec30000000, 0x3e469e8d10103a17
259 .quad 0x0000fb66b0000000, 0xbdb2ce50dcdf6e22
260 .quad 0x00011301d0000000, 0x3df25b50a4ebbf1b
261 .quad 0x00012abdc0000000, 0x3e1b0c72fee4aeb5
262 .quad 0x0001429ab0000000, 0xbe356d2204cbefe7
263 .quad 0x00015a98c8000000, 0x3e24b1ca24901aae
264 .quad 0x000172b840000000, 0xbe4c15742919041c
265 .quad 0x00018af938000000, 0x3e2191bd3777ee17
266 .quad 0x0001a35be8000000, 0x3e4b7e5ba9e5b4c8
267 .quad 0x0001bbe088000000, 0xbe4fdd19632a70c7
268 .quad 0x0001d48730000000, 0x3e368b9aa7805b80
269 .quad 0x0001ed5020000000, 0x3e47e6c8e5c40d00
270 .quad 0x0002063b88000000, 0x3e18a3358ee3bac1
271 .quad 0x00021f4990000000, 0x3e37ddc962552fd3
272 .quad 0x0002387a70000000, 0xbe38a9dc7993e052
273 .quad 0x000251ce50000000, 0xbe135670329f5521
274 .quad 0x00026b4568000000, 0xbe40ec1916d42cc6
275 .quad 0x000284dfe0000000, 0x3e3f5638096cf15d
276 .quad 0x00029e9df8000000, 0xbe470108f69ed175
277 .quad 0x0002b87fd0000000, 0x3e2b5b31ffbbd48d
278 .quad 0x0002d285a8000000, 0xbe31bfcf4bff6e2b
279 .quad 0x0002ecafa8000000, 0x3e33e2f5611ca0f4
280 .quad 0x000306fe08000000, 0x3e418db8a96f46ad
281 .quad 0x0003217100000000, 0xbe4d993e76563187
282 .quad 0x00033c08b0000000, 0x3e4320b7fa64e431
283 .quad 0x000356c560000000, 0xbe1b5803cdae772e
284 .quad 0x000371a738000000, 0xbe28aac6ab1d7560
285 .quad 0x00038cae70000000, 0xbe47d13cd3d2b1a8
286 .quad 0x0003a7db38000000, 0xbe48d30048af21b7
287 .quad 0x0003c32dc0000000, 0x3e489d47242000f9
288 .quad 0x0003dea650000000, 0xbe4f6e5eee525f6f
289 .quad 0x0003fa4508000000, 0xbe4a9bff22fa047f
290 .quad 0x0004160a20000000, 0x3e3f72e29f84325c
291 .quad 0x000431f5d8000000, 0x3e350a896dc70444
292 .quad 0x00044e0860000000, 0x3e18624b40c4dbd0
293 .quad 0x00046a41f0000000, 0xbe4717fd446d7686
294 .quad 0x000486a2b8000000, 0xbe41f6197f61f2e2
295 .quad 0x0004a32af0000000, 0x3e2afa7bcce5b17a
296 .quad 0x0004bfdad8000000, 0xbe464eaec715e343
297 .quad 0x0004dcb298000000, 0x3e3fddd0d63b36ef
298 .quad 0x0004f9b278000000, 0xbe362d35952cc275
299 .quad 0x000516daa0000000, 0x3e467b320e0897a9
300 .quad 0x0005342b58000000, 0xbe362b07e20f57c4
301 .quad 0x000551a4c8000000, 0x3e42ec9076297631
302 .quad 0x00056f4738000000, 0xbe34ad8259913500
303 .quad 0x00058d12d8000000, 0xbe4b41c016d6a1ea
304 .quad 0x0005ab07e0000000, 0xbe45bd5eb539b67f
305 .quad 0x0005c92688000000, 0x3e42ca35b80e258e
306 .quad 0x0005e76f18000000, 0xbe4296f5bc8b20da
307 .quad 0x000605e1b8000000, 0x3e376dc08b076f59
308 .quad 0x0006247eb0000000, 0x3e0d2ac258f87d03
309 .quad 0x0006434638000000, 0xbe4999e701c483c7
310 .quad 0x0006623880000000, 0x3e42a91124893ecf
311 .quad 0x00068155d8000000, 0xbe4d9ab467bf1d47
312 .quad 0x0006a09e68000000, 0xbe380c4336f74d05
313 .quad 0x0006c01278000000, 0xbe47a12a08944ab3
314 .quad 0x0006dfb240000000, 0xbe4cd72e886ef8ea
315 .quad 0x0006ff7df8000000, 0x3e3519483cf87e1b
316 .quad 0x00071f75e8000000, 0x3e2d8bee7ba46e1e
317 .quad 0x00073f9a48000000, 0x3e24b02e77ab934a
318 .quad 0x00075feb58000000, 0xbe3bd98374091656
319 .quad 0x0007806950000000, 0xbe00d1604f328fec
320 .quad 0x0007a11470000000, 0x3e4f580c36bea881
321 .quad 0x0007c1ed00000000, 0x3e330c1327c49334
322 .quad 0x0007e2f338000000, 0xbe330b19defa2fd4
323 .quad 0x0008042758000000, 0xbe4e0f2f724f90cc
324 .quad 0x0008258998000000, 0x3e34cce128acf88b
325 .quad 0x0008471a48000000, 0xbe3dc385331ad094
326 .quad 0x000868d998000000, 0x3e4a2497640720ed
327 .quad 0x00088ac7d8000000, 0x3e38a669966530bd
328 .quad 0x0008ace540000000, 0x3e415506dadd3e2b
329 .quad 0x0008cf3218000000, 0xbe34abb7410d55e3
330 .quad 0x0008f1ae98000000, 0x3e31577362b98274
331 .quad 0x0009145b08000000, 0x3e4c8ffe2c4530da
332 .quad 0x00093737b0000000, 0x3e29b8bc9e8a0388
333 .quad 0x00095a44c8000000, 0x3e4e4290774da41b
334 .quad 0x00097d82a0000000, 0xbe00d8d83a30b6f8
335 .quad 0x0009a0f170000000, 0x3e2940f737462137
336 .quad 0x0009c49180000000, 0x3e451f8480e3e236
337 .quad 0x0009e86318000000, 0x3e3e323231824ca8
338 .quad 0x000a0c6678000000, 0x3e4aef2b2594d6d4
339 .quad 0x000a309bf0000000, 0xbe4dae966539f470
340 .quad 0x000a5503b0000000, 0x3e41f12ae45a1225
341 .quad 0x000a799e10000000, 0x3e49859ac3796fd9
342 .quad 0x000a9e6b58000000, 0xbe44301205e0a6de
343 .quad 0x000ac36bc0000000, 0xbe0606431f9234cb
344 .quad 0x000ae89f98000000, 0x3e35ad3ad5e8734d
345 .quad 0x000b0e0728000000, 0x3e38db66590842ad
346 .quad 0x000b33a2b8000000, 0x3e13c57ebdaff43a
347 .quad 0x000b597290000000, 0xbe40d536338e3bf7
348 .quad 0x000b7f76f0000000, 0x3e47daf237553d84
349 .quad 0x000ba5b030000000, 0x3e2420c930819679
350 .quad 0x000bcc1e90000000, 0x3e12f074891ee83d
351 .quad 0x000bf2c258000000, 0x3e4eb8f0442046b8
352 .quad 0x000c199be0000000, 0xbe43d56b1eeef9a7
353 .quad 0x000c40ab60000000, 0xbd87c2c975903ef8
354 .quad 0x000c67f130000000, 0xbe3a82eb4b5dec80
355 .quad 0x000c8f6d98000000, 0xbe4fc8c257729a1e
356 .quad 0x000cb720e0000000, 0xbe48837cb757e1a1
357 .quad 0x000cdf0b58000000, 0xbe4511e031dd83b5
358 .quad 0x000d072d48000000, 0x3e403c4bdc687918
359 .quad 0x000d2f8708000000, 0x3deb13e315bc2473
360 .quad 0x000d5818e0000000, 0xbe4822dbc6d12fd3
361 .quad 0x000d80e318000000, 0xbe3367c68447b063
362 .quad 0x000da9e600000000, 0x3e4ed9942b84600d
363 .quad 0x000dd321f0000000, 0x3e480da3025b4aef
364 .quad 0x000dfc9730000000, 0x3e4bdcdaf5cb4656
365 .quad 0x000e264618000000, 0xbe4852f6baf6c4f0
366 .quad 0x000e502ee8000000, 0xbe1d30027630bb40
367 .quad 0x000e7a51f8000000, 0x3e4e3a641a5aa459
368 .quad 0x000ea4afa0000000, 0x3e452486cc2c7b9d
369 .quad 0x000ecf4830000000, 0xbe438cc07b927e77
370 .quad 0x000efa1bf0000000, 0xbe39ea5d888e02de
371 .quad 0x000f252b38000000, 0xbe2288ad162f2d20
372 .quad 0x000f507658000000, 0x3e4b722a033a7c26
373 .quad 0x000f7bfdb0000000, 0xbe431a0f63b7625a
374 .quad 0x000fa7c180000000, 0x3e39e90d82e90a7e
375 .quad 0x000fd3c228000000, 0x3e4c7b8f884badd2
376 /* poly_coeff[4] */
377 .align 16
378 .quad 0x3f81111168877F38, 0x3f81111168877F38 /* coeff5 */
379 .quad 0x3fa55555C2A9C0F3, 0x3fa55555C2A9C0F3 /* coeff4 */
380 .quad 0x3fc555555555541D, 0x3fc555555555541D /* coeff3 */
381 .quad 0x3fdFFFFFFFFFFE5C, 0x3fdFFFFFFFFFFE5C /* coeff2 */
382 /* Log2e */
383 .align 16
384 .quad 0x40671547652B82FE, 0x40671547652B82FE
385 /* L2H */
386 .align 16
387 .quad 0x3f762e42fef80000, 0x3f762e42fef80000
388 /* L2L */
389 .align 16
390 .quad 0x3d41cf79abc9e3b4, 0x3d41cf79abc9e3b4
391 /* ExpAddConst */
392 .align 16
393 .quad 0x42f80000001ff800, 0x42f80000001ff800
394 /* IndexMask */
395 .align 16
396 .quad 0x00000000000007f0, 0x00000000000007f0
397 /* ExpMask */
398 .align 16
399 .quad 0x00000000003ff800, 0x00000000003ff800
400 /* MOne */
401 .align 16
402 .quad 0xbff0000000000000, 0xbff0000000000000
403 /* AbsMask */
404 .align 16
405 .quad 0x7fffffffffffffff, 0x7fffffffffffffff
406 /* Threshold */
407 .align 16
408 .quad 0x40861DA04CBAFE43, 0x40861DA04CBAFE43
409 /* L2 */
410 .align 16
411 .quad 0x3f762e42fefa39ef, 0x3f762e42fefa39ef
412 .align 16
413 .type __svml_dexpm1_data_internal, @object
414 .size __svml_dexpm1_data_internal, .-__svml_dexpm1_data_internal
415 .align 16
416
417.FLT_10:
418 .long 0x00000000, 0x43380000, 0x00000000, 0x43380000
419 .type .FLT_10, @object
420 .size .FLT_10, 16
421

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_d_expm12_core_sse4.S