1/* Function expm1f vectorized with SSE4.
2 Copyright (C) 2021-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
18
19/*
20 * ALGORITHM DESCRIPTION:
21 *
22 * N = (int)(x*2^k/log(2.0)), R = x - N*log(2)/2^k
23 * exp(x) = 2^(N/2^k) * poly(R) is computed in high-low parts
24 * expm1(x) = exp(x)-1 is then obtained via multi-precision computation
25 *
26 *
27 */
28
29/* Offsets for data table __svml_sexpm1_data_internal
30 */
31#define Expm1_HA_table 0
32#define poly_coeff 512
33#define Log2e 576
34#define L2H 592
35#define L2L 608
36#define ExpAddConst 624
37#define IndexMask 640
38#define ExpMask 656
39#define MOne 672
40#define AbsMask 688
41#define Threshold 704
42#define L2 720
43
44#include <sysdep.h>
45
46 .section .text.sse4, "ax", @progbits
47ENTRY(_ZGVbN4v_expm1f_sse4)
48 pushq %rbp
49 cfi_def_cfa_offset(16)
50 movq %rsp, %rbp
51 cfi_def_cfa(6, 16)
52 cfi_offset(6, -16)
53 andq $-32, %rsp
54 subq $64, %rsp
55 movaps %xmm0, %xmm4
56 movups Log2e+__svml_sexpm1_data_internal(%rip), %xmm9
57 lea __svml_sexpm1_data_internal(%rip), %r8
58 mulps %xmm0, %xmm9
59 movups .FLT_10(%rip), %xmm5
60 movups ExpAddConst+__svml_sexpm1_data_internal(%rip), %xmm2
61 addps %xmm5, %xmm9
62
63 /* argument reduction */
64 movups L2H+__svml_sexpm1_data_internal(%rip), %xmm6
65 subps %xmm5, %xmm9
66 mulps %xmm9, %xmm6
67 addps %xmm9, %xmm2
68
69 /* table lookup */
70 movdqu IndexMask+__svml_sexpm1_data_internal(%rip), %xmm12
71 subps %xmm6, %xmm4
72 pand %xmm2, %xmm12
73 movups L2L+__svml_sexpm1_data_internal(%rip), %xmm7
74 movups AbsMask+__svml_sexpm1_data_internal(%rip), %xmm3
75 pshufd $1, %xmm12, %xmm10
76 movaps %xmm3, %xmm8
77 mulps %xmm9, %xmm7
78 andps %xmm0, %xmm8
79 cmpnleps Threshold+__svml_sexpm1_data_internal(%rip), %xmm8
80 movd %xmm12, %edx
81 subps %xmm7, %xmm4
82 movd %xmm10, %ecx
83 movmskps %xmm8, %eax
84 pshufd $2, %xmm12, %xmm11
85 movaps %xmm4, %xmm7
86 pshufd $3, %xmm12, %xmm13
87 andnps %xmm0, %xmm3
88 movd %xmm11, %esi
89 movd %xmm13, %edi
90
91 /* polynomial */
92 movups poly_coeff+__svml_sexpm1_data_internal(%rip), %xmm8
93 movdqu ExpMask+__svml_sexpm1_data_internal(%rip), %xmm6
94 movslq %edx, %rdx
95 pand %xmm6, %xmm2
96 movslq %ecx, %rcx
97 pslld $14, %xmm2
98 movslq %esi, %rsi
99 movslq %edi, %rdi
100 movq (%r8, %rdx), %xmm1
101 movq (%r8, %rcx), %xmm14
102 movq (%r8, %rsi), %xmm5
103 movq (%r8, %rdi), %xmm15
104 unpcklps %xmm14, %xmm1
105 mulps %xmm4, %xmm8
106 movaps %xmm1, %xmm10
107 mulps %xmm4, %xmm7
108 addps poly_coeff+16+__svml_sexpm1_data_internal(%rip), %xmm8
109 unpcklps %xmm15, %xmm5
110 movlhps %xmm5, %xmm10
111 shufps $238, %xmm5, %xmm1
112 orps %xmm2, %xmm10
113
114 /* T-1 */
115 movups MOne+__svml_sexpm1_data_internal(%rip), %xmm9
116 mulps %xmm2, %xmm1
117 addps %xmm9, %xmm10
118 mulps %xmm7, %xmm8
119 addps %xmm1, %xmm10
120 addps %xmm8, %xmm4
121 movaps %xmm10, %xmm1
122 subps %xmm9, %xmm1
123 mulps %xmm1, %xmm4
124 addps %xmm4, %xmm10
125 orps %xmm3, %xmm10
126 testl %eax, %eax
127
128 /* Go to special inputs processing branch */
129 jne L(SPECIAL_VALUES_BRANCH)
130 # LOE rbx r12 r13 r14 r15 eax xmm0 xmm10
131
132 /* Restore registers
133 * and exit the function
134 */
135
136L(EXIT):
137 movaps %xmm10, %xmm0
138 movq %rbp, %rsp
139 popq %rbp
140 cfi_def_cfa(7, 8)
141 cfi_restore(6)
142 ret
143 cfi_def_cfa(6, 16)
144 cfi_offset(6, -16)
145
146 /* Branch to process
147 * special inputs
148 */
149
150L(SPECIAL_VALUES_BRANCH):
151 movups %xmm0, 32(%rsp)
152 movups %xmm10, 48(%rsp)
153 # LOE rbx r12 r13 r14 r15 eax
154
155 xorl %edx, %edx
156 movq %r12, 16(%rsp)
157 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -48; DW_OP_plus) */
158 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xd0, 0xff, 0xff, 0xff, 0x22
159 movl %edx, %r12d
160 movq %r13, 8(%rsp)
161 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -56; DW_OP_plus) */
162 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xc8, 0xff, 0xff, 0xff, 0x22
163 movl %eax, %r13d
164 movq %r14, (%rsp)
165 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -64; DW_OP_plus) */
166 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x22
167 # LOE rbx r15 r12d r13d
168
169 /* Range mask
170 * bits check
171 */
172
173L(RANGEMASK_CHECK):
174 btl %r12d, %r13d
175
176 /* Call scalar math function */
177 jc L(SCALAR_MATH_CALL)
178 # LOE rbx r15 r12d r13d
179
180 /* Special inputs
181 * processing loop
182 */
183
184L(SPECIAL_VALUES_LOOP):
185 incl %r12d
186 cmpl $4, %r12d
187
188 /* Check bits in range mask */
189 jl L(RANGEMASK_CHECK)
190 # LOE rbx r15 r12d r13d
191
192 movq 16(%rsp), %r12
193 cfi_restore(12)
194 movq 8(%rsp), %r13
195 cfi_restore(13)
196 movq (%rsp), %r14
197 cfi_restore(14)
198 movups 48(%rsp), %xmm10
199
200 /* Go to exit */
201 jmp L(EXIT)
202 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -48; DW_OP_plus) */
203 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xd0, 0xff, 0xff, 0xff, 0x22
204 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -56; DW_OP_plus) */
205 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xc8, 0xff, 0xff, 0xff, 0x22
206 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -64; DW_OP_plus) */
207 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x22
208 # LOE rbx r12 r13 r14 r15 xmm10
209
210 /* Scalar math function call
211 * to process special input
212 */
213
214L(SCALAR_MATH_CALL):
215 movl %r12d, %r14d
216 movss 32(%rsp, %r14, 4), %xmm0
217 call expm1f@PLT
218 # LOE rbx r14 r15 r12d r13d xmm0
219
220 movss %xmm0, 48(%rsp, %r14, 4)
221
222 /* Process special inputs in loop */
223 jmp L(SPECIAL_VALUES_LOOP)
224 # LOE rbx r15 r12d r13d
225END(_ZGVbN4v_expm1f_sse4)
226
227 .section .rodata, "a"
228 .align 16
229
230#ifdef __svml_sexpm1_data_internal_typedef
231typedef unsigned int VUINT32;
232typedef struct {
233 __declspec(align(16)) VUINT32 Expm1_HA_table[(1<<7)][1];
234 __declspec(align(16)) VUINT32 poly_coeff[4][4][1];
235 __declspec(align(16)) VUINT32 Log2e[4][1];
236 __declspec(align(16)) VUINT32 L2H[4][1];
237 __declspec(align(16)) VUINT32 L2L[4][1];
238 __declspec(align(16)) VUINT32 ExpAddConst[4][1];
239 __declspec(align(16)) VUINT32 IndexMask[4][1];
240 __declspec(align(16)) VUINT32 ExpMask[4][1];
241 __declspec(align(16)) VUINT32 MOne[4][1];
242 __declspec(align(16)) VUINT32 AbsMask[4][1];
243 __declspec(align(16)) VUINT32 Threshold[4][1];
244 __declspec(align(16)) VUINT32 L2[4][1];
245} __svml_sexpm1_data_internal;
246#endif
247__svml_sexpm1_data_internal:
248 /* Expm1_HA_table */
249 .long 0x00000000, 0x00000000
250 .long 0x00016000, 0x391a3e78
251 .long 0x0002d000, 0xb89e59d5
252 .long 0x00044000, 0xb93ae78a
253 .long 0x0005b000, 0xb9279306
254 .long 0x00072000, 0xb79e6961
255 .long 0x0008a000, 0xb97e2fee
256 .long 0x000a1000, 0x391aaea9
257 .long 0x000b9000, 0x39383c7d
258 .long 0x000d2000, 0xb9241490
259 .long 0x000ea000, 0x39073169
260 .long 0x00103000, 0x386e218a
261 .long 0x0011c000, 0x38f4dceb
262 .long 0x00136000, 0xb93a9a1e
263 .long 0x0014f000, 0x391df520
264 .long 0x00169000, 0x3905a6e4
265 .long 0x00183000, 0x397e0a32
266 .long 0x0019e000, 0x370b2641
267 .long 0x001b9000, 0xb8b1918b
268 .long 0x001d4000, 0xb8132c6a
269 .long 0x001ef000, 0x39264c12
270 .long 0x0020b000, 0x37221f73
271 .long 0x00227000, 0x37060619
272 .long 0x00243000, 0x3922b5c1
273 .long 0x00260000, 0xb814ab27
274 .long 0x0027d000, 0xb89b12c6
275 .long 0x0029a000, 0x382d5a75
276 .long 0x002b8000, 0xb938c94b
277 .long 0x002d6000, 0xb97822b8
278 .long 0x002f4000, 0xb910ea53
279 .long 0x00312000, 0x38fd6075
280 .long 0x00331000, 0x38620955
281 .long 0x00350000, 0x391e667f
282 .long 0x00370000, 0xb89b8736
283 .long 0x00390000, 0xb90a1714
284 .long 0x003b0000, 0xb7a54ded
285 .long 0x003d1000, 0xb96b8c15
286 .long 0x003f1000, 0x397336cf
287 .long 0x00413000, 0xb8eccd66
288 .long 0x00434000, 0x39599b45
289 .long 0x00456000, 0x3965422b
290 .long 0x00479000, 0xb8a2cdd5
291 .long 0x0049c000, 0xb9484f32
292 .long 0x004bf000, 0xb8fac043
293 .long 0x004e2000, 0x391182a4
294 .long 0x00506000, 0x38ccf6bc
295 .long 0x0052b000, 0xb97c4dc2
296 .long 0x0054f000, 0x38d6aaf4
297 .long 0x00574000, 0x391f995b
298 .long 0x0059a000, 0xb8ba8f62
299 .long 0x005c0000, 0xb9090d05
300 .long 0x005e6000, 0x37f4825e
301 .long 0x0060d000, 0xb8c844f5
302 .long 0x00634000, 0xb76d1a83
303 .long 0x0065c000, 0xb95f2310
304 .long 0x00684000, 0xb952b5f8
305 .long 0x006ac000, 0x37c6e7dd
306 .long 0x006d5000, 0xb7cfe126
307 .long 0x006fe000, 0x3917337c
308 .long 0x00728000, 0x383b9e2d
309 .long 0x00752000, 0x392fa2a5
310 .long 0x0077d000, 0x37df730b
311 .long 0x007a8000, 0x38ecb6dd
312 .long 0x007d4000, 0xb879f986
313 /* poly_coeff[4] */
314 .align 16
315 .long 0x3e2AAABF, 0x3e2AAABF, 0x3e2AAABF, 0x3e2AAABF /* coeff3 */
316 .long 0x3f00000F, 0x3f00000F, 0x3f00000F, 0x3f00000F /* coeff2 */
317 /* 32 Byte Padding */
318 .zero 32
319 /* Log2e */
320 .align 16
321 .long 0x42B8AA3B, 0x42B8AA3B, 0x42B8AA3B, 0x42B8AA3B
322 /* L2H */
323 .align 16
324 .long 0x3c318000, 0x3c318000, 0x3c318000, 0x3c318000
325 /* L2L */
326 .align 16
327 .long 0xb65e8083, 0xb65e8083, 0xb65e8083, 0xb65e8083
328 /* ExpAddConst */
329 .align 16
330 .long 0x49f0fe00, 0x49f0fe00, 0x49f0fe00, 0x49f0fe00
331 /* IndexMask */
332 .align 16
333 .long 0x000001f8, 0x000001f8, 0x000001f8, 0x000001f8
334 /* ExpMask */
335 .align 16
336 .long 0x0001fe00, 0x0001fe00, 0x0001fe00, 0x0001fe00
337 /* MOne */
338 .align 16
339 .long 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000
340 /* AbsMask */
341 .align 16
342 .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
343 /* Threshold */
344 .align 16
345 .long 0x42AD496B, 0x42AD496B, 0x42AD496B, 0x42AD496B // 86.643394
346 /* L2 */
347 .align 16
348 .long 0x3cb17218, 0x3cb17218, 0x3cb17218, 0x3cb17218
349 .align 16
350 .type __svml_sexpm1_data_internal, @object
351 .size __svml_sexpm1_data_internal, .-__svml_sexpm1_data_internal
352 .align 16
353
354.FLT_10:
355 .long 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000
356 .type .FLT_10, @object
357 .size .FLT_10, 16
358

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_s_expm1f4_core_sse4.S