1/* Function expm1f vectorized with AVX2.
2 Copyright (C) 2021-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
18
19/*
20 * ALGORITHM DESCRIPTION:
21 *
22 * N = (int)(x*2^k/log(2.0)), R = x - N*log(2)/2^k
23 * exp(x) = 2^(N/2^k) * poly(R) is computed in high-low parts
24 * expm1(x) = exp(x)-1 is then obtained via multi-precision computation
25 *
26 *
27 */
28
29/* Offsets for data table __svml_sexpm1_data_internal
30 */
31#define Expm1_HA_table 0
32#define poly_coeff 512
33#define Log2e 640
34#define L2H 672
35#define L2L 704
36#define ExpAddConst 736
37#define IndexMask 768
38#define ExpMask 800
39#define MOne 832
40#define AbsMask 864
41#define Threshold 896
42#define L2 928
43
44#include <sysdep.h>
45
46 .section .text.avx2, "ax", @progbits
47ENTRY(_ZGVdN8v_expm1f_avx2)
48 pushq %rbp
49 cfi_def_cfa_offset(16)
50 movq %rsp, %rbp
51 cfi_def_cfa(6, 16)
52 cfi_offset(6, -16)
53 andq $-32, %rsp
54 subq $96, %rsp
55 lea __svml_sexpm1_data_internal(%rip), %rax
56 vmovaps %ymm0, %ymm3
57 vmulps Log2e+__svml_sexpm1_data_internal(%rip), %ymm3, %ymm4
58
59 /* argument reduction */
60 vmovups L2H+__svml_sexpm1_data_internal(%rip), %ymm2
61 vmovups AbsMask+__svml_sexpm1_data_internal(%rip), %ymm5
62 vroundps $0, %ymm4, %ymm8
63 vaddps ExpAddConst+__svml_sexpm1_data_internal(%rip), %ymm8, %ymm0
64 vfnmadd213ps %ymm3, %ymm8, %ymm2
65
66 /* table lookup */
67 vandps IndexMask+__svml_sexpm1_data_internal(%rip), %ymm0, %ymm9
68 vandps %ymm5, %ymm3, %ymm6
69 vcmpnle_uqps Threshold+__svml_sexpm1_data_internal(%rip), %ymm6, %ymm7
70 vfnmadd231ps L2L+__svml_sexpm1_data_internal(%rip), %ymm8, %ymm2
71 vandps ExpMask+__svml_sexpm1_data_internal(%rip), %ymm0, %ymm0
72 vandnps %ymm3, %ymm5, %ymm1
73 vpslld $14, %ymm0, %ymm0
74 vmovmskps %ymm7, %edx
75 vmovd %xmm9, %ecx
76 vextractf128 $1, %ymm9, %xmm10
77 movslq %ecx, %rcx
78 vmovd %xmm10, %r9d
79 vpextrd $1, %xmm9, %esi
80 vpextrd $2, %xmm9, %edi
81 vpextrd $3, %xmm9, %r8d
82 vmovq (%rax, %rcx), %xmm11
83 vpextrd $1, %xmm10, %r10d
84 vpextrd $2, %xmm10, %r11d
85 vpextrd $3, %xmm10, %ecx
86 movslq %esi, %rsi
87 movslq %edi, %rdi
88 movslq %r8d, %r8
89 movslq %r9d, %r9
90 movslq %r10d, %r10
91 movslq %r11d, %r11
92 movslq %ecx, %rcx
93 vmovq (%rax, %rsi), %xmm13
94 vmovq (%rax, %rdi), %xmm12
95 vmovq (%rax, %r8), %xmm14
96 vmovq (%rax, %r9), %xmm15
97 vmovq (%rax, %r10), %xmm5
98 vmovq (%rax, %r11), %xmm4
99 vmovq (%rax, %rcx), %xmm6
100 vunpcklps %xmm12, %xmm11, %xmm7
101 vunpcklps %xmm14, %xmm13, %xmm8
102 vunpcklps %xmm4, %xmm15, %xmm15
103 vunpcklps %xmm6, %xmm5, %xmm9
104 vmulps %ymm2, %ymm2, %ymm13
105 vinsertf128 $1, %xmm15, %ymm7, %ymm10
106 vinsertf128 $1, %xmm9, %ymm8, %ymm11
107 vunpcklps %ymm11, %ymm10, %ymm12
108 vorps %ymm0, %ymm12, %ymm14
109
110 /* polynomial */
111 vmovups poly_coeff+__svml_sexpm1_data_internal(%rip), %ymm12
112 vfmadd213ps poly_coeff+32+__svml_sexpm1_data_internal(%rip), %ymm2, %ymm12
113 vfmadd213ps %ymm2, %ymm13, %ymm12
114
115 /* T-1 */
116 vmovups MOne+__svml_sexpm1_data_internal(%rip), %ymm13
117 vaddps %ymm13, %ymm14, %ymm2
118 vunpckhps %ymm11, %ymm10, %ymm4
119 vfmadd213ps %ymm2, %ymm0, %ymm4
120 vsubps %ymm13, %ymm4, %ymm0
121 vfmadd213ps %ymm4, %ymm12, %ymm0
122 vorps %ymm1, %ymm0, %ymm0
123 testl %edx, %edx
124
125 /* Go to special inputs processing branch */
126 jne L(SPECIAL_VALUES_BRANCH)
127 # LOE rbx r12 r13 r14 r15 edx ymm0 ymm3
128
129 /* Restore registers
130 * and exit the function
131 */
132
133L(EXIT):
134 movq %rbp, %rsp
135 popq %rbp
136 cfi_def_cfa(7, 8)
137 cfi_restore(6)
138 ret
139 cfi_def_cfa(6, 16)
140 cfi_offset(6, -16)
141
142 /* Branch to process
143 * special inputs
144 */
145
146L(SPECIAL_VALUES_BRANCH):
147 vmovups %ymm3, 32(%rsp)
148 vmovups %ymm0, 64(%rsp)
149 # LOE rbx r12 r13 r14 r15 edx ymm0
150
151 xorl %eax, %eax
152 # LOE rbx r12 r13 r14 r15 eax edx
153
154 vzeroupper
155 movq %r12, 16(%rsp)
156 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */
157 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
158 movl %eax, %r12d
159 movq %r13, 8(%rsp)
160 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */
161 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
162 movl %edx, %r13d
163 movq %r14, (%rsp)
164 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */
165 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
166 # LOE rbx r15 r12d r13d
167
168 /* Range mask
169 * bits check
170 */
171
172L(RANGEMASK_CHECK):
173 btl %r12d, %r13d
174
175 /* Call scalar math function */
176 jc L(SCALAR_MATH_CALL)
177 # LOE rbx r15 r12d r13d
178
179 /* Special inputs
180 * processing loop
181 */
182
183L(SPECIAL_VALUES_LOOP):
184 incl %r12d
185 cmpl $8, %r12d
186
187 /* Check bits in range mask */
188 jl L(RANGEMASK_CHECK)
189 # LOE rbx r15 r12d r13d
190
191 movq 16(%rsp), %r12
192 cfi_restore(12)
193 movq 8(%rsp), %r13
194 cfi_restore(13)
195 movq (%rsp), %r14
196 cfi_restore(14)
197 vmovups 64(%rsp), %ymm0
198
199 /* Go to exit */
200 jmp L(EXIT)
201 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */
202 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
203 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */
204 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
205 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */
206 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
207 # LOE rbx r12 r13 r14 r15 ymm0
208
209 /* Scalar math function call
210 * to process special input
211 */
212
213L(SCALAR_MATH_CALL):
214 movl %r12d, %r14d
215 vmovss 32(%rsp, %r14, 4), %xmm0
216 call expm1f@PLT
217 # LOE rbx r14 r15 r12d r13d xmm0
218
219 vmovss %xmm0, 64(%rsp, %r14, 4)
220
221 /* Process special inputs in loop */
222 jmp L(SPECIAL_VALUES_LOOP)
223 # LOE rbx r15 r12d r13d
224END(_ZGVdN8v_expm1f_avx2)
225
226 .section .rodata, "a"
227 .align 32
228
229#ifdef __svml_sexpm1_data_internal_typedef
230typedef unsigned int VUINT32;
231typedef struct {
232 __declspec(align(32)) VUINT32 Expm1_HA_table[(1<<7)][1];
233 __declspec(align(32)) VUINT32 poly_coeff[4][8][1];
234 __declspec(align(32)) VUINT32 Log2e[8][1];
235 __declspec(align(32)) VUINT32 L2H[8][1];
236 __declspec(align(32)) VUINT32 L2L[8][1];
237 __declspec(align(32)) VUINT32 ExpAddConst[8][1];
238 __declspec(align(32)) VUINT32 IndexMask[8][1];
239 __declspec(align(32)) VUINT32 ExpMask[8][1];
240 __declspec(align(32)) VUINT32 MOne[8][1];
241 __declspec(align(32)) VUINT32 AbsMask[8][1];
242 __declspec(align(32)) VUINT32 Threshold[8][1];
243 __declspec(align(32)) VUINT32 L2[8][1];
244} __svml_sexpm1_data_internal;
245#endif
246__svml_sexpm1_data_internal:
247 /* Expm1_HA_table */
248 .long 0x00000000, 0x00000000
249 .long 0x00016000, 0x391a3e78
250 .long 0x0002d000, 0xb89e59d5
251 .long 0x00044000, 0xb93ae78a
252 .long 0x0005b000, 0xb9279306
253 .long 0x00072000, 0xb79e6961
254 .long 0x0008a000, 0xb97e2fee
255 .long 0x000a1000, 0x391aaea9
256 .long 0x000b9000, 0x39383c7d
257 .long 0x000d2000, 0xb9241490
258 .long 0x000ea000, 0x39073169
259 .long 0x00103000, 0x386e218a
260 .long 0x0011c000, 0x38f4dceb
261 .long 0x00136000, 0xb93a9a1e
262 .long 0x0014f000, 0x391df520
263 .long 0x00169000, 0x3905a6e4
264 .long 0x00183000, 0x397e0a32
265 .long 0x0019e000, 0x370b2641
266 .long 0x001b9000, 0xb8b1918b
267 .long 0x001d4000, 0xb8132c6a
268 .long 0x001ef000, 0x39264c12
269 .long 0x0020b000, 0x37221f73
270 .long 0x00227000, 0x37060619
271 .long 0x00243000, 0x3922b5c1
272 .long 0x00260000, 0xb814ab27
273 .long 0x0027d000, 0xb89b12c6
274 .long 0x0029a000, 0x382d5a75
275 .long 0x002b8000, 0xb938c94b
276 .long 0x002d6000, 0xb97822b8
277 .long 0x002f4000, 0xb910ea53
278 .long 0x00312000, 0x38fd6075
279 .long 0x00331000, 0x38620955
280 .long 0x00350000, 0x391e667f
281 .long 0x00370000, 0xb89b8736
282 .long 0x00390000, 0xb90a1714
283 .long 0x003b0000, 0xb7a54ded
284 .long 0x003d1000, 0xb96b8c15
285 .long 0x003f1000, 0x397336cf
286 .long 0x00413000, 0xb8eccd66
287 .long 0x00434000, 0x39599b45
288 .long 0x00456000, 0x3965422b
289 .long 0x00479000, 0xb8a2cdd5
290 .long 0x0049c000, 0xb9484f32
291 .long 0x004bf000, 0xb8fac043
292 .long 0x004e2000, 0x391182a4
293 .long 0x00506000, 0x38ccf6bc
294 .long 0x0052b000, 0xb97c4dc2
295 .long 0x0054f000, 0x38d6aaf4
296 .long 0x00574000, 0x391f995b
297 .long 0x0059a000, 0xb8ba8f62
298 .long 0x005c0000, 0xb9090d05
299 .long 0x005e6000, 0x37f4825e
300 .long 0x0060d000, 0xb8c844f5
301 .long 0x00634000, 0xb76d1a83
302 .long 0x0065c000, 0xb95f2310
303 .long 0x00684000, 0xb952b5f8
304 .long 0x006ac000, 0x37c6e7dd
305 .long 0x006d5000, 0xb7cfe126
306 .long 0x006fe000, 0x3917337c
307 .long 0x00728000, 0x383b9e2d
308 .long 0x00752000, 0x392fa2a5
309 .long 0x0077d000, 0x37df730b
310 .long 0x007a8000, 0x38ecb6dd
311 .long 0x007d4000, 0xb879f986
312 /* poly_coeff[4] */
313 .align 32
314 .long 0x3e2AAABF, 0x3e2AAABF, 0x3e2AAABF, 0x3e2AAABF, 0x3e2AAABF, 0x3e2AAABF, 0x3e2AAABF, 0x3e2AAABF /* coeff3 */
315 .long 0x3f00000F, 0x3f00000F, 0x3f00000F, 0x3f00000F, 0x3f00000F, 0x3f00000F, 0x3f00000F, 0x3f00000F /* coeff2 */
316 /* 64 Byte Padding */
317 .zero 64
318 /* Log2e */
319 .align 32
320 .long 0x42B8AA3B, 0x42B8AA3B, 0x42B8AA3B, 0x42B8AA3B, 0x42B8AA3B, 0x42B8AA3B, 0x42B8AA3B, 0x42B8AA3B
321 /* L2H */
322 .align 32
323 .long 0x3c318000, 0x3c318000, 0x3c318000, 0x3c318000, 0x3c318000, 0x3c318000, 0x3c318000, 0x3c318000
324 /* L2L */
325 .align 32
326 .long 0xb65e8083, 0xb65e8083, 0xb65e8083, 0xb65e8083, 0xb65e8083, 0xb65e8083, 0xb65e8083, 0xb65e8083
327 /* ExpAddConst */
328 .align 32
329 .long 0x49f0fe00, 0x49f0fe00, 0x49f0fe00, 0x49f0fe00, 0x49f0fe00, 0x49f0fe00, 0x49f0fe00, 0x49f0fe00
330 /* IndexMask */
331 .align 32
332 .long 0x000001f8, 0x000001f8, 0x000001f8, 0x000001f8, 0x000001f8, 0x000001f8, 0x000001f8, 0x000001f8
333 /* ExpMask */
334 .align 32
335 .long 0x0001fe00, 0x0001fe00, 0x0001fe00, 0x0001fe00, 0x0001fe00, 0x0001fe00, 0x0001fe00, 0x0001fe00
336 /* MOne */
337 .align 32
338 .long 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000
339 /* AbsMask */
340 .align 32
341 .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
342 /* Threshold */
343 .align 32
344 .long 0x42AD496B, 0x42AD496B, 0x42AD496B, 0x42AD496B, 0x42AD496B, 0x42AD496B, 0x42AD496B, 0x42AD496B // 86.643394
345 /* L2 */
346 .align 32
347 .long 0x3cb17218, 0x3cb17218, 0x3cb17218, 0x3cb17218, 0x3cb17218, 0x3cb17218, 0x3cb17218, 0x3cb17218
348 .align 32
349 .type __svml_sexpm1_data_internal, @object
350 .size __svml_sexpm1_data_internal, .-__svml_sexpm1_data_internal
351

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_s_expm1f8_core_avx2.S