1/* Function exp2f vectorized with SSE4.
2 Copyright (C) 2021-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
18
19/*
20 * ALGORITHM DESCRIPTION:
21 *
22 * exp2(x) = 2^n * T[j] * (1 + P(y))
23 * where
24 * x = m*(1/K) + y, y in [-1/K..1/K]
25 * m = n*K + j, m, n,j - signed integer, j in [-K/2..K/2]
26 *
27 * values of 2^j/K are tabulated
28 *
29 * P(y) is a minimax polynomial approximation of exp2(x)-1
30 * on small interval [-1/K..1/K]
31 *
32 * Special cases:
33 *
34 * exp2(NaN) = NaN
35 * exp2(+INF) = +INF
36 * exp2(-INF) = 0
37 * exp2(x) = 1 for subnormals
38 * For IEEE float
39 * if x >= 128.0 then exp2f(x) overflow
40 * if x < -151.0 then exp2f(x) underflow
41 *
42 */
43
44/* Offsets for data table __svml_sexp2_data_internal
45 */
46#define _sShifter 0
47#define _sPC0 16
48#define _sPC1 32
49#define _sPC2 48
50#define _sPC3 64
51#define _sPC4 80
52#define _sPC5 96
53#define _sPC6 112
54#define _iAbsMask 128
55#define _iDomainRange 144
56
57#include <sysdep.h>
58
59 .section .text.sse4, "ax", @progbits
60ENTRY(_ZGVbN4v_exp2f_sse4)
61 subq $72, %rsp
62 cfi_def_cfa_offset(80)
63
64 /* Check for overflow\underflow */
65 movups __svml_sexp2_data_internal(%rip), %xmm1
66
67 /* Implementation */
68 movaps %xmm1, %xmm5
69
70 /* Polynomial */
71 movups _sPC6+__svml_sexp2_data_internal(%rip), %xmm4
72 addps %xmm0, %xmm5
73 movaps %xmm5, %xmm3
74
75 /* 2^N */
76 pslld $23, %xmm5
77
78 /* Check for overflow\underflow */
79 movdqu _iAbsMask+__svml_sexp2_data_internal(%rip), %xmm2
80 subps %xmm1, %xmm3
81
82 /* R */
83 movaps %xmm0, %xmm1
84 pand %xmm0, %xmm2
85 pcmpgtd _iDomainRange+__svml_sexp2_data_internal(%rip), %xmm2
86 subps %xmm3, %xmm1
87 movmskps %xmm2, %edx
88 mulps %xmm1, %xmm4
89 addps _sPC5+__svml_sexp2_data_internal(%rip), %xmm4
90 mulps %xmm1, %xmm4
91 addps _sPC4+__svml_sexp2_data_internal(%rip), %xmm4
92 mulps %xmm1, %xmm4
93 addps _sPC3+__svml_sexp2_data_internal(%rip), %xmm4
94 mulps %xmm1, %xmm4
95 addps _sPC2+__svml_sexp2_data_internal(%rip), %xmm4
96 mulps %xmm1, %xmm4
97 addps _sPC1+__svml_sexp2_data_internal(%rip), %xmm4
98 mulps %xmm4, %xmm1
99 addps _sPC0+__svml_sexp2_data_internal(%rip), %xmm1
100
101 /* Reconstruction */
102 paddd %xmm5, %xmm1
103 testl %edx, %edx
104
105 /* Go to special inputs processing branch */
106 jne L(SPECIAL_VALUES_BRANCH)
107 # LOE rbx rbp r12 r13 r14 r15 edx xmm0 xmm1
108
109 /* Restore registers
110 * and exit the function
111 */
112
113L(EXIT):
114 movaps %xmm1, %xmm0
115 addq $72, %rsp
116 cfi_def_cfa_offset(8)
117 ret
118 cfi_def_cfa_offset(80)
119
120 /* Branch to process
121 * special inputs
122 */
123
124L(SPECIAL_VALUES_BRANCH):
125 movups %xmm0, 32(%rsp)
126 movups %xmm1, 48(%rsp)
127 # LOE rbx rbp r12 r13 r14 r15 edx
128
129 xorl %eax, %eax
130 movq %r12, 16(%rsp)
131 cfi_offset(12, -64)
132 movl %eax, %r12d
133 movq %r13, 8(%rsp)
134 cfi_offset(13, -72)
135 movl %edx, %r13d
136 movq %r14, (%rsp)
137 cfi_offset(14, -80)
138 # LOE rbx rbp r15 r12d r13d
139
140 /* Range mask
141 * bits check
142 */
143
144L(RANGEMASK_CHECK):
145 btl %r12d, %r13d
146
147 /* Call scalar math function */
148 jc L(SCALAR_MATH_CALL)
149 # LOE rbx rbp r15 r12d r13d
150
151 /* Special inputs
152 * processing loop
153 */
154
155L(SPECIAL_VALUES_LOOP):
156 incl %r12d
157 cmpl $4, %r12d
158
159 /* Check bits in range mask */
160 jl L(RANGEMASK_CHECK)
161 # LOE rbx rbp r15 r12d r13d
162
163 movq 16(%rsp), %r12
164 cfi_restore(12)
165 movq 8(%rsp), %r13
166 cfi_restore(13)
167 movq (%rsp), %r14
168 cfi_restore(14)
169 movups 48(%rsp), %xmm1
170
171 /* Go to exit */
172 jmp L(EXIT)
173 cfi_offset(12, -64)
174 cfi_offset(13, -72)
175 cfi_offset(14, -80)
176 # LOE rbx rbp r12 r13 r14 r15 xmm1
177
178 /* Scalar math function call
179 * to process special input
180 */
181
182L(SCALAR_MATH_CALL):
183 movl %r12d, %r14d
184 movss 32(%rsp, %r14, 4), %xmm0
185 call exp2f@PLT
186 # LOE rbx rbp r14 r15 r12d r13d xmm0
187
188 movss %xmm0, 48(%rsp, %r14, 4)
189
190 /* Process special inputs in loop */
191 jmp L(SPECIAL_VALUES_LOOP)
192 # LOE rbx rbp r15 r12d r13d
193END(_ZGVbN4v_exp2f_sse4)
194
195 .section .rodata, "a"
196 .align 16
197
198#ifdef __svml_sexp2_data_internal_typedef
199typedef unsigned int VUINT32;
200typedef struct {
201 __declspec(align(16)) VUINT32 _sShifter[4][1];
202 __declspec(align(16)) VUINT32 _sPC0[4][1];
203 __declspec(align(16)) VUINT32 _sPC1[4][1];
204 __declspec(align(16)) VUINT32 _sPC2[4][1];
205 __declspec(align(16)) VUINT32 _sPC3[4][1];
206 __declspec(align(16)) VUINT32 _sPC4[4][1];
207 __declspec(align(16)) VUINT32 _sPC5[4][1];
208 __declspec(align(16)) VUINT32 _sPC6[4][1];
209 __declspec(align(16)) VUINT32 _iAbsMask[4][1];
210 __declspec(align(16)) VUINT32 _iDomainRange[4][1];
211} __svml_sexp2_data_internal;
212#endif
213__svml_sexp2_data_internal:
214 .long 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000 /* _sShifter */
215 .align 16
216 .long 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000 /* _sPC0 */
217 .align 16
218 .long 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218 /* _sPC1 */
219 .align 16
220 .long 0x3e75fdef, 0x3e75fdef, 0x3e75fdef, 0x3e75fdef /* _sPC2 */
221 .align 16
222 .long 0x3d6357cf, 0x3d6357cf, 0x3d6357cf, 0x3d6357cf /* _sPC3 */
223 .align 16
224 .long 0x3c1d962c, 0x3c1d962c, 0x3c1d962c, 0x3c1d962c /* _sPC4 */
225 .align 16
226 .long 0x3aaf7a51, 0x3aaf7a51, 0x3aaf7a51, 0x3aaf7a51 /* _sPC5 */
227 .align 16
228 .long 0x39213c8c, 0x39213c8c, 0x39213c8c, 0x39213c8c /* _sPC6 */
229 //common
230 .align 16
231 .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff /* _iAbsMask */
232 .align 16
233 .long 0x42fc0000, 0x42fc0000, 0x42fc0000, 0x42fc0000 /* _iDomainRange=126.0 */
234 .align 16
235 .type __svml_sexp2_data_internal, @object
236 .size __svml_sexp2_data_internal, .-__svml_sexp2_data_internal
237

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_s_exp2f4_core_sse4.S