1/* Function sinhf vectorized with SSE4.
2 Copyright (C) 2021-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
18
19/*
20 * ALGORITHM DESCRIPTION:
21 *
22 * Compute sinh(x) as (exp(x)-exp(-x))/2,
23 * where exp is calculated as
24 * exp(M*ln2 + ln2*(j/2^k) + r) = 2^M * 2^(j/2^k) * exp(r)
25 *
26 * Special cases:
27 *
28 * sinh(NaN) = quiet NaN, and raise invalid exception
29 * sinh(INF) = that INF
30 * sinh(x) = x for subnormals
31 * sinh(x) overflows for big x and returns MAXLOG+log(2)
32 *
33 */
34
35/* Offsets for data table __svml_ssinh_data_internal
36 */
37#define _sInvLn2 0
38#define _sLn2hi 16
39#define _sLn2lo 32
40#define _sSign 48
41#define _sShifter 64
42#define _iDomainRange 80
43#define _sPC1 96
44#define _sPC2 112
45#define _sPC3 128
46#define _sPC4 144
47#define _sPC5 160
48#define _sPC6 176
49#define _iHalf 192
50
51#include <sysdep.h>
52
53 .section .text.sse4, "ax", @progbits
54ENTRY(_ZGVbN4v_sinhf_sse4)
55 subq $72, %rsp
56 cfi_def_cfa_offset(80)
57
58 /*
59 * Implementation
60 * Abs argument
61 */
62 movups _sSign+__svml_ssinh_data_internal(%rip), %xmm14
63 andps %xmm0, %xmm14
64 movaps %xmm14, %xmm10
65
66 /*
67 * Load argument
68 * dM = x/log(2) + RShifter
69 */
70 movups _sInvLn2+__svml_ssinh_data_internal(%rip), %xmm7
71 pxor %xmm0, %xmm10
72 mulps %xmm10, %xmm7
73
74 /*
75 * Check for overflow\underflow
76 * MORE faster than GE?
77 */
78 movaps %xmm10, %xmm1
79 movups _sShifter+__svml_ssinh_data_internal(%rip), %xmm2
80
81 /* sR = sX - sN*Log2_hi */
82 movups _sLn2hi+__svml_ssinh_data_internal(%rip), %xmm3
83 addps %xmm2, %xmm7
84
85 /*
86 * R
87 * sN = sM - RShifter
88 */
89 movaps %xmm7, %xmm4
90
91 /*
92 * G1, G2 2^N, 2^(-N)
93 * iM now is an EXP(2^N)
94 */
95 pslld $23, %xmm7
96
97 /* sR = (sX - sN*Log2_hi) - sN*Log2_lo */
98 movups _sLn2lo+__svml_ssinh_data_internal(%rip), %xmm5
99 subps %xmm2, %xmm4
100 mulps %xmm4, %xmm3
101 mulps %xmm4, %xmm5
102 subps %xmm3, %xmm10
103
104 /*
105 * sinh(r) = r*((a1=1)+r^2*(a3+r^2*(a5+{v1 r^2*a7})))) = r + r*(r^2*(a3+r^2*(a5+r^2*a7))) ....
106 * sSinh_r = (a3+r^2*a5)
107 */
108 movups _sPC5+__svml_ssinh_data_internal(%rip), %xmm8
109 subps %xmm5, %xmm10
110
111 /* sR2 = sR^2 */
112 movaps %xmm10, %xmm12
113 mulps %xmm10, %xmm12
114
115 /*
116 * sinh(X) = sG2 + sG1*sinh(dR) + sG2*sR2*(a2+sR2*(a4+a6*sR2)
117 * sOut = (a4 +a6*sR2)
118 */
119 movups _sPC6+__svml_ssinh_data_internal(%rip), %xmm9
120 mulps %xmm12, %xmm8
121 mulps %xmm12, %xmm9
122 addps _sPC3+__svml_ssinh_data_internal(%rip), %xmm8
123 addps _sPC4+__svml_ssinh_data_internal(%rip), %xmm9
124
125 /* sSinh_r = r^2*(a3+r^2*a5) */
126 mulps %xmm12, %xmm8
127
128 /* sOut = a2+sR2*(a4+a6*sR2) */
129 mulps %xmm12, %xmm9
130
131 /* sSinh_r = r + r*(r^2*(a3+r^2*a5)) */
132 mulps %xmm10, %xmm8
133 addps _sPC2+__svml_ssinh_data_internal(%rip), %xmm9
134 addps %xmm8, %xmm10
135
136 /* sOut = sR2*(a2+sR2*(a4+a6*sR2) */
137 mulps %xmm9, %xmm12
138 movdqu _iHalf+__svml_ssinh_data_internal(%rip), %xmm6
139 movdqa %xmm6, %xmm13
140 psubd %xmm7, %xmm6
141 paddd %xmm7, %xmm13
142
143 /* sG1 = 2^(N-1)+2^(-N-1) */
144 movdqa %xmm13, %xmm11
145
146 /* sG2 = 2^(N-1)-2^(-N-1) */
147 subps %xmm6, %xmm13
148 addps %xmm6, %xmm11
149
150 /* sOut = sG2*sR2*(a2+sR2*(a4+a6*sR2) */
151 mulps %xmm13, %xmm12
152
153 /* sOut = sG1*sinh(dR)+sG2*sR2*(a2+sR2*(a4+a6*sR2) */
154 mulps %xmm10, %xmm11
155 pcmpgtd _iDomainRange+__svml_ssinh_data_internal(%rip), %xmm1
156 addps %xmm11, %xmm12
157 movmskps %xmm1, %edx
158
159 /* sOut = sG2 + sG1*sinh(dR) + sG2*sR2*(a2+sR2*(a4+a6*sR2) */
160 addps %xmm12, %xmm13
161
162 /* Ret H */
163 orps %xmm13, %xmm14
164 testl %edx, %edx
165
166 /* Go to special inputs processing branch */
167 jne L(SPECIAL_VALUES_BRANCH)
168 # LOE rbx rbp r12 r13 r14 r15 edx xmm0 xmm14
169
170 /* Restore registers
171 * and exit the function
172 */
173
174L(EXIT):
175 movaps %xmm14, %xmm0
176 addq $72, %rsp
177 cfi_def_cfa_offset(8)
178 ret
179 cfi_def_cfa_offset(80)
180
181 /* Branch to process
182 * special inputs
183 */
184
185L(SPECIAL_VALUES_BRANCH):
186 movups %xmm0, 32(%rsp)
187 movups %xmm14, 48(%rsp)
188 # LOE rbx rbp r12 r13 r14 r15 edx
189
190 xorl %eax, %eax
191 movq %r12, 16(%rsp)
192 cfi_offset(12, -64)
193 movl %eax, %r12d
194 movq %r13, 8(%rsp)
195 cfi_offset(13, -72)
196 movl %edx, %r13d
197 movq %r14, (%rsp)
198 cfi_offset(14, -80)
199 # LOE rbx rbp r15 r12d r13d
200
201 /* Range mask
202 * bits check
203 */
204
205L(RANGEMASK_CHECK):
206 btl %r12d, %r13d
207
208 /* Call scalar math function */
209 jc L(SCALAR_MATH_CALL)
210 # LOE rbx rbp r15 r12d r13d
211
212 /* Special inputs
213 * processing loop
214 */
215
216L(SPECIAL_VALUES_LOOP):
217 incl %r12d
218 cmpl $4, %r12d
219
220 /* Check bits in range mask */
221 jl L(RANGEMASK_CHECK)
222 # LOE rbx rbp r15 r12d r13d
223
224 movq 16(%rsp), %r12
225 cfi_restore(12)
226 movq 8(%rsp), %r13
227 cfi_restore(13)
228 movq (%rsp), %r14
229 cfi_restore(14)
230 movups 48(%rsp), %xmm14
231
232 /* Go to exit */
233 jmp L(EXIT)
234 cfi_offset(12, -64)
235 cfi_offset(13, -72)
236 cfi_offset(14, -80)
237 # LOE rbx rbp r12 r13 r14 r15 xmm14
238
239 /* Scalar math function call
240 * to process special input
241 */
242
243L(SCALAR_MATH_CALL):
244 movl %r12d, %r14d
245 movss 32(%rsp, %r14, 4), %xmm0
246 call sinhf@PLT
247 # LOE rbx rbp r14 r15 r12d r13d xmm0
248
249 movss %xmm0, 48(%rsp, %r14, 4)
250
251 /* Process special inputs in loop */
252 jmp L(SPECIAL_VALUES_LOOP)
253 # LOE rbx rbp r15 r12d r13d
254END(_ZGVbN4v_sinhf_sse4)
255
256 .section .rodata, "a"
257 .align 16
258
259#ifdef __svml_ssinh_data_internal_typedef
260typedef unsigned int VUINT32;
261typedef struct {
262 __declspec(align(16)) VUINT32 _sInvLn2[4][1];
263 __declspec(align(16)) VUINT32 _sLn2hi[4][1];
264 __declspec(align(16)) VUINT32 _sLn2lo[4][1];
265 __declspec(align(16)) VUINT32 _sSign[4][1];
266 __declspec(align(16)) VUINT32 _sShifter[4][1];
267 __declspec(align(16)) VUINT32 _iDomainRange[4][1];
268 __declspec(align(16)) VUINT32 _sPC1[4][1];
269 __declspec(align(16)) VUINT32 _sPC2[4][1];
270 __declspec(align(16)) VUINT32 _sPC3[4][1];
271 __declspec(align(16)) VUINT32 _sPC4[4][1];
272 __declspec(align(16)) VUINT32 _sPC5[4][1];
273 __declspec(align(16)) VUINT32 _sPC6[4][1];
274 __declspec(align(16)) VUINT32 _iHalf[4][1];
275} __svml_ssinh_data_internal;
276#endif
277__svml_ssinh_data_internal:
278 .long 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B /* _sInvLn2 */ // k=0
279 .align 16
280 .long 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000 /* _sLn2hi */
281 .align 16
282 .long 0x3805FDF4, 0x3805FDF4, 0x3805FDF4, 0x3805FDF4 /* _sLn2lo */
283 .align 16
284 .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 /* _sSign */
285 .align 16
286 .long 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000 /* _sShifter */
287 .align 16
288 .long 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E /* _iDomainRange */
289 .align 16
290 .long 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000 /* _sPC1=1 */
291 .align 16
292 .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000 /* _sPC2 */
293 .align 16
294 .long 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57 /* _sPC3 */
295 .align 16
296 .long 0x3d2aaa72, 0x3d2aaa72, 0x3d2aaa72, 0x3d2aaa72 /* _sPC4 */
297 .align 16
298 .long 0x3c091461, 0x3c091461, 0x3c091461, 0x3c091461 /* _sPC5 */
299 .align 16
300 .long 0x3ab6a8a3, 0x3ab6a8a3, 0x3ab6a8a3, 0x3ab6a8a3 /* _sPC6 */
301 // Integer constants
302 .align 16
303 .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000 /* _iHalf */
304 .align 16
305 .type __svml_ssinh_data_internal, @object
306 .size __svml_ssinh_data_internal, .-__svml_ssinh_data_internal
307

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_s_sinhf4_core_sse4.S