1/* Function coshf vectorized with SSE4.
2 Copyright (C) 2021-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
18
19/*
20 * ALGORITHM DESCRIPTION:
21 *
22 * Compute cosh(x) as (exp(x)+exp(-x))/2,
23 * where exp is calculated as
24 * exp(M*ln2 + ln2*(j/2^k) + r) = 2^M * 2^(j/2^k) * exp(r)
25 *
26 * Special cases:
27 *
28 * cosh(NaN) = quiet NaN, and raise invalid exception
29 * cosh(INF) = that INF
30 * cosh(0) = 1
31 * cosh(x) overflows for big x and returns MAXLOG+log(2)
32 *
33 */
34
35/* Offsets for data table __svml_scosh_data_internal
36 */
37#define _sInvLn2 0
38#define _sLn2hi 16
39#define _sLn2lo 32
40#define _sSign 48
41#define _sShifter 64
42#define _iDomainRange 80
43#define _sPC1 96
44#define _sPC2 112
45#define _sPC3 128
46#define _sPC4 144
47#define _sPC5 160
48#define _sPC6 176
49#define _iHalf 192
50
51#include <sysdep.h>
52
53 .section .text.sse4, "ax", @progbits
54ENTRY(_ZGVbN4v_coshf_sse4)
55 subq $72, %rsp
56 cfi_def_cfa_offset(80)
57
58 /*
59 * Implementation
60 * Abs argument
61 */
62 movups _sSign+__svml_scosh_data_internal(%rip), %xmm1
63
64 /*
65 * Load argument
66 * dM = x/log(2) + RShifter
67 */
68 movups _sInvLn2+__svml_scosh_data_internal(%rip), %xmm9
69 andnps %xmm0, %xmm1
70 mulps %xmm1, %xmm9
71
72 /* Check for overflow\underflow */
73 movaps %xmm1, %xmm3
74 movups _sShifter+__svml_scosh_data_internal(%rip), %xmm4
75 movups _sLn2hi+__svml_scosh_data_internal(%rip), %xmm5
76 addps %xmm4, %xmm9
77
78 /*
79 * R
80 * sN = sM - RShifter
81 */
82 movaps %xmm9, %xmm6
83
84 /*
85 * G1, G2 2^N, 2^(-N)
86 * iM now is an EXP(2^N)
87 */
88 pslld $23, %xmm9
89 movups _sLn2lo+__svml_scosh_data_internal(%rip), %xmm7
90 subps %xmm4, %xmm6
91
92 /* sR = sX - sN*Log2_hi */
93 mulps %xmm6, %xmm5
94
95 /* sR = (sX - sN*Log2_hi) - sN*Log2_lo */
96 mulps %xmm6, %xmm7
97 movdqu _iDomainRange+__svml_scosh_data_internal(%rip), %xmm2
98 pcmpgtd %xmm2, %xmm3
99 pcmpeqd %xmm1, %xmm2
100
101 /*
102 * sinh(r) = r*((a1=1)+r^2*(a3+r^2*(a5+{v1 r^2*a7})))) = r + r*(r^2*(a3+r^2*(a5+r^2*a7))) ....
103 * sSinh_r = (a3+r^2*a5)
104 */
105 movups _sPC5+__svml_scosh_data_internal(%rip), %xmm10
106 por %xmm2, %xmm3
107
108 /*
109 * sinh(X) = sG2 + sG1*sinh(dR) + sG2*sR2*(a2+sR2*(a4+a6*sR2)
110 * sOut = (a4 +a6*sR2)
111 */
112 movups _sPC6+__svml_scosh_data_internal(%rip), %xmm11
113 subps %xmm5, %xmm1
114 movmskps %xmm3, %edx
115 movdqu _iHalf+__svml_scosh_data_internal(%rip), %xmm8
116 subps %xmm7, %xmm1
117
118 /* sR2 = sR^2, shaffled */
119 movaps %xmm1, %xmm13
120 movdqa %xmm8, %xmm2
121 mulps %xmm1, %xmm13
122 paddd %xmm9, %xmm2
123 mulps %xmm13, %xmm10
124 psubd %xmm9, %xmm8
125 mulps %xmm13, %xmm11
126 addps _sPC3+__svml_scosh_data_internal(%rip), %xmm10
127 addps _sPC4+__svml_scosh_data_internal(%rip), %xmm11
128
129 /* sSinh_r = r^2*(a3+r^2*a5) */
130 mulps %xmm13, %xmm10
131
132 /* sOut = a2+sR2*(a4+a6*sR2) */
133 mulps %xmm13, %xmm11
134
135 /* sSinh_r = r + r*(r^2*(a3+r^2*a5)) */
136 mulps %xmm1, %xmm10
137 addps _sPC2+__svml_scosh_data_internal(%rip), %xmm11
138 addps %xmm10, %xmm1
139
140 /* sOut = sR2*(a2+sR2*(a4+a6*sR2) */
141 mulps %xmm11, %xmm13
142
143 /* sG1 = 2^(N-1)-2^(-N-1) */
144 movdqa %xmm2, %xmm12
145
146 /* sG2 = 2^(N-1)+2^(-N-1) */
147 addps %xmm8, %xmm2
148 subps %xmm8, %xmm12
149
150 /* sOut = sG2*sR2*(a2+sR2*(a4+a6*sR2) */
151 mulps %xmm2, %xmm13
152
153 /* sOut = sG1*sinh(dR)+sG2*sR2*(a2+sR2*(a4+a6*sR2) */
154 mulps %xmm1, %xmm12
155 addps %xmm12, %xmm13
156
157 /* sOut = sG2 + sG1*sinh(dR) + sG2*sR2*(a2+sR2*(a4+a6*sR2) */
158 addps %xmm13, %xmm2
159
160 /* Ret H */
161 testl %edx, %edx
162
163 /* Go to special inputs processing branch */
164 jne L(SPECIAL_VALUES_BRANCH)
165 # LOE rbx rbp r12 r13 r14 r15 edx xmm0 xmm2
166
167 /* Restore registers
168 * and exit the function
169 */
170
171L(EXIT):
172 movaps %xmm2, %xmm0
173 addq $72, %rsp
174 cfi_def_cfa_offset(8)
175 ret
176 cfi_def_cfa_offset(80)
177
178 /* Branch to process
179 * special inputs
180 */
181
182L(SPECIAL_VALUES_BRANCH):
183 movups %xmm0, 32(%rsp)
184 movups %xmm2, 48(%rsp)
185 # LOE rbx rbp r12 r13 r14 r15 edx
186
187 xorl %eax, %eax
188 movq %r12, 16(%rsp)
189 cfi_offset(12, -64)
190 movl %eax, %r12d
191 movq %r13, 8(%rsp)
192 cfi_offset(13, -72)
193 movl %edx, %r13d
194 movq %r14, (%rsp)
195 cfi_offset(14, -80)
196 # LOE rbx rbp r15 r12d r13d
197
198 /* Range mask
199 * bits check
200 */
201
202L(RANGEMASK_CHECK):
203 btl %r12d, %r13d
204
205 /* Call scalar math function */
206 jc L(SCALAR_MATH_CALL)
207 # LOE rbx rbp r15 r12d r13d
208
209 /* Special inputs
210 * processing loop
211 */
212
213L(SPECIAL_VALUES_LOOP):
214 incl %r12d
215 cmpl $4, %r12d
216
217 /* Check bits in range mask */
218 jl L(RANGEMASK_CHECK)
219 # LOE rbx rbp r15 r12d r13d
220
221 movq 16(%rsp), %r12
222 cfi_restore(12)
223 movq 8(%rsp), %r13
224 cfi_restore(13)
225 movq (%rsp), %r14
226 cfi_restore(14)
227 movups 48(%rsp), %xmm2
228
229 /* Go to exit */
230 jmp L(EXIT)
231 cfi_offset(12, -64)
232 cfi_offset(13, -72)
233 cfi_offset(14, -80)
234 # LOE rbx rbp r12 r13 r14 r15 xmm2
235
236 /* Scalar math function call
237 * to process special input
238 */
239
240L(SCALAR_MATH_CALL):
241 movl %r12d, %r14d
242 movss 32(%rsp, %r14, 4), %xmm0
243 call coshf@PLT
244 # LOE rbx rbp r14 r15 r12d r13d xmm0
245
246 movss %xmm0, 48(%rsp, %r14, 4)
247
248 /* Process special inputs in loop */
249 jmp L(SPECIAL_VALUES_LOOP)
250 # LOE rbx rbp r15 r12d r13d
251END(_ZGVbN4v_coshf_sse4)
252
253 .section .rodata, "a"
254 .align 16
255
256#ifdef __svml_scosh_data_internal_typedef
257typedef unsigned int VUINT32;
258typedef struct {
259 __declspec(align(16)) VUINT32 _sInvLn2[4][1];
260 __declspec(align(16)) VUINT32 _sLn2hi[4][1];
261 __declspec(align(16)) VUINT32 _sLn2lo[4][1];
262 __declspec(align(16)) VUINT32 _sSign[4][1];
263 __declspec(align(16)) VUINT32 _sShifter[4][1];
264 __declspec(align(16)) VUINT32 _iDomainRange[4][1];
265 __declspec(align(16)) VUINT32 _sPC1[4][1];
266 __declspec(align(16)) VUINT32 _sPC2[4][1];
267 __declspec(align(16)) VUINT32 _sPC3[4][1];
268 __declspec(align(16)) VUINT32 _sPC4[4][1];
269 __declspec(align(16)) VUINT32 _sPC5[4][1];
270 __declspec(align(16)) VUINT32 _sPC6[4][1];
271 __declspec(align(16)) VUINT32 _iHalf[4][1];
272} __svml_scosh_data_internal;
273#endif
274__svml_scosh_data_internal:
275 .long 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B /* _sInvLn2 */ // k=0
276 .align 16
277 .long 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000 /* _sLn2hi */
278 .align 16
279 .long 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4 /* _sLn2lo */
280 .align 16
281 .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 /* _sSign */
282 .align 16
283 .long 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000 /* _sShifter */
284 .align 16
285 .long 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E /* _iDomainRange */
286 .align 16
287 .long 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000 /* _sPC1=1 */
288 .align 16
289 .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000 /* _sPC2 */
290 .align 16
291 .long 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57 /* _sPC3 */
292 .align 16
293 .long 0x3d2aaa72, 0x3d2aaa72, 0x3d2aaa72, 0x3d2aaa72 /* _sPC4 */
294 .align 16
295 .long 0x3c091461, 0x3c091461, 0x3c091461, 0x3c091461 /* _sPC5 */
296 .align 16
297 .long 0x3ab6a8a3, 0x3ab6a8a3, 0x3ab6a8a3, 0x3ab6a8a3 /* _sPC6 */
298 // Integer constants
299 .align 16
300 .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000 /* _iHalf */
301 .align 16
302 .type __svml_scosh_data_internal, @object
303 .size __svml_scosh_data_internal, .-__svml_scosh_data_internal
304

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_s_coshf4_core_sse4.S