1/* Function sinhf vectorized with AVX2.
2 Copyright (C) 2021-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
18
19/*
20 * ALGORITHM DESCRIPTION:
21 *
22 * Compute sinh(x) as (exp(x)-exp(-x))/2,
23 * where exp is calculated as
24 * exp(M*ln2 + ln2*(j/2^k) + r) = 2^M * 2^(j/2^k) * exp(r)
25 *
26 * Special cases:
27 *
28 * sinh(NaN) = quiet NaN, and raise invalid exception
29 * sinh(INF) = that INF
30 * sinh(x) = x for subnormals
31 * sinh(x) overflows for big x and returns MAXLOG+log(2)
32 *
33 */
34
35/* Offsets for data table __svml_ssinh_data_internal
36 */
37#define _sInvLn2 0
38#define _sLn2hi 32
39#define _sLn2lo 64
40#define _sSign 96
41#define _sShifter 128
42#define _iDomainRange 160
43#define _sPC1 192
44#define _sPC2 224
45#define _sPC3 256
46#define _sPC4 288
47#define _sPC5 320
48#define _sPC6 352
49#define _iHalf 384
50
51#include <sysdep.h>
52
53 .section .text.avx2, "ax", @progbits
54ENTRY(_ZGVdN8v_sinhf_avx2)
55 pushq %rbp
56 cfi_def_cfa_offset(16)
57 movq %rsp, %rbp
58 cfi_def_cfa(6, 16)
59 cfi_offset(6, -16)
60 andq $-32, %rsp
61 subq $96, %rsp
62 vmovups _sInvLn2+__svml_ssinh_data_internal(%rip), %ymm7
63 vmovups _sShifter+__svml_ssinh_data_internal(%rip), %ymm4
64 vmovups _sLn2hi+__svml_ssinh_data_internal(%rip), %ymm5
65
66 /*
67 * sinh(X) = sG2 + sG1*sinh(dR) + sG2*sR2*(a2+sR2*(a4+a6*sR2)
68 * sOut = (a4 +a6*sR2)
69 */
70 vmovups _sPC6+__svml_ssinh_data_internal(%rip), %ymm14
71
72 /*
73 * sinh(r) = r*((a1=1)+r^2*(a3+r^2*(a5+{v1 r^2*a7})))) = r + r*(r^2*(a3+r^2*(a5+r^2*a7))) ....
74 * sSinh_r = (a3+r^2*a5)
75 */
76 vmovups _sPC5+__svml_ssinh_data_internal(%rip), %ymm12
77 vmovups _iHalf+__svml_ssinh_data_internal(%rip), %ymm8
78 vmovaps %ymm0, %ymm2
79
80 /*
81 * Implementation
82 * Abs argument
83 */
84 vandps _sSign+__svml_ssinh_data_internal(%rip), %ymm2, %ymm1
85 vxorps %ymm2, %ymm1, %ymm0
86
87 /*
88 * Load argument
89 * dM = x/log(2) + RShifter
90 */
91 vfmadd213ps %ymm4, %ymm0, %ymm7
92
93 /*
94 * R
95 * sN = sM - RShifter
96 */
97 vsubps %ymm4, %ymm7, %ymm6
98
99 /*
100 * G1, G2 2^N, 2^(-N)
101 * iM now is an EXP(2^N)
102 */
103 vpslld $23, %ymm7, %ymm9
104
105 /*
106 * Check for overflow\underflow
107 * MORE faster than GE?
108 */
109 vpcmpgtd _iDomainRange+__svml_ssinh_data_internal(%rip), %ymm0, %ymm3
110
111 /* sR = sX - sN*Log2_hi */
112 vfnmadd231ps %ymm5, %ymm6, %ymm0
113 vpaddd %ymm9, %ymm8, %ymm10
114 vpsubd %ymm9, %ymm8, %ymm11
115
116 /* sR = (sX - sN*Log2_hi) - sN*Log2_lo */
117 vfnmadd231ps _sLn2lo+__svml_ssinh_data_internal(%rip), %ymm6, %ymm0
118
119 /* sR2 = sR^2 */
120 vmulps %ymm0, %ymm0, %ymm13
121 vfmadd213ps _sPC4+__svml_ssinh_data_internal(%rip), %ymm13, %ymm14
122 vfmadd213ps _sPC3+__svml_ssinh_data_internal(%rip), %ymm13, %ymm12
123
124 /* sOut = a2+sR2*(a4+a6*sR2) */
125 vfmadd213ps _sPC2+__svml_ssinh_data_internal(%rip), %ymm13, %ymm14
126
127 /* sSinh_r = r^2*(a3+r^2*a5) */
128 vmulps %ymm12, %ymm13, %ymm12
129
130 /* sOut = sR2*(a2+sR2*(a4+a6*sR2) */
131 vmulps %ymm14, %ymm13, %ymm15
132
133 /* sSinh_r = r + r*(r^2*(a3+r^2*a5)) */
134 vfmadd213ps %ymm0, %ymm0, %ymm12
135 vmovmskps %ymm3, %edx
136
137 /* sG1 = 2^(N-1)+2^(-N-1) */
138 vaddps %ymm11, %ymm10, %ymm3
139
140 /* sG2 = 2^(N-1)-2^(-N-1) */
141 vsubps %ymm11, %ymm10, %ymm10
142
143 /* sOut = sG2*sR2*(a2+sR2*(a4+a6*sR2) */
144 vmulps %ymm15, %ymm10, %ymm0
145
146 /* sOut = sG1*sinh(dR)+sG2*sR2*(a2+sR2*(a4+a6*sR2) */
147 vfmadd213ps %ymm0, %ymm12, %ymm3
148
149 /* sOut = sG2 + sG1*sinh(dR) + sG2*sR2*(a2+sR2*(a4+a6*sR2) */
150 vaddps %ymm3, %ymm10, %ymm4
151
152 /* Ret H */
153 vorps %ymm4, %ymm1, %ymm0
154 testl %edx, %edx
155
156 /* Go to special inputs processing branch */
157 jne L(SPECIAL_VALUES_BRANCH)
158 # LOE rbx r12 r13 r14 r15 edx ymm0 ymm2
159
160 /* Restore registers
161 * and exit the function
162 */
163
164L(EXIT):
165 movq %rbp, %rsp
166 popq %rbp
167 cfi_def_cfa(7, 8)
168 cfi_restore(6)
169 ret
170 cfi_def_cfa(6, 16)
171 cfi_offset(6, -16)
172
173 /* Branch to process
174 * special inputs
175 */
176
177L(SPECIAL_VALUES_BRANCH):
178 vmovups %ymm2, 32(%rsp)
179 vmovups %ymm0, 64(%rsp)
180 # LOE rbx r12 r13 r14 r15 edx ymm0
181
182 xorl %eax, %eax
183 # LOE rbx r12 r13 r14 r15 eax edx
184
185 vzeroupper
186 movq %r12, 16(%rsp)
187 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */
188 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
189 movl %eax, %r12d
190 movq %r13, 8(%rsp)
191 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */
192 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
193 movl %edx, %r13d
194 movq %r14, (%rsp)
195 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */
196 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
197 # LOE rbx r15 r12d r13d
198
199 /* Range mask
200 * bits check
201 */
202
203L(RANGEMASK_CHECK):
204 btl %r12d, %r13d
205
206 /* Call scalar math function */
207 jc L(SCALAR_MATH_CALL)
208 # LOE rbx r15 r12d r13d
209
210 /* Special inputs
211 * processing loop
212 */
213
214L(SPECIAL_VALUES_LOOP):
215 incl %r12d
216 cmpl $8, %r12d
217
218 /* Check bits in range mask */
219 jl L(RANGEMASK_CHECK)
220 # LOE rbx r15 r12d r13d
221
222 movq 16(%rsp), %r12
223 cfi_restore(12)
224 movq 8(%rsp), %r13
225 cfi_restore(13)
226 movq (%rsp), %r14
227 cfi_restore(14)
228 vmovups 64(%rsp), %ymm0
229
230 /* Go to exit */
231 jmp L(EXIT)
232 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */
233 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
234 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */
235 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
236 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */
237 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
238 # LOE rbx r12 r13 r14 r15 ymm0
239
240 /* Scalar math function call
241 * to process special input
242 */
243
244L(SCALAR_MATH_CALL):
245 movl %r12d, %r14d
246 vmovss 32(%rsp, %r14, 4), %xmm0
247 call sinhf@PLT
248 # LOE rbx r14 r15 r12d r13d xmm0
249
250 vmovss %xmm0, 64(%rsp, %r14, 4)
251
252 /* Process special inputs in loop */
253 jmp L(SPECIAL_VALUES_LOOP)
254 # LOE rbx r15 r12d r13d
255END(_ZGVdN8v_sinhf_avx2)
256
257 .section .rodata, "a"
258 .align 32
259
260#ifdef __svml_ssinh_data_internal_typedef
261typedef unsigned int VUINT32;
262typedef struct {
263 __declspec(align(32)) VUINT32 _sInvLn2[8][1];
264 __declspec(align(32)) VUINT32 _sLn2hi[8][1];
265 __declspec(align(32)) VUINT32 _sLn2lo[8][1];
266 __declspec(align(32)) VUINT32 _sSign[8][1];
267 __declspec(align(32)) VUINT32 _sShifter[8][1];
268 __declspec(align(32)) VUINT32 _iDomainRange[8][1];
269 __declspec(align(32)) VUINT32 _sPC1[8][1];
270 __declspec(align(32)) VUINT32 _sPC2[8][1];
271 __declspec(align(32)) VUINT32 _sPC3[8][1];
272 __declspec(align(32)) VUINT32 _sPC4[8][1];
273 __declspec(align(32)) VUINT32 _sPC5[8][1];
274 __declspec(align(32)) VUINT32 _sPC6[8][1];
275 __declspec(align(32)) VUINT32 _iHalf[8][1];
276} __svml_ssinh_data_internal;
277#endif
278__svml_ssinh_data_internal:
279 .long 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B /* _sInvLn2 */ // k=0
280 .align 32
281 .long 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000 /* _sLn2hi */
282 .align 32
283 .long 0x3805FDF4, 0x3805FDF4, 0x3805FDF4, 0x3805FDF4, 0x3805FDF4, 0x3805FDF4, 0x3805FDF4, 0x3805FDF4 /* _sLn2lo */
284 .align 32
285 .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000 /* _sSign */
286 .align 32
287 .long 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000 /* _sShifter */
288 .align 32
289 .long 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E /* _iDomainRange */
290 .align 32
291 .long 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000 /* _sPC1=1 */
292 .align 32
293 .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000 /* _sPC2 */
294 .align 32
295 .long 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57 /* _sPC3 */
296 .align 32
297 .long 0x3d2aaa72, 0x3d2aaa72, 0x3d2aaa72, 0x3d2aaa72, 0x3d2aaa72, 0x3d2aaa72, 0x3d2aaa72, 0x3d2aaa72 /* _sPC4 */
298 .align 32
299 .long 0x3c091461, 0x3c091461, 0x3c091461, 0x3c091461, 0x3c091461, 0x3c091461, 0x3c091461, 0x3c091461 /* _sPC5 */
300 .align 32
301 .long 0x3ab6a8a3, 0x3ab6a8a3, 0x3ab6a8a3, 0x3ab6a8a3, 0x3ab6a8a3, 0x3ab6a8a3, 0x3ab6a8a3, 0x3ab6a8a3 /* _sPC6 */
302 // Integer constants
303 .align 32
304 .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000 /* _iHalf */
305 .align 32
306 .type __svml_ssinh_data_internal, @object
307 .size __svml_ssinh_data_internal, .-__svml_ssinh_data_internal
308

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_s_sinhf8_core_avx2.S