1/* Function sinhf vectorized with AVX-512.
2 Copyright (C) 2021-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
18
19/*
20 * ALGORITHM DESCRIPTION:
21 *
22 * Compute sinh(x) as (exp(x)-exp(-x))/2,
23 * where exp is calculated as
24 * exp(M*ln2 + ln2*(j/2^k) + r) = 2^M * 2^(j/2^k) * exp(r)
25 *
26 * Special cases:
27 *
28 * sinh(NaN) = quiet NaN, and raise invalid exception
29 * sinh(INF) = that INF
30 * sinh(x) = x for subnormals
31 * sinh(x) overflows for big x and returns MAXLOG+log(2)
32 *
33 */
34
35/* Offsets for data table __svml_ssinh_data_internal
36 */
37#define _sInvLn2 0
38#define _sLn2hi 64
39#define _sLn2lo 128
40#define _sSign 192
41#define _sShifter 256
42#define _iDomainRange 320
43#define _sPC1 384
44#define _sPC2 448
45#define _sPC3 512
46#define _sPC4 576
47#define _sPC5 640
48#define _sPC6 704
49#define _iHalf 768
50
51#include <sysdep.h>
52
53 .section .text.evex512, "ax", @progbits
54ENTRY(_ZGVeN16v_sinhf_skx)
55 pushq %rbp
56 cfi_def_cfa_offset(16)
57 movq %rsp, %rbp
58 cfi_def_cfa(6, 16)
59 cfi_offset(6, -16)
60 andq $-64, %rsp
61 subq $192, %rsp
62 vmovaps %zmm0, %zmm5
63
64 /*
65 * Implementation
66 * Abs argument
67 */
68 vandps _sSign+__svml_ssinh_data_internal(%rip), %zmm5, %zmm4
69
70 /*
71 * Check for overflow\underflow
72 * MORE faster than GE?
73 */
74 vpternlogd $255, %zmm6, %zmm6, %zmm6
75 vmovups _sShifter+__svml_ssinh_data_internal(%rip), %zmm7
76
77 /*
78 * Load argument
79 * dM = x/log(2) + RShifter
80 */
81 vmovups _sInvLn2+__svml_ssinh_data_internal(%rip), %zmm11
82 vmovups _sLn2hi+__svml_ssinh_data_internal(%rip), %zmm8
83 vmovups _sLn2lo+__svml_ssinh_data_internal(%rip), %zmm10
84 vmovups _iHalf+__svml_ssinh_data_internal(%rip), %zmm12
85 vmovups _sPC5+__svml_ssinh_data_internal(%rip), %zmm0
86 vmovups _sPC6+__svml_ssinh_data_internal(%rip), %zmm3
87
88 /* x^2 */
89 vmovups _sPC2+__svml_ssinh_data_internal(%rip), %zmm2
90 vxorps %zmm5, %zmm4, %zmm1
91 vfmadd213ps {rn-sae}, %zmm7, %zmm1, %zmm11
92 vpcmpd $2, _iDomainRange+__svml_ssinh_data_internal(%rip), %zmm1, %k1
93
94 /*
95 * G1, G2 2^N, 2^(-N)
96 * iM now is an EXP(2^N)
97 */
98 vpslld $23, %zmm11, %zmm13
99
100 /*
101 * R
102 * sN = sM - RShifter
103 */
104 vsubps {rn-sae}, %zmm7, %zmm11, %zmm9
105 vpaddd %zmm13, %zmm12, %zmm14
106 vpsubd %zmm13, %zmm12, %zmm15
107
108 /* sG1 = 2^(N-1)+2^(-N-1) */
109 vaddps {rn-sae}, %zmm15, %zmm14, %zmm7
110 vpandnd %zmm1, %zmm1, %zmm6{%k1}
111
112 /* sR = sX - sN*Log2_hi */
113 vfnmadd231ps {rn-sae}, %zmm8, %zmm9, %zmm1
114 vptestmd %zmm6, %zmm6, %k0
115
116 /* sG2 = 2^(N-1)-2^(-N-1) */
117 vsubps {rn-sae}, %zmm15, %zmm14, %zmm8
118
119 /* sR = (sX - sN*Log2_hi) - sN*Log2_lo */
120 vfnmadd231ps {rn-sae}, %zmm10, %zmm9, %zmm1
121
122 /*
123 * sinh(r) = r*((a1=1)+r^2*(a3+r^2*(a5+{v1 r^2*a7})))) = r + r*(r^2*(a3+r^2*(a5+r^2*a7))) ....
124 * sSinh_r = (a3+r^2*a5)
125 */
126 vmovups _sPC3+__svml_ssinh_data_internal(%rip), %zmm14
127 kmovw %k0, %edx
128
129 /* sR2 = sR^2 */
130 vmulps {rn-sae}, %zmm1, %zmm1, %zmm6
131 vfmadd231ps {rn-sae}, %zmm6, %zmm0, %zmm14
132
133 /* sSinh_r = r^2*(a3+r^2*a5) */
134 vmulps {rn-sae}, %zmm6, %zmm14, %zmm0
135
136 /* sSinh_r = r + r*(r^2*(a3+r^2*a5)) */
137 vfmadd213ps {rn-sae}, %zmm1, %zmm1, %zmm0
138
139 /*
140 * sinh(X) = sG2 + sG1*sinh(dR) + sG2*sR2*(a2+sR2*(a4+a6*sR2)
141 * sOut = (a4 +a6*sR2)
142 */
143 vmovups _sPC4+__svml_ssinh_data_internal(%rip), %zmm1
144 vfmadd231ps {rn-sae}, %zmm6, %zmm3, %zmm1
145
146 /* sOut = a2+sR2*(a4+a6*sR2) */
147 vfmadd213ps {rn-sae}, %zmm2, %zmm6, %zmm1
148
149 /* sOut = sR2*(a2+sR2*(a4+a6*sR2) */
150 vmulps {rn-sae}, %zmm6, %zmm1, %zmm2
151
152 /* sOut = sG2*sR2*(a2+sR2*(a4+a6*sR2) */
153 vmulps {rn-sae}, %zmm8, %zmm2, %zmm3
154
155 /* sOut = sG1*sinh(dR)+sG2*sR2*(a2+sR2*(a4+a6*sR2) */
156 vfmadd213ps {rn-sae}, %zmm3, %zmm0, %zmm7
157
158 /* sOut = sG2 + sG1*sinh(dR) + sG2*sR2*(a2+sR2*(a4+a6*sR2) */
159 vaddps {rn-sae}, %zmm8, %zmm7, %zmm9
160
161 /* Ret H */
162 vorps %zmm9, %zmm4, %zmm0
163 testl %edx, %edx
164
165 /* Go to special inputs processing branch */
166 jne L(SPECIAL_VALUES_BRANCH)
167 # LOE rbx r12 r13 r14 r15 edx zmm0 zmm5
168
169 /* Restore registers
170 * and exit the function
171 */
172
173L(EXIT):
174 movq %rbp, %rsp
175 popq %rbp
176 cfi_def_cfa(7, 8)
177 cfi_restore(6)
178 ret
179 cfi_def_cfa(6, 16)
180 cfi_offset(6, -16)
181
182 /* Branch to process
183 * special inputs
184 */
185
186L(SPECIAL_VALUES_BRANCH):
187 vmovups %zmm5, 64(%rsp)
188 vmovups %zmm0, 128(%rsp)
189 # LOE rbx r12 r13 r14 r15 edx zmm0
190
191 xorl %eax, %eax
192 # LOE rbx r12 r13 r14 r15 eax edx
193
194 vzeroupper
195 movq %r12, 16(%rsp)
196 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
197 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
198 movl %eax, %r12d
199 movq %r13, 8(%rsp)
200 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
201 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
202 movl %edx, %r13d
203 movq %r14, (%rsp)
204 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
205 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
206 # LOE rbx r15 r12d r13d
207
208 /* Range mask
209 * bits check
210 */
211
212L(RANGEMASK_CHECK):
213 btl %r12d, %r13d
214
215 /* Call scalar math function */
216 jc L(SCALAR_MATH_CALL)
217 # LOE rbx r15 r12d r13d
218
219 /* Special inputs
220 * processing loop
221 */
222
223L(SPECIAL_VALUES_LOOP):
224 incl %r12d
225 cmpl $16, %r12d
226
227 /* Check bits in range mask */
228 jl L(RANGEMASK_CHECK)
229 # LOE rbx r15 r12d r13d
230
231 movq 16(%rsp), %r12
232 cfi_restore(12)
233 movq 8(%rsp), %r13
234 cfi_restore(13)
235 movq (%rsp), %r14
236 cfi_restore(14)
237 vmovups 128(%rsp), %zmm0
238
239 /* Go to exit */
240 jmp L(EXIT)
241 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
242 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
243 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
244 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
245 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
246 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
247 # LOE rbx r12 r13 r14 r15 zmm0
248
249 /* Scalar math function call
250 * to process special input
251 */
252
253L(SCALAR_MATH_CALL):
254 movl %r12d, %r14d
255 vmovss 64(%rsp, %r14, 4), %xmm0
256 call sinhf@PLT
257 # LOE rbx r14 r15 r12d r13d xmm0
258
259 vmovss %xmm0, 128(%rsp, %r14, 4)
260
261 /* Process special inputs in loop */
262 jmp L(SPECIAL_VALUES_LOOP)
263 # LOE rbx r15 r12d r13d
264END(_ZGVeN16v_sinhf_skx)
265
266 .section .rodata, "a"
267 .align 64
268
269#ifdef __svml_ssinh_data_internal_typedef
270typedef unsigned int VUINT32;
271typedef struct {
272 __declspec(align(64)) VUINT32 _sInvLn2[16][1];
273 __declspec(align(64)) VUINT32 _sLn2hi[16][1];
274 __declspec(align(64)) VUINT32 _sLn2lo[16][1];
275 __declspec(align(64)) VUINT32 _sSign[16][1];
276 __declspec(align(64)) VUINT32 _sShifter[16][1];
277 __declspec(align(64)) VUINT32 _iDomainRange[16][1];
278 __declspec(align(64)) VUINT32 _sPC1[16][1];
279 __declspec(align(64)) VUINT32 _sPC2[16][1];
280 __declspec(align(64)) VUINT32 _sPC3[16][1];
281 __declspec(align(64)) VUINT32 _sPC4[16][1];
282 __declspec(align(64)) VUINT32 _sPC5[16][1];
283 __declspec(align(64)) VUINT32 _sPC6[16][1];
284 __declspec(align(64)) VUINT32 _iHalf[16][1];
285} __svml_ssinh_data_internal;
286#endif
287__svml_ssinh_data_internal:
288 .long 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B /* _sInvLn2 */ // k=0
289 .align 64
290 .long 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000 /* _sLn2hi */
291 .align 64
292 .long 0x3805FDF4, 0x3805FDF4, 0x3805FDF4, 0x3805FDF4, 0x3805FDF4, 0x3805FDF4, 0x3805FDF4, 0x3805FDF4, 0x3805FDF4, 0x3805FDF4, 0x3805FDF4, 0x3805FDF4, 0x3805FDF4, 0x3805FDF4, 0x3805FDF4, 0x3805FDF4 /* _sLn2lo */
293 .align 64
294 .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000 /* _sSign */
295 .align 64
296 .long 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000 /* _sShifter */
297 .align 64
298 .long 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E /* _iDomainRange */
299 .align 64
300 .long 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000 /* _sPC1=1 */
301 .align 64
302 .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000 /* _sPC2 */
303 .align 64
304 .long 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57 /* _sPC3 */
305 .align 64
306 .long 0x3d2aaa72, 0x3d2aaa72, 0x3d2aaa72, 0x3d2aaa72, 0x3d2aaa72, 0x3d2aaa72, 0x3d2aaa72, 0x3d2aaa72, 0x3d2aaa72, 0x3d2aaa72, 0x3d2aaa72, 0x3d2aaa72, 0x3d2aaa72, 0x3d2aaa72, 0x3d2aaa72, 0x3d2aaa72 /* _sPC4 */
307 .align 64
308 .long 0x3c091461, 0x3c091461, 0x3c091461, 0x3c091461, 0x3c091461, 0x3c091461, 0x3c091461, 0x3c091461, 0x3c091461, 0x3c091461, 0x3c091461, 0x3c091461, 0x3c091461, 0x3c091461, 0x3c091461, 0x3c091461 /* _sPC5 */
309 .align 64
310 .long 0x3ab6a8a3, 0x3ab6a8a3, 0x3ab6a8a3, 0x3ab6a8a3, 0x3ab6a8a3, 0x3ab6a8a3, 0x3ab6a8a3, 0x3ab6a8a3, 0x3ab6a8a3, 0x3ab6a8a3, 0x3ab6a8a3, 0x3ab6a8a3, 0x3ab6a8a3, 0x3ab6a8a3, 0x3ab6a8a3, 0x3ab6a8a3 /* _sPC6 */
311 // Integer constants
312 .align 64
313 .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000 /* _iHalf */
314 .align 64
315 .type __svml_ssinh_data_internal, @object
316 .size __svml_ssinh_data_internal, .-__svml_ssinh_data_internal
317

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_s_sinhf16_core_avx512.S