1/* Function coshf vectorized with AVX2.
2 Copyright (C) 2021-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
18
19/*
20 * ALGORITHM DESCRIPTION:
21 *
22 * Compute cosh(x) as (exp(x)+exp(-x))/2,
23 * where exp is calculated as
24 * exp(M*ln2 + ln2*(j/2^k) + r) = 2^M * 2^(j/2^k) * exp(r)
25 *
26 * Special cases:
27 *
28 * cosh(NaN) = quiet NaN, and raise invalid exception
29 * cosh(INF) = that INF
30 * cosh(0) = 1
31 * cosh(x) overflows for big x and returns MAXLOG+log(2)
32 *
33 */
34
35/* Offsets for data table __svml_scosh_data_internal
36 */
37#define _sInvLn2 0
38#define _sLn2hi 32
39#define _sLn2lo 64
40#define _sSign 96
41#define _sShifter 128
42#define _iDomainRange 160
43#define _sPC1 192
44#define _sPC2 224
45#define _sPC3 256
46#define _sPC4 288
47#define _sPC5 320
48#define _sPC6 352
49#define _iHalf 384
50
51#include <sysdep.h>
52
53 .section .text.avx2, "ax", @progbits
54ENTRY(_ZGVdN8v_coshf_avx2)
55 pushq %rbp
56 cfi_def_cfa_offset(16)
57 movq %rsp, %rbp
58 cfi_def_cfa(6, 16)
59 cfi_offset(6, -16)
60 andq $-32, %rsp
61 subq $96, %rsp
62 vmovups _sSign+__svml_scosh_data_internal(%rip), %ymm2
63 vmovups _sShifter+__svml_scosh_data_internal(%rip), %ymm7
64
65 /*
66 * Load argument
67 * dM = x/log(2) + RShifter
68 */
69 vmovups _sInvLn2+__svml_scosh_data_internal(%rip), %ymm10
70 vmovups _sLn2hi+__svml_scosh_data_internal(%rip), %ymm8
71 vmovups _iDomainRange+__svml_scosh_data_internal(%rip), %ymm3
72
73 /*
74 * sinh(r) = r*((a1=1)+r^2*(a3+r^2*(a5+{v1 r^2*a7})))) = r + r*(r^2*(a3+r^2*(a5+r^2*a7))) ....
75 * sSinh_r = (a3+r^2*a5)
76 */
77 vmovups _sPC5+__svml_scosh_data_internal(%rip), %ymm15
78 vmovups _iHalf+__svml_scosh_data_internal(%rip), %ymm11
79 vmovaps %ymm0, %ymm1
80
81 /*
82 * Implementation
83 * Abs argument
84 */
85 vandnps %ymm1, %ymm2, %ymm0
86 vfmadd213ps %ymm7, %ymm0, %ymm10
87
88 /*
89 * R
90 * sN = sM - RShifter
91 */
92 vsubps %ymm7, %ymm10, %ymm9
93
94 /*
95 * G1, G2 2^N, 2^(-N)
96 * iM now is an EXP(2^N)
97 */
98 vpslld $23, %ymm10, %ymm12
99
100 /* Check for overflow\underflow */
101 vpcmpgtd %ymm3, %ymm0, %ymm4
102 vpcmpeqd %ymm3, %ymm0, %ymm5
103
104 /* sR = sX - sN*Log2_hi */
105 vfnmadd231ps %ymm8, %ymm9, %ymm0
106 vpaddd %ymm12, %ymm11, %ymm13
107 vpsubd %ymm12, %ymm11, %ymm14
108 vpor %ymm5, %ymm4, %ymm6
109
110 /* sR = (sX - sN*Log2_hi) - sN*Log2_lo */
111 vfnmadd231ps _sLn2lo+__svml_scosh_data_internal(%rip), %ymm9, %ymm0
112
113 /* sG1 = 2^(N-1)-2^(-N-1) */
114 vsubps %ymm14, %ymm13, %ymm4
115
116 /* sG2 = 2^(N-1)+2^(-N-1) */
117 vaddps %ymm14, %ymm13, %ymm3
118
119 /* sR2 = sR^2, shaffled */
120 vmulps %ymm0, %ymm0, %ymm2
121 vfmadd213ps _sPC3+__svml_scosh_data_internal(%rip), %ymm2, %ymm15
122
123 /* sSinh_r = r^2*(a3+r^2*a5) */
124 vmulps %ymm15, %ymm2, %ymm13
125
126 /* sSinh_r = r + r*(r^2*(a3+r^2*a5)) */
127 vfmadd213ps %ymm0, %ymm0, %ymm13
128
129 /*
130 * sinh(X) = sG2 + sG1*sinh(dR) + sG2*sR2*(a2+sR2*(a4+a6*sR2)
131 * sOut = (a4 +a6*sR2)
132 */
133 vmovups _sPC6+__svml_scosh_data_internal(%rip), %ymm0
134 vfmadd213ps _sPC4+__svml_scosh_data_internal(%rip), %ymm2, %ymm0
135
136 /* sOut = a2+sR2*(a4+a6*sR2) */
137 vfmadd213ps _sPC2+__svml_scosh_data_internal(%rip), %ymm2, %ymm0
138
139 /* sOut = sR2*(a2+sR2*(a4+a6*sR2) */
140 vmulps %ymm0, %ymm2, %ymm15
141
142 /* sOut = sG2*sR2*(a2+sR2*(a4+a6*sR2) */
143 vmulps %ymm15, %ymm3, %ymm14
144
145 /* sOut = sG1*sinh(dR)+sG2*sR2*(a2+sR2*(a4+a6*sR2) */
146 vfmadd213ps %ymm14, %ymm13, %ymm4
147 vmovmskps %ymm6, %edx
148
149 /* sOut = sG2 + sG1*sinh(dR) + sG2*sR2*(a2+sR2*(a4+a6*sR2) */
150 vaddps %ymm4, %ymm3, %ymm0
151
152 /* Ret H */
153 testl %edx, %edx
154
155 /* Go to special inputs processing branch */
156 jne L(SPECIAL_VALUES_BRANCH)
157 # LOE rbx r12 r13 r14 r15 edx ymm0 ymm1
158
159 /* Restore registers
160 * and exit the function
161 */
162
163L(EXIT):
164 movq %rbp, %rsp
165 popq %rbp
166 cfi_def_cfa(7, 8)
167 cfi_restore(6)
168 ret
169 cfi_def_cfa(6, 16)
170 cfi_offset(6, -16)
171
172 /* Branch to process
173 * special inputs
174 */
175
176L(SPECIAL_VALUES_BRANCH):
177 vmovups %ymm1, 32(%rsp)
178 vmovups %ymm0, 64(%rsp)
179 # LOE rbx r12 r13 r14 r15 edx ymm0
180
181 xorl %eax, %eax
182 # LOE rbx r12 r13 r14 r15 eax edx
183
184 vzeroupper
185 movq %r12, 16(%rsp)
186 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */
187 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
188 movl %eax, %r12d
189 movq %r13, 8(%rsp)
190 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */
191 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
192 movl %edx, %r13d
193 movq %r14, (%rsp)
194 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */
195 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
196 # LOE rbx r15 r12d r13d
197
198 /* Range mask
199 * bits check
200 */
201
202L(RANGEMASK_CHECK):
203 btl %r12d, %r13d
204
205 /* Call scalar math function */
206 jc L(SCALAR_MATH_CALL)
207 # LOE rbx r15 r12d r13d
208
209 /* Special inputs
210 * processing loop
211 */
212
213L(SPECIAL_VALUES_LOOP):
214 incl %r12d
215 cmpl $8, %r12d
216
217 /* Check bits in range mask */
218 jl L(RANGEMASK_CHECK)
219 # LOE rbx r15 r12d r13d
220
221 movq 16(%rsp), %r12
222 cfi_restore(12)
223 movq 8(%rsp), %r13
224 cfi_restore(13)
225 movq (%rsp), %r14
226 cfi_restore(14)
227 vmovups 64(%rsp), %ymm0
228
229 /* Go to exit */
230 jmp L(EXIT)
231 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */
232 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
233 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */
234 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
235 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */
236 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
237 # LOE rbx r12 r13 r14 r15 ymm0
238
239 /* Scalar math function call
240 * to process special input
241 */
242
243L(SCALAR_MATH_CALL):
244 movl %r12d, %r14d
245 vmovss 32(%rsp, %r14, 4), %xmm0
246 call coshf@PLT
247 # LOE rbx r14 r15 r12d r13d xmm0
248
249 vmovss %xmm0, 64(%rsp, %r14, 4)
250
251 /* Process special inputs in loop */
252 jmp L(SPECIAL_VALUES_LOOP)
253 # LOE rbx r15 r12d r13d
254END(_ZGVdN8v_coshf_avx2)
255
256 .section .rodata, "a"
257 .align 32
258
259#ifdef __svml_scosh_data_internal_typedef
260typedef unsigned int VUINT32;
261typedef struct {
262 __declspec(align(32)) VUINT32 _sInvLn2[8][1];
263 __declspec(align(32)) VUINT32 _sLn2hi[8][1];
264 __declspec(align(32)) VUINT32 _sLn2lo[8][1];
265 __declspec(align(32)) VUINT32 _sSign[8][1];
266 __declspec(align(32)) VUINT32 _sShifter[8][1];
267 __declspec(align(32)) VUINT32 _iDomainRange[8][1];
268 __declspec(align(32)) VUINT32 _sPC1[8][1];
269 __declspec(align(32)) VUINT32 _sPC2[8][1];
270 __declspec(align(32)) VUINT32 _sPC3[8][1];
271 __declspec(align(32)) VUINT32 _sPC4[8][1];
272 __declspec(align(32)) VUINT32 _sPC5[8][1];
273 __declspec(align(32)) VUINT32 _sPC6[8][1];
274 __declspec(align(32)) VUINT32 _iHalf[8][1];
275} __svml_scosh_data_internal;
276#endif
277__svml_scosh_data_internal:
278 .long 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B /* _sInvLn2 */ // k=0
279 .align 32
280 .long 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000 /* _sLn2hi */
281 .align 32
282 .long 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4 /* _sLn2lo */
283 .align 32
284 .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000 /* _sSign */
285 .align 32
286 .long 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000 /* _sShifter */
287 .align 32
288 .long 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E /* _iDomainRange */
289 .align 32
290 .long 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000 /* _sPC1=1 */
291 .align 32
292 .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000 /* _sPC2 */
293 .align 32
294 .long 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57 /* _sPC3 */
295 .align 32
296 .long 0x3d2aaa72, 0x3d2aaa72, 0x3d2aaa72, 0x3d2aaa72, 0x3d2aaa72, 0x3d2aaa72, 0x3d2aaa72, 0x3d2aaa72 /* _sPC4 */
297 .align 32
298 .long 0x3c091461, 0x3c091461, 0x3c091461, 0x3c091461, 0x3c091461, 0x3c091461, 0x3c091461, 0x3c091461 /* _sPC5 */
299 .align 32
300 .long 0x3ab6a8a3, 0x3ab6a8a3, 0x3ab6a8a3, 0x3ab6a8a3, 0x3ab6a8a3, 0x3ab6a8a3, 0x3ab6a8a3, 0x3ab6a8a3 /* _sPC6 */
301 // Integer constants
302 .align 32
303 .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000 /* _iHalf */
304 .align 32
305 .type __svml_scosh_data_internal, @object
306 .size __svml_scosh_data_internal, .-__svml_scosh_data_internal
307

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_s_coshf8_core_avx2.S