1/* Function coshf vectorized with AVX-512.
2 Copyright (C) 2021-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
18
19/*
20 * ALGORITHM DESCRIPTION:
21 *
22 * Compute cosh(x) as (exp(x)+exp(-x))/2,
23 * where exp is calculated as
24 * exp(M*ln2 + ln2*(j/2^k) + r) = 2^M * 2^(j/2^k) * exp(r)
25 *
26 * Special cases:
27 *
28 * cosh(NaN) = quiet NaN, and raise invalid exception
29 * cosh(INF) = that INF
30 * cosh(0) = 1
31 * cosh(x) overflows for big x and returns MAXLOG+log(2)
32 *
33 */
34
35/* Offsets for data table __svml_scosh_data_internal
36 */
37#define _sExp_tbl_PH 0
38#define _sExp_tbl_NH 128
39#define _sShifter_UISA 256
40#define _iDomainRange_UISA 320
41#define _sPC1_UISA 384
42#define _sPC2_UISA 448
43#define _sPC3_UISA 512
44#define _sInvLn2 576
45#define _sLn2hi 640
46#define _sLn2lo 704
47#define _sSign 768
48#define _iExpMask 832
49#define _sShifter 896
50#define _iDomainRange 960
51#define _sPC1 1024
52#define _sPC2 1088
53#define _sPC3 1152
54
55#include <sysdep.h>
56
57 .section .text.evex512, "ax", @progbits
58ENTRY(_ZGVeN16v_coshf_skx)
59 pushq %rbp
60 cfi_def_cfa_offset(16)
61 movq %rsp, %rbp
62 cfi_def_cfa(6, 16)
63 cfi_offset(6, -16)
64 andq $-64, %rsp
65 subq $192, %rsp
66 vmovups _sSign+__svml_scosh_data_internal(%rip), %zmm4
67 vmovups _sShifter_UISA+__svml_scosh_data_internal(%rip), %zmm6
68
69 /*
70 * Load argument
71 * dM = x/log(2) + RShifter
72 */
73 vmovups _sInvLn2+__svml_scosh_data_internal(%rip), %zmm10
74 vmovups _sLn2hi+__svml_scosh_data_internal(%rip), %zmm7
75 vmovups _sLn2lo+__svml_scosh_data_internal(%rip), %zmm9
76
77 /* */
78 vmovups _sPC3_UISA+__svml_scosh_data_internal(%rip), %zmm2
79
80 /* x^2 */
81 vmovups _sPC2_UISA+__svml_scosh_data_internal(%rip), %zmm3
82
83 /* G1, G2 2^N, 2^(-N) */
84 vmovups __svml_scosh_data_internal(%rip), %zmm12
85 vmovups _sExp_tbl_NH+__svml_scosh_data_internal(%rip), %zmm13
86
87 /*
88 * Implementation
89 * Abs argument
90 */
91 vandnps %zmm0, %zmm4, %zmm1
92
93 /* Check for overflow\underflow */
94 vpternlogd $255, %zmm5, %zmm5, %zmm5
95 vfmadd213ps {rn-sae}, %zmm6, %zmm1, %zmm10
96 vpcmpd $1, _iDomainRange_UISA+__svml_scosh_data_internal(%rip), %zmm1, %k1
97
98 /* iM now is an EXP(2^N) */
99 vpslld $18, %zmm10, %zmm11
100
101 /*
102 * R
103 * sN = sM - RShifter
104 */
105 vsubps {rn-sae}, %zmm6, %zmm10, %zmm8
106 vpermt2ps _sExp_tbl_PH+64+__svml_scosh_data_internal(%rip), %zmm10, %zmm12
107 vpermt2ps _sExp_tbl_NH+64+__svml_scosh_data_internal(%rip), %zmm10, %zmm13
108 vpandnd %zmm1, %zmm1, %zmm5{%k1}
109
110 /* sR = sX - sN*Log2_hi */
111 vfnmadd231ps {rn-sae}, %zmm7, %zmm8, %zmm1
112 vptestmd %zmm5, %zmm5, %k0
113
114 /* sR = (sX - sN*Log2_hi) - sN*Log2_lo */
115 vfnmadd231ps {rn-sae}, %zmm9, %zmm8, %zmm1
116 kmovw %k0, %edx
117 vmulps {rn-sae}, %zmm1, %zmm1, %zmm4
118 vmulps {rn-sae}, %zmm4, %zmm2, %zmm2
119
120 /* sSinh_r = r + r*(r^2*(a3)) */
121 vfmadd213ps {rn-sae}, %zmm1, %zmm1, %zmm2
122
123 /* sOut = r^2*(a2) */
124 vmulps {rn-sae}, %zmm4, %zmm3, %zmm1
125 vpandd _iExpMask+__svml_scosh_data_internal(%rip), %zmm11, %zmm14
126 vpaddd %zmm14, %zmm12, %zmm15
127 vpsubd %zmm14, %zmm13, %zmm10
128
129 /* sG2 = 2^N*Th + 2^(-N)*T_h */
130 vaddps {rn-sae}, %zmm10, %zmm15, %zmm5
131
132 /* sG1 = 2^N*Th - 2^(-N)*T_h */
133 vsubps {rn-sae}, %zmm10, %zmm15, %zmm6
134
135 /* res = sG1*(r + r*(r^2*(a3))) + sG2*(1+r^2*(a2)) */
136 vfmadd213ps {rn-sae}, %zmm5, %zmm5, %zmm1
137 vfmadd213ps {rn-sae}, %zmm1, %zmm2, %zmm6
138 testl %edx, %edx
139
140 /* Go to special inputs processing branch */
141 jne L(SPECIAL_VALUES_BRANCH)
142 # LOE rbx r12 r13 r14 r15 edx zmm0 zmm6
143
144 /* Restore registers
145 * and exit the function
146 */
147
148L(EXIT):
149 vmovaps %zmm6, %zmm0
150 movq %rbp, %rsp
151 popq %rbp
152 cfi_def_cfa(7, 8)
153 cfi_restore(6)
154 ret
155 cfi_def_cfa(6, 16)
156 cfi_offset(6, -16)
157
158 /* Branch to process
159 * special inputs
160 */
161
162L(SPECIAL_VALUES_BRANCH):
163 vmovups %zmm0, 64(%rsp)
164 vmovups %zmm6, 128(%rsp)
165 # LOE rbx r12 r13 r14 r15 edx zmm6
166
167 xorl %eax, %eax
168 # LOE rbx r12 r13 r14 r15 eax edx
169
170 vzeroupper
171 movq %r12, 16(%rsp)
172 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
173 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
174 movl %eax, %r12d
175 movq %r13, 8(%rsp)
176 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
177 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
178 movl %edx, %r13d
179 movq %r14, (%rsp)
180 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
181 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
182 # LOE rbx r15 r12d r13d
183
184 /* Range mask
185 * bits check
186 */
187
188L(RANGEMASK_CHECK):
189 btl %r12d, %r13d
190
191 /* Call scalar math function */
192 jc L(SCALAR_MATH_CALL)
193 # LOE rbx r15 r12d r13d
194
195 /* Special inputs
196 * processing loop
197 */
198
199L(SPECIAL_VALUES_LOOP):
200 incl %r12d
201 cmpl $16, %r12d
202
203 /* Check bits in range mask */
204 jl L(RANGEMASK_CHECK)
205 # LOE rbx r15 r12d r13d
206
207 movq 16(%rsp), %r12
208 cfi_restore(12)
209 movq 8(%rsp), %r13
210 cfi_restore(13)
211 movq (%rsp), %r14
212 cfi_restore(14)
213 vmovups 128(%rsp), %zmm6
214
215 /* Go to exit */
216 jmp L(EXIT)
217 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
218 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
219 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
220 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
221 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
222 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
223 # LOE rbx r12 r13 r14 r15 zmm6
224
225 /* Scalar math function call
226 * to process special input
227 */
228
229L(SCALAR_MATH_CALL):
230 movl %r12d, %r14d
231 vmovss 64(%rsp, %r14, 4), %xmm0
232 call coshf@PLT
233 # LOE rbx r14 r15 r12d r13d xmm0
234
235 vmovss %xmm0, 128(%rsp, %r14, 4)
236
237 /* Process special inputs in loop */
238 jmp L(SPECIAL_VALUES_LOOP)
239 # LOE rbx r15 r12d r13d
240END(_ZGVeN16v_coshf_skx)
241
242 .section .rodata, "a"
243 .align 64
244
245#ifdef __svml_scosh_data_internal_typedef
246typedef unsigned int VUINT32;
247typedef struct {
248 __declspec(align(64)) VUINT32 _sExp_tbl_PH[32][1];
249 __declspec(align(64)) VUINT32 _sExp_tbl_NH[32][1];
250 __declspec(align(64)) VUINT32 _sShifter_UISA[16][1];
251 __declspec(align(64)) VUINT32 _iDomainRange_UISA[16][1];
252 __declspec(align(64)) VUINT32 _sPC1_UISA[16][1];
253 __declspec(align(64)) VUINT32 _sPC2_UISA[16][1];
254 __declspec(align(64)) VUINT32 _sPC3_UISA[16][1];
255 __declspec(align(64)) VUINT32 _sInvLn2[16][1];
256 __declspec(align(64)) VUINT32 _sLn2hi[16][1];
257 __declspec(align(64)) VUINT32 _sLn2lo[16][1];
258 __declspec(align(64)) VUINT32 _sSign[16][1];
259 __declspec(align(64)) VUINT32 _iExpMask[16][1];
260 __declspec(align(64)) VUINT32 _sShifter[16][1];
261 __declspec(align(64)) VUINT32 _iDomainRange[16][1];
262 __declspec(align(64)) VUINT32 _sPC1[16][1];
263 __declspec(align(64)) VUINT32 _sPC2[16][1];
264 __declspec(align(64)) VUINT32 _sPC3[16][1];
265} __svml_scosh_data_internal;
266#endif
267__svml_scosh_data_internal:
268 /* _sExp_tbl_PH 2^(i/32-1), i=0..31 */
269 .long 0x3f000000, 0x3f02cd87, 0x3f05aac3, 0x3f08980f
270 .long 0x3f0b95c2, 0x3f0ea43a, 0x3f11c3d3, 0x3f14f4f0
271 .long 0x3f1837f0, 0x3f1b8d3a, 0x3f1ef532, 0x3f227043
272 .long 0x3f25fed7, 0x3f29a15b, 0x3f2d583f, 0x3f3123f6
273 .long 0x3f3504f3, 0x3f38fbaf, 0x3f3d08a4, 0x3f412c4d
274 .long 0x3f45672a, 0x3f49b9be, 0x3f4e248c, 0x3f52a81e
275 .long 0x3f5744fd, 0x3f5bfbb8, 0x3f60ccdf, 0x3f65b907
276 .long 0x3f6ac0c7, 0x3f6fe4ba, 0x3f75257d, 0x3f7a83b3
277 /* _sExp_tbl_NH 2^(-i/32-1), i=0..31 */
278 .align 64
279 .long 0x3f000000, 0x3efa83b3, 0x3ef5257d, 0x3eefe4ba
280 .long 0x3eeac0c7, 0x3ee5b907, 0x3ee0ccdf, 0x3edbfbb8
281 .long 0x3ed744fd, 0x3ed2a81e, 0x3ece248c, 0x3ec9b9be
282 .long 0x3ec5672a, 0x3ec12c4d, 0x3ebd08a4, 0x3eb8fbaf
283 .long 0x3eb504f3, 0x3eb123f6, 0x3ead583f, 0x3ea9a15b
284 .long 0x3ea5fed7, 0x3ea27043, 0x3e9ef532, 0x3e9b8d3a
285 .long 0x3e9837f0, 0x3e94f4f0, 0x3e91c3d3, 0x3e8ea43a
286 .long 0x3e8b95c2, 0x3e88980f, 0x3e85aac3, 0x3e82cd87
287 .align 64
288 .long 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000, 0x48c00000 /* 1.5*2^18 _sShifter_UISA */
289 .align 64
290 .long 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E /* _iDomainRange_UISA */
291 .align 64
292 .long 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000 /* _sPC1_UISA=1 */
293 .align 64
294 .long 0x3f00010f, 0x3f00010f, 0x3f00010f, 0x3f00010f, 0x3f00010f, 0x3f00010f, 0x3f00010f, 0x3f00010f, 0x3f00010f, 0x3f00010f, 0x3f00010f, 0x3f00010f, 0x3f00010f, 0x3f00010f, 0x3f00010f, 0x3f00010f /* _sPC2_UISA */
295 .align 64
296 .long 0x3e2aaacd, 0x3e2aaacd, 0x3e2aaacd, 0x3e2aaacd, 0x3e2aaacd, 0x3e2aaacd, 0x3e2aaacd, 0x3e2aaacd, 0x3e2aaacd, 0x3e2aaacd, 0x3e2aaacd, 0x3e2aaacd, 0x3e2aaacd, 0x3e2aaacd, 0x3e2aaacd, 0x3e2aaacd /* _sPC3_UISA */
297 .align 64
298 .long 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B /* _sInvLn2 */ // k=0
299 .align 64
300 .long 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000 /* _sLn2hi */
301 .align 64
302 .long 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4 /* _sLn2lo */
303 .align 64
304 .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000 /* _sSign */
305 .align 64
306 .long 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 /* _iExpMask */
307 .align 64
308 .long 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000 /* _sShifter */
309 .align 64
310 .long 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E /* _iDomainRange */
311 .align 64
312 .long 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000 /* _sPC1=1 */
313 .align 64
314 .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000 /* _sPC2 */
315 .align 64
316 .long 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57 /* _sPC3 */
317 .align 64
318 .type __svml_scosh_data_internal, @object
319 .size __svml_scosh_data_internal, .-__svml_scosh_data_internal
320

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_s_coshf16_core_avx512.S