1/* Function cosh vectorized with AVX-512.
2 Copyright (C) 2021-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
18
19/*
20 * ALGORITHM DESCRIPTION:
21 *
22 * Compute cosh(x) as (exp(x)+exp(-x))/2,
23 * where exp is calculated as
24 * exp(M*ln2 + ln2*(j/2^k) + r) = 2^M * 2^(j/2^k) * exp(r)
25 *
26 * Special cases:
27 *
28 * cosh(NaN) = quiet NaN, and raise invalid exception
29 * cosh(INF) = that INF
30 * cosh(0) = 1
31 * cosh(x) overflows for big x and returns MAXLOG+log(2)
32 *
33 */
34
35/* Offsets for data table __svml_dcosh_data_internal
36 */
37#define _dTp_h 0
38#define _dTn_h 128
39#define _dbShifter_UISA 256
40#define _dPC2_UISA 320
41#define _dPC3_UISA 384
42#define _dPC4_UISA 448
43#define _dPC5_UISA 512
44#define _dPC6_UISA 576
45#define _dPC7_UISA 640
46#define _dbInvLn2 704
47#define _dbLn2hi 768
48#define _dbLn2lo 832
49#define _dbShifter 896
50#define _dPC2 960
51#define _dPC3 1024
52#define _dPC4 1088
53#define _lExpMask 1152
54#define _dSign 1216
55#define _iDomainRange 1280
56
57#include <sysdep.h>
58
59 .section .text.evex512, "ax", @progbits
60ENTRY(_ZGVeN8v_cosh_skx)
61 pushq %rbp
62 cfi_def_cfa_offset(16)
63 movq %rsp, %rbp
64 cfi_def_cfa(6, 16)
65 cfi_offset(6, -16)
66 andq $-64, %rsp
67 subq $192, %rsp
68 vmovups _dSign+__svml_dcosh_data_internal(%rip), %zmm11
69 vmovups _dbShifter_UISA+__svml_dcosh_data_internal(%rip), %zmm15
70
71 /*
72 * Load argument
73 * dM = x*2^K/log(2) + RShifter
74 */
75 vmovups _dbInvLn2+__svml_dcosh_data_internal(%rip), %zmm4
76 vmovups _dbLn2hi+__svml_dcosh_data_internal(%rip), %zmm2
77 vmovups _dbLn2lo+__svml_dcosh_data_internal(%rip), %zmm3
78 vmovups _dPC7_UISA+__svml_dcosh_data_internal(%rip), %zmm8
79 vmovups _dPC6_UISA+__svml_dcosh_data_internal(%rip), %zmm9
80 vmovups _dPC2_UISA+__svml_dcosh_data_internal(%rip), %zmm7
81 vmovups _dPC3_UISA+__svml_dcosh_data_internal(%rip), %zmm6
82 vmovaps %zmm0, %zmm10
83
84 /* Abs argument */
85 vandnpd %zmm10, %zmm11, %zmm5
86
87 /* Index and lookup */
88 vmovups __svml_dcosh_data_internal(%rip), %zmm11
89 vmovups _dTn_h+__svml_dcosh_data_internal(%rip), %zmm0
90 vfmadd213pd {rn-sae}, %zmm15, %zmm5, %zmm4
91
92 /*
93 * Check for overflow\underflow
94 *
95 */
96 vpsrlq $32, %zmm5, %zmm12
97
98 /* dN = dM - RShifter */
99 vsubpd {rn-sae}, %zmm15, %zmm4, %zmm1
100 vpmovqd %zmm12, %ymm13
101 vpermt2pd _dTn_h+64+__svml_dcosh_data_internal(%rip), %zmm4, %zmm0
102 vpermt2pd _dTp_h+64+__svml_dcosh_data_internal(%rip), %zmm4, %zmm11
103
104 /* dR = dX - dN*Log2_hi/2^K */
105 vfnmadd231pd {rn-sae}, %zmm2, %zmm1, %zmm5
106
107 /*
108 * poly(r) = Gmjp(1 + a2*r^2 + a4*r^4) + Gmjn*(r+ a3*r^3 +a5*r^5) =
109 * = Gmjp_h +Gmjp_l+ Gmjp*r^2*(a2 + a4*r^2) + Gmjn*(r+ r^3*(a3 +a5*r^2)
110 */
111 vmovups _dPC5_UISA+__svml_dcosh_data_internal(%rip), %zmm12
112 vpsllq $48, %zmm4, %zmm2
113
114 /* dR = dX - dN*Log2_hi/2^K */
115 vfnmadd231pd {rn-sae}, %zmm3, %zmm1, %zmm5
116 vmulpd {rn-sae}, %zmm5, %zmm5, %zmm1
117 vfmadd231pd {rn-sae}, %zmm1, %zmm8, %zmm12
118 vmovups _dPC4_UISA+__svml_dcosh_data_internal(%rip), %zmm8
119 vfmadd213pd {rn-sae}, %zmm6, %zmm1, %zmm12
120 vfmadd231pd {rn-sae}, %zmm1, %zmm9, %zmm8
121 vfmadd213pd {rn-sae}, %zmm7, %zmm1, %zmm8
122 vpcmpgtd _iDomainRange+__svml_dcosh_data_internal(%rip), %ymm13, %ymm14
123 vmovmskps %ymm14, %edx
124
125 /* dOut=r^2*(a2 + a4*r^2) */
126 vmulpd {rn-sae}, %zmm1, %zmm8, %zmm6
127
128 /* lM now is an EXP(2^N) */
129 vpandq _lExpMask+__svml_dcosh_data_internal(%rip), %zmm2, %zmm3
130 vpaddq %zmm3, %zmm11, %zmm4
131 vpsubq %zmm3, %zmm0, %zmm0
132 vsubpd {rn-sae}, %zmm0, %zmm4, %zmm14
133 vaddpd {rn-sae}, %zmm0, %zmm4, %zmm13
134
135 /* dM=r^2*(a3 +a5*r^2) */
136 vmulpd {rn-sae}, %zmm1, %zmm12, %zmm0
137 vfmadd213pd {rn-sae}, %zmm13, %zmm13, %zmm6
138
139 /* dM= r + r^3*(a3 +a5*r^2) */
140 vfmadd213pd {rn-sae}, %zmm5, %zmm5, %zmm0
141 vfmadd213pd {rn-sae}, %zmm6, %zmm14, %zmm0
142 testl %edx, %edx
143
144 /* Go to special inputs processing branch */
145 jne L(SPECIAL_VALUES_BRANCH)
146 # LOE rbx r12 r13 r14 r15 edx zmm0 zmm10
147
148 /* Restore registers
149 * and exit the function
150 */
151
152L(EXIT):
153 movq %rbp, %rsp
154 popq %rbp
155 cfi_def_cfa(7, 8)
156 cfi_restore(6)
157 ret
158 cfi_def_cfa(6, 16)
159 cfi_offset(6, -16)
160
161 /* Branch to process
162 * special inputs
163 */
164
165L(SPECIAL_VALUES_BRANCH):
166 vmovups %zmm10, 64(%rsp)
167 vmovups %zmm0, 128(%rsp)
168 # LOE rbx r12 r13 r14 r15 edx zmm0
169
170 xorl %eax, %eax
171 # LOE rbx r12 r13 r14 r15 eax edx
172
173 vzeroupper
174 movq %r12, 16(%rsp)
175 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
176 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
177 movl %eax, %r12d
178 movq %r13, 8(%rsp)
179 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
180 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
181 movl %edx, %r13d
182 movq %r14, (%rsp)
183 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
184 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
185 # LOE rbx r15 r12d r13d
186
187 /* Range mask
188 * bits check
189 */
190
191L(RANGEMASK_CHECK):
192 btl %r12d, %r13d
193
194 /* Call scalar math function */
195 jc L(SCALAR_MATH_CALL)
196 # LOE rbx r15 r12d r13d
197
198 /* Special inputs
199 * processing loop
200 */
201
202L(SPECIAL_VALUES_LOOP):
203 incl %r12d
204 cmpl $8, %r12d
205
206 /* Check bits in range mask */
207 jl L(RANGEMASK_CHECK)
208 # LOE rbx r15 r12d r13d
209
210 movq 16(%rsp), %r12
211 cfi_restore(12)
212 movq 8(%rsp), %r13
213 cfi_restore(13)
214 movq (%rsp), %r14
215 cfi_restore(14)
216 vmovups 128(%rsp), %zmm0
217
218 /* Go to exit */
219 jmp L(EXIT)
220 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
221 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
222 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
223 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
224 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
225 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
226 # LOE rbx r12 r13 r14 r15 zmm0
227
228 /* Scalar math function call
229 * to process special input
230 */
231
232L(SCALAR_MATH_CALL):
233 movl %r12d, %r14d
234 vmovsd 64(%rsp, %r14, 8), %xmm0
235 call cosh@PLT
236 # LOE rbx r14 r15 r12d r13d xmm0
237
238 vmovsd %xmm0, 128(%rsp, %r14, 8)
239
240 /* Process special inputs in loop */
241 jmp L(SPECIAL_VALUES_LOOP)
242 # LOE rbx r15 r12d r13d
243END(_ZGVeN8v_cosh_skx)
244
245 .section .rodata, "a"
246 .align 64
247
248#ifdef __svml_dcosh_data_internal_typedef
249typedef unsigned int VUINT32;
250typedef struct {
251 __declspec(align(64)) VUINT32 _dTp_h[(1<<4)][2];
252 __declspec(align(64)) VUINT32 _dTn_h[(1<<4)][2];
253 __declspec(align(64)) VUINT32 _dbShifter_UISA[8][2];
254 __declspec(align(64)) VUINT32 _dPC2_UISA[8][2];
255 __declspec(align(64)) VUINT32 _dPC3_UISA[8][2];
256 __declspec(align(64)) VUINT32 _dPC4_UISA[8][2];
257 __declspec(align(64)) VUINT32 _dPC5_UISA[8][2];
258 __declspec(align(64)) VUINT32 _dPC6_UISA[8][2];
259 __declspec(align(64)) VUINT32 _dPC7_UISA[8][2];
260 __declspec(align(64)) VUINT32 _dbInvLn2[8][2];
261 __declspec(align(64)) VUINT32 _dbLn2hi[8][2];
262 __declspec(align(64)) VUINT32 _dbLn2lo[8][2];
263 __declspec(align(64)) VUINT32 _dbShifter[8][2];
264 __declspec(align(64)) VUINT32 _dPC2[8][2];
265 __declspec(align(64)) VUINT32 _dPC3[8][2];
266 __declspec(align(64)) VUINT32 _dPC4[8][2];
267 __declspec(align(64)) VUINT32 _lExpMask[8][2];
268 __declspec(align(64)) VUINT32 _dSign[8][2]; // 0x8000000000000000
269 __declspec(align(64)) VUINT32 _iDomainRange[16][1];
270} __svml_dcosh_data_internal;
271#endif
272__svml_dcosh_data_internal:
273 /* _dTp_h */
274 .quad 0x3fe0000000000000, 0x3fe0b5586cf9890f, 0x3fe172b83c7d517b, 0x3fe2387a6e756238
275 .quad 0x3fe306fe0a31b715, 0x3fe3dea64c123422, 0x3fe4bfdad5362a27, 0x3fe5ab07dd485429
276 .quad 0x3fe6a09e667f3bcd, 0x3fe7a11473eb0187, 0x3fe8ace5422aa0db, 0x3fe9c49182a3f090
277 .quad 0x3feae89f995ad3ad, 0x3fec199bdd85529c, 0x3fed5818dcfba487, 0x3feea4afa2a490da
278 /* dTn_h */
279 .align 64
280 .quad 0x3fe0000000000000, 0x3fdea4afa2a490da, 0x3fdd5818dcfba487, 0x3fdc199bdd85529c
281 .quad 0x3fdae89f995ad3ad, 0x3fd9c49182a3f090, 0x3fd8ace5422aa0db, 0x3fd7a11473eb0187
282 .quad 0x3fd6a09e667f3bcd, 0x3fd5ab07dd485429, 0x3fd4bfdad5362a27, 0x3fd3dea64c123422
283 .quad 0x3fd306fe0a31b715, 0x3fd2387a6e756238, 0x3fd172b83c7d517b, 0x3fd0b5586cf9890f
284 .align 64
285 .quad 0x42F8000000000000, 0x42F8000000000000, 0x42F8000000000000, 0x42F8000000000000, 0x42F8000000000000, 0x42F8000000000000, 0x42F8000000000000, 0x42F8000000000000 /* _dbShifter_UISA */
286 .align 64
287 .quad 0x3fe0000000000004, 0x3fe0000000000004, 0x3fe0000000000004, 0x3fe0000000000004, 0x3fe0000000000004, 0x3fe0000000000004, 0x3fe0000000000004, 0x3fe0000000000004 /* _dPC2_UISA */
288 .align 64
289 .quad 0x3fc5555555555543, 0x3fc5555555555543, 0x3fc5555555555543, 0x3fc5555555555543, 0x3fc5555555555543, 0x3fc5555555555543, 0x3fc5555555555543, 0x3fc5555555555543 /* _dPC3_UISA */
290 .align 64
291 .quad 0x3fa5555555484f37, 0x3fa5555555484f37, 0x3fa5555555484f37, 0x3fa5555555484f37, 0x3fa5555555484f37, 0x3fa5555555484f37, 0x3fa5555555484f37, 0x3fa5555555484f37 /* _dPC4_UISA */
292 .align 64
293 .quad 0x3f81111111286a0c, 0x3f81111111286a0c, 0x3f81111111286a0c, 0x3f81111111286a0c, 0x3f81111111286a0c, 0x3f81111111286a0c, 0x3f81111111286a0c, 0x3f81111111286a0c /* _dPC5_UISA */
294 .align 64
295 .quad 0x3f56c183da08f116, 0x3f56c183da08f116, 0x3f56c183da08f116, 0x3f56c183da08f116, 0x3f56c183da08f116, 0x3f56c183da08f116, 0x3f56c183da08f116, 0x3f56c183da08f116 /* _dPC6_UISA */
296 .align 64
297 .quad 0x3f2a018d76da03da, 0x3f2a018d76da03da, 0x3f2a018d76da03da, 0x3f2a018d76da03da, 0x3f2a018d76da03da, 0x3f2a018d76da03da, 0x3f2a018d76da03da, 0x3f2a018d76da03da /* _dPC7_UISA */
298 /* _dbT */
299 .align 64
300 .quad 0x3ff71547652b82fe, 0x3ff71547652b82fe, 0x3ff71547652b82fe, 0x3ff71547652b82fe, 0x3ff71547652b82fe, 0x3ff71547652b82fe, 0x3ff71547652b82fe, 0x3ff71547652b82fe /* _dbInvLn2 = 1/log(2) */
301 .align 64
302 .quad 0x3FE62E42FEFC0000, 0x3FE62E42FEFC0000, 0x3FE62E42FEFC0000, 0x3FE62E42FEFC0000, 0x3FE62E42FEFC0000, 0x3FE62E42FEFC0000, 0x3FE62E42FEFC0000, 0x3FE62E42FEFC0000 /* _dbLn2hi = log(2) hi */
303 .align 64
304 .quad 0xBDAC610CA86C3899, 0xBDAC610CA86C3899, 0xBDAC610CA86C3899, 0xBDAC610CA86C3899, 0xBDAC610CA86C3899, 0xBDAC610CA86C3899, 0xBDAC610CA86C3899, 0xBDAC610CA86C3899 /* _dbLn2lo = log(2) lo */
305 .align 64
306 .quad 0x42B8000000000000, 0x42B8000000000000, 0x42B8000000000000, 0x42B8000000000000, 0x42B8000000000000, 0x42B8000000000000, 0x42B8000000000000, 0x42B8000000000000 /* _dbShifter */
307 .align 64
308 .quad 0x3FDFFFFFFFFFFDBD, 0x3FDFFFFFFFFFFDBD, 0x3FDFFFFFFFFFFDBD, 0x3FDFFFFFFFFFFDBD, 0x3FDFFFFFFFFFFDBD, 0x3FDFFFFFFFFFFDBD, 0x3FDFFFFFFFFFFDBD, 0x3FDFFFFFFFFFFDBD /* _dPC2 */
309 .align 64
310 .quad 0x3FC5555570813E14, 0x3FC5555570813E14, 0x3FC5555570813E14, 0x3FC5555570813E14, 0x3FC5555570813E14, 0x3FC5555570813E14, 0x3FC5555570813E14, 0x3FC5555570813E14 /* _dPC3 */
311 .align 64
312 .quad 0x3FA55555CF16D299, 0x3FA55555CF16D299, 0x3FA55555CF16D299, 0x3FA55555CF16D299, 0x3FA55555CF16D299, 0x3FA55555CF16D299, 0x3FA55555CF16D299, 0x3FA55555CF16D299 /* _dPC4 */
313 .align 64
314 .quad 0x7ff0000000000000, 0x7ff0000000000000, 0x7ff0000000000000, 0x7ff0000000000000, 0x7ff0000000000000, 0x7ff0000000000000, 0x7ff0000000000000, 0x7ff0000000000000 /* _lExpMask */
315 .align 64
316 .quad 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000 /* _dSign */
317 .align 64
318 .long 0x40861d99, 0x40861d99, 0x40861d99, 0x40861d99, 0x40861d99, 0x40861d99, 0x40861d99, 0x40861d99, 0x40861d99, 0x40861d99, 0x40861d99, 0x40861d99, 0x40861d99, 0x40861d99, 0x40861d99, 0x40861d99 /* _iDomainRange 0x40861d9ac12a3e85 =(1021*2^K-0.5)*log(2)/2^K -needed for quick exp */
319 .align 64
320 .type __svml_dcosh_data_internal, @object
321 .size __svml_dcosh_data_internal, .-__svml_dcosh_data_internal
322

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_d_cosh8_core_avx512.S