1 | /* Function cosh vectorized with AVX-512. |
2 | Copyright (C) 2021-2024 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | https://www.gnu.org/licenses/. */ |
18 | |
19 | /* |
20 | * ALGORITHM DESCRIPTION: |
21 | * |
22 | * Compute cosh(x) as (exp(x)+exp(-x))/2, |
23 | * where exp is calculated as |
24 | * exp(M*ln2 + ln2*(j/2^k) + r) = 2^M * 2^(j/2^k) * exp(r) |
25 | * |
26 | * Special cases: |
27 | * |
28 | * cosh(NaN) = quiet NaN, and raise invalid exception |
29 | * cosh(INF) = that INF |
30 | * cosh(0) = 1 |
31 | * cosh(x) overflows for big x and returns MAXLOG+log(2) |
32 | * |
33 | */ |
34 | |
35 | /* Offsets for data table __svml_dcosh_data_internal |
36 | */ |
37 | #define _dTp_h 0 |
38 | #define _dTn_h 128 |
39 | #define _dbShifter_UISA 256 |
40 | #define _dPC2_UISA 320 |
41 | #define _dPC3_UISA 384 |
42 | #define _dPC4_UISA 448 |
43 | #define _dPC5_UISA 512 |
44 | #define _dPC6_UISA 576 |
45 | #define _dPC7_UISA 640 |
46 | #define _dbInvLn2 704 |
47 | #define _dbLn2hi 768 |
48 | #define _dbLn2lo 832 |
49 | #define _dbShifter 896 |
50 | #define _dPC2 960 |
51 | #define _dPC3 1024 |
52 | #define _dPC4 1088 |
53 | #define _lExpMask 1152 |
54 | #define _dSign 1216 |
55 | #define _iDomainRange 1280 |
56 | |
57 | #include <sysdep.h> |
58 | |
59 | .section .text.evex512, "ax" , @progbits |
60 | ENTRY(_ZGVeN8v_cosh_skx) |
61 | pushq %rbp |
62 | cfi_def_cfa_offset(16) |
63 | movq %rsp, %rbp |
64 | cfi_def_cfa(6, 16) |
65 | cfi_offset(6, -16) |
66 | andq $-64, %rsp |
67 | subq $192, %rsp |
68 | vmovups _dSign+__svml_dcosh_data_internal(%rip), %zmm11 |
69 | vmovups _dbShifter_UISA+__svml_dcosh_data_internal(%rip), %zmm15 |
70 | |
71 | /* |
72 | * Load argument |
73 | * dM = x*2^K/log(2) + RShifter |
74 | */ |
75 | vmovups _dbInvLn2+__svml_dcosh_data_internal(%rip), %zmm4 |
76 | vmovups _dbLn2hi+__svml_dcosh_data_internal(%rip), %zmm2 |
77 | vmovups _dbLn2lo+__svml_dcosh_data_internal(%rip), %zmm3 |
78 | vmovups _dPC7_UISA+__svml_dcosh_data_internal(%rip), %zmm8 |
79 | vmovups _dPC6_UISA+__svml_dcosh_data_internal(%rip), %zmm9 |
80 | vmovups _dPC2_UISA+__svml_dcosh_data_internal(%rip), %zmm7 |
81 | vmovups _dPC3_UISA+__svml_dcosh_data_internal(%rip), %zmm6 |
82 | vmovaps %zmm0, %zmm10 |
83 | |
84 | /* Abs argument */ |
85 | vandnpd %zmm10, %zmm11, %zmm5 |
86 | |
87 | /* Index and lookup */ |
88 | vmovups __svml_dcosh_data_internal(%rip), %zmm11 |
89 | vmovups _dTn_h+__svml_dcosh_data_internal(%rip), %zmm0 |
90 | vfmadd213pd {rn-sae}, %zmm15, %zmm5, %zmm4 |
91 | |
92 | /* |
93 | * Check for overflow\underflow |
94 | * |
95 | */ |
96 | vpsrlq $32, %zmm5, %zmm12 |
97 | |
98 | /* dN = dM - RShifter */ |
99 | vsubpd {rn-sae}, %zmm15, %zmm4, %zmm1 |
100 | vpmovqd %zmm12, %ymm13 |
101 | vpermt2pd _dTn_h+64+__svml_dcosh_data_internal(%rip), %zmm4, %zmm0 |
102 | vpermt2pd _dTp_h+64+__svml_dcosh_data_internal(%rip), %zmm4, %zmm11 |
103 | |
104 | /* dR = dX - dN*Log2_hi/2^K */ |
105 | vfnmadd231pd {rn-sae}, %zmm2, %zmm1, %zmm5 |
106 | |
107 | /* |
108 | * poly(r) = Gmjp(1 + a2*r^2 + a4*r^4) + Gmjn*(r+ a3*r^3 +a5*r^5) = |
109 | * = Gmjp_h +Gmjp_l+ Gmjp*r^2*(a2 + a4*r^2) + Gmjn*(r+ r^3*(a3 +a5*r^2) |
110 | */ |
111 | vmovups _dPC5_UISA+__svml_dcosh_data_internal(%rip), %zmm12 |
112 | vpsllq $48, %zmm4, %zmm2 |
113 | |
114 | /* dR = dX - dN*Log2_hi/2^K */ |
115 | vfnmadd231pd {rn-sae}, %zmm3, %zmm1, %zmm5 |
116 | vmulpd {rn-sae}, %zmm5, %zmm5, %zmm1 |
117 | vfmadd231pd {rn-sae}, %zmm1, %zmm8, %zmm12 |
118 | vmovups _dPC4_UISA+__svml_dcosh_data_internal(%rip), %zmm8 |
119 | vfmadd213pd {rn-sae}, %zmm6, %zmm1, %zmm12 |
120 | vfmadd231pd {rn-sae}, %zmm1, %zmm9, %zmm8 |
121 | vfmadd213pd {rn-sae}, %zmm7, %zmm1, %zmm8 |
122 | vpcmpgtd _iDomainRange+__svml_dcosh_data_internal(%rip), %ymm13, %ymm14 |
123 | vmovmskps %ymm14, %edx |
124 | |
125 | /* dOut=r^2*(a2 + a4*r^2) */ |
126 | vmulpd {rn-sae}, %zmm1, %zmm8, %zmm6 |
127 | |
128 | /* lM now is an EXP(2^N) */ |
129 | vpandq _lExpMask+__svml_dcosh_data_internal(%rip), %zmm2, %zmm3 |
130 | vpaddq %zmm3, %zmm11, %zmm4 |
131 | vpsubq %zmm3, %zmm0, %zmm0 |
132 | vsubpd {rn-sae}, %zmm0, %zmm4, %zmm14 |
133 | vaddpd {rn-sae}, %zmm0, %zmm4, %zmm13 |
134 | |
135 | /* dM=r^2*(a3 +a5*r^2) */ |
136 | vmulpd {rn-sae}, %zmm1, %zmm12, %zmm0 |
137 | vfmadd213pd {rn-sae}, %zmm13, %zmm13, %zmm6 |
138 | |
139 | /* dM= r + r^3*(a3 +a5*r^2) */ |
140 | vfmadd213pd {rn-sae}, %zmm5, %zmm5, %zmm0 |
141 | vfmadd213pd {rn-sae}, %zmm6, %zmm14, %zmm0 |
142 | testl %edx, %edx |
143 | |
144 | /* Go to special inputs processing branch */ |
145 | jne L(SPECIAL_VALUES_BRANCH) |
146 | # LOE rbx r12 r13 r14 r15 edx zmm0 zmm10 |
147 | |
148 | /* Restore registers |
149 | * and exit the function |
150 | */ |
151 | |
152 | L(EXIT): |
153 | movq %rbp, %rsp |
154 | popq %rbp |
155 | cfi_def_cfa(7, 8) |
156 | cfi_restore(6) |
157 | ret |
158 | cfi_def_cfa(6, 16) |
159 | cfi_offset(6, -16) |
160 | |
161 | /* Branch to process |
162 | * special inputs |
163 | */ |
164 | |
165 | L(SPECIAL_VALUES_BRANCH): |
166 | vmovups %zmm10, 64(%rsp) |
167 | vmovups %zmm0, 128(%rsp) |
168 | # LOE rbx r12 r13 r14 r15 edx zmm0 |
169 | |
170 | xorl %eax, %eax |
171 | # LOE rbx r12 r13 r14 r15 eax edx |
172 | |
173 | vzeroupper |
174 | movq %r12, 16(%rsp) |
175 | /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */ |
176 | .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22 |
177 | movl %eax, %r12d |
178 | movq %r13, 8(%rsp) |
179 | /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */ |
180 | .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22 |
181 | movl %edx, %r13d |
182 | movq %r14, (%rsp) |
183 | /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */ |
184 | .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22 |
185 | # LOE rbx r15 r12d r13d |
186 | |
187 | /* Range mask |
188 | * bits check |
189 | */ |
190 | |
191 | L(RANGEMASK_CHECK): |
192 | btl %r12d, %r13d |
193 | |
194 | /* Call scalar math function */ |
195 | jc L(SCALAR_MATH_CALL) |
196 | # LOE rbx r15 r12d r13d |
197 | |
198 | /* Special inputs |
199 | * processing loop |
200 | */ |
201 | |
202 | L(SPECIAL_VALUES_LOOP): |
203 | incl %r12d |
204 | cmpl $8, %r12d |
205 | |
206 | /* Check bits in range mask */ |
207 | jl L(RANGEMASK_CHECK) |
208 | # LOE rbx r15 r12d r13d |
209 | |
210 | movq 16(%rsp), %r12 |
211 | cfi_restore(12) |
212 | movq 8(%rsp), %r13 |
213 | cfi_restore(13) |
214 | movq (%rsp), %r14 |
215 | cfi_restore(14) |
216 | vmovups 128(%rsp), %zmm0 |
217 | |
218 | /* Go to exit */ |
219 | jmp L(EXIT) |
220 | /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */ |
221 | .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22 |
222 | /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */ |
223 | .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22 |
224 | /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */ |
225 | .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22 |
226 | # LOE rbx r12 r13 r14 r15 zmm0 |
227 | |
228 | /* Scalar math function call |
229 | * to process special input |
230 | */ |
231 | |
232 | L(SCALAR_MATH_CALL): |
233 | movl %r12d, %r14d |
234 | vmovsd 64(%rsp, %r14, 8), %xmm0 |
235 | call cosh@PLT |
236 | # LOE rbx r14 r15 r12d r13d xmm0 |
237 | |
238 | vmovsd %xmm0, 128(%rsp, %r14, 8) |
239 | |
240 | /* Process special inputs in loop */ |
241 | jmp L(SPECIAL_VALUES_LOOP) |
242 | # LOE rbx r15 r12d r13d |
243 | END(_ZGVeN8v_cosh_skx) |
244 | |
245 | .section .rodata, "a" |
246 | .align 64 |
247 | |
248 | #ifdef __svml_dcosh_data_internal_typedef |
249 | typedef unsigned int VUINT32; |
250 | typedef struct { |
251 | __declspec(align(64)) VUINT32 _dTp_h[(1<<4)][2]; |
252 | __declspec(align(64)) VUINT32 _dTn_h[(1<<4)][2]; |
253 | __declspec(align(64)) VUINT32 _dbShifter_UISA[8][2]; |
254 | __declspec(align(64)) VUINT32 _dPC2_UISA[8][2]; |
255 | __declspec(align(64)) VUINT32 _dPC3_UISA[8][2]; |
256 | __declspec(align(64)) VUINT32 _dPC4_UISA[8][2]; |
257 | __declspec(align(64)) VUINT32 _dPC5_UISA[8][2]; |
258 | __declspec(align(64)) VUINT32 _dPC6_UISA[8][2]; |
259 | __declspec(align(64)) VUINT32 _dPC7_UISA[8][2]; |
260 | __declspec(align(64)) VUINT32 _dbInvLn2[8][2]; |
261 | __declspec(align(64)) VUINT32 _dbLn2hi[8][2]; |
262 | __declspec(align(64)) VUINT32 _dbLn2lo[8][2]; |
263 | __declspec(align(64)) VUINT32 _dbShifter[8][2]; |
264 | __declspec(align(64)) VUINT32 _dPC2[8][2]; |
265 | __declspec(align(64)) VUINT32 _dPC3[8][2]; |
266 | __declspec(align(64)) VUINT32 _dPC4[8][2]; |
267 | __declspec(align(64)) VUINT32 _lExpMask[8][2]; |
268 | __declspec(align(64)) VUINT32 _dSign[8][2]; // 0x8000000000000000 |
269 | __declspec(align(64)) VUINT32 _iDomainRange[16][1]; |
270 | } __svml_dcosh_data_internal; |
271 | #endif |
272 | __svml_dcosh_data_internal: |
273 | /* _dTp_h */ |
274 | .quad 0x3fe0000000000000, 0x3fe0b5586cf9890f, 0x3fe172b83c7d517b, 0x3fe2387a6e756238 |
275 | .quad 0x3fe306fe0a31b715, 0x3fe3dea64c123422, 0x3fe4bfdad5362a27, 0x3fe5ab07dd485429 |
276 | .quad 0x3fe6a09e667f3bcd, 0x3fe7a11473eb0187, 0x3fe8ace5422aa0db, 0x3fe9c49182a3f090 |
277 | .quad 0x3feae89f995ad3ad, 0x3fec199bdd85529c, 0x3fed5818dcfba487, 0x3feea4afa2a490da |
278 | /* dTn_h */ |
279 | .align 64 |
280 | .quad 0x3fe0000000000000, 0x3fdea4afa2a490da, 0x3fdd5818dcfba487, 0x3fdc199bdd85529c |
281 | .quad 0x3fdae89f995ad3ad, 0x3fd9c49182a3f090, 0x3fd8ace5422aa0db, 0x3fd7a11473eb0187 |
282 | .quad 0x3fd6a09e667f3bcd, 0x3fd5ab07dd485429, 0x3fd4bfdad5362a27, 0x3fd3dea64c123422 |
283 | .quad 0x3fd306fe0a31b715, 0x3fd2387a6e756238, 0x3fd172b83c7d517b, 0x3fd0b5586cf9890f |
284 | .align 64 |
285 | .quad 0x42F8000000000000, 0x42F8000000000000, 0x42F8000000000000, 0x42F8000000000000, 0x42F8000000000000, 0x42F8000000000000, 0x42F8000000000000, 0x42F8000000000000 /* _dbShifter_UISA */ |
286 | .align 64 |
287 | .quad 0x3fe0000000000004, 0x3fe0000000000004, 0x3fe0000000000004, 0x3fe0000000000004, 0x3fe0000000000004, 0x3fe0000000000004, 0x3fe0000000000004, 0x3fe0000000000004 /* _dPC2_UISA */ |
288 | .align 64 |
289 | .quad 0x3fc5555555555543, 0x3fc5555555555543, 0x3fc5555555555543, 0x3fc5555555555543, 0x3fc5555555555543, 0x3fc5555555555543, 0x3fc5555555555543, 0x3fc5555555555543 /* _dPC3_UISA */ |
290 | .align 64 |
291 | .quad 0x3fa5555555484f37, 0x3fa5555555484f37, 0x3fa5555555484f37, 0x3fa5555555484f37, 0x3fa5555555484f37, 0x3fa5555555484f37, 0x3fa5555555484f37, 0x3fa5555555484f37 /* _dPC4_UISA */ |
292 | .align 64 |
293 | .quad 0x3f81111111286a0c, 0x3f81111111286a0c, 0x3f81111111286a0c, 0x3f81111111286a0c, 0x3f81111111286a0c, 0x3f81111111286a0c, 0x3f81111111286a0c, 0x3f81111111286a0c /* _dPC5_UISA */ |
294 | .align 64 |
295 | .quad 0x3f56c183da08f116, 0x3f56c183da08f116, 0x3f56c183da08f116, 0x3f56c183da08f116, 0x3f56c183da08f116, 0x3f56c183da08f116, 0x3f56c183da08f116, 0x3f56c183da08f116 /* _dPC6_UISA */ |
296 | .align 64 |
297 | .quad 0x3f2a018d76da03da, 0x3f2a018d76da03da, 0x3f2a018d76da03da, 0x3f2a018d76da03da, 0x3f2a018d76da03da, 0x3f2a018d76da03da, 0x3f2a018d76da03da, 0x3f2a018d76da03da /* _dPC7_UISA */ |
298 | /* _dbT */ |
299 | .align 64 |
300 | .quad 0x3ff71547652b82fe, 0x3ff71547652b82fe, 0x3ff71547652b82fe, 0x3ff71547652b82fe, 0x3ff71547652b82fe, 0x3ff71547652b82fe, 0x3ff71547652b82fe, 0x3ff71547652b82fe /* _dbInvLn2 = 1/log(2) */ |
301 | .align 64 |
302 | .quad 0x3FE62E42FEFC0000, 0x3FE62E42FEFC0000, 0x3FE62E42FEFC0000, 0x3FE62E42FEFC0000, 0x3FE62E42FEFC0000, 0x3FE62E42FEFC0000, 0x3FE62E42FEFC0000, 0x3FE62E42FEFC0000 /* _dbLn2hi = log(2) hi */ |
303 | .align 64 |
304 | .quad 0xBDAC610CA86C3899, 0xBDAC610CA86C3899, 0xBDAC610CA86C3899, 0xBDAC610CA86C3899, 0xBDAC610CA86C3899, 0xBDAC610CA86C3899, 0xBDAC610CA86C3899, 0xBDAC610CA86C3899 /* _dbLn2lo = log(2) lo */ |
305 | .align 64 |
306 | .quad 0x42B8000000000000, 0x42B8000000000000, 0x42B8000000000000, 0x42B8000000000000, 0x42B8000000000000, 0x42B8000000000000, 0x42B8000000000000, 0x42B8000000000000 /* _dbShifter */ |
307 | .align 64 |
308 | .quad 0x3FDFFFFFFFFFFDBD, 0x3FDFFFFFFFFFFDBD, 0x3FDFFFFFFFFFFDBD, 0x3FDFFFFFFFFFFDBD, 0x3FDFFFFFFFFFFDBD, 0x3FDFFFFFFFFFFDBD, 0x3FDFFFFFFFFFFDBD, 0x3FDFFFFFFFFFFDBD /* _dPC2 */ |
309 | .align 64 |
310 | .quad 0x3FC5555570813E14, 0x3FC5555570813E14, 0x3FC5555570813E14, 0x3FC5555570813E14, 0x3FC5555570813E14, 0x3FC5555570813E14, 0x3FC5555570813E14, 0x3FC5555570813E14 /* _dPC3 */ |
311 | .align 64 |
312 | .quad 0x3FA55555CF16D299, 0x3FA55555CF16D299, 0x3FA55555CF16D299, 0x3FA55555CF16D299, 0x3FA55555CF16D299, 0x3FA55555CF16D299, 0x3FA55555CF16D299, 0x3FA55555CF16D299 /* _dPC4 */ |
313 | .align 64 |
314 | .quad 0x7ff0000000000000, 0x7ff0000000000000, 0x7ff0000000000000, 0x7ff0000000000000, 0x7ff0000000000000, 0x7ff0000000000000, 0x7ff0000000000000, 0x7ff0000000000000 /* _lExpMask */ |
315 | .align 64 |
316 | .quad 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000 /* _dSign */ |
317 | .align 64 |
318 | .long 0x40861d99, 0x40861d99, 0x40861d99, 0x40861d99, 0x40861d99, 0x40861d99, 0x40861d99, 0x40861d99, 0x40861d99, 0x40861d99, 0x40861d99, 0x40861d99, 0x40861d99, 0x40861d99, 0x40861d99, 0x40861d99 /* _iDomainRange 0x40861d9ac12a3e85 =(1021*2^K-0.5)*log(2)/2^K -needed for quick exp */ |
319 | .align 64 |
320 | .type __svml_dcosh_data_internal, @object |
321 | .size __svml_dcosh_data_internal, .-__svml_dcosh_data_internal |
322 | |