1/* Function log1p vectorized with AVX-512.
2 Copyright (C) 2021-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
18
19/*
20 * ALGORITHM DESCRIPTION:
21 *
22 * 1+x = 2^k*(xh + xl) is computed in high-low parts; xh in [1, 2)
23 * Get short reciprocal approximation Rcp ~ 1/xh
24 * R = (Rcp*xh - 1.0) + Rcp*xl
25 * log1p(x) = k*log(2.0) - log(Rcp) + poly(R)
26 * log(Rcp) is tabulated
27 *
28 *
29 */
30
31/* Offsets for data table __svml_dlog1p_data_internal_avx512
32 */
33#define Log_tbl 0
34#define One 128
35#define SgnMask 192
36#define C075 256
37#define poly_coeff9 320
38#define poly_coeff8 384
39#define poly_coeff7 448
40#define poly_coeff6 512
41#define poly_coeff5 576
42#define poly_coeff4 640
43#define poly_coeff3 704
44#define poly_coeff2 768
45#define L2 832
46
47#include <sysdep.h>
48
49 .section .text.evex512, "ax", @progbits
50ENTRY(_ZGVeN8v_log1p_skx)
51 pushq %rbp
52 cfi_def_cfa_offset(16)
53 movq %rsp, %rbp
54 cfi_def_cfa(6, 16)
55 cfi_offset(6, -16)
56 andq $-64, %rsp
57 subq $192, %rsp
58 vmovups One+__svml_dlog1p_data_internal_avx512(%rip), %zmm7
59 vmovups SgnMask+__svml_dlog1p_data_internal_avx512(%rip), %zmm14
60 vmovaps %zmm0, %zmm9
61 vaddpd {rn-sae}, %zmm9, %zmm7, %zmm11
62 vandpd %zmm14, %zmm9, %zmm8
63
64 /* compute 1+x as high, low parts */
65 vmaxpd {sae}, %zmm9, %zmm7, %zmm10
66 vminpd {sae}, %zmm9, %zmm7, %zmm12
67
68 /* GetMant(x), normalized to [1, 2) for x>=0, NaN for x<0 */
69 vgetmantpd $8, {sae}, %zmm11, %zmm6
70
71 /* GetExp(x) */
72 vgetexppd {sae}, %zmm11, %zmm5
73 vsubpd {rn-sae}, %zmm10, %zmm11, %zmm13
74
75 /* DblRcp ~ 1/Mantissa */
76 vrcp14pd %zmm6, %zmm15
77
78 /* Start polynomial evaluation */
79 vmovups poly_coeff9+__svml_dlog1p_data_internal_avx512(%rip), %zmm10
80 vmovups poly_coeff7+__svml_dlog1p_data_internal_avx512(%rip), %zmm11
81
82 /* Xl */
83 vsubpd {rn-sae}, %zmm13, %zmm12, %zmm2
84 vxorpd %zmm14, %zmm5, %zmm3
85
86 /* round DblRcp to 4 fractional bits (RN mode, no Precision exception) */
87 vrndscalepd $88, {sae}, %zmm15, %zmm4
88 vmovups poly_coeff5+__svml_dlog1p_data_internal_avx512(%rip), %zmm12
89 vmovups poly_coeff6+__svml_dlog1p_data_internal_avx512(%rip), %zmm14
90 vmovups poly_coeff3+__svml_dlog1p_data_internal_avx512(%rip), %zmm13
91
92 /* Xl*2^(-Expon) */
93 vscalefpd {rn-sae}, %zmm3, %zmm2, %zmm1
94
95 /* Reduced argument: R = DblRcp*(Mantissa+Xl) - 1 */
96 vfmsub213pd {rn-sae}, %zmm7, %zmm4, %zmm6
97 vmovups __svml_dlog1p_data_internal_avx512(%rip), %zmm3
98
99 /*
100 * Table lookup
101 * Prepare exponent correction: DblRcp<0.75?
102 */
103 vmovups C075+__svml_dlog1p_data_internal_avx512(%rip), %zmm2
104
105 /* Prepare table index */
106 vpsrlq $48, %zmm4, %zmm0
107 vfmadd231pd {rn-sae}, %zmm4, %zmm1, %zmm6
108 vmovups poly_coeff8+__svml_dlog1p_data_internal_avx512(%rip), %zmm1
109 vcmppd $17, {sae}, %zmm2, %zmm4, %k1
110 vcmppd $4, {sae}, %zmm6, %zmm6, %k0
111 vfmadd231pd {rn-sae}, %zmm6, %zmm10, %zmm1
112 vmovups poly_coeff4+__svml_dlog1p_data_internal_avx512(%rip), %zmm10
113 vfmadd231pd {rn-sae}, %zmm6, %zmm11, %zmm14
114 vmovups L2+__svml_dlog1p_data_internal_avx512(%rip), %zmm4
115 vpermt2pd Log_tbl+64+__svml_dlog1p_data_internal_avx512(%rip), %zmm0, %zmm3
116
117 /* add 1 to Expon if DblRcp<0.75 */
118 vaddpd {rn-sae}, %zmm7, %zmm5, %zmm5{%k1}
119
120 /* R^2 */
121 vmulpd {rn-sae}, %zmm6, %zmm6, %zmm0
122 vfmadd231pd {rn-sae}, %zmm6, %zmm12, %zmm10
123 vmovups poly_coeff2+__svml_dlog1p_data_internal_avx512(%rip), %zmm12
124 vmulpd {rn-sae}, %zmm0, %zmm0, %zmm15
125 vfmadd231pd {rn-sae}, %zmm6, %zmm13, %zmm12
126 vfmadd213pd {rn-sae}, %zmm14, %zmm0, %zmm1
127 kmovw %k0, %edx
128 vfmadd213pd {rn-sae}, %zmm12, %zmm0, %zmm10
129
130 /* polynomial */
131 vfmadd213pd {rn-sae}, %zmm10, %zmm15, %zmm1
132 vfmadd213pd {rn-sae}, %zmm6, %zmm0, %zmm1
133 vaddpd {rn-sae}, %zmm1, %zmm3, %zmm6
134 vfmadd213pd {rn-sae}, %zmm6, %zmm4, %zmm5
135 vorpd %zmm8, %zmm5, %zmm0
136 testl %edx, %edx
137
138 /* Go to special inputs processing branch */
139 jne L(SPECIAL_VALUES_BRANCH)
140 # LOE rbx r12 r13 r14 r15 edx zmm0 zmm9
141
142 /* Restore registers
143 * and exit the function
144 */
145
146L(EXIT):
147 movq %rbp, %rsp
148 popq %rbp
149 cfi_def_cfa(7, 8)
150 cfi_restore(6)
151 ret
152 cfi_def_cfa(6, 16)
153 cfi_offset(6, -16)
154
155 /* Branch to process
156 * special inputs
157 */
158
159L(SPECIAL_VALUES_BRANCH):
160 vmovups %zmm9, 64(%rsp)
161 vmovups %zmm0, 128(%rsp)
162 # LOE rbx r12 r13 r14 r15 edx zmm0
163
164 xorl %eax, %eax
165 # LOE rbx r12 r13 r14 r15 eax edx
166
167 vzeroupper
168 movq %r12, 16(%rsp)
169 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
170 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
171 movl %eax, %r12d
172 movq %r13, 8(%rsp)
173 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
174 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
175 movl %edx, %r13d
176 movq %r14, (%rsp)
177 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
178 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
179 # LOE rbx r15 r12d r13d
180
181 /* Range mask
182 * bits check
183 */
184
185L(RANGEMASK_CHECK):
186 btl %r12d, %r13d
187
188 /* Call scalar math function */
189 jc L(SCALAR_MATH_CALL)
190 # LOE rbx r15 r12d r13d
191
192 /* Special inputs
193 * processing loop
194 */
195
196L(SPECIAL_VALUES_LOOP):
197 incl %r12d
198 cmpl $8, %r12d
199
200 /* Check bits in range mask */
201 jl L(RANGEMASK_CHECK)
202 # LOE rbx r15 r12d r13d
203
204 movq 16(%rsp), %r12
205 cfi_restore(12)
206 movq 8(%rsp), %r13
207 cfi_restore(13)
208 movq (%rsp), %r14
209 cfi_restore(14)
210 vmovups 128(%rsp), %zmm0
211
212 /* Go to exit */
213 jmp L(EXIT)
214 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
215 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
216 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
217 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
218 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
219 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
220 # LOE rbx r12 r13 r14 r15 zmm0
221
222 /* Scalar math function call
223 * to process special input
224 */
225
226L(SCALAR_MATH_CALL):
227 movl %r12d, %r14d
228 vmovsd 64(%rsp, %r14, 8), %xmm0
229 call log1p@PLT
230 # LOE rbx r14 r15 r12d r13d xmm0
231
232 vmovsd %xmm0, 128(%rsp, %r14, 8)
233
234 /* Process special inputs in loop */
235 jmp L(SPECIAL_VALUES_LOOP)
236 # LOE rbx r15 r12d r13d
237END(_ZGVeN8v_log1p_skx)
238
239 .section .rodata, "a"
240 .align 64
241
242#ifdef __svml_dlog1p_data_internal_avx512_typedef
243typedef unsigned int VUINT32;
244typedef struct {
245 __declspec(align(64)) VUINT32 Log_tbl[16][2];
246 __declspec(align(64)) VUINT32 One[8][2];
247 __declspec(align(64)) VUINT32 SgnMask[8][2];
248 __declspec(align(64)) VUINT32 C075[8][2];
249 __declspec(align(64)) VUINT32 poly_coeff9[8][2];
250 __declspec(align(64)) VUINT32 poly_coeff8[8][2];
251 __declspec(align(64)) VUINT32 poly_coeff7[8][2];
252 __declspec(align(64)) VUINT32 poly_coeff6[8][2];
253 __declspec(align(64)) VUINT32 poly_coeff5[8][2];
254 __declspec(align(64)) VUINT32 poly_coeff4[8][2];
255 __declspec(align(64)) VUINT32 poly_coeff3[8][2];
256 __declspec(align(64)) VUINT32 poly_coeff2[8][2];
257 __declspec(align(64)) VUINT32 L2[8][2];
258} __svml_dlog1p_data_internal_avx512;
259#endif
260__svml_dlog1p_data_internal_avx512:
261 /* Log_tbl */
262 .quad 0x0000000000000000
263 .quad 0xbfaf0a30c01162a6
264 .quad 0xbfbe27076e2af2e6
265 .quad 0xbfc5ff3070a793d4
266 .quad 0xbfcc8ff7c79a9a22
267 .quad 0xbfd1675cababa60e
268 .quad 0xbfd4618bc21c5ec2
269 .quad 0xbfd739d7f6bbd007
270 .quad 0x3fd269621134db92
271 .quad 0x3fcf991c6cb3b379
272 .quad 0x3fca93ed3c8ad9e3
273 .quad 0x3fc5bf406b543db2
274 .quad 0x3fc1178e8227e47c
275 .quad 0x3fb9335e5d594989
276 .quad 0x3fb08598b59e3a07
277 .quad 0x3fa0415d89e74444
278 /* One */
279 .align 64
280 .quad 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000
281 /* SgnMask */
282 .align 64
283 .quad 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000
284 /* C075 0.75 */
285 .align 64
286 .quad 0x3fe8000000000000, 0x3fe8000000000000, 0x3fe8000000000000, 0x3fe8000000000000, 0x3fe8000000000000, 0x3fe8000000000000, 0x3fe8000000000000, 0x3fe8000000000000
287 /* poly_coeff9 */
288 .align 64
289 .quad 0x3fbC81CD309D7C70, 0x3fbC81CD309D7C70, 0x3fbC81CD309D7C70, 0x3fbC81CD309D7C70, 0x3fbC81CD309D7C70, 0x3fbC81CD309D7C70, 0x3fbC81CD309D7C70, 0x3fbC81CD309D7C70
290 /* poly_coeff8 */
291 .align 64
292 .quad 0xbfc007357E93AF62, 0xbfc007357E93AF62, 0xbfc007357E93AF62, 0xbfc007357E93AF62, 0xbfc007357E93AF62, 0xbfc007357E93AF62, 0xbfc007357E93AF62, 0xbfc007357E93AF62
293 /* poly_coeff7 */
294 .align 64
295 .quad 0x3fc249229CEE81EF, 0x3fc249229CEE81EF, 0x3fc249229CEE81EF, 0x3fc249229CEE81EF, 0x3fc249229CEE81EF, 0x3fc249229CEE81EF, 0x3fc249229CEE81EF, 0x3fc249229CEE81EF
296 /* poly_coeff6 */
297 .align 64
298 .quad 0xbfc55553FB28DB06, 0xbfc55553FB28DB06, 0xbfc55553FB28DB06, 0xbfc55553FB28DB06, 0xbfc55553FB28DB06, 0xbfc55553FB28DB06, 0xbfc55553FB28DB06, 0xbfc55553FB28DB06
299 /* poly_coeff5 */
300 .align 64
301 .quad 0x3fc9999999CC9F5C, 0x3fc9999999CC9F5C, 0x3fc9999999CC9F5C, 0x3fc9999999CC9F5C, 0x3fc9999999CC9F5C, 0x3fc9999999CC9F5C, 0x3fc9999999CC9F5C, 0x3fc9999999CC9F5C
302 /* poly_coeff4 */
303 .align 64
304 .quad 0xbfd00000000C05BD, 0xbfd00000000C05BD, 0xbfd00000000C05BD, 0xbfd00000000C05BD, 0xbfd00000000C05BD, 0xbfd00000000C05BD, 0xbfd00000000C05BD, 0xbfd00000000C05BD
305 /* poly_coeff3 */
306 .align 64
307 .quad 0x3fd5555555555466, 0x3fd5555555555466, 0x3fd5555555555466, 0x3fd5555555555466, 0x3fd5555555555466, 0x3fd5555555555466, 0x3fd5555555555466, 0x3fd5555555555466
308 /* poly_coeff2 */
309 .align 64
310 .quad 0xbfdFFFFFFFFFFFC6, 0xbfdFFFFFFFFFFFC6, 0xbfdFFFFFFFFFFFC6, 0xbfdFFFFFFFFFFFC6, 0xbfdFFFFFFFFFFFC6, 0xbfdFFFFFFFFFFFC6, 0xbfdFFFFFFFFFFFC6, 0xbfdFFFFFFFFFFFC6
311 /* L2 = log(2) */
312 .align 64
313 .quad 0x3fe62E42FEFA39EF, 0x3fe62E42FEFA39EF, 0x3fe62E42FEFA39EF, 0x3fe62E42FEFA39EF, 0x3fe62E42FEFA39EF, 0x3fe62E42FEFA39EF, 0x3fe62E42FEFA39EF, 0x3fe62E42FEFA39EF
314 .align 64
315 .type __svml_dlog1p_data_internal_avx512, @object
316 .size __svml_dlog1p_data_internal_avx512, .-__svml_dlog1p_data_internal_avx512
317

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_d_log1p8_core_avx512.S