1/* Function exp2 vectorized with AVX-512.
2 Copyright (C) 2021-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
18
19/*
20 * ALGORITHM DESCRIPTION:
21 *
22 * Double precision mantissa represented as: 1.b1b2b3 ... b52
23 * Constant for double precision: S = 2^48 x 1.5
24 *
25 * 2^X = 2^Xo x 2^{X-Xo}
26 * 2^X = 2^K x 2^fo x 2^{X-Xo}
27 * 2^X = 2^K x 2^fo x 2^r
28 *
29 * 2^K --> Manual scaling
30 * 2^fo --> Table lookup
31 * r --> 1 + poly (r = X - Xo)
32 *
33 * Xo = K + fo
34 * Xo = K + 0.x1x2x3x4
35 *
36 * r = X - Xo
37 * = Vreduce(X, imm)
38 * = X - VRndScale(X, imm), where Xo = VRndScale(X, imm)
39 *
40 * Rnd(S + X) = S + Xo, where S is selected as S = 2^19 x 1.5
41 * S + X = S + floor(X) + 0.x1x2x3x4
42 * Rnd(S + X) = Rnd(2^48 x 1.5 + X)
43 * (Note: 2^exp x 1.b1b2b3 ... b52, 2^{exp-52} = 2^-4 for exp=48)
44 *
45 * exp2(x) = 2^K x 2^fo x (1 + poly(r)), where 2^r = 1 + poly(r)
46 *
47 * Scale back:
48 * dest = src1 x 2^floor(src2)
49 *
50 *
51 */
52
53/* Offsets for data table __svml_dexp2_data_internal_avx512
54 */
55#define Frac_PowerD0 0
56#define poly_coeff1 128
57#define poly_coeff2 192
58#define poly_coeff3 256
59#define poly_coeff4 320
60#define poly_coeff5 384
61#define poly_coeff6 448
62#define add_const 512
63#define AbsMask 576
64#define Threshold 640
65#define _lIndexMask 704
66
67#include <sysdep.h>
68
69 .section .text.evex512, "ax", @progbits
70ENTRY(_ZGVeN8v_exp2_skx)
71 pushq %rbp
72 cfi_def_cfa_offset(16)
73 movq %rsp, %rbp
74 cfi_def_cfa(6, 16)
75 cfi_offset(6, -16)
76 andq $-64, %rsp
77 subq $192, %rsp
78 vmovups poly_coeff5+__svml_dexp2_data_internal_avx512(%rip), %zmm14
79 vmovups poly_coeff6+__svml_dexp2_data_internal_avx512(%rip), %zmm6
80
81 /*
82 * Reduced argument
83 * where VREDUCE is available
84 */
85 vreducepd $65, {sae}, %zmm0, %zmm10
86 vmovups poly_coeff4+__svml_dexp2_data_internal_avx512(%rip), %zmm7
87 vmovups add_const+__svml_dexp2_data_internal_avx512(%rip), %zmm3
88 vmovups poly_coeff3+__svml_dexp2_data_internal_avx512(%rip), %zmm8
89 vmovups __svml_dexp2_data_internal_avx512(%rip), %zmm13
90
91 /* c6*r + c5 */
92 vfmadd231pd {rn-sae}, %zmm10, %zmm6, %zmm14
93 vmovups poly_coeff2+__svml_dexp2_data_internal_avx512(%rip), %zmm9
94 vmovups Threshold+__svml_dexp2_data_internal_avx512(%rip), %zmm2
95
96 /*
97 *
98 * HA
99 * Variables and constants
100 * Load constants and vector(s)
101 */
102 vmovups poly_coeff1+__svml_dexp2_data_internal_avx512(%rip), %zmm11
103
104 /* c6*r^2 + c5*r + c4 */
105 vfmadd213pd {rn-sae}, %zmm7, %zmm10, %zmm14
106
107 /*
108 * Integer form of K+0.b1b2b3b4 in lower bits - call K_plus_f0
109 * Mantisssa of normalized double precision FP: 1.b1b2...b52
110 */
111 vaddpd {rd-sae}, %zmm3, %zmm0, %zmm4
112 vandpd AbsMask+__svml_dexp2_data_internal_avx512(%rip), %zmm0, %zmm1
113
114 /* c6*r^3 + c5*r^2 + c4*r + c3 */
115 vfmadd213pd {rn-sae}, %zmm8, %zmm10, %zmm14
116 vcmppd $29, {sae}, %zmm2, %zmm1, %k0
117
118 /* c6*r^4 + c5*r^3 + c4*r^2 + c3*r + c2 */
119 vfmadd213pd {rn-sae}, %zmm9, %zmm10, %zmm14
120 kmovw %k0, %edx
121
122 /* c6*r^5 + c5*r^4 + c4*r^3 + c3*r^2 + c2*r + c1 */
123 vfmadd213pd {rn-sae}, %zmm11, %zmm10, %zmm14
124
125 /* Table value: 2^(0.b1b2b3b4) */
126 vpandq _lIndexMask+__svml_dexp2_data_internal_avx512(%rip), %zmm4, %zmm5
127 vpermt2pd Frac_PowerD0+64+__svml_dexp2_data_internal_avx512(%rip), %zmm5, %zmm13
128
129 /* T*r */
130 vmulpd {rn-sae}, %zmm10, %zmm13, %zmm12
131
132 /* T + (T*r*(c6*r^5 + c5*r^4 + c4*r^3 + c3*r^2 + c2*r + c1)) */
133 vfmadd213pd {rn-sae}, %zmm13, %zmm12, %zmm14
134
135 /* Scaling placed at the end to avoid accuracy loss when T*r*scale underflows */
136 vscalefpd {rn-sae}, %zmm0, %zmm14, %zmm1
137 testl %edx, %edx
138
139 /* Go to special inputs processing branch */
140 jne L(SPECIAL_VALUES_BRANCH)
141 # LOE rbx r12 r13 r14 r15 edx zmm0 zmm1
142
143 /* Restore registers
144 * and exit the function
145 */
146
147L(EXIT):
148 vmovaps %zmm1, %zmm0
149 movq %rbp, %rsp
150 popq %rbp
151 cfi_def_cfa(7, 8)
152 cfi_restore(6)
153 ret
154 cfi_def_cfa(6, 16)
155 cfi_offset(6, -16)
156
157 /* Branch to process
158 * special inputs
159 */
160
161L(SPECIAL_VALUES_BRANCH):
162 vmovups %zmm0, 64(%rsp)
163 vmovups %zmm1, 128(%rsp)
164 # LOE rbx r12 r13 r14 r15 edx zmm1
165
166 xorl %eax, %eax
167 # LOE rbx r12 r13 r14 r15 eax edx
168
169 vzeroupper
170 movq %r12, 16(%rsp)
171 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
172 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
173 movl %eax, %r12d
174 movq %r13, 8(%rsp)
175 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
176 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
177 movl %edx, %r13d
178 movq %r14, (%rsp)
179 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
180 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
181 # LOE rbx r15 r12d r13d
182
183 /* Range mask
184 * bits check
185 */
186
187L(RANGEMASK_CHECK):
188 btl %r12d, %r13d
189
190 /* Call scalar math function */
191 jc L(SCALAR_MATH_CALL)
192 # LOE rbx r15 r12d r13d
193
194 /* Special inputs
195 * processing loop
196 */
197
198L(SPECIAL_VALUES_LOOP):
199 incl %r12d
200 cmpl $8, %r12d
201
202 /* Check bits in range mask */
203 jl L(RANGEMASK_CHECK)
204 # LOE rbx r15 r12d r13d
205
206 movq 16(%rsp), %r12
207 cfi_restore(12)
208 movq 8(%rsp), %r13
209 cfi_restore(13)
210 movq (%rsp), %r14
211 cfi_restore(14)
212 vmovups 128(%rsp), %zmm1
213
214 /* Go to exit */
215 jmp L(EXIT)
216 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
217 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
218 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
219 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
220 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
221 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
222 # LOE rbx r12 r13 r14 r15 zmm1
223
224 /* Scalar math function call
225 * to process special input
226 */
227
228L(SCALAR_MATH_CALL):
229 movl %r12d, %r14d
230 vmovsd 64(%rsp, %r14, 8), %xmm0
231 call exp2@PLT
232 # LOE rbx r14 r15 r12d r13d xmm0
233
234 vmovsd %xmm0, 128(%rsp, %r14, 8)
235
236 /* Process special inputs in loop */
237 jmp L(SPECIAL_VALUES_LOOP)
238 # LOE rbx r15 r12d r13d
239END(_ZGVeN8v_exp2_skx)
240
241 .section .rodata, "a"
242 .align 64
243
244#ifdef __svml_dexp2_data_internal_avx512_typedef
245typedef unsigned int VUINT32;
246typedef struct {
247 __declspec(align(64)) VUINT32 Frac_PowerD0[16][2];
248 __declspec(align(64)) VUINT32 poly_coeff1[8][2];
249 __declspec(align(64)) VUINT32 poly_coeff2[8][2];
250 __declspec(align(64)) VUINT32 poly_coeff3[8][2];
251 __declspec(align(64)) VUINT32 poly_coeff4[8][2];
252 __declspec(align(64)) VUINT32 poly_coeff5[8][2];
253 __declspec(align(64)) VUINT32 poly_coeff6[8][2];
254 __declspec(align(64)) VUINT32 add_const[8][2];
255 __declspec(align(64)) VUINT32 AbsMask[8][2];
256 __declspec(align(64)) VUINT32 Threshold[8][2];
257 __declspec(align(64)) VUINT32 _lIndexMask[8][2];
258} __svml_dexp2_data_internal_avx512;
259#endif
260__svml_dexp2_data_internal_avx512:
261 /* Frac_PowerD0 */
262 .quad 0x3FF0000000000000
263 .quad 0x3FF0B5586CF9890F
264 .quad 0x3FF172B83C7D517B
265 .quad 0x3FF2387A6E756238
266 .quad 0x3FF306FE0A31B715
267 .quad 0x3FF3DEA64C123422
268 .quad 0x3FF4BFDAD5362A27
269 .quad 0x3FF5AB07DD485429
270 .quad 0x3FF6A09E667F3BCD
271 .quad 0x3FF7A11473EB0187
272 .quad 0x3FF8ACE5422AA0DB
273 .quad 0x3FF9C49182A3F090
274 .quad 0x3FFAE89F995AD3AD
275 .quad 0x3FFC199BDD85529C
276 .quad 0x3FFD5818DCFBA487
277 .quad 0x3FFEA4AFA2A490DA
278 .align 64
279 .quad 0x3FE62E42FEFA398B, 0x3FE62E42FEFA398B, 0x3FE62E42FEFA398B, 0x3FE62E42FEFA398B, 0x3FE62E42FEFA398B, 0x3FE62E42FEFA398B, 0x3FE62E42FEFA398B, 0x3FE62E42FEFA398B /* == poly_coeff1 == */
280 .align 64
281 .quad 0x3FCEBFBDFF84555A, 0x3FCEBFBDFF84555A, 0x3FCEBFBDFF84555A, 0x3FCEBFBDFF84555A, 0x3FCEBFBDFF84555A, 0x3FCEBFBDFF84555A, 0x3FCEBFBDFF84555A, 0x3FCEBFBDFF84555A /* == poly_coeff2 == */
282 .align 64
283 .quad 0x3FAC6B08D4AD86B9, 0x3FAC6B08D4AD86B9, 0x3FAC6B08D4AD86B9, 0x3FAC6B08D4AD86B9, 0x3FAC6B08D4AD86B9, 0x3FAC6B08D4AD86B9, 0x3FAC6B08D4AD86B9, 0x3FAC6B08D4AD86B9 /* == poly_coeff3 == */
284 .align 64
285 .quad 0x3F83B2AD1B172252, 0x3F83B2AD1B172252, 0x3F83B2AD1B172252, 0x3F83B2AD1B172252, 0x3F83B2AD1B172252, 0x3F83B2AD1B172252, 0x3F83B2AD1B172252, 0x3F83B2AD1B172252 /* == poly_coeff4 == */
286 .align 64
287 .quad 0x3F55D7472713CD19, 0x3F55D7472713CD19, 0x3F55D7472713CD19, 0x3F55D7472713CD19, 0x3F55D7472713CD19, 0x3F55D7472713CD19, 0x3F55D7472713CD19, 0x3F55D7472713CD19 /* == poly_coeff5 == */
288 .align 64
289 .quad 0x3F24A1D7F526371B, 0x3F24A1D7F526371B, 0x3F24A1D7F526371B, 0x3F24A1D7F526371B, 0x3F24A1D7F526371B, 0x3F24A1D7F526371B, 0x3F24A1D7F526371B, 0x3F24A1D7F526371B /* == poly_coeff6 == */
290 .align 64
291 .quad 0x42F8000000000000, 0x42F8000000000000, 0x42F8000000000000, 0x42F8000000000000, 0x42F8000000000000, 0x42F8000000000000, 0x42F8000000000000, 0x42F8000000000000 /* add_const */
292 .align 64
293 .quad 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff /* AbsMask */
294 .align 64
295 .quad 0x408fefff00000000, 0x408fefff00000000, 0x408fefff00000000, 0x408fefff00000000, 0x408fefff00000000, 0x408fefff00000000, 0x408fefff00000000, 0x408fefff00000000 /* Threshold */
296 .align 64
297 .quad 0x000000000000000F, 0x000000000000000F, 0x000000000000000F, 0x000000000000000F, 0x000000000000000F, 0x000000000000000F, 0x000000000000000F, 0x000000000000000F /* _lIndexMask */
298 .align 64
299 .type __svml_dexp2_data_internal_avx512, @object
300 .size __svml_dexp2_data_internal_avx512, .-__svml_dexp2_data_internal_avx512
301

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_d_exp28_core_avx512.S