1/* Function exp2f vectorized with AVX-512.
2 Copyright (C) 2021-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
18
19/*
20 * ALGORITHM DESCRIPTION:
21 *
22 * Single precision mantissa represented as: 1.b1b2b3 ... b23
23 * Constant for single precision: S = 2^19 x 1.5
24 *
25 * 2^X = 2^Xo x 2^{X-Xo}
26 * 2^X = 2^K x 2^fo x 2^{X-Xo}
27 * 2^X = 2^K x 2^fo x 2^r
28 *
29 * 2^K --> Manual scaling
30 * 2^fo --> Table lookup
31 * r --> 1 + poly (r = X - Xo)
32 *
33 * Xo = K + fo
34 * Xo = K + 0.x1x2x3x4
35 *
36 * r = X - Xo
37 * = Vreduce(X, imm)
38 * = X - VRndScale(X, imm), where Xo = VRndScale(X, imm)
39 *
40 * Rnd(S + X) = S + Xo, where S is selected as S = 2^19 x 1.5
41 * S + X = S + floor(X) + 0.x1x2x3x4
42 * Rnd(S + X) = Rnd(2^19 x 1.5 + X)
43 * (Note: 2^exp x 1.b1b2b3 ... b23, 2^{exp-23} = 2^-4 for exp=19)
44 *
45 * exp2(x) = 2^K x 2^fo x (1 + poly(r)), where 2^r = 1 + poly(r)
46 *
47 * Scale back:
48 * dest = src1 x 2^floor(src2)
49 *
50 *
51 */
52
53/* Offsets for data table __svml_sexp2_data_internal_avx512
54 */
55#define Frac_PowerS0 0
56#define poly_coeff1 64
57#define poly_coeff2 128
58#define poly_coeff3 192
59#define add_const 256
60#define AbsMask 320
61#define Threshold 384
62
63#include <sysdep.h>
64
65 .section .text.evex512, "ax", @progbits
66ENTRY(_ZGVeN16v_exp2f_skx)
67 pushq %rbp
68 cfi_def_cfa_offset(16)
69 movq %rsp, %rbp
70 cfi_def_cfa(6, 16)
71 cfi_offset(6, -16)
72 andq $-64, %rsp
73 subq $192, %rsp
74 vmovups add_const+__svml_sexp2_data_internal_avx512(%rip), %zmm3
75
76 /*
77 * Reduced argument
78 * where VREDUCE is available
79 */
80 vreduceps $65, {sae}, %zmm0, %zmm6
81 vmovups poly_coeff3+__svml_sexp2_data_internal_avx512(%rip), %zmm5
82 vmovups poly_coeff2+__svml_sexp2_data_internal_avx512(%rip), %zmm10
83 vmovups Threshold+__svml_sexp2_data_internal_avx512(%rip), %zmm2
84
85 /*
86 *
87 * HA
88 * Variables and constants
89 * Load constants and vector(s)
90 */
91 vmovups poly_coeff1+__svml_sexp2_data_internal_avx512(%rip), %zmm7
92
93 /*
94 * Integer form of K+0.b1b2b3b4 in lower bits - call K_plus_f0
95 * Mantisssa of normalized single precision FP: 1.b1b2...b23
96 */
97 vaddps {rd-sae}, %zmm3, %zmm0, %zmm4
98 vandps AbsMask+__svml_sexp2_data_internal_avx512(%rip), %zmm0, %zmm1
99
100 /* c3*r + c2 */
101 vfmadd231ps {rn-sae}, %zmm6, %zmm5, %zmm10
102 vcmpps $30, {sae}, %zmm2, %zmm1, %k0
103
104 /* c3*r^2 + c2*r + c1 */
105 vfmadd213ps {rn-sae}, %zmm7, %zmm6, %zmm10
106
107 /* Table value: 2^(0.b1b2b3b4) */
108 vpermps __svml_sexp2_data_internal_avx512(%rip), %zmm4, %zmm9
109 kmovw %k0, %edx
110
111 /* T*r */
112 vmulps {rn-sae}, %zmm6, %zmm9, %zmm8
113
114 /* T + (T*r*(c3*r^2 + c2*r + c1) */
115 vfmadd213ps {rn-sae}, %zmm9, %zmm8, %zmm10
116
117 /* Scaling placed at the end to avoid accuracy loss when T*r*scale underflows */
118 vscalefps {rn-sae}, %zmm0, %zmm10, %zmm1
119 testl %edx, %edx
120
121 /* Go to special inputs processing branch */
122 jne L(SPECIAL_VALUES_BRANCH)
123 # LOE rbx r12 r13 r14 r15 edx zmm0 zmm1
124
125 /* Restore registers
126 * and exit the function
127 */
128
129L(EXIT):
130 vmovaps %zmm1, %zmm0
131 movq %rbp, %rsp
132 popq %rbp
133 cfi_def_cfa(7, 8)
134 cfi_restore(6)
135 ret
136 cfi_def_cfa(6, 16)
137 cfi_offset(6, -16)
138
139 /* Branch to process
140 * special inputs
141 */
142
143L(SPECIAL_VALUES_BRANCH):
144 vmovups %zmm0, 64(%rsp)
145 vmovups %zmm1, 128(%rsp)
146 # LOE rbx r12 r13 r14 r15 edx zmm1
147
148 xorl %eax, %eax
149 # LOE rbx r12 r13 r14 r15 eax edx
150
151 vzeroupper
152 movq %r12, 16(%rsp)
153 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
154 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
155 movl %eax, %r12d
156 movq %r13, 8(%rsp)
157 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
158 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
159 movl %edx, %r13d
160 movq %r14, (%rsp)
161 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
162 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
163 # LOE rbx r15 r12d r13d
164
165 /* Range mask
166 * bits check
167 */
168
169L(RANGEMASK_CHECK):
170 btl %r12d, %r13d
171
172 /* Call scalar math function */
173 jc L(SCALAR_MATH_CALL)
174 # LOE rbx r15 r12d r13d
175
176 /* Special inputs
177 * processing loop
178 */
179
180L(SPECIAL_VALUES_LOOP):
181 incl %r12d
182 cmpl $16, %r12d
183
184 /* Check bits in range mask */
185 jl L(RANGEMASK_CHECK)
186 # LOE rbx r15 r12d r13d
187
188 movq 16(%rsp), %r12
189 cfi_restore(12)
190 movq 8(%rsp), %r13
191 cfi_restore(13)
192 movq (%rsp), %r14
193 cfi_restore(14)
194 vmovups 128(%rsp), %zmm1
195
196 /* Go to exit */
197 jmp L(EXIT)
198 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
199 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
200 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
201 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
202 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
203 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
204 # LOE rbx r12 r13 r14 r15 zmm1
205
206 /* Scalar math function call
207 * to process special input
208 */
209
210L(SCALAR_MATH_CALL):
211 movl %r12d, %r14d
212 vmovss 64(%rsp, %r14, 4), %xmm0
213 call exp2f@PLT
214 # LOE rbx r14 r15 r12d r13d xmm0
215
216 vmovss %xmm0, 128(%rsp, %r14, 4)
217
218 /* Process special inputs in loop */
219 jmp L(SPECIAL_VALUES_LOOP)
220 # LOE rbx r15 r12d r13d
221END(_ZGVeN16v_exp2f_skx)
222
223 .section .rodata, "a"
224 .align 64
225
226#ifdef __svml_sexp2_data_internal_avx512_typedef
227typedef unsigned int VUINT32;
228typedef struct {
229 __declspec(align(64)) VUINT32 Frac_PowerS0[16][1];
230 __declspec(align(64)) VUINT32 poly_coeff1[16][1];
231 __declspec(align(64)) VUINT32 poly_coeff2[16][1];
232 __declspec(align(64)) VUINT32 poly_coeff3[16][1];
233 __declspec(align(64)) VUINT32 add_const[16][1];
234 __declspec(align(64)) VUINT32 AbsMask[16][1];
235 __declspec(align(64)) VUINT32 Threshold[16][1];
236} __svml_sexp2_data_internal_avx512;
237#endif
238__svml_sexp2_data_internal_avx512:
239 /* Frac_PowerS0 */
240 .long 0x3F800000
241 .long 0x3F85AAC3
242 .long 0x3F8B95C2
243 .long 0x3F91C3D3
244 .long 0x3F9837F0
245 .long 0x3F9EF532
246 .long 0x3FA5FED7
247 .long 0x3FAD583F
248 .long 0x3FB504F3
249 .long 0x3FBD08A4
250 .long 0x3FC5672A
251 .long 0x3FCE248C
252 .long 0x3FD744FD
253 .long 0x3FE0CCDF
254 .long 0x3FEAC0C7
255 .long 0x3FF5257D
256 .align 64
257 .long 0x3F317222, 0x3F317222, 0x3F317222, 0x3F317222, 0x3F317222, 0x3F317222, 0x3F317222, 0x3F317222, 0x3F317222, 0x3F317222, 0x3F317222, 0x3F317222, 0x3F317222, 0x3F317222, 0x3F317222, 0x3F317222 /* == poly_coeff1 == */
258 .align 64
259 .long 0x3E75F16B, 0x3E75F16B, 0x3E75F16B, 0x3E75F16B, 0x3E75F16B, 0x3E75F16B, 0x3E75F16B, 0x3E75F16B, 0x3E75F16B, 0x3E75F16B, 0x3E75F16B, 0x3E75F16B, 0x3E75F16B, 0x3E75F16B, 0x3E75F16B, 0x3E75F16B /* == poly_coeff2 == */
260 .align 64
261 .long 0x3D6854CA, 0x3D6854CA, 0x3D6854CA, 0x3D6854CA, 0x3D6854CA, 0x3D6854CA, 0x3D6854CA, 0x3D6854CA, 0x3D6854CA, 0x3D6854CA, 0x3D6854CA, 0x3D6854CA, 0x3D6854CA, 0x3D6854CA, 0x3D6854CA, 0x3D6854CA /* == poly_coeff3 == */
262 .align 64
263 .long 0x49400000, 0x49400000, 0x49400000, 0x49400000, 0x49400000, 0x49400000, 0x49400000, 0x49400000, 0x49400000, 0x49400000, 0x49400000, 0x49400000, 0x49400000, 0x49400000, 0x49400000, 0x49400000 /* add_const */
264 .align 64
265 .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff /* AbsMask */
266 .align 64
267 .long 0x42fc0000, 0x42fc0000, 0x42fc0000, 0x42fc0000, 0x42fc0000, 0x42fc0000, 0x42fc0000, 0x42fc0000, 0x42fc0000, 0x42fc0000, 0x42fc0000, 0x42fc0000, 0x42fc0000, 0x42fc0000, 0x42fc0000, 0x42fc0000 /* Threshold=126.0 */
268 .align 64
269 .type __svml_sexp2_data_internal_avx512, @object
270 .size __svml_sexp2_data_internal_avx512, .-__svml_sexp2_data_internal_avx512
271

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_s_exp2f16_core_avx512.S