1/* Function exp2f vectorized with AVX2.
2 Copyright (C) 2021-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
18
19/*
20 * ALGORITHM DESCRIPTION:
21 *
22 * exp2(x) = 2^n * T[j] * (1 + P(y))
23 * where
24 * x = m*(1/K) + y, y in [-1/K..1/K]
25 * m = n*K + j, m, n,j - signed integer, j in [-K/2..K/2]
26 *
27 * values of 2^j/K are tabulated
28 *
29 * P(y) is a minimax polynomial approximation of exp2(x)-1
30 * on small interval [-1/K..1/K]
31 *
32 * Special cases:
33 *
34 * exp2(NaN) = NaN
35 * exp2(+INF) = +INF
36 * exp2(-INF) = 0
37 * exp2(x) = 1 for subnormals
38 * For IEEE float
39 * if x >= 128.0 then exp2f(x) overflow
40 * if x < -151.0 then exp2f(x) underflow
41 *
42 */
43
44/* Offsets for data table __svml_sexp2_data_internal
45 */
46#define _sShifter 0
47#define _sPC0 32
48#define _sPC1 64
49#define _sPC2 96
50#define _sPC3 128
51#define _sPC4 160
52#define _sPC5 192
53#define _sPC6 224
54#define _iAbsMask 256
55#define _iDomainRange 288
56
57#include <sysdep.h>
58
59 .section .text.avx2, "ax", @progbits
60ENTRY(_ZGVdN8v_exp2f_avx2)
61 pushq %rbp
62 cfi_def_cfa_offset(16)
63 movq %rsp, %rbp
64 cfi_def_cfa(6, 16)
65 cfi_offset(6, -16)
66 andq $-32, %rsp
67 subq $96, %rsp
68 vmovups __svml_sexp2_data_internal(%rip), %ymm1
69
70 /* Check for overflow\underflow */
71 vmovups _sPC6+__svml_sexp2_data_internal(%rip), %ymm7
72
73 /* Implementation */
74 vaddps %ymm1, %ymm0, %ymm6
75 vsubps %ymm1, %ymm6, %ymm4
76
77 /* 2^N */
78 vpslld $23, %ymm6, %ymm8
79
80 /* R */
81 vsubps %ymm4, %ymm0, %ymm5
82
83 /* Polynomial */
84 vfmadd213ps _sPC5+__svml_sexp2_data_internal(%rip), %ymm5, %ymm7
85 vfmadd213ps _sPC4+__svml_sexp2_data_internal(%rip), %ymm5, %ymm7
86 vfmadd213ps _sPC3+__svml_sexp2_data_internal(%rip), %ymm5, %ymm7
87 vfmadd213ps _sPC2+__svml_sexp2_data_internal(%rip), %ymm5, %ymm7
88 vfmadd213ps _sPC1+__svml_sexp2_data_internal(%rip), %ymm5, %ymm7
89 vfmadd213ps _sPC0+__svml_sexp2_data_internal(%rip), %ymm5, %ymm7
90
91 /* Check for overflow\underflow */
92 vandps _iAbsMask+__svml_sexp2_data_internal(%rip), %ymm0, %ymm2
93 vpcmpgtd _iDomainRange+__svml_sexp2_data_internal(%rip), %ymm2, %ymm3
94 vmovmskps %ymm3, %edx
95
96 /* Reconstruction */
97 vpaddd %ymm8, %ymm7, %ymm1
98 testl %edx, %edx
99
100 /* Go to special inputs processing branch */
101 jne L(SPECIAL_VALUES_BRANCH)
102 # LOE rbx r12 r13 r14 r15 edx ymm0 ymm1
103
104 /* Restore registers
105 * and exit the function
106 */
107
108L(EXIT):
109 vmovaps %ymm1, %ymm0
110 movq %rbp, %rsp
111 popq %rbp
112 cfi_def_cfa(7, 8)
113 cfi_restore(6)
114 ret
115 cfi_def_cfa(6, 16)
116 cfi_offset(6, -16)
117
118 /* Branch to process
119 * special inputs
120 */
121
122L(SPECIAL_VALUES_BRANCH):
123 vmovups %ymm0, 32(%rsp)
124 vmovups %ymm1, 64(%rsp)
125 # LOE rbx r12 r13 r14 r15 edx ymm1
126
127 xorl %eax, %eax
128 # LOE rbx r12 r13 r14 r15 eax edx
129
130 vzeroupper
131 movq %r12, 16(%rsp)
132 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */
133 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
134 movl %eax, %r12d
135 movq %r13, 8(%rsp)
136 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */
137 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
138 movl %edx, %r13d
139 movq %r14, (%rsp)
140 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */
141 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
142 # LOE rbx r15 r12d r13d
143
144 /* Range mask
145 * bits check
146 */
147
148L(RANGEMASK_CHECK):
149 btl %r12d, %r13d
150
151 /* Call scalar math function */
152 jc L(SCALAR_MATH_CALL)
153 # LOE rbx r15 r12d r13d
154
155 /* Special inputs
156 * processing loop
157 */
158
159L(SPECIAL_VALUES_LOOP):
160 incl %r12d
161 cmpl $8, %r12d
162
163 /* Check bits in range mask */
164 jl L(RANGEMASK_CHECK)
165 # LOE rbx r15 r12d r13d
166
167 movq 16(%rsp), %r12
168 cfi_restore(12)
169 movq 8(%rsp), %r13
170 cfi_restore(13)
171 movq (%rsp), %r14
172 cfi_restore(14)
173 vmovups 64(%rsp), %ymm1
174
175 /* Go to exit */
176 jmp L(EXIT)
177 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */
178 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
179 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */
180 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
181 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */
182 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
183 # LOE rbx r12 r13 r14 r15 ymm1
184
185 /* Scalar math function call
186 * to process special input
187 */
188
189L(SCALAR_MATH_CALL):
190 movl %r12d, %r14d
191 vmovss 32(%rsp, %r14, 4), %xmm0
192 call exp2f@PLT
193 # LOE rbx r14 r15 r12d r13d xmm0
194
195 vmovss %xmm0, 64(%rsp, %r14, 4)
196
197 /* Process special inputs in loop */
198 jmp L(SPECIAL_VALUES_LOOP)
199 # LOE rbx r15 r12d r13d
200END(_ZGVdN8v_exp2f_avx2)
201
202 .section .rodata, "a"
203 .align 32
204
205#ifdef __svml_sexp2_data_internal_typedef
206typedef unsigned int VUINT32;
207typedef struct {
208 __declspec(align(32)) VUINT32 _sShifter[8][1];
209 __declspec(align(32)) VUINT32 _sPC0[8][1];
210 __declspec(align(32)) VUINT32 _sPC1[8][1];
211 __declspec(align(32)) VUINT32 _sPC2[8][1];
212 __declspec(align(32)) VUINT32 _sPC3[8][1];
213 __declspec(align(32)) VUINT32 _sPC4[8][1];
214 __declspec(align(32)) VUINT32 _sPC5[8][1];
215 __declspec(align(32)) VUINT32 _sPC6[8][1];
216 __declspec(align(32)) VUINT32 _iAbsMask[8][1];
217 __declspec(align(32)) VUINT32 _iDomainRange[8][1];
218} __svml_sexp2_data_internal;
219#endif
220__svml_sexp2_data_internal:
221 .long 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000 /* _sShifter */
222 .align 32
223 .long 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000 /* _sPC0 */
224 .align 32
225 .long 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218 /* _sPC1 */
226 .align 32
227 .long 0x3e75fdef, 0x3e75fdef, 0x3e75fdef, 0x3e75fdef, 0x3e75fdef, 0x3e75fdef, 0x3e75fdef, 0x3e75fdef /* _sPC2 */
228 .align 32
229 .long 0x3d6357cf, 0x3d6357cf, 0x3d6357cf, 0x3d6357cf, 0x3d6357cf, 0x3d6357cf, 0x3d6357cf, 0x3d6357cf /* _sPC3 */
230 .align 32
231 .long 0x3c1d962c, 0x3c1d962c, 0x3c1d962c, 0x3c1d962c, 0x3c1d962c, 0x3c1d962c, 0x3c1d962c, 0x3c1d962c /* _sPC4 */
232 .align 32
233 .long 0x3aaf7a51, 0x3aaf7a51, 0x3aaf7a51, 0x3aaf7a51, 0x3aaf7a51, 0x3aaf7a51, 0x3aaf7a51, 0x3aaf7a51 /* _sPC5 */
234 .align 32
235 .long 0x39213c8c, 0x39213c8c, 0x39213c8c, 0x39213c8c, 0x39213c8c, 0x39213c8c, 0x39213c8c, 0x39213c8c /* _sPC6 */
236 //common
237 .align 32
238 .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff /* _iAbsMask */
239 .align 32
240 .long 0x42fc0000, 0x42fc0000, 0x42fc0000, 0x42fc0000, 0x42fc0000, 0x42fc0000, 0x42fc0000, 0x42fc0000 /* _iDomainRange=126.0 */
241 .align 32
242 .type __svml_sexp2_data_internal, @object
243 .size __svml_sexp2_data_internal, .-__svml_sexp2_data_internal
244

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_s_exp2f8_core_avx2.S