1/* Function expm1 vectorized with AVX2.
2 Copyright (C) 2021-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
18
19/*
20 * ALGORITHM DESCRIPTION:
21 *
22 * N = (int)(x*2^k/log(2.0)), R = x - N*log(2)/2^k
23 * exp(x) = 2^(N/2^k) * poly(R) is computed in high-low parts
24 * expm1(x) = exp(x)-1 is then obtained via multi-precision computation
25 *
26 *
27 */
28
29/* Offsets for data table __svml_dexpm1_data_internal
30 */
31#define Expm1_HA_table 0
32#define poly_coeff 2048
33#define Log2e 2176
34#define L2H 2208
35#define L2L 2240
36#define ExpAddConst 2272
37#define IndexMask 2304
38#define ExpMask 2336
39#define MOne 2368
40#define AbsMask 2400
41#define Threshold 2432
42#define L2 2464
43
44#include <sysdep.h>
45
46 .section .text.avx2, "ax", @progbits
47ENTRY(_ZGVdN4v_expm1_avx2)
48 pushq %rbp
49 cfi_def_cfa_offset(16)
50 movq %rsp, %rbp
51 cfi_def_cfa(6, 16)
52 cfi_offset(6, -16)
53 andq $-32, %rsp
54 subq $96, %rsp
55 lea __svml_dexpm1_data_internal(%rip), %r8
56 vmovapd %ymm0, %ymm3
57 vmulpd Log2e+__svml_dexpm1_data_internal(%rip), %ymm3, %ymm4
58
59 /* argument reduction */
60 vmovupd L2H+__svml_dexpm1_data_internal(%rip), %ymm2
61 vmovupd AbsMask+__svml_dexpm1_data_internal(%rip), %ymm5
62 vroundpd $0, %ymm4, %ymm8
63 vaddpd ExpAddConst+__svml_dexpm1_data_internal(%rip), %ymm8, %ymm0
64 vfnmadd213pd %ymm3, %ymm8, %ymm2
65
66 /* table lookup */
67 vandps IndexMask+__svml_dexpm1_data_internal(%rip), %ymm0, %ymm9
68 vandpd %ymm5, %ymm3, %ymm6
69 vcmpnle_uqpd Threshold+__svml_dexpm1_data_internal(%rip), %ymm6, %ymm7
70 vfnmadd231pd L2L+__svml_dexpm1_data_internal(%rip), %ymm8, %ymm2
71 vandnpd %ymm3, %ymm5, %ymm1
72 vmovmskpd %ymm7, %eax
73 vmovupd poly_coeff+64+__svml_dexpm1_data_internal(%rip), %ymm7
74 vmulpd %ymm2, %ymm2, %ymm8
75 vfmadd213pd poly_coeff+96+__svml_dexpm1_data_internal(%rip), %ymm2, %ymm7
76 vandps ExpMask+__svml_dexpm1_data_internal(%rip), %ymm0, %ymm0
77 vextractf128 $1, %ymm9, %xmm10
78 vmovd %xmm9, %edx
79 vmovd %xmm10, %esi
80 vpextrd $2, %xmm9, %ecx
81 vpextrd $2, %xmm10, %edi
82 movslq %edx, %rdx
83 movslq %ecx, %rcx
84 movslq %esi, %rsi
85 movslq %edi, %rdi
86 vmovupd (%r8, %rdx), %xmm13
87 vmovupd (%r8, %rcx), %xmm14
88 vmovupd (%r8, %rsi), %xmm4
89 vmovupd (%r8, %rdi), %xmm5
90 vunpcklpd %xmm14, %xmm13, %xmm11
91 vunpcklpd %xmm5, %xmm4, %xmm12
92 vpsllq $41, %ymm0, %ymm10
93 vunpckhpd %xmm14, %xmm13, %xmm15
94 vunpckhpd %xmm5, %xmm4, %xmm13
95 vinsertf128 $1, %xmm12, %ymm11, %ymm6
96
97 /* polynomial */
98 vmovupd poly_coeff+__svml_dexpm1_data_internal(%rip), %ymm12
99
100 /* T-1 */
101 vmovupd MOne+__svml_dexpm1_data_internal(%rip), %ymm11
102 vfmadd213pd poly_coeff+32+__svml_dexpm1_data_internal(%rip), %ymm2, %ymm12
103 vfmadd213pd %ymm7, %ymm8, %ymm12
104 vorpd %ymm10, %ymm6, %ymm9
105 vfmadd213pd %ymm2, %ymm8, %ymm12
106 vaddpd %ymm11, %ymm9, %ymm2
107 vinsertf128 $1, %xmm13, %ymm15, %ymm14
108
109 /* Th1 = (Th-1) + Tl */
110 vfmadd213pd %ymm2, %ymm10, %ymm14
111
112 /* T = Th+Tl */
113 vsubpd %ymm11, %ymm14, %ymm0
114 vfmadd213pd %ymm14, %ymm12, %ymm0
115 vorpd %ymm1, %ymm0, %ymm0
116 testl %eax, %eax
117
118 /* Go to special inputs processing branch */
119 jne L(SPECIAL_VALUES_BRANCH)
120 # LOE rbx r12 r13 r14 r15 eax ymm0 ymm3
121
122 /* Restore registers
123 * and exit the function
124 */
125
126L(EXIT):
127 movq %rbp, %rsp
128 popq %rbp
129 cfi_def_cfa(7, 8)
130 cfi_restore(6)
131 ret
132 cfi_def_cfa(6, 16)
133 cfi_offset(6, -16)
134
135 /* Branch to process
136 * special inputs
137 */
138
139L(SPECIAL_VALUES_BRANCH):
140 vmovupd %ymm3, 32(%rsp)
141 vmovupd %ymm0, 64(%rsp)
142 # LOE rbx r12 r13 r14 r15 eax ymm0
143
144 xorl %edx, %edx
145 # LOE rbx r12 r13 r14 r15 eax edx
146
147 vzeroupper
148 movq %r12, 16(%rsp)
149 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */
150 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
151 movl %edx, %r12d
152 movq %r13, 8(%rsp)
153 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */
154 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
155 movl %eax, %r13d
156 movq %r14, (%rsp)
157 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */
158 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
159 # LOE rbx r15 r12d r13d
160
161 /* Range mask
162 * bits check
163 */
164
165L(RANGEMASK_CHECK):
166 btl %r12d, %r13d
167
168 /* Call scalar math function */
169 jc L(SCALAR_MATH_CALL)
170 # LOE rbx r15 r12d r13d
171
172 /* Special inputs
173 * processing loop
174 */
175
176L(SPECIAL_VALUES_LOOP):
177 incl %r12d
178 cmpl $4, %r12d
179
180 /* Check bits in range mask */
181 jl L(RANGEMASK_CHECK)
182 # LOE rbx r15 r12d r13d
183
184 movq 16(%rsp), %r12
185 cfi_restore(12)
186 movq 8(%rsp), %r13
187 cfi_restore(13)
188 movq (%rsp), %r14
189 cfi_restore(14)
190 vmovupd 64(%rsp), %ymm0
191
192 /* Go to exit */
193 jmp L(EXIT)
194 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */
195 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
196 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */
197 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
198 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */
199 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
200 # LOE rbx r12 r13 r14 r15 ymm0
201
202 /* Scalar math function call
203 * to process special input
204 */
205
206L(SCALAR_MATH_CALL):
207 movl %r12d, %r14d
208 vmovsd 32(%rsp, %r14, 8), %xmm0
209 call expm1@PLT
210 # LOE rbx r14 r15 r12d r13d xmm0
211
212 vmovsd %xmm0, 64(%rsp, %r14, 8)
213
214 /* Process special inputs in loop */
215 jmp L(SPECIAL_VALUES_LOOP)
216 # LOE rbx r15 r12d r13d
217END(_ZGVdN4v_expm1_avx2)
218
219 .section .rodata, "a"
220 .align 32
221
222#ifdef __svml_dexpm1_data_internal_typedef
223typedef unsigned int VUINT32;
224typedef struct {
225 __declspec(align(32)) VUINT32 Expm1_HA_table[(1<<8)][2];
226 __declspec(align(32)) VUINT32 poly_coeff[4][4][2];
227 __declspec(align(32)) VUINT32 Log2e[4][2];
228 __declspec(align(32)) VUINT32 L2H[4][2];
229 __declspec(align(32)) VUINT32 L2L[4][2];
230 __declspec(align(32)) VUINT32 ExpAddConst[4][2];
231 __declspec(align(32)) VUINT32 IndexMask[4][2];
232 __declspec(align(32)) VUINT32 ExpMask[4][2];
233 __declspec(align(32)) VUINT32 MOne[4][2];
234 __declspec(align(32)) VUINT32 AbsMask[4][2];
235 __declspec(align(32)) VUINT32 Threshold[4][2];
236 __declspec(align(32)) VUINT32 L2[4][2];
237} __svml_dexpm1_data_internal;
238#endif
239__svml_dexpm1_data_internal:
240 /* Expm1_HA_table */
241 .quad 0x0000000000000000, 0x0000000000000000
242 .quad 0x0000163da8000000, 0x3e3fb33356d84a67
243 .quad 0x00002c9a40000000, 0xbe3887f9f1190835
244 .quad 0x00004315e8000000, 0x3e1b9fe12f5ce3e7
245 .quad 0x000059b0d0000000, 0x3e48ac2ba1d73e2a
246 .quad 0x0000706b28000000, 0x3e3ddf6ddc6dc404
247 .quad 0x0000874518000000, 0x3e1d66f20230d7c9
248 .quad 0x00009e3ec8000000, 0x3e46379c1a290f03
249 .quad 0x0000b55870000000, 0xbe4833b784eb3a37
250 .quad 0x0000cc9228000000, 0x3e4b923fba03db83
251 .quad 0x0000e3ec30000000, 0x3e469e8d10103a17
252 .quad 0x0000fb66b0000000, 0xbdb2ce50dcdf6e22
253 .quad 0x00011301d0000000, 0x3df25b50a4ebbf1b
254 .quad 0x00012abdc0000000, 0x3e1b0c72fee4aeb5
255 .quad 0x0001429ab0000000, 0xbe356d2204cbefe7
256 .quad 0x00015a98c8000000, 0x3e24b1ca24901aae
257 .quad 0x000172b840000000, 0xbe4c15742919041c
258 .quad 0x00018af938000000, 0x3e2191bd3777ee17
259 .quad 0x0001a35be8000000, 0x3e4b7e5ba9e5b4c8
260 .quad 0x0001bbe088000000, 0xbe4fdd19632a70c7
261 .quad 0x0001d48730000000, 0x3e368b9aa7805b80
262 .quad 0x0001ed5020000000, 0x3e47e6c8e5c40d00
263 .quad 0x0002063b88000000, 0x3e18a3358ee3bac1
264 .quad 0x00021f4990000000, 0x3e37ddc962552fd3
265 .quad 0x0002387a70000000, 0xbe38a9dc7993e052
266 .quad 0x000251ce50000000, 0xbe135670329f5521
267 .quad 0x00026b4568000000, 0xbe40ec1916d42cc6
268 .quad 0x000284dfe0000000, 0x3e3f5638096cf15d
269 .quad 0x00029e9df8000000, 0xbe470108f69ed175
270 .quad 0x0002b87fd0000000, 0x3e2b5b31ffbbd48d
271 .quad 0x0002d285a8000000, 0xbe31bfcf4bff6e2b
272 .quad 0x0002ecafa8000000, 0x3e33e2f5611ca0f4
273 .quad 0x000306fe08000000, 0x3e418db8a96f46ad
274 .quad 0x0003217100000000, 0xbe4d993e76563187
275 .quad 0x00033c08b0000000, 0x3e4320b7fa64e431
276 .quad 0x000356c560000000, 0xbe1b5803cdae772e
277 .quad 0x000371a738000000, 0xbe28aac6ab1d7560
278 .quad 0x00038cae70000000, 0xbe47d13cd3d2b1a8
279 .quad 0x0003a7db38000000, 0xbe48d30048af21b7
280 .quad 0x0003c32dc0000000, 0x3e489d47242000f9
281 .quad 0x0003dea650000000, 0xbe4f6e5eee525f6f
282 .quad 0x0003fa4508000000, 0xbe4a9bff22fa047f
283 .quad 0x0004160a20000000, 0x3e3f72e29f84325c
284 .quad 0x000431f5d8000000, 0x3e350a896dc70444
285 .quad 0x00044e0860000000, 0x3e18624b40c4dbd0
286 .quad 0x00046a41f0000000, 0xbe4717fd446d7686
287 .quad 0x000486a2b8000000, 0xbe41f6197f61f2e2
288 .quad 0x0004a32af0000000, 0x3e2afa7bcce5b17a
289 .quad 0x0004bfdad8000000, 0xbe464eaec715e343
290 .quad 0x0004dcb298000000, 0x3e3fddd0d63b36ef
291 .quad 0x0004f9b278000000, 0xbe362d35952cc275
292 .quad 0x000516daa0000000, 0x3e467b320e0897a9
293 .quad 0x0005342b58000000, 0xbe362b07e20f57c4
294 .quad 0x000551a4c8000000, 0x3e42ec9076297631
295 .quad 0x00056f4738000000, 0xbe34ad8259913500
296 .quad 0x00058d12d8000000, 0xbe4b41c016d6a1ea
297 .quad 0x0005ab07e0000000, 0xbe45bd5eb539b67f
298 .quad 0x0005c92688000000, 0x3e42ca35b80e258e
299 .quad 0x0005e76f18000000, 0xbe4296f5bc8b20da
300 .quad 0x000605e1b8000000, 0x3e376dc08b076f59
301 .quad 0x0006247eb0000000, 0x3e0d2ac258f87d03
302 .quad 0x0006434638000000, 0xbe4999e701c483c7
303 .quad 0x0006623880000000, 0x3e42a91124893ecf
304 .quad 0x00068155d8000000, 0xbe4d9ab467bf1d47
305 .quad 0x0006a09e68000000, 0xbe380c4336f74d05
306 .quad 0x0006c01278000000, 0xbe47a12a08944ab3
307 .quad 0x0006dfb240000000, 0xbe4cd72e886ef8ea
308 .quad 0x0006ff7df8000000, 0x3e3519483cf87e1b
309 .quad 0x00071f75e8000000, 0x3e2d8bee7ba46e1e
310 .quad 0x00073f9a48000000, 0x3e24b02e77ab934a
311 .quad 0x00075feb58000000, 0xbe3bd98374091656
312 .quad 0x0007806950000000, 0xbe00d1604f328fec
313 .quad 0x0007a11470000000, 0x3e4f580c36bea881
314 .quad 0x0007c1ed00000000, 0x3e330c1327c49334
315 .quad 0x0007e2f338000000, 0xbe330b19defa2fd4
316 .quad 0x0008042758000000, 0xbe4e0f2f724f90cc
317 .quad 0x0008258998000000, 0x3e34cce128acf88b
318 .quad 0x0008471a48000000, 0xbe3dc385331ad094
319 .quad 0x000868d998000000, 0x3e4a2497640720ed
320 .quad 0x00088ac7d8000000, 0x3e38a669966530bd
321 .quad 0x0008ace540000000, 0x3e415506dadd3e2b
322 .quad 0x0008cf3218000000, 0xbe34abb7410d55e3
323 .quad 0x0008f1ae98000000, 0x3e31577362b98274
324 .quad 0x0009145b08000000, 0x3e4c8ffe2c4530da
325 .quad 0x00093737b0000000, 0x3e29b8bc9e8a0388
326 .quad 0x00095a44c8000000, 0x3e4e4290774da41b
327 .quad 0x00097d82a0000000, 0xbe00d8d83a30b6f8
328 .quad 0x0009a0f170000000, 0x3e2940f737462137
329 .quad 0x0009c49180000000, 0x3e451f8480e3e236
330 .quad 0x0009e86318000000, 0x3e3e323231824ca8
331 .quad 0x000a0c6678000000, 0x3e4aef2b2594d6d4
332 .quad 0x000a309bf0000000, 0xbe4dae966539f470
333 .quad 0x000a5503b0000000, 0x3e41f12ae45a1225
334 .quad 0x000a799e10000000, 0x3e49859ac3796fd9
335 .quad 0x000a9e6b58000000, 0xbe44301205e0a6de
336 .quad 0x000ac36bc0000000, 0xbe0606431f9234cb
337 .quad 0x000ae89f98000000, 0x3e35ad3ad5e8734d
338 .quad 0x000b0e0728000000, 0x3e38db66590842ad
339 .quad 0x000b33a2b8000000, 0x3e13c57ebdaff43a
340 .quad 0x000b597290000000, 0xbe40d536338e3bf7
341 .quad 0x000b7f76f0000000, 0x3e47daf237553d84
342 .quad 0x000ba5b030000000, 0x3e2420c930819679
343 .quad 0x000bcc1e90000000, 0x3e12f074891ee83d
344 .quad 0x000bf2c258000000, 0x3e4eb8f0442046b8
345 .quad 0x000c199be0000000, 0xbe43d56b1eeef9a7
346 .quad 0x000c40ab60000000, 0xbd87c2c975903ef8
347 .quad 0x000c67f130000000, 0xbe3a82eb4b5dec80
348 .quad 0x000c8f6d98000000, 0xbe4fc8c257729a1e
349 .quad 0x000cb720e0000000, 0xbe48837cb757e1a1
350 .quad 0x000cdf0b58000000, 0xbe4511e031dd83b5
351 .quad 0x000d072d48000000, 0x3e403c4bdc687918
352 .quad 0x000d2f8708000000, 0x3deb13e315bc2473
353 .quad 0x000d5818e0000000, 0xbe4822dbc6d12fd3
354 .quad 0x000d80e318000000, 0xbe3367c68447b063
355 .quad 0x000da9e600000000, 0x3e4ed9942b84600d
356 .quad 0x000dd321f0000000, 0x3e480da3025b4aef
357 .quad 0x000dfc9730000000, 0x3e4bdcdaf5cb4656
358 .quad 0x000e264618000000, 0xbe4852f6baf6c4f0
359 .quad 0x000e502ee8000000, 0xbe1d30027630bb40
360 .quad 0x000e7a51f8000000, 0x3e4e3a641a5aa459
361 .quad 0x000ea4afa0000000, 0x3e452486cc2c7b9d
362 .quad 0x000ecf4830000000, 0xbe438cc07b927e77
363 .quad 0x000efa1bf0000000, 0xbe39ea5d888e02de
364 .quad 0x000f252b38000000, 0xbe2288ad162f2d20
365 .quad 0x000f507658000000, 0x3e4b722a033a7c26
366 .quad 0x000f7bfdb0000000, 0xbe431a0f63b7625a
367 .quad 0x000fa7c180000000, 0x3e39e90d82e90a7e
368 .quad 0x000fd3c228000000, 0x3e4c7b8f884badd2
369 /* poly_coeff[4] */
370 .align 32
371 .quad 0x3f81111168877F38, 0x3f81111168877F38, 0x3f81111168877F38, 0x3f81111168877F38 /* coeff5 */
372 .quad 0x3fa55555C2A9C0F3, 0x3fa55555C2A9C0F3, 0x3fa55555C2A9C0F3, 0x3fa55555C2A9C0F3 /* coeff4 */
373 .quad 0x3fc555555555541D, 0x3fc555555555541D, 0x3fc555555555541D, 0x3fc555555555541D /* coeff3 */
374 .quad 0x3fdFFFFFFFFFFE5C, 0x3fdFFFFFFFFFFE5C, 0x3fdFFFFFFFFFFE5C, 0x3fdFFFFFFFFFFE5C /* coeff2 */
375 /* Log2e */
376 .align 32
377 .quad 0x40671547652B82FE, 0x40671547652B82FE, 0x40671547652B82FE, 0x40671547652B82FE
378 /* L2H */
379 .align 32
380 .quad 0x3f762e42fef80000, 0x3f762e42fef80000, 0x3f762e42fef80000, 0x3f762e42fef80000
381 /* L2L */
382 .align 32
383 .quad 0x3d41cf79abc9e3b4, 0x3d41cf79abc9e3b4, 0x3d41cf79abc9e3b4, 0x3d41cf79abc9e3b4
384 /* ExpAddConst */
385 .align 32
386 .quad 0x42f80000001ff800, 0x42f80000001ff800, 0x42f80000001ff800, 0x42f80000001ff800
387 /* IndexMask */
388 .align 32
389 .quad 0x00000000000007f0, 0x00000000000007f0, 0x00000000000007f0, 0x00000000000007f0
390 /* ExpMask */
391 .align 32
392 .quad 0x00000000003ff800, 0x00000000003ff800, 0x00000000003ff800, 0x00000000003ff800
393 /* MOne */
394 .align 32
395 .quad 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000
396 /* AbsMask */
397 .align 32
398 .quad 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff
399 /* Threshold */
400 .align 32
401 .quad 0x40861DA04CBAFE43, 0x40861DA04CBAFE43, 0x40861DA04CBAFE43, 0x40861DA04CBAFE43
402 /* L2 */
403 .align 32
404 .quad 0x3f762e42fefa39ef, 0x3f762e42fefa39ef, 0x3f762e42fefa39ef, 0x3f762e42fefa39ef
405 .align 32
406 .type __svml_dexpm1_data_internal, @object
407 .size __svml_dexpm1_data_internal, .-__svml_dexpm1_data_internal
408

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_d_expm14_core_avx2.S