1/* Function sincosf vectorized with AVX-512. KNL and SKX versions.
2 Copyright (C) 2014-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <sysdep.h>
20#include "svml_s_trig_data.h"
21#include "svml_s_wrapper_impl.h"
22
23/*
24 ALGORITHM DESCRIPTION:
25
26 1) Range reduction to [-Pi/4; +Pi/4] interval
27 a) Grab sign from source argument and save it.
28 b) Remove sign using AND operation
29 c) Getting octant Y by 2/Pi multiplication
30 d) Add "Right Shifter" value
31 e) Treat obtained value as integer S for destination sign setting.
32 SS = ((S-S&1)&2)<<30; For sin part
33 SC = ((S+S&1)&2)<<30; For cos part
34 f) Change destination sign if source sign is negative
35 using XOR operation.
36 g) Subtract "Right Shifter" (0x4B000000) value
37 h) Subtract Y*(PI/2) from X argument, where PI/2 divided to 4 parts:
38 X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4;
39 2) Polynomial (minimax for sin within [-Pi/4; +Pi/4] interval)
40 a) Calculate X^2 = X * X
41 b) Calculate 2 polynomials for sin and cos:
42 RS = X * ( A0 + X^2 * (A1 + x^2 * (A2 + x^2 * (A3))));
43 RC = B0 + X^2 * (B1 + x^2 * (B2 + x^2 * (B3 + x^2 * (B4))));
44 c) Swap RS & RC if first bit of obtained value after
45 Right Shifting is set to 1. Using And, Andnot & Or operations.
46 3) Destination sign setting
47 a) Set shifted destination sign using XOR operation:
48 R1 = XOR( RS, SS );
49 R2 = XOR( RC, SC ). */
50
51 .section .text.evex512, "ax", @progbits
52ENTRY (_ZGVeN16vl4l4_sincosf_knl)
53 pushq %rbp
54 cfi_adjust_cfa_offset (8)
55 cfi_rel_offset (%rbp, 0)
56 movq %rsp, %rbp
57 cfi_def_cfa_register (%rbp)
58 andq $-64, %rsp
59 subq $1344, %rsp
60 movq __svml_s_trig_data@GOTPCREL(%rip), %rax
61 vmovaps %zmm0, %zmm2
62 movl $-1, %edx
63 vmovups __sAbsMask(%rax), %zmm0
64 vmovups __sInvPI(%rax), %zmm3
65
66/* Absolute argument computation */
67 vpandd %zmm0, %zmm2, %zmm1
68 vmovups __sPI1_FMA(%rax), %zmm5
69 vmovups __sSignMask(%rax), %zmm9
70 vpandnd %zmm2, %zmm0, %zmm0
71
72/* h) Subtract Y*(PI/2) from X argument, where PI/2 divided to 3 parts:
73 X = X - Y*PI1 - Y*PI2 - Y*PI3 */
74 vmovaps %zmm1, %zmm6
75 vmovaps %zmm1, %zmm8
76
77/* c) Getting octant Y by 2/Pi multiplication
78 d) Add "Right Shifter" value */
79 vfmadd213ps __sRShifter(%rax), %zmm1, %zmm3
80 vmovups __sPI3_FMA(%rax), %zmm7
81
82/* g) Subtract "Right Shifter" (0x4B000000) value */
83 vsubps __sRShifter(%rax), %zmm3, %zmm12
84
85/* e) Treat obtained value as integer S for destination sign setting */
86 vpslld $31, %zmm3, %zmm13
87 vmovups __sA7_FMA(%rax), %zmm14
88 vfnmadd231ps %zmm12, %zmm5, %zmm6
89
90/* 2) Polynomial (minimax for sin within [-Pi/4; +Pi/4] interval)
91 a) Calculate X^2 = X * X
92 b) Calculate 2 polynomials for sin and cos:
93 RS = X * ( A0 + X^2 * (A1 + x^2 * (A2 + x^2 * (A3))));
94 RC = B0 + X^2 * (B1 + x^2 * (B2 + x^2 * (B3 + x^2 * (B4)))) */
95 vmovaps %zmm14, %zmm15
96 vmovups __sA9_FMA(%rax), %zmm3
97 vcmpps $22, __sRangeReductionVal(%rax), %zmm1, %k1
98 vpbroadcastd %edx, %zmm1{%k1}{z}
99 vfnmadd231ps __sPI2_FMA(%rax), %zmm12, %zmm6
100 vptestmd %zmm1, %zmm1, %k0
101 vpandd %zmm6, %zmm9, %zmm11
102 kmovw %k0, %ecx
103 vpxord __sOneHalf(%rax), %zmm11, %zmm4
104
105/* Result sign calculations */
106 vpternlogd $150, %zmm13, %zmm9, %zmm11
107
108/* Add correction term 0.5 for cos() part */
109 vaddps %zmm4, %zmm12, %zmm10
110 vfnmadd213ps %zmm6, %zmm7, %zmm12
111 vfnmadd231ps %zmm10, %zmm5, %zmm8
112 vpxord %zmm13, %zmm12, %zmm13
113 vmulps %zmm13, %zmm13, %zmm12
114 vfnmadd231ps __sPI2_FMA(%rax), %zmm10, %zmm8
115 vfmadd231ps __sA9_FMA(%rax), %zmm12, %zmm15
116 vfnmadd213ps %zmm8, %zmm7, %zmm10
117 vfmadd213ps __sA5_FMA(%rax), %zmm12, %zmm15
118 vpxord %zmm11, %zmm10, %zmm5
119 vmulps %zmm5, %zmm5, %zmm4
120 vfmadd213ps __sA3(%rax), %zmm12, %zmm15
121 vfmadd213ps %zmm14, %zmm4, %zmm3
122 vmulps %zmm12, %zmm15, %zmm14
123 vfmadd213ps __sA5_FMA(%rax), %zmm4, %zmm3
124 vfmadd213ps %zmm13, %zmm13, %zmm14
125 vfmadd213ps __sA3(%rax), %zmm4, %zmm3
126 vpxord %zmm0, %zmm14, %zmm0
127 vmulps %zmm4, %zmm3, %zmm3
128 vfmadd213ps %zmm5, %zmm5, %zmm3
129 testl %ecx, %ecx
130 jne .LBL_1_3
131
132.LBL_1_2:
133 cfi_remember_state
134 vmovups %zmm0, (%rdi)
135 vmovups %zmm3, (%rsi)
136 movq %rbp, %rsp
137 cfi_def_cfa_register (%rsp)
138 popq %rbp
139 cfi_adjust_cfa_offset (-8)
140 cfi_restore (%rbp)
141 ret
142
143.LBL_1_3:
144 cfi_restore_state
145 vmovups %zmm2, 1152(%rsp)
146 vmovups %zmm0, 1216(%rsp)
147 vmovups %zmm3, 1280(%rsp)
148 je .LBL_1_2
149
150 xorb %dl, %dl
151 kmovw %k4, 1048(%rsp)
152 xorl %eax, %eax
153 kmovw %k5, 1040(%rsp)
154 kmovw %k6, 1032(%rsp)
155 kmovw %k7, 1024(%rsp)
156 vmovups %zmm16, 960(%rsp)
157 vmovups %zmm17, 896(%rsp)
158 vmovups %zmm18, 832(%rsp)
159 vmovups %zmm19, 768(%rsp)
160 vmovups %zmm20, 704(%rsp)
161 vmovups %zmm21, 640(%rsp)
162 vmovups %zmm22, 576(%rsp)
163 vmovups %zmm23, 512(%rsp)
164 vmovups %zmm24, 448(%rsp)
165 vmovups %zmm25, 384(%rsp)
166 vmovups %zmm26, 320(%rsp)
167 vmovups %zmm27, 256(%rsp)
168 vmovups %zmm28, 192(%rsp)
169 vmovups %zmm29, 128(%rsp)
170 vmovups %zmm30, 64(%rsp)
171 vmovups %zmm31, (%rsp)
172 movq %rsi, 1056(%rsp)
173 movq %r12, 1096(%rsp)
174 cfi_offset_rel_rsp (12, 1096)
175 movb %dl, %r12b
176 movq %r13, 1088(%rsp)
177 cfi_offset_rel_rsp (13, 1088)
178 movl %eax, %r13d
179 movq %r14, 1080(%rsp)
180 cfi_offset_rel_rsp (14, 1080)
181 movl %ecx, %r14d
182 movq %r15, 1072(%rsp)
183 cfi_offset_rel_rsp (15, 1072)
184 movq %rbx, 1064(%rsp)
185 movq %rdi, %rbx
186 cfi_remember_state
187
188.LBL_1_6:
189 btl %r13d, %r14d
190 jc .LBL_1_13
191
192.LBL_1_7:
193 lea 1(%r13), %esi
194 btl %esi, %r14d
195 jc .LBL_1_10
196
197.LBL_1_8:
198 addb $1, %r12b
199 addl $2, %r13d
200 cmpb $16, %r12b
201 jb .LBL_1_6
202
203 movq %rbx, %rdi
204 kmovw 1048(%rsp), %k4
205 movq 1056(%rsp), %rsi
206 kmovw 1040(%rsp), %k5
207 movq 1096(%rsp), %r12
208 cfi_restore (%r12)
209 kmovw 1032(%rsp), %k6
210 movq 1088(%rsp), %r13
211 cfi_restore (%r13)
212 kmovw 1024(%rsp), %k7
213 vmovups 960(%rsp), %zmm16
214 vmovups 896(%rsp), %zmm17
215 vmovups 832(%rsp), %zmm18
216 vmovups 768(%rsp), %zmm19
217 vmovups 704(%rsp), %zmm20
218 vmovups 640(%rsp), %zmm21
219 vmovups 576(%rsp), %zmm22
220 vmovups 512(%rsp), %zmm23
221 vmovups 448(%rsp), %zmm24
222 vmovups 384(%rsp), %zmm25
223 vmovups 320(%rsp), %zmm26
224 vmovups 256(%rsp), %zmm27
225 vmovups 192(%rsp), %zmm28
226 vmovups 128(%rsp), %zmm29
227 vmovups 64(%rsp), %zmm30
228 vmovups (%rsp), %zmm31
229 movq 1080(%rsp), %r14
230 cfi_restore (%r14)
231 movq 1072(%rsp), %r15
232 cfi_restore (%r15)
233 movq 1064(%rsp), %rbx
234 vmovups 1216(%rsp), %zmm0
235 vmovups 1280(%rsp), %zmm3
236 jmp .LBL_1_2
237
238.LBL_1_10:
239 cfi_restore_state
240 movzbl %r12b, %r15d
241 vmovss 1156(%rsp,%r15,8), %xmm0
242
243 call JUMPTARGET(sinf)
244
245 vmovss %xmm0, 1220(%rsp,%r15,8)
246 vmovss 1156(%rsp,%r15,8), %xmm0
247
248 call JUMPTARGET(cosf)
249
250 vmovss %xmm0, 1284(%rsp,%r15,8)
251 jmp .LBL_1_8
252
253.LBL_1_13:
254 movzbl %r12b, %r15d
255 vmovss 1152(%rsp,%r15,8), %xmm0
256
257 call JUMPTARGET(sinf)
258
259 vmovss %xmm0, 1216(%rsp,%r15,8)
260 vmovss 1152(%rsp,%r15,8), %xmm0
261
262 call JUMPTARGET(cosf)
263
264 vmovss %xmm0, 1280(%rsp,%r15,8)
265 jmp .LBL_1_7
266END (_ZGVeN16vl4l4_sincosf_knl)
267libmvec_hidden_def(_ZGVeN16vl4l4_sincosf_knl)
268
269ENTRY (_ZGVeN16vl4l4_sincosf_skx)
270 pushq %rbp
271 cfi_adjust_cfa_offset (8)
272 cfi_rel_offset (%rbp, 0)
273 movq %rsp, %rbp
274 cfi_def_cfa_register (%rbp)
275 andq $-64, %rsp
276 subq $1344, %rsp
277 movq __svml_s_trig_data@GOTPCREL(%rip), %rax
278 vmovaps %zmm0, %zmm4
279 vmovups __sAbsMask(%rax), %zmm3
280 vmovups __sInvPI(%rax), %zmm5
281 vmovups __sRShifter(%rax), %zmm6
282 vmovups __sPI1_FMA(%rax), %zmm9
283 vmovups __sPI2_FMA(%rax), %zmm10
284 vmovups __sSignMask(%rax), %zmm14
285 vmovups __sOneHalf(%rax), %zmm7
286 vmovups __sPI3_FMA(%rax), %zmm12
287
288/* Absolute argument computation */
289 vandps %zmm3, %zmm4, %zmm2
290
291/* c) Getting octant Y by 2/Pi multiplication
292 d) Add "Right Shifter" value */
293 vfmadd213ps %zmm6, %zmm2, %zmm5
294 vcmpps $18, __sRangeReductionVal(%rax), %zmm2, %k1
295
296/* e) Treat obtained value as integer S for destination sign setting */
297 vpslld $31, %zmm5, %zmm0
298
299/* g) Subtract "Right Shifter" (0x4B000000) value */
300 vsubps %zmm6, %zmm5, %zmm5
301 vmovups __sA3(%rax), %zmm6
302
303/* h) Subtract Y*(PI/2) from X argument, where PI/2 divided to 3 parts:
304 X = X - Y*PI1 - Y*PI2 - Y*PI3 */
305 vmovaps %zmm2, %zmm11
306 vfnmadd231ps %zmm5, %zmm9, %zmm11
307 vfnmadd231ps %zmm5, %zmm10, %zmm11
308 vandps %zmm11, %zmm14, %zmm1
309 vxorps %zmm1, %zmm7, %zmm8
310
311/* Result sign calculations */
312 vpternlogd $150, %zmm0, %zmm14, %zmm1
313 vpternlogd $0xff, %zmm14, %zmm14, %zmm14
314
315/* Add correction term 0.5 for cos() part */
316 vaddps %zmm8, %zmm5, %zmm15
317 vfnmadd213ps %zmm11, %zmm12, %zmm5
318 vandnps %zmm4, %zmm3, %zmm11
319 vmovups __sA7_FMA(%rax), %zmm3
320 vmovaps %zmm2, %zmm13
321 vfnmadd231ps %zmm15, %zmm9, %zmm13
322 vxorps %zmm0, %zmm5, %zmm9
323 vmovups __sA5_FMA(%rax), %zmm0
324 vfnmadd231ps %zmm15, %zmm10, %zmm13
325 vmulps %zmm9, %zmm9, %zmm8
326 vfnmadd213ps %zmm13, %zmm12, %zmm15
327 vmovups __sA9_FMA(%rax), %zmm12
328 vxorps %zmm1, %zmm15, %zmm1
329 vmulps %zmm1, %zmm1, %zmm13
330
331/* 2) Polynomial (minimax for sin within [-Pi/4; +Pi/4] interval)
332 a) Calculate X^2 = X * X
333 b) Calculate 2 polynomials for sin and cos:
334 RS = X * ( A0 + X^2 * (A1 + x^2 * (A2 + x^2 * (A3))));
335 RC = B0 + X^2 * (B1 + x^2 * (B2 + x^2 * (B3 + x^2 * (B4)))) */
336 vmovaps %zmm12, %zmm7
337 vfmadd213ps %zmm3, %zmm8, %zmm7
338 vfmadd213ps %zmm3, %zmm13, %zmm12
339 vfmadd213ps %zmm0, %zmm8, %zmm7
340 vfmadd213ps %zmm0, %zmm13, %zmm12
341 vfmadd213ps %zmm6, %zmm8, %zmm7
342 vfmadd213ps %zmm6, %zmm13, %zmm12
343 vmulps %zmm8, %zmm7, %zmm10
344 vmulps %zmm13, %zmm12, %zmm3
345 vfmadd213ps %zmm9, %zmm9, %zmm10
346 vfmadd213ps %zmm1, %zmm1, %zmm3
347 vxorps %zmm11, %zmm10, %zmm0
348 vpandnd %zmm2, %zmm2, %zmm14{%k1}
349 vptestmd %zmm14, %zmm14, %k0
350 kmovw %k0, %ecx
351 testl %ecx, %ecx
352 jne .LBL_2_3
353
354.LBL_2_2:
355 cfi_remember_state
356 vmovups %zmm0, (%rdi)
357 vmovups %zmm3, (%rsi)
358 movq %rbp, %rsp
359 cfi_def_cfa_register (%rsp)
360 popq %rbp
361 cfi_adjust_cfa_offset (-8)
362 cfi_restore (%rbp)
363 ret
364
365.LBL_2_3:
366 cfi_restore_state
367 vmovups %zmm4, 1152(%rsp)
368 vmovups %zmm0, 1216(%rsp)
369 vmovups %zmm3, 1280(%rsp)
370 je .LBL_2_2
371
372 xorb %dl, %dl
373 xorl %eax, %eax
374 kmovw %k4, 1048(%rsp)
375 kmovw %k5, 1040(%rsp)
376 kmovw %k6, 1032(%rsp)
377 kmovw %k7, 1024(%rsp)
378 vmovups %zmm16, 960(%rsp)
379 vmovups %zmm17, 896(%rsp)
380 vmovups %zmm18, 832(%rsp)
381 vmovups %zmm19, 768(%rsp)
382 vmovups %zmm20, 704(%rsp)
383 vmovups %zmm21, 640(%rsp)
384 vmovups %zmm22, 576(%rsp)
385 vmovups %zmm23, 512(%rsp)
386 vmovups %zmm24, 448(%rsp)
387 vmovups %zmm25, 384(%rsp)
388 vmovups %zmm26, 320(%rsp)
389 vmovups %zmm27, 256(%rsp)
390 vmovups %zmm28, 192(%rsp)
391 vmovups %zmm29, 128(%rsp)
392 vmovups %zmm30, 64(%rsp)
393 vmovups %zmm31, (%rsp)
394 movq %rsi, 1056(%rsp)
395 movq %r12, 1096(%rsp)
396 cfi_offset_rel_rsp (12, 1096)
397 movb %dl, %r12b
398 movq %r13, 1088(%rsp)
399 cfi_offset_rel_rsp (13, 1088)
400 movl %eax, %r13d
401 movq %r14, 1080(%rsp)
402 cfi_offset_rel_rsp (14, 1080)
403 movl %ecx, %r14d
404 movq %r15, 1072(%rsp)
405 cfi_offset_rel_rsp (15, 1072)
406 movq %rbx, 1064(%rsp)
407 movq %rdi, %rbx
408 cfi_remember_state
409
410.LBL_2_6:
411 btl %r13d, %r14d
412 jc .LBL_2_13
413
414.LBL_2_7:
415 lea 1(%r13), %esi
416 btl %esi, %r14d
417 jc .LBL_2_10
418
419.LBL_2_8:
420 incb %r12b
421 addl $2, %r13d
422 cmpb $16, %r12b
423 jb .LBL_2_6
424
425 kmovw 1048(%rsp), %k4
426 movq %rbx, %rdi
427 kmovw 1040(%rsp), %k5
428 kmovw 1032(%rsp), %k6
429 kmovw 1024(%rsp), %k7
430 vmovups 960(%rsp), %zmm16
431 vmovups 896(%rsp), %zmm17
432 vmovups 832(%rsp), %zmm18
433 vmovups 768(%rsp), %zmm19
434 vmovups 704(%rsp), %zmm20
435 vmovups 640(%rsp), %zmm21
436 vmovups 576(%rsp), %zmm22
437 vmovups 512(%rsp), %zmm23
438 vmovups 448(%rsp), %zmm24
439 vmovups 384(%rsp), %zmm25
440 vmovups 320(%rsp), %zmm26
441 vmovups 256(%rsp), %zmm27
442 vmovups 192(%rsp), %zmm28
443 vmovups 128(%rsp), %zmm29
444 vmovups 64(%rsp), %zmm30
445 vmovups (%rsp), %zmm31
446 vmovups 1216(%rsp), %zmm0
447 vmovups 1280(%rsp), %zmm3
448 movq 1056(%rsp), %rsi
449 movq 1096(%rsp), %r12
450 cfi_restore (%r12)
451 movq 1088(%rsp), %r13
452 cfi_restore (%r13)
453 movq 1080(%rsp), %r14
454 cfi_restore (%r14)
455 movq 1072(%rsp), %r15
456 cfi_restore (%r15)
457 movq 1064(%rsp), %rbx
458 jmp .LBL_2_2
459
460.LBL_2_10:
461 cfi_restore_state
462 movzbl %r12b, %r15d
463 vmovss 1156(%rsp,%r15,8), %xmm0
464 vzeroupper
465 vmovss 1156(%rsp,%r15,8), %xmm0
466
467 call JUMPTARGET(sinf)
468
469 vmovss %xmm0, 1220(%rsp,%r15,8)
470 vmovss 1156(%rsp,%r15,8), %xmm0
471
472 call JUMPTARGET(cosf)
473
474 vmovss %xmm0, 1284(%rsp,%r15,8)
475 jmp .LBL_2_8
476
477.LBL_2_13:
478 movzbl %r12b, %r15d
479 vmovss 1152(%rsp,%r15,8), %xmm0
480 vzeroupper
481 vmovss 1152(%rsp,%r15,8), %xmm0
482
483 call JUMPTARGET(sinf)
484
485 vmovss %xmm0, 1216(%rsp,%r15,8)
486 vmovss 1152(%rsp,%r15,8), %xmm0
487
488 call JUMPTARGET(cosf)
489
490 vmovss %xmm0, 1280(%rsp,%r15,8)
491 jmp .LBL_2_7
492END (_ZGVeN16vl4l4_sincosf_skx)
493libmvec_hidden_def(_ZGVeN16vl4l4_sincosf_skx)
494
495/* Wrapper between vvv and vl4l4 vector variants. */
496.macro WRAPPER_AVX512_vvv_vl4l4 callee
497#ifndef __ILP32__
498 pushq %rbp
499 cfi_adjust_cfa_offset (8)
500 cfi_rel_offset (%rbp, 0)
501 movq %rsp, %rbp
502 cfi_def_cfa_register (%rbp)
503 andq $-64, %rsp
504 subq $384, %rsp
505 vmovups %zmm1, 128(%rsp)
506 lea (%rsp), %rdi
507 vmovups %zmm2, 192(%rdi)
508 vmovups %zmm3, 256(%rdi)
509 vmovups %zmm4, 320(%rdi)
510 lea 64(%rsp), %rsi
511 call HIDDEN_JUMPTARGET(\callee)
512 movq 128(%rsp), %rdx
513 movq 136(%rsp), %rsi
514 movq 144(%rsp), %r8
515 movq 152(%rsp), %r10
516 movl (%rsp), %eax
517 movl 4(%rsp), %ecx
518 movl 8(%rsp), %edi
519 movl 12(%rsp), %r9d
520 movl %eax, (%rdx)
521 movl %ecx, (%rsi)
522 movq 160(%rsp), %rax
523 movq 168(%rsp), %rcx
524 movl %edi, (%r8)
525 movl %r9d, (%r10)
526 movq 176(%rsp), %rdi
527 movq 184(%rsp), %r9
528 movl 16(%rsp), %r11d
529 movl 20(%rsp), %edx
530 movl 24(%rsp), %esi
531 movl 28(%rsp), %r8d
532 movl %r11d, (%rax)
533 movl %edx, (%rcx)
534 movq 192(%rsp), %r11
535 movq 200(%rsp), %rdx
536 movl %esi, (%rdi)
537 movl %r8d, (%r9)
538 movq 208(%rsp), %rsi
539 movq 216(%rsp), %r8
540 movl 32(%rsp), %r10d
541 movl 36(%rsp), %eax
542 movl 40(%rsp), %ecx
543 movl 44(%rsp), %edi
544 movl %r10d, (%r11)
545 movl %eax, (%rdx)
546 movq 224(%rsp), %r10
547 movq 232(%rsp), %rax
548 movl %ecx, (%rsi)
549 movl %edi, (%r8)
550 movq 240(%rsp), %rcx
551 movq 248(%rsp), %rdi
552 movl 48(%rsp), %r9d
553 movl 52(%rsp), %r11d
554 movl 56(%rsp), %edx
555 movl 60(%rsp), %esi
556 movl %r9d, (%r10)
557 movl %r11d, (%rax)
558 movq 256(%rsp), %r9
559 movq 264(%rsp), %r11
560 movl %edx, (%rcx)
561 movl %esi, (%rdi)
562 movq 272(%rsp), %rdx
563 movq 280(%rsp), %rsi
564 movl 64(%rsp), %r8d
565 movl 68(%rsp), %r10d
566 movl 72(%rsp), %eax
567 movl 76(%rsp), %ecx
568 movl %r8d, (%r9)
569 movl %r10d, (%r11)
570 movq 288(%rsp), %r8
571 movq 296(%rsp), %r10
572 movl %eax, (%rdx)
573 movl %ecx, (%rsi)
574 movq 304(%rsp), %rax
575 movq 312(%rsp), %rcx
576 movl 80(%rsp), %edi
577 movl 84(%rsp), %r9d
578 movl 88(%rsp), %r11d
579 movl 92(%rsp), %edx
580 movl %edi, (%r8)
581 movl %r9d, (%r10)
582 movq 320(%rsp), %rdi
583 movq 328(%rsp), %r9
584 movl %r11d, (%rax)
585 movl %edx, (%rcx)
586 movq 336(%rsp), %r11
587 movq 344(%rsp), %rdx
588 movl 96(%rsp), %esi
589 movl 100(%rsp), %r8d
590 movl 104(%rsp), %r10d
591 movl 108(%rsp), %eax
592 movl %esi, (%rdi)
593 movl %r8d, (%r9)
594 movq 352(%rsp), %rsi
595 movq 360(%rsp), %r8
596 movl %r10d, (%r11)
597 movl %eax, (%rdx)
598 movq 368(%rsp), %r10
599 movq 376(%rsp), %rax
600 movl 112(%rsp), %ecx
601 movl 116(%rsp), %edi
602 movl 120(%rsp), %r9d
603 movl 124(%rsp), %r11d
604 movl %ecx, (%rsi)
605 movl %edi, (%r8)
606 movl %r9d, (%r10)
607 movl %r11d, (%rax)
608 movq %rbp, %rsp
609 cfi_def_cfa_register (%rsp)
610 popq %rbp
611 cfi_adjust_cfa_offset (-8)
612 cfi_restore (%rbp)
613 ret
614#else
615 leal 8(%rsp), %r10d
616 .cfi_def_cfa 10, 0
617 andl $-64, %esp
618 pushq -8(%r10d)
619 pushq %rbp
620 .cfi_escape 0x10,0x6,0x2,0x76,0
621 movl %esp, %ebp
622 pushq %r10
623 .cfi_escape 0xf,0x3,0x76,0x78,0x6
624 leal -112(%rbp), %esi
625 leal -176(%rbp), %edi
626 subl $296, %esp
627 vmovdqa64 %zmm1, -240(%ebp)
628 vmovdqa64 %zmm2, -304(%ebp)
629 call HIDDEN_JUMPTARGET(\callee)
630 movl -240(%ebp), %eax
631 vmovss -176(%ebp), %xmm0
632 vmovss %xmm0, (%eax)
633 movl -236(%ebp), %eax
634 vmovss -172(%ebp), %xmm0
635 vmovss %xmm0, (%eax)
636 movl -232(%ebp), %eax
637 vmovss -168(%ebp), %xmm0
638 vmovss %xmm0, (%eax)
639 movl -228(%ebp), %eax
640 vmovss -164(%ebp), %xmm0
641 vmovss %xmm0, (%eax)
642 movl -224(%ebp), %eax
643 vmovss -160(%ebp), %xmm0
644 vmovss %xmm0, (%eax)
645 movl -220(%ebp), %eax
646 vmovss -156(%ebp), %xmm0
647 vmovss %xmm0, (%eax)
648 movl -216(%ebp), %eax
649 vmovss -152(%ebp), %xmm0
650 vmovss %xmm0, (%eax)
651 movl -212(%ebp), %eax
652 vmovss -148(%ebp), %xmm0
653 vmovss %xmm0, (%eax)
654 movl -208(%ebp), %eax
655 vmovss -144(%ebp), %xmm0
656 vmovss %xmm0, (%eax)
657 movl -204(%ebp), %eax
658 vmovss -140(%ebp), %xmm0
659 vmovss %xmm0, (%eax)
660 movl -200(%ebp), %eax
661 vmovss -136(%ebp), %xmm0
662 vmovss %xmm0, (%eax)
663 movl -196(%ebp), %eax
664 vmovss -132(%ebp), %xmm0
665 vmovss %xmm0, (%eax)
666 movl -192(%ebp), %eax
667 vmovss -128(%ebp), %xmm0
668 vmovss %xmm0, (%eax)
669 movl -188(%ebp), %eax
670 vmovss -124(%ebp), %xmm0
671 vmovss %xmm0, (%eax)
672 movl -184(%ebp), %eax
673 vmovss -120(%ebp), %xmm0
674 vmovss %xmm0, (%eax)
675 movl -180(%ebp), %eax
676 vmovss -116(%ebp), %xmm0
677 vmovss %xmm0, (%eax)
678 movl -304(%ebp), %eax
679 vmovss -112(%ebp), %xmm0
680 vmovss %xmm0, (%eax)
681 movl -300(%ebp), %eax
682 vmovss -108(%ebp), %xmm0
683 vmovss %xmm0, (%eax)
684 movl -296(%ebp), %eax
685 vmovss -104(%ebp), %xmm0
686 vmovss %xmm0, (%eax)
687 movl -292(%ebp), %eax
688 vmovss -100(%ebp), %xmm0
689 vmovss %xmm0, (%eax)
690 movl -288(%ebp), %eax
691 vmovss -96(%ebp), %xmm0
692 vmovss %xmm0, (%eax)
693 movl -284(%ebp), %eax
694 vmovss -92(%ebp), %xmm0
695 vmovss %xmm0, (%eax)
696 movl -280(%ebp), %eax
697 vmovss -88(%ebp), %xmm0
698 vmovss %xmm0, (%eax)
699 movl -276(%ebp), %eax
700 vmovss -84(%ebp), %xmm0
701 vmovss %xmm0, (%eax)
702 movl -272(%ebp), %eax
703 vmovss -80(%ebp), %xmm0
704 vmovss %xmm0, (%eax)
705 movl -268(%ebp), %eax
706 vmovss -76(%ebp), %xmm0
707 vmovss %xmm0, (%eax)
708 movl -264(%ebp), %eax
709 vmovss -72(%ebp), %xmm0
710 vmovss %xmm0, (%eax)
711 movl -260(%ebp), %eax
712 vmovss -68(%ebp), %xmm0
713 vmovss %xmm0, (%eax)
714 movl -256(%ebp), %eax
715 vmovss -64(%ebp), %xmm0
716 vmovss %xmm0, (%eax)
717 movl -252(%ebp), %eax
718 vmovss -60(%ebp), %xmm0
719 vmovss %xmm0, (%eax)
720 movl -248(%ebp), %eax
721 vmovss -56(%ebp), %xmm0
722 vmovss %xmm0, (%eax)
723 movl -244(%ebp), %eax
724 vmovss -52(%ebp), %xmm0
725 vmovss %xmm0, (%eax)
726 addl $296, %esp
727 popq %r10
728 .cfi_def_cfa 10, 0
729 popq %rbp
730 leal -8(%r10), %esp
731 .cfi_def_cfa 7, 8
732 ret
733#endif
734.endm
735
736ENTRY (_ZGVeN16vvv_sincosf_knl)
737WRAPPER_AVX512_vvv_vl4l4 _ZGVeN16vl4l4_sincosf_knl
738END (_ZGVeN16vvv_sincosf_knl)
739
740ENTRY (_ZGVeN16vvv_sincosf_skx)
741WRAPPER_AVX512_vvv_vl4l4 _ZGVeN16vl4l4_sincosf_skx
742END (_ZGVeN16vvv_sincosf_skx)
743

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S