1/* Function sincos vectorized with AVX-512. KNL and SKX versions.
2 Copyright (C) 2014-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <sysdep.h>
20#include "svml_d_trig_data.h"
21#include "svml_d_wrapper_impl.h"
22
23/*
24 ALGORITHM DESCRIPTION:
25
26 ( low accuracy ( < 4ulp ) or enhanced performance
27 ( half of correct mantissa ) implementation )
28
29 Argument representation:
30 arg = N*Pi + R
31
32 Result calculation:
33 sin(arg) = sin(N*Pi + R) = (-1)^N * sin(R)
34 arg + Pi/2 = (N'*Pi + R')
35 cos(arg) = sin(arg+Pi/2) = sin(N'*Pi + R') = (-1)^N' * sin(R')
36 sin(R), sin(R') are approximated by corresponding polynomial. */
37
38 .section .text.evex512, "ax", @progbits
39ENTRY (_ZGVeN8vl8l8_sincos_knl)
40 pushq %rbp
41 cfi_adjust_cfa_offset (8)
42 cfi_rel_offset (%rbp, 0)
43 movq %rsp, %rbp
44 cfi_def_cfa_register (%rbp)
45 andq $-64, %rsp
46 subq $1344, %rsp
47 movq __svml_d_trig_data@GOTPCREL(%rip), %rax
48 vmovaps %zmm0, %zmm4
49 movq $-1, %rdx
50 vmovups __dSignMask(%rax), %zmm12
51 vmovups __dInvPI(%rax), %zmm5
52
53/* ARGUMENT RANGE REDUCTION:
54 Absolute argument: X' = |X| */
55 vpandnq %zmm4, %zmm12, %zmm3
56 vmovups __dPI1_FMA(%rax), %zmm7
57 vmovups __dPI3_FMA(%rax), %zmm9
58
59/* SinR = X' - SinN*Pi1 */
60 vmovaps %zmm3, %zmm8
61
62/* CosR = SinX - CosN*Pi1 */
63 vmovaps %zmm3, %zmm10
64
65/* SinY = X'*InvPi + RS : right shifter add */
66 vfmadd213pd __dRShifter(%rax), %zmm3, %zmm5
67 vmovups __dC6(%rax), %zmm13
68
69/* SinN = Y - RS : right shifter sub */
70 vsubpd __dRShifter(%rax), %zmm5, %zmm1
71 vmovaps %zmm13, %zmm14
72
73/* SinSignRes = Y<<63 : shift LSB to MSB place for result sign */
74 vpsllq $63, %zmm5, %zmm2
75 vcmppd $22, __dRangeVal(%rax), %zmm3, %k1
76
77/* Update CosRSign and CosSignRes signs */
78 vmovaps %zmm12, %zmm5
79 vfnmadd231pd %zmm1, %zmm7, %zmm8
80
81/* SinR = SinR - SinN*Pi1 */
82 vfnmadd231pd __dPI2_FMA(%rax), %zmm1, %zmm8
83
84/* Sine result sign: SinRSign = SignMask & SinR */
85 vpandq %zmm8, %zmm12, %zmm11
86
87/* Set SinRSign to 0.5 */
88 vporq __dOneHalf(%rax), %zmm11, %zmm6
89 vpternlogq $150, %zmm2, %zmm11, %zmm5
90
91/* Update sign SinSignRes */
92 vpternlogq $120, %zmm4, %zmm12, %zmm2
93
94/* Polynomial approximation */
95 vmovups __dC7(%rax), %zmm11
96
97/* CosN = SinN +(-)0.5 */
98 vaddpd %zmm6, %zmm1, %zmm0
99
100/* SinR = SinR - SinN*Pi3 */
101 vfnmadd213pd %zmm8, %zmm9, %zmm1
102 vfnmadd231pd %zmm0, %zmm7, %zmm10
103
104/* SinR2 = SinR^2 */
105 vmulpd %zmm1, %zmm1, %zmm15
106
107/* Grab SignX
108 CosR = CosR - CosN*Pi2 */
109 vfnmadd231pd __dPI2_FMA(%rax), %zmm0, %zmm10
110 vfmadd231pd __dC7(%rax), %zmm15, %zmm14
111
112/* CosR = CosR - CosN*Pi3 */
113 vfnmadd213pd %zmm10, %zmm9, %zmm0
114 vfmadd213pd __dC5(%rax), %zmm15, %zmm14
115
116/* CosR2 = CosR^2 */
117 vmulpd %zmm0, %zmm0, %zmm12
118 vfmadd213pd __dC4(%rax), %zmm15, %zmm14
119 vfmadd213pd %zmm13, %zmm12, %zmm11
120
121/* SinPoly = C3 + SinR2*(C4 + SinR2*(C5 + SinR2*(C6 + SinR2*C7))) */
122 vfmadd213pd __dC3(%rax), %zmm15, %zmm14
123 vfmadd213pd __dC5(%rax), %zmm12, %zmm11
124
125/* SinPoly = C2 + SinR2*SinPoly */
126 vfmadd213pd __dC2(%rax), %zmm15, %zmm14
127 vfmadd213pd __dC4(%rax), %zmm12, %zmm11
128
129/* SinPoly = C1 + SinR2*SinPoly */
130 vfmadd213pd __dC1(%rax), %zmm15, %zmm14
131
132/* CosPoly = C3 + CosR2*(C4 + CosR2*(C5 + CosR2*(C6 + CosR2*C7))) */
133 vfmadd213pd __dC3(%rax), %zmm12, %zmm11
134
135/* SinPoly = SinR2*SinPoly */
136 vmulpd %zmm15, %zmm14, %zmm13
137
138/* CosPoly = C2 + CosR2*CosPoly */
139 vfmadd213pd __dC2(%rax), %zmm12, %zmm11
140
141/* SinPoly = SinR*SinPoly */
142 vfmadd213pd %zmm1, %zmm1, %zmm13
143 vpbroadcastq %rdx, %zmm1{%k1}{z}
144
145/* CosPoly = C1 + CosR2*CosPoly */
146 vfmadd213pd __dC1(%rax), %zmm12, %zmm11
147 vptestmq %zmm1, %zmm1, %k0
148 kmovw %k0, %ecx
149
150/* CosPoly = CosR2*CosPoly */
151 vmulpd %zmm12, %zmm11, %zmm14
152 movzbl %cl, %ecx
153
154/* CosPoly = CosR*CosPoly */
155 vfmadd213pd %zmm0, %zmm0, %zmm14
156
157/* Final reconstruction.
158 Update Sin result's sign */
159 vpxorq %zmm2, %zmm13, %zmm0
160
161/* Update Cos result's sign */
162 vpxorq %zmm5, %zmm14, %zmm2
163 testl %ecx, %ecx
164 jne .LBL_1_3
165
166.LBL_1_2:
167 cfi_remember_state
168 vmovups %zmm0, (%rdi)
169 vmovups %zmm2, (%rsi)
170 movq %rbp, %rsp
171 cfi_def_cfa_register (%rsp)
172 popq %rbp
173 cfi_adjust_cfa_offset (-8)
174 cfi_restore (%rbp)
175 ret
176
177.LBL_1_3:
178 cfi_restore_state
179 vmovups %zmm4, 1152(%rsp)
180 vmovups %zmm0, 1216(%rsp)
181 vmovups %zmm2, 1280(%rsp)
182 je .LBL_1_2
183
184 xorb %dl, %dl
185 kmovw %k4, 1048(%rsp)
186 xorl %eax, %eax
187 kmovw %k5, 1040(%rsp)
188 kmovw %k6, 1032(%rsp)
189 kmovw %k7, 1024(%rsp)
190 vmovups %zmm16, 960(%rsp)
191 vmovups %zmm17, 896(%rsp)
192 vmovups %zmm18, 832(%rsp)
193 vmovups %zmm19, 768(%rsp)
194 vmovups %zmm20, 704(%rsp)
195 vmovups %zmm21, 640(%rsp)
196 vmovups %zmm22, 576(%rsp)
197 vmovups %zmm23, 512(%rsp)
198 vmovups %zmm24, 448(%rsp)
199 vmovups %zmm25, 384(%rsp)
200 vmovups %zmm26, 320(%rsp)
201 vmovups %zmm27, 256(%rsp)
202 vmovups %zmm28, 192(%rsp)
203 vmovups %zmm29, 128(%rsp)
204 vmovups %zmm30, 64(%rsp)
205 vmovups %zmm31, (%rsp)
206 movq %rsi, 1056(%rsp)
207 movq %r12, 1096(%rsp)
208 cfi_offset_rel_rsp (12, 1096)
209 movb %dl, %r12b
210 movq %r13, 1088(%rsp)
211 cfi_offset_rel_rsp (13, 1088)
212 movl %eax, %r13d
213 movq %r14, 1080(%rsp)
214 cfi_offset_rel_rsp (14, 1080)
215 movl %ecx, %r14d
216 movq %r15, 1072(%rsp)
217 cfi_offset_rel_rsp (15, 1072)
218 movq %rbx, 1064(%rsp)
219 movq %rdi, %rbx
220 cfi_remember_state
221
222.LBL_1_6:
223 btl %r13d, %r14d
224 jc .LBL_1_13
225
226.LBL_1_7:
227 lea 1(%r13), %esi
228 btl %esi, %r14d
229 jc .LBL_1_10
230
231.LBL_1_8:
232 addb $1, %r12b
233 addl $2, %r13d
234 cmpb $16, %r12b
235 jb .LBL_1_6
236
237 movq %rbx, %rdi
238 kmovw 1048(%rsp), %k4
239 movq 1056(%rsp), %rsi
240 kmovw 1040(%rsp), %k5
241 movq 1096(%rsp), %r12
242 cfi_restore (%r12)
243 kmovw 1032(%rsp), %k6
244 movq 1088(%rsp), %r13
245 cfi_restore (%r13)
246 kmovw 1024(%rsp), %k7
247 vmovups 960(%rsp), %zmm16
248 vmovups 896(%rsp), %zmm17
249 vmovups 832(%rsp), %zmm18
250 vmovups 768(%rsp), %zmm19
251 vmovups 704(%rsp), %zmm20
252 vmovups 640(%rsp), %zmm21
253 vmovups 576(%rsp), %zmm22
254 vmovups 512(%rsp), %zmm23
255 vmovups 448(%rsp), %zmm24
256 vmovups 384(%rsp), %zmm25
257 vmovups 320(%rsp), %zmm26
258 vmovups 256(%rsp), %zmm27
259 vmovups 192(%rsp), %zmm28
260 vmovups 128(%rsp), %zmm29
261 vmovups 64(%rsp), %zmm30
262 vmovups (%rsp), %zmm31
263 movq 1080(%rsp), %r14
264 cfi_restore (%r14)
265 movq 1072(%rsp), %r15
266 cfi_restore (%r15)
267 movq 1064(%rsp), %rbx
268 vmovups 1216(%rsp), %zmm0
269 vmovups 1280(%rsp), %zmm2
270 jmp .LBL_1_2
271
272.LBL_1_10:
273 cfi_restore_state
274 movzbl %r12b, %r15d
275 shlq $4, %r15
276 vmovsd 1160(%rsp,%r15), %xmm0
277
278 call JUMPTARGET(sin)
279
280 vmovsd %xmm0, 1224(%rsp,%r15)
281 vmovsd 1160(%rsp,%r15), %xmm0
282
283 call JUMPTARGET(cos)
284
285 vmovsd %xmm0, 1288(%rsp,%r15)
286 jmp .LBL_1_8
287
288.LBL_1_13:
289 movzbl %r12b, %r15d
290 shlq $4, %r15
291 vmovsd 1152(%rsp,%r15), %xmm0
292
293 call JUMPTARGET(sin)
294
295 vmovsd %xmm0, 1216(%rsp,%r15)
296 vmovsd 1152(%rsp,%r15), %xmm0
297
298 call JUMPTARGET(cos)
299
300 vmovsd %xmm0, 1280(%rsp,%r15)
301 jmp .LBL_1_7
302
303END (_ZGVeN8vl8l8_sincos_knl)
304libmvec_hidden_def(_ZGVeN8vl8l8_sincos_knl)
305
306ENTRY (_ZGVeN8vl8l8_sincos_skx)
307 pushq %rbp
308 cfi_adjust_cfa_offset (8)
309 cfi_rel_offset (%rbp, 0)
310 movq %rsp, %rbp
311 cfi_def_cfa_register (%rbp)
312 andq $-64, %rsp
313 subq $1344, %rsp
314 movq __svml_d_trig_data@GOTPCREL(%rip), %rax
315 vmovaps %zmm0, %zmm8
316 vmovups __dSignMask(%rax), %zmm4
317 vmovups __dInvPI(%rax), %zmm9
318 vmovups __dRShifter(%rax), %zmm10
319 vmovups __dPI1_FMA(%rax), %zmm13
320 vmovups __dPI2_FMA(%rax), %zmm14
321 vmovups __dOneHalf(%rax), %zmm11
322 vmovups __dPI3_FMA(%rax), %zmm2
323
324/* ARGUMENT RANGE REDUCTION:
325 Absolute argument: X' = |X| */
326 vandnpd %zmm8, %zmm4, %zmm7
327
328/* SinY = X'*InvPi + RS : right shifter add */
329 vfmadd213pd %zmm10, %zmm7, %zmm9
330 vcmppd $18, __dRangeVal(%rax), %zmm7, %k1
331
332/* SinSignRes = Y<<63 : shift LSB to MSB place for result sign */
333 vpsllq $63, %zmm9, %zmm6
334
335/* SinN = Y - RS : right shifter sub */
336 vsubpd %zmm10, %zmm9, %zmm5
337 vmovups __dC5(%rax), %zmm9
338 vmovups __dC4(%rax), %zmm10
339
340/* SinR = X' - SinN*Pi1 */
341 vmovaps %zmm7, %zmm15
342 vfnmadd231pd %zmm5, %zmm13, %zmm15
343
344/* SinR = SinR - SinN*Pi1 */
345 vfnmadd231pd %zmm5, %zmm14, %zmm15
346
347/* Sine result sign: SinRSign = SignMask & SinR */
348 vandpd %zmm15, %zmm4, %zmm1
349
350/* Set SinRSign to 0.5 */
351 vorpd %zmm1, %zmm11, %zmm12
352 vmovups __dC3(%rax), %zmm11
353
354/* CosN = SinN +(-)0.5 */
355 vaddpd %zmm12, %zmm5, %zmm3
356
357/* SinR = SinR - SinN*Pi3 */
358 vfnmadd213pd %zmm15, %zmm2, %zmm5
359 vmovups __dC2(%rax), %zmm12
360
361/* SinR2 = SinR^2 */
362 vmulpd %zmm5, %zmm5, %zmm15
363
364/* CosR = SinX - CosN*Pi1 */
365 vmovaps %zmm7, %zmm0
366 vfnmadd231pd %zmm3, %zmm13, %zmm0
367 vmovups __dC1(%rax), %zmm13
368
369/* Grab SignX
370 CosR = CosR - CosN*Pi2 */
371 vfnmadd231pd %zmm3, %zmm14, %zmm0
372
373/* CosR = CosR - CosN*Pi3 */
374 vfnmadd213pd %zmm0, %zmm2, %zmm3
375
376/* Polynomial approximation */
377 vmovups __dC7(%rax), %zmm0
378
379/* Update CosRSign and CosSignRes signs */
380 vmovaps %zmm4, %zmm2
381 vpternlogq $150, %zmm6, %zmm1, %zmm2
382
383/* Update sign SinSignRes */
384 vpternlogq $120, %zmm8, %zmm4, %zmm6
385
386/* CosR2 = CosR^2 */
387 vmulpd %zmm3, %zmm3, %zmm1
388 vmovups __dC6(%rax), %zmm4
389 vmovaps %zmm0, %zmm14
390 vfmadd213pd %zmm4, %zmm1, %zmm0
391 vfmadd213pd %zmm4, %zmm15, %zmm14
392 vfmadd213pd %zmm9, %zmm1, %zmm0
393 vfmadd213pd %zmm9, %zmm15, %zmm14
394 vfmadd213pd %zmm10, %zmm1, %zmm0
395 vfmadd213pd %zmm10, %zmm15, %zmm14
396
397/* CosPoly = C3 + CosR2*(C4 + CosR2*(C5 + CosR2*(C6 + CosR2*C7))) */
398 vfmadd213pd %zmm11, %zmm1, %zmm0
399
400/* SinPoly = C3 + SinR2*(C4 + SinR2*(C5 + SinR2*(C6 + SinR2*C7))) */
401 vfmadd213pd %zmm11, %zmm15, %zmm14
402
403/* CosPoly = C2 + CosR2*CosPoly */
404 vfmadd213pd %zmm12, %zmm1, %zmm0
405
406/* SinPoly = C2 + SinR2*SinPoly */
407 vfmadd213pd %zmm12, %zmm15, %zmm14
408
409/* CosPoly = C1 + CosR2*CosPoly */
410 vfmadd213pd %zmm13, %zmm1, %zmm0
411
412/* SinPoly = C1 + SinR2*SinPoly */
413 vfmadd213pd %zmm13, %zmm15, %zmm14
414
415/* CosPoly = CosR2*CosPoly */
416 vmulpd %zmm1, %zmm0, %zmm1
417
418/* SinPoly = SinR2*SinPoly */
419 vmulpd %zmm15, %zmm14, %zmm4
420
421/* CosPoly = CosR*CosPoly */
422 vfmadd213pd %zmm3, %zmm3, %zmm1
423
424/* SinPoly = SinR*SinPoly */
425 vfmadd213pd %zmm5, %zmm5, %zmm4
426 vpternlogd $0xff, %zmm3, %zmm3, %zmm3
427
428/* Update Cos result's sign */
429 vxorpd %zmm2, %zmm1, %zmm1
430
431/* Final reconstruction.
432 Update Sin result's sign */
433 vxorpd %zmm6, %zmm4, %zmm0
434 vpandnq %zmm7, %zmm7, %zmm3{%k1}
435 vcmppd $3, %zmm3, %zmm3, %k0
436 kmovw %k0, %ecx
437 testl %ecx, %ecx
438 jne .LBL_2_3
439
440.LBL_2_2:
441 cfi_remember_state
442 vmovups %zmm0, (%rdi)
443 vmovups %zmm1, (%rsi)
444 movq %rbp, %rsp
445 cfi_def_cfa_register (%rsp)
446 popq %rbp
447 cfi_adjust_cfa_offset (-8)
448 cfi_restore (%rbp)
449 ret
450
451.LBL_2_3:
452 cfi_restore_state
453 vmovups %zmm8, 1152(%rsp)
454 vmovups %zmm0, 1216(%rsp)
455 vmovups %zmm1, 1280(%rsp)
456 je .LBL_2_2
457
458 xorb %dl, %dl
459 xorl %eax, %eax
460 kmovw %k4, 1048(%rsp)
461 kmovw %k5, 1040(%rsp)
462 kmovw %k6, 1032(%rsp)
463 kmovw %k7, 1024(%rsp)
464 vmovups %zmm16, 960(%rsp)
465 vmovups %zmm17, 896(%rsp)
466 vmovups %zmm18, 832(%rsp)
467 vmovups %zmm19, 768(%rsp)
468 vmovups %zmm20, 704(%rsp)
469 vmovups %zmm21, 640(%rsp)
470 vmovups %zmm22, 576(%rsp)
471 vmovups %zmm23, 512(%rsp)
472 vmovups %zmm24, 448(%rsp)
473 vmovups %zmm25, 384(%rsp)
474 vmovups %zmm26, 320(%rsp)
475 vmovups %zmm27, 256(%rsp)
476 vmovups %zmm28, 192(%rsp)
477 vmovups %zmm29, 128(%rsp)
478 vmovups %zmm30, 64(%rsp)
479 vmovups %zmm31, (%rsp)
480 movq %rsi, 1056(%rsp)
481 movq %r12, 1096(%rsp)
482 cfi_offset_rel_rsp (12, 1096)
483 movb %dl, %r12b
484 movq %r13, 1088(%rsp)
485 cfi_offset_rel_rsp (13, 1088)
486 movl %eax, %r13d
487 movq %r14, 1080(%rsp)
488 cfi_offset_rel_rsp (14, 1080)
489 movl %ecx, %r14d
490 movq %r15, 1072(%rsp)
491 cfi_offset_rel_rsp (15, 1072)
492 movq %rbx, 1064(%rsp)
493 movq %rdi, %rbx
494 cfi_remember_state
495
496.LBL_2_6:
497 btl %r13d, %r14d
498 jc .LBL_2_13
499
500.LBL_2_7:
501 lea 1(%r13), %esi
502 btl %esi, %r14d
503 jc .LBL_2_10
504
505.LBL_2_8:
506 incb %r12b
507 addl $2, %r13d
508 cmpb $16, %r12b
509 jb .LBL_2_6
510
511 kmovw 1048(%rsp), %k4
512 movq %rbx, %rdi
513 kmovw 1040(%rsp), %k5
514 kmovw 1032(%rsp), %k6
515 kmovw 1024(%rsp), %k7
516 vmovups 960(%rsp), %zmm16
517 vmovups 896(%rsp), %zmm17
518 vmovups 832(%rsp), %zmm18
519 vmovups 768(%rsp), %zmm19
520 vmovups 704(%rsp), %zmm20
521 vmovups 640(%rsp), %zmm21
522 vmovups 576(%rsp), %zmm22
523 vmovups 512(%rsp), %zmm23
524 vmovups 448(%rsp), %zmm24
525 vmovups 384(%rsp), %zmm25
526 vmovups 320(%rsp), %zmm26
527 vmovups 256(%rsp), %zmm27
528 vmovups 192(%rsp), %zmm28
529 vmovups 128(%rsp), %zmm29
530 vmovups 64(%rsp), %zmm30
531 vmovups (%rsp), %zmm31
532 vmovups 1216(%rsp), %zmm0
533 vmovups 1280(%rsp), %zmm1
534 movq 1056(%rsp), %rsi
535 movq 1096(%rsp), %r12
536 cfi_restore (%r12)
537 movq 1088(%rsp), %r13
538 cfi_restore (%r13)
539 movq 1080(%rsp), %r14
540 cfi_restore (%r14)
541 movq 1072(%rsp), %r15
542 cfi_restore (%r15)
543 movq 1064(%rsp), %rbx
544 jmp .LBL_2_2
545
546.LBL_2_10:
547 cfi_restore_state
548 movzbl %r12b, %r15d
549 shlq $4, %r15
550 vmovsd 1160(%rsp,%r15), %xmm0
551 vzeroupper
552 vmovsd 1160(%rsp,%r15), %xmm0
553
554 call JUMPTARGET(sin)
555
556 vmovsd %xmm0, 1224(%rsp,%r15)
557 vmovsd 1160(%rsp,%r15), %xmm0
558
559 call JUMPTARGET(cos)
560
561 vmovsd %xmm0, 1288(%rsp,%r15)
562 jmp .LBL_2_8
563
564.LBL_2_13:
565 movzbl %r12b, %r15d
566 shlq $4, %r15
567 vmovsd 1152(%rsp,%r15), %xmm0
568 vzeroupper
569 vmovsd 1152(%rsp,%r15), %xmm0
570
571 call JUMPTARGET(sin)
572
573 vmovsd %xmm0, 1216(%rsp,%r15)
574 vmovsd 1152(%rsp,%r15), %xmm0
575
576 call JUMPTARGET(cos)
577
578 vmovsd %xmm0, 1280(%rsp,%r15)
579 jmp .LBL_2_7
580
581END (_ZGVeN8vl8l8_sincos_skx)
582libmvec_hidden_def(_ZGVeN8vl8l8_sincos_skx)
583
584/* Wrapper between vvv and vl8l8 vector variants. */
585.macro WRAPPER_AVX512_vvv_vl8l8 callee
586#ifndef __ILP32__
587 pushq %rbp
588 cfi_adjust_cfa_offset (8)
589 cfi_rel_offset (%rbp, 0)
590 movq %rsp, %rbp
591 cfi_def_cfa_register (%rbp)
592 andq $-64, %rsp
593 subq $256, %rsp
594 vmovups %zmm1, 128(%rsp)
595 lea (%rsp), %rdi
596 vmovups %zmm2, 192(%rdi)
597 lea 64(%rsp), %rsi
598 call HIDDEN_JUMPTARGET(\callee)
599 movq 128(%rsp), %rdx
600 movq 136(%rsp), %rsi
601 movq 144(%rsp), %r8
602 movq 152(%rsp), %r10
603 movq (%rsp), %rax
604 movq 8(%rsp), %rcx
605 movq 16(%rsp), %rdi
606 movq 24(%rsp), %r9
607 movq %rax, (%rdx)
608 movq %rcx, (%rsi)
609 movq 160(%rsp), %rax
610 movq 168(%rsp), %rcx
611 movq %rdi, (%r8)
612 movq %r9, (%r10)
613 movq 176(%rsp), %rdi
614 movq 184(%rsp), %r9
615 movq 32(%rsp), %r11
616 movq 40(%rsp), %rdx
617 movq 48(%rsp), %rsi
618 movq 56(%rsp), %r8
619 movq %r11, (%rax)
620 movq %rdx, (%rcx)
621 movq 192(%rsp), %r11
622 movq 200(%rsp), %rdx
623 movq %rsi, (%rdi)
624 movq %r8, (%r9)
625 movq 208(%rsp), %rsi
626 movq 216(%rsp), %r8
627 movq 64(%rsp), %r10
628 movq 72(%rsp), %rax
629 movq 80(%rsp), %rcx
630 movq 88(%rsp), %rdi
631 movq %r10, (%r11)
632 movq %rax, (%rdx)
633 movq 224(%rsp), %r10
634 movq 232(%rsp), %rax
635 movq %rcx, (%rsi)
636 movq %rdi, (%r8)
637 movq 240(%rsp), %rcx
638 movq 248(%rsp), %rdi
639 movq 96(%rsp), %r9
640 movq 104(%rsp), %r11
641 movq 112(%rsp), %rdx
642 movq 120(%rsp), %rsi
643 movq %r9, (%r10)
644 movq %r11, (%rax)
645 movq %rdx, (%rcx)
646 movq %rsi, (%rdi)
647 movq %rbp, %rsp
648 cfi_def_cfa_register (%rsp)
649 popq %rbp
650 cfi_adjust_cfa_offset (-8)
651 cfi_restore (%rbp)
652 ret
653#else
654 leal 8(%rsp), %r10d
655 .cfi_def_cfa 10, 0
656 andl $-64, %esp
657 pushq -8(%r10d)
658 pushq %rbp
659 .cfi_escape 0x10,0x6,0x2,0x76,0
660 movl %esp, %ebp
661 pushq %r10
662 .cfi_escape 0xf,0x3,0x76,0x78,0x6
663 leal -112(%rbp), %esi
664 leal -176(%rbp), %edi
665 subl $232, %esp
666 vmovdqa %ymm1, -208(%ebp)
667 vmovdqa %ymm2, -240(%ebp)
668 call HIDDEN_JUMPTARGET(\callee)
669 vmovdqa -208(%ebp), %xmm0
670 vmovq %xmm0, %rax
671 vmovsd -176(%ebp), %xmm0
672 vmovsd %xmm0, (%eax)
673 shrq $32, %rax
674 vmovsd -168(%ebp), %xmm0
675 vmovsd %xmm0, (%eax)
676 movq -200(%ebp), %rax
677 vmovsd -160(%ebp), %xmm0
678 vmovsd %xmm0, (%eax)
679 shrq $32, %rax
680 vmovsd -152(%ebp), %xmm0
681 vmovsd %xmm0, (%eax)
682 movq -192(%ebp), %rax
683 vmovsd -144(%ebp), %xmm0
684 vmovsd %xmm0, (%eax)
685 shrq $32, %rax
686 vmovsd -136(%ebp), %xmm0
687 vmovsd %xmm0, (%eax)
688 movq -184(%ebp), %rax
689 vmovsd -128(%ebp), %xmm0
690 vmovsd %xmm0, (%eax)
691 shrq $32, %rax
692 vmovsd -120(%ebp), %xmm0
693 vmovsd %xmm0, (%eax)
694 vmovdqa -240(%ebp), %xmm0
695 vmovq %xmm0, %rax
696 vmovsd -112(%ebp), %xmm0
697 vmovsd %xmm0, (%eax)
698 shrq $32, %rax
699 vmovsd -104(%ebp), %xmm0
700 vmovsd %xmm0, (%eax)
701 movq -232(%ebp), %rax
702 vmovsd -96(%ebp), %xmm0
703 vmovsd %xmm0, (%eax)
704 shrq $32, %rax
705 vmovsd -88(%ebp), %xmm0
706 vmovsd %xmm0, (%eax)
707 movq -224(%ebp), %rax
708 vmovsd -80(%ebp), %xmm0
709 vmovsd %xmm0, (%eax)
710 shrq $32, %rax
711 vmovsd -72(%ebp), %xmm0
712 vmovsd %xmm0, (%eax)
713 movq -216(%ebp), %rax
714 vmovsd -64(%ebp), %xmm0
715 vmovsd %xmm0, (%eax)
716 shrq $32, %rax
717 vmovsd -56(%ebp), %xmm0
718 vmovsd %xmm0, (%eax)
719 addl $232, %esp
720 popq %r10
721 .cfi_def_cfa 10, 0
722 popq %rbp
723 leal -8(%r10), %esp
724 .cfi_def_cfa 7, 8
725 ret
726#endif
727.endm
728
729ENTRY (_ZGVeN8vvv_sincos_knl)
730WRAPPER_AVX512_vvv_vl8l8 _ZGVeN8vl8l8_sincos_knl
731END (_ZGVeN8vvv_sincos_knl)
732
733ENTRY (_ZGVeN8vvv_sincos_skx)
734WRAPPER_AVX512_vvv_vl8l8 _ZGVeN8vl8l8_sincos_skx
735END (_ZGVeN8vvv_sincos_skx)
736

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S