1/* Function sincos vectorized with AVX2.
2 Copyright (C) 2014-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <sysdep.h>
20#include "svml_d_trig_data.h"
21
22 .section .text.avx2, "ax", @progbits
23ENTRY (_ZGVdN4vl8l8_sincos_avx2)
24/*
25 ALGORITHM DESCRIPTION:
26
27 ( low accuracy ( < 4ulp ) or enhanced performance
28 ( half of correct mantissa ) implementation )
29
30 Argument representation:
31 arg = N*Pi + R
32
33 Result calculation:
34 sin(arg) = sin(N*Pi + R) = (-1)^N * sin(R)
35 arg + Pi/2 = (N'*Pi + R')
36 cos(arg) = sin(arg+Pi/2) = sin(N'*Pi + R') = (-1)^N' * sin(R')
37 sin(R), sin(R') are approximated by corresponding polynomial. */
38
39 pushq %rbp
40 cfi_adjust_cfa_offset (8)
41 cfi_rel_offset (%rbp, 0)
42 movq %rsp, %rbp
43 cfi_def_cfa_register (%rbp)
44 andq $-64, %rsp
45 subq $448, %rsp
46 movq __svml_d_trig_data@GOTPCREL(%rip), %rax
47 vmovups %ymm14, 288(%rsp)
48 vmovups %ymm8, 352(%rsp)
49 vmovupd __dSignMask(%rax), %ymm6
50 vmovupd __dInvPI(%rax), %ymm2
51 vmovupd __dPI1_FMA(%rax), %ymm5
52 vmovups %ymm9, 224(%rsp)
53
54/* ARGUMENT RANGE REDUCTION:
55 Absolute argument: X' = |X| */
56 vandnpd %ymm0, %ymm6, %ymm1
57
58/* SinY = X'*InvPi + RS : right shifter add */
59 vfmadd213pd __dRShifter(%rax), %ymm1, %ymm2
60
61/* SinSignRes = Y<<63 : shift LSB to MSB place for result sign */
62 vpsllq $63, %ymm2, %ymm4
63
64/* SinN = Y - RS : right shifter sub */
65 vsubpd __dRShifter(%rax), %ymm2, %ymm2
66
67/* SinR = X' - SinN*Pi1 */
68 vmovdqa %ymm1, %ymm14
69 vfnmadd231pd %ymm2, %ymm5, %ymm14
70
71/* SinR = SinR - SinN*Pi1 */
72 vfnmadd231pd __dPI2_FMA(%rax), %ymm2, %ymm14
73
74/* Sine result sign: SinRSign = SignMask & SinR */
75 vandpd %ymm14, %ymm6, %ymm7
76
77/* Set SinRSign to 0.5 */
78 vorpd __dOneHalf(%rax), %ymm7, %ymm3
79
80/* CosN = SinN +(-)0.5 */
81 vaddpd %ymm3, %ymm2, %ymm3
82
83/* CosR = SinX - CosN*Pi1 */
84 vmovdqa %ymm1, %ymm8
85 vfnmadd231pd %ymm3, %ymm5, %ymm8
86 vmovupd __dPI3_FMA(%rax), %ymm5
87 vcmpnle_uqpd __dRangeVal(%rax), %ymm1, %ymm1
88
89/* CosR = CosR - CosN*Pi2 */
90 vfnmadd231pd __dPI2_FMA(%rax), %ymm3, %ymm8
91
92/* SinR = SinR - SinN*Pi3 */
93 vfnmadd213pd %ymm14, %ymm5, %ymm2
94
95/* CosR = CosR - CosN*Pi3 */
96 vfnmadd213pd %ymm8, %ymm5, %ymm3
97 vmovupd __dC6(%rax), %ymm8
98
99/* SinR2 = SinR^2 */
100 vmulpd %ymm2, %ymm2, %ymm14
101
102/* CosR2 = CosR^2 */
103 vmulpd %ymm3, %ymm3, %ymm5
104
105/* Grab SignX */
106 vandpd %ymm0, %ymm6, %ymm9
107
108/* Update CosRSign and CosSignRes signs */
109 vxorpd %ymm6, %ymm7, %ymm6
110 vxorpd %ymm6, %ymm4, %ymm7
111
112/* Update sign SinSignRes */
113 vxorpd %ymm9, %ymm4, %ymm6
114
115/* Polynomial approximation */
116 vmovupd __dC7(%rax), %ymm4
117 vmovdqa %ymm8, %ymm9
118 vfmadd231pd __dC7(%rax), %ymm14, %ymm9
119 vfmadd213pd %ymm8, %ymm5, %ymm4
120 vfmadd213pd __dC5(%rax), %ymm14, %ymm9
121 vfmadd213pd __dC5(%rax), %ymm5, %ymm4
122 vfmadd213pd __dC4(%rax), %ymm14, %ymm9
123 vfmadd213pd __dC4(%rax), %ymm5, %ymm4
124
125/* SinPoly = C3 + SinR2*(C4 + SinR2*(C5 + SinR2*(C6 + SinR2*C7))) */
126 vfmadd213pd __dC3(%rax), %ymm14, %ymm9
127
128/* CosPoly = C3 + CosR2*(C4 + CosR2*(C5 + CosR2*(C6 + CosR2*C7))) */
129 vfmadd213pd __dC3(%rax), %ymm5, %ymm4
130
131/* SinPoly = C2 + SinR2*SinPoly */
132 vfmadd213pd __dC2(%rax), %ymm14, %ymm9
133
134/* CosPoly = C2 + CosR2*CosPoly */
135 vfmadd213pd __dC2(%rax), %ymm5, %ymm4
136
137/* SinPoly = C1 + SinR2*SinPoly */
138 vfmadd213pd __dC1(%rax), %ymm14, %ymm9
139
140/* CosPoly = C1 + CosR2*CosPoly */
141 vfmadd213pd __dC1(%rax), %ymm5, %ymm4
142
143/* SinPoly = SinR2*SinPoly */
144 vmulpd %ymm14, %ymm9, %ymm8
145
146/* CosPoly = CosR2*CosPoly */
147 vmulpd %ymm5, %ymm4, %ymm4
148
149/* SinPoly = SinR*SinPoly */
150 vfmadd213pd %ymm2, %ymm2, %ymm8
151
152/* CosPoly = CosR*CosPoly */
153 vfmadd213pd %ymm3, %ymm3, %ymm4
154 vmovmskpd %ymm1, %ecx
155
156/* Final reconstruction
157 Update Sin result's sign */
158 vxorpd %ymm6, %ymm8, %ymm3
159
160/* Update Cos result's sign */
161 vxorpd %ymm7, %ymm4, %ymm2
162 testl %ecx, %ecx
163 jne .LBL_1_3
164
165.LBL_1_2:
166 cfi_remember_state
167 vmovups 352(%rsp), %ymm8
168 vmovups 224(%rsp), %ymm9
169 vmovups 288(%rsp), %ymm14
170 vmovupd %ymm2, (%rsi)
171 vmovdqa %ymm3, (%rdi)
172 movq %rbp, %rsp
173 cfi_def_cfa_register (%rsp)
174 popq %rbp
175 cfi_adjust_cfa_offset (-8)
176 cfi_restore (%rbp)
177 ret
178
179.LBL_1_3:
180 cfi_restore_state
181 vmovupd %ymm0, 256(%rsp)
182 vmovupd %ymm3, 320(%rsp)
183 vmovupd %ymm2, 384(%rsp)
184 je .LBL_1_2
185
186 xorb %dl, %dl
187 xorl %eax, %eax
188 vmovups %ymm10, 128(%rsp)
189 vmovups %ymm11, 96(%rsp)
190 vmovups %ymm12, 64(%rsp)
191 vmovups %ymm13, 32(%rsp)
192 vmovups %ymm15, (%rsp)
193 movq %rsi, 160(%rsp)
194 movq %r12, 200(%rsp)
195 cfi_offset_rel_rsp (12, 200)
196 movb %dl, %r12b
197 movq %r13, 192(%rsp)
198 cfi_offset_rel_rsp (13, 192)
199 movl %eax, %r13d
200 movq %r14, 184(%rsp)
201 cfi_offset_rel_rsp (14, 184)
202 movl %ecx, %r14d
203 movq %r15, 176(%rsp)
204 cfi_offset_rel_rsp (15, 176)
205 movq %rbx, 168(%rsp)
206 movq %rdi, %rbx
207 cfi_remember_state
208
209.LBL_1_6:
210 btl %r13d, %r14d
211 jc .LBL_1_13
212
213.LBL_1_7:
214 lea 1(%r13), %esi
215 btl %esi, %r14d
216 jc .LBL_1_10
217
218.LBL_1_8:
219 incb %r12b
220 addl $2, %r13d
221 cmpb $16, %r12b
222 jb .LBL_1_6
223
224 vmovups 128(%rsp), %ymm10
225 movq %rbx, %rdi
226 vmovups 96(%rsp), %ymm11
227 vmovups 64(%rsp), %ymm12
228 vmovups 32(%rsp), %ymm13
229 vmovups (%rsp), %ymm15
230 vmovupd 320(%rsp), %ymm3
231 vmovupd 384(%rsp), %ymm2
232 movq 160(%rsp), %rsi
233 movq 200(%rsp), %r12
234 cfi_restore (%r12)
235 movq 192(%rsp), %r13
236 cfi_restore (%r13)
237 movq 184(%rsp), %r14
238 cfi_restore (%r14)
239 movq 176(%rsp), %r15
240 cfi_restore (%r15)
241 movq 168(%rsp), %rbx
242 jmp .LBL_1_2
243
244.LBL_1_10:
245 cfi_restore_state
246 movzbl %r12b, %r15d
247 shlq $4, %r15
248 vmovsd 264(%rsp,%r15), %xmm0
249 vzeroupper
250
251 call JUMPTARGET(sin)
252
253 vmovsd %xmm0, 328(%rsp,%r15)
254 vmovsd 264(%rsp,%r15), %xmm0
255
256 call JUMPTARGET(cos)
257
258 vmovsd %xmm0, 392(%rsp,%r15)
259 jmp .LBL_1_8
260
261.LBL_1_13:
262 movzbl %r12b, %r15d
263 shlq $4, %r15
264 vmovsd 256(%rsp,%r15), %xmm0
265 vzeroupper
266
267 call JUMPTARGET(sin)
268
269 vmovsd %xmm0, 320(%rsp,%r15)
270 vmovsd 256(%rsp,%r15), %xmm0
271
272 call JUMPTARGET(cos)
273
274 vmovsd %xmm0, 384(%rsp,%r15)
275 jmp .LBL_1_7
276
277END (_ZGVdN4vl8l8_sincos_avx2)
278libmvec_hidden_def(_ZGVdN4vl8l8_sincos_avx2)
279
280/* vvv version implemented with wrapper to vl8l8 variant. */
281ENTRY (_ZGVdN4vvv_sincos_avx2)
282#ifndef __ILP32__
283 pushq %rbp
284 cfi_adjust_cfa_offset (8)
285 cfi_rel_offset (%rbp, 0)
286 movq %rsp, %rbp
287 cfi_def_cfa_register (%rbp)
288 andq $-32, %rsp
289 subq $128, %rsp
290 vmovdqu %ymm1, 64(%rsp)
291 lea (%rsp), %rdi
292 vmovdqu %ymm2, 96(%rdi)
293 lea 32(%rsp), %rsi
294 call HIDDEN_JUMPTARGET(_ZGVdN4vl8l8_sincos_avx2)
295 movq 64(%rsp), %rdx
296 movq 96(%rsp), %rsi
297 movq 72(%rsp), %r8
298 movq 104(%rsp), %r10
299 movq (%rsp), %rax
300 movq 32(%rsp), %rcx
301 movq 8(%rsp), %rdi
302 movq 40(%rsp), %r9
303 movq %rax, (%rdx)
304 movq %rcx, (%rsi)
305 movq 80(%rsp), %rax
306 movq 112(%rsp), %rcx
307 movq %rdi, (%r8)
308 movq %r9, (%r10)
309 movq 88(%rsp), %rdi
310 movq 120(%rsp), %r9
311 movq 16(%rsp), %r11
312 movq 48(%rsp), %rdx
313 movq 24(%rsp), %rsi
314 movq 56(%rsp), %r8
315 movq %r11, (%rax)
316 movq %rdx, (%rcx)
317 movq %rsi, (%rdi)
318 movq %r8, (%r9)
319 movq %rbp, %rsp
320 cfi_def_cfa_register (%rsp)
321 popq %rbp
322 cfi_adjust_cfa_offset (-8)
323 cfi_restore (%rbp)
324 ret
325#else
326 leal 8(%rsp), %r10d
327 .cfi_def_cfa 10, 0
328 andl $-32, %esp
329 pushq -8(%r10d)
330 pushq %rbp
331 .cfi_escape 0x10,0x6,0x2,0x76,0
332 movl %esp, %ebp
333 pushq %r10
334 .cfi_escape 0xf,0x3,0x76,0x78,0x6
335 leal -48(%rbp), %esi
336 leal -80(%rbp), %edi
337 subl $104, %esp
338 vmovaps %xmm1, -96(%ebp)
339 vmovaps %xmm2, -112(%ebp)
340 call HIDDEN_JUMPTARGET(_ZGVdN4vl8l8_sincos_avx2)
341 movl -96(%ebp), %eax
342 vmovsd -80(%ebp), %xmm0
343 vmovsd %xmm0, (%eax)
344 movl -92(%ebp), %eax
345 vmovsd -72(%ebp), %xmm0
346 vmovsd %xmm0, (%eax)
347 movl -88(%ebp), %eax
348 vmovsd -64(%ebp), %xmm0
349 vmovsd %xmm0, (%eax)
350 movl -84(%ebp), %eax
351 vmovsd -56(%ebp), %xmm0
352 vmovsd %xmm0, (%eax)
353 movl -112(%ebp), %eax
354 vmovsd -48(%ebp), %xmm0
355 vmovsd %xmm0, (%eax)
356 movl -108(%ebp), %eax
357 vmovsd -40(%ebp), %xmm0
358 vmovsd %xmm0, (%eax)
359 movl -104(%ebp), %eax
360 vmovsd -32(%ebp), %xmm0
361 vmovsd %xmm0, (%eax)
362 movl -100(%ebp), %eax
363 vmovsd -24(%ebp), %xmm0
364 vmovsd %xmm0, (%eax)
365 addl $104, %esp
366 popq %r10
367 .cfi_def_cfa 10, 0
368 popq %rbp
369 leal -8(%r10), %esp
370 .cfi_def_cfa 7, 8
371 ret
372#endif
373END (_ZGVdN4vvv_sincos_avx2)
374

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_d_sincos4_core_avx2.S