1 | /* Function sincos vectorized with SSE4. |
2 | Copyright (C) 2014-2024 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <sysdep.h> |
20 | #include "svml_d_trig_data.h" |
21 | |
22 | .section .text.sse4, "ax" , @progbits |
23 | ENTRY (_ZGVbN2vl8l8_sincos_sse4) |
24 | /* |
25 | ALGORITHM DESCRIPTION: |
26 | |
27 | ( low accuracy ( < 4ulp ) or enhanced performance |
28 | ( half of correct mantissa ) implementation ) |
29 | |
30 | Argument representation: |
31 | arg = N*Pi + R |
32 | |
33 | Result calculation: |
34 | sin(arg) = sin(N*Pi + R) = (-1)^N * sin(R) |
35 | arg + Pi/2 = (N'*Pi + R') |
36 | cos(arg) = sin(arg+Pi/2) = sin(N'*Pi + R') = (-1)^N' * sin(R') |
37 | sin(R), sin(R') are approximated by corresponding polynomial. */ |
38 | |
39 | pushq %rbp |
40 | cfi_adjust_cfa_offset (8) |
41 | cfi_rel_offset (%rbp, 0) |
42 | movq %rsp, %rbp |
43 | cfi_def_cfa_register (%rbp) |
44 | andq $-64, %rsp |
45 | subq $320, %rsp |
46 | movq __svml_d_trig_data@GOTPCREL(%rip), %rax |
47 | movups %xmm11, 160(%rsp) |
48 | movups %xmm12, 144(%rsp) |
49 | movups __dSignMask(%rax), %xmm11 |
50 | |
51 | /* ARGUMENT RANGE REDUCTION: |
52 | Absolute argument: X' = |X| */ |
53 | movaps %xmm11, %xmm4 |
54 | |
55 | /* Grab sign bit from argument */ |
56 | movaps %xmm11, %xmm7 |
57 | movups __dInvPI(%rax), %xmm5 |
58 | andnps %xmm0, %xmm4 |
59 | |
60 | /* SinY = X'*InvPi + RS : right shifter add */ |
61 | mulpd %xmm4, %xmm5 |
62 | addpd __dRShifter(%rax), %xmm5 |
63 | |
64 | /* SinSignRes = Y<<63 : shift LSB to MSB place for result sign */ |
65 | movaps %xmm5, %xmm12 |
66 | andps %xmm0, %xmm7 |
67 | |
68 | /* SinN = Y - RS : right shifter sub */ |
69 | subpd __dRShifter(%rax), %xmm5 |
70 | movups %xmm10, 176(%rsp) |
71 | psllq $63, %xmm12 |
72 | movups __dPI1(%rax), %xmm10 |
73 | |
74 | /* SinR = X' - SinN*Pi1 */ |
75 | movaps %xmm10, %xmm1 |
76 | mulpd %xmm5, %xmm1 |
77 | movups __dPI2(%rax), %xmm6 |
78 | |
79 | /* SinR = SinR - SinN*Pi1 */ |
80 | movaps %xmm6, %xmm2 |
81 | mulpd %xmm5, %xmm2 |
82 | movups %xmm13, 112(%rsp) |
83 | movaps %xmm4, %xmm13 |
84 | subpd %xmm1, %xmm13 |
85 | subpd %xmm2, %xmm13 |
86 | |
87 | /* Sine result sign: SinRSign = SignMask & SinR */ |
88 | movaps %xmm11, %xmm2 |
89 | |
90 | /* CosR = SinX - CosN*Pi1 */ |
91 | movaps %xmm4, %xmm1 |
92 | movups __dOneHalf(%rax), %xmm3 |
93 | andps %xmm13, %xmm2 |
94 | |
95 | /* Set SinRSign to 0.5 */ |
96 | orps %xmm2, %xmm3 |
97 | |
98 | /* Update CosRSign and CosSignRes signs */ |
99 | xorps %xmm11, %xmm2 |
100 | |
101 | /* CosN = SinN +(-)0.5 */ |
102 | addpd %xmm5, %xmm3 |
103 | cmpnlepd __dRangeVal(%rax), %xmm4 |
104 | mulpd %xmm3, %xmm10 |
105 | |
106 | /* CosR = CosR - CosN*Pi2 */ |
107 | mulpd %xmm3, %xmm6 |
108 | subpd %xmm10, %xmm1 |
109 | movmskpd %xmm4, %ecx |
110 | movups __dPI3(%rax), %xmm10 |
111 | xorps %xmm12, %xmm2 |
112 | subpd %xmm6, %xmm1 |
113 | |
114 | /* SinR = SinR - SinN*Pi3 */ |
115 | movaps %xmm10, %xmm6 |
116 | |
117 | /* Final reconstruction. |
118 | Combine Sin result's sign */ |
119 | xorps %xmm7, %xmm12 |
120 | mulpd %xmm5, %xmm6 |
121 | |
122 | /* CosR = CosR - CosN*Pi3 */ |
123 | mulpd %xmm3, %xmm10 |
124 | subpd %xmm6, %xmm13 |
125 | subpd %xmm10, %xmm1 |
126 | movups __dPI4(%rax), %xmm6 |
127 | |
128 | /* SinR = SinR - SinN*Pi4 */ |
129 | mulpd %xmm6, %xmm5 |
130 | |
131 | /* CosR = CosR - CosN*Pi4 */ |
132 | mulpd %xmm6, %xmm3 |
133 | subpd %xmm5, %xmm13 |
134 | subpd %xmm3, %xmm1 |
135 | |
136 | /* SinR2 = SinR^2 */ |
137 | movaps %xmm13, %xmm6 |
138 | |
139 | /* CosR2 = CosR^2 */ |
140 | movaps %xmm1, %xmm10 |
141 | mulpd %xmm13, %xmm6 |
142 | mulpd %xmm1, %xmm10 |
143 | |
144 | /* Polynomial approximation */ |
145 | movups __dC7(%rax), %xmm5 |
146 | movaps %xmm5, %xmm3 |
147 | mulpd %xmm6, %xmm3 |
148 | mulpd %xmm10, %xmm5 |
149 | addpd __dC6(%rax), %xmm3 |
150 | addpd __dC6(%rax), %xmm5 |
151 | mulpd %xmm6, %xmm3 |
152 | mulpd %xmm10, %xmm5 |
153 | addpd __dC5(%rax), %xmm3 |
154 | addpd __dC5(%rax), %xmm5 |
155 | mulpd %xmm6, %xmm3 |
156 | mulpd %xmm10, %xmm5 |
157 | addpd __dC4(%rax), %xmm3 |
158 | addpd __dC4(%rax), %xmm5 |
159 | |
160 | /* SinPoly = C3 + SinR2*(C4 + SinR2*(C5 + SinR2*(C6 + SinR2*C7))) */ |
161 | mulpd %xmm6, %xmm3 |
162 | |
163 | /* CosPoly = C3 + CosR2*(C4 + CosR2*(C5 + CosR2*(C6 + CosR2*C7))) */ |
164 | mulpd %xmm10, %xmm5 |
165 | addpd __dC3(%rax), %xmm3 |
166 | addpd __dC3(%rax), %xmm5 |
167 | |
168 | /* SinPoly = C2 + SinR2*SinPoly */ |
169 | mulpd %xmm6, %xmm3 |
170 | |
171 | /* CosPoly = C2 + CosR2*CosPoly */ |
172 | mulpd %xmm10, %xmm5 |
173 | addpd __dC2(%rax), %xmm3 |
174 | addpd __dC2(%rax), %xmm5 |
175 | |
176 | /* SinPoly = C1 + SinR2*SinPoly */ |
177 | mulpd %xmm6, %xmm3 |
178 | |
179 | /* CosPoly = C1 + CosR2*CosPoly */ |
180 | mulpd %xmm10, %xmm5 |
181 | addpd __dC1(%rax), %xmm3 |
182 | addpd __dC1(%rax), %xmm5 |
183 | |
184 | /* SinPoly = SinR2*SinPoly */ |
185 | mulpd %xmm3, %xmm6 |
186 | |
187 | /* CosPoly = CosR2*CosPoly */ |
188 | mulpd %xmm5, %xmm10 |
189 | |
190 | /* SinPoly = SinR*SinPoly */ |
191 | mulpd %xmm13, %xmm6 |
192 | |
193 | /* CosPoly = CosR*CosPoly */ |
194 | mulpd %xmm1, %xmm10 |
195 | addpd %xmm6, %xmm13 |
196 | addpd %xmm10, %xmm1 |
197 | |
198 | /* Update Sin result's sign */ |
199 | xorps %xmm12, %xmm13 |
200 | |
201 | /* Update Cos result's sign */ |
202 | xorps %xmm2, %xmm1 |
203 | testl %ecx, %ecx |
204 | jne .LBL_1_3 |
205 | |
206 | .LBL_1_2: |
207 | cfi_remember_state |
208 | movups 176(%rsp), %xmm10 |
209 | movaps %xmm13, (%rdi) |
210 | movups 160(%rsp), %xmm11 |
211 | movups 144(%rsp), %xmm12 |
212 | movups 112(%rsp), %xmm13 |
213 | movups %xmm1, (%rsi) |
214 | movq %rbp, %rsp |
215 | cfi_def_cfa_register (%rsp) |
216 | popq %rbp |
217 | cfi_adjust_cfa_offset (-8) |
218 | cfi_restore (%rbp) |
219 | ret |
220 | |
221 | .LBL_1_3: |
222 | cfi_restore_state |
223 | movups %xmm0, 128(%rsp) |
224 | movups %xmm13, 192(%rsp) |
225 | movups %xmm1, 256(%rsp) |
226 | je .LBL_1_2 |
227 | |
228 | xorb %dl, %dl |
229 | xorl %eax, %eax |
230 | movups %xmm8, 48(%rsp) |
231 | movups %xmm9, 32(%rsp) |
232 | movups %xmm14, 16(%rsp) |
233 | movups %xmm15, (%rsp) |
234 | movq %rsi, 64(%rsp) |
235 | movq %r12, 104(%rsp) |
236 | cfi_offset_rel_rsp (12, 104) |
237 | movb %dl, %r12b |
238 | movq %r13, 96(%rsp) |
239 | cfi_offset_rel_rsp (13, 96) |
240 | movl %eax, %r13d |
241 | movq %r14, 88(%rsp) |
242 | cfi_offset_rel_rsp (14, 88) |
243 | movl %ecx, %r14d |
244 | movq %r15, 80(%rsp) |
245 | cfi_offset_rel_rsp (15, 80) |
246 | movq %rbx, 72(%rsp) |
247 | movq %rdi, %rbx |
248 | cfi_remember_state |
249 | |
250 | .LBL_1_6: |
251 | btl %r13d, %r14d |
252 | jc .LBL_1_13 |
253 | |
254 | .LBL_1_7: |
255 | lea 1(%r13), %esi |
256 | btl %esi, %r14d |
257 | jc .LBL_1_10 |
258 | |
259 | .LBL_1_8: |
260 | incb %r12b |
261 | addl $2, %r13d |
262 | cmpb $16, %r12b |
263 | jb .LBL_1_6 |
264 | |
265 | movups 48(%rsp), %xmm8 |
266 | movq %rbx, %rdi |
267 | movups 32(%rsp), %xmm9 |
268 | movups 16(%rsp), %xmm14 |
269 | movups (%rsp), %xmm15 |
270 | movq 64(%rsp), %rsi |
271 | movq 104(%rsp), %r12 |
272 | cfi_restore (%r12) |
273 | movq 96(%rsp), %r13 |
274 | cfi_restore (%r13) |
275 | movq 88(%rsp), %r14 |
276 | cfi_restore (%r14) |
277 | movq 80(%rsp), %r15 |
278 | cfi_restore (%r15) |
279 | movq 72(%rsp), %rbx |
280 | movups 192(%rsp), %xmm13 |
281 | movups 256(%rsp), %xmm1 |
282 | jmp .LBL_1_2 |
283 | |
284 | .LBL_1_10: |
285 | cfi_restore_state |
286 | movzbl %r12b, %r15d |
287 | shlq $4, %r15 |
288 | movsd 136(%rsp,%r15), %xmm0 |
289 | |
290 | call JUMPTARGET(sin) |
291 | |
292 | movsd %xmm0, 200(%rsp,%r15) |
293 | movsd 136(%rsp,%r15), %xmm0 |
294 | |
295 | call JUMPTARGET(cos) |
296 | |
297 | movsd %xmm0, 264(%rsp,%r15) |
298 | jmp .LBL_1_8 |
299 | |
300 | .LBL_1_13: |
301 | movzbl %r12b, %r15d |
302 | shlq $4, %r15 |
303 | movsd 128(%rsp,%r15), %xmm0 |
304 | |
305 | call JUMPTARGET(sin) |
306 | |
307 | movsd %xmm0, 192(%rsp,%r15) |
308 | movsd 128(%rsp,%r15), %xmm0 |
309 | |
310 | call JUMPTARGET(cos) |
311 | |
312 | movsd %xmm0, 256(%rsp,%r15) |
313 | jmp .LBL_1_7 |
314 | END (_ZGVbN2vl8l8_sincos_sse4) |
315 | libmvec_hidden_def(_ZGVbN2vl8l8_sincos_sse4) |
316 | |
317 | /* vvv version implemented with wrapper to vl8l8 variant. */ |
318 | ENTRY (_ZGVbN2vvv_sincos_sse4) |
319 | #ifndef __ILP32__ |
320 | subq $72, %rsp |
321 | .cfi_def_cfa_offset 80 |
322 | movdqu %xmm1, 32(%rsp) |
323 | lea (%rsp), %rdi |
324 | movdqu %xmm2, 48(%rdi) |
325 | lea 16(%rsp), %rsi |
326 | call HIDDEN_JUMPTARGET(_ZGVbN2vl8l8_sincos_sse4) |
327 | movq 32(%rsp), %rdx |
328 | movq 48(%rsp), %rsi |
329 | movq 40(%rsp), %r8 |
330 | movq 56(%rsp), %r10 |
331 | movq (%rsp), %rax |
332 | movq 16(%rsp), %rcx |
333 | movq 8(%rsp), %rdi |
334 | movq 24(%rsp), %r9 |
335 | movq %rax, (%rdx) |
336 | movq %rcx, (%rsi) |
337 | movq %rdi, (%r8) |
338 | movq %r9, (%r10) |
339 | addq $72, %rsp |
340 | .cfi_def_cfa_offset 8 |
341 | ret |
342 | #else |
343 | subl $72, %esp |
344 | .cfi_def_cfa_offset 80 |
345 | leal 48(%rsp), %esi |
346 | movaps %xmm1, 16(%esp) |
347 | leal 32(%rsp), %edi |
348 | movaps %xmm2, (%esp) |
349 | call HIDDEN_JUMPTARGET(_ZGVbN2vl8l8_sincos_sse4) |
350 | movdqa 16(%esp), %xmm1 |
351 | movsd 32(%esp), %xmm0 |
352 | movq %xmm1, %rax |
353 | movdqa (%esp), %xmm2 |
354 | movsd %xmm0, (%eax) |
355 | movsd 40(%esp), %xmm0 |
356 | pextrd $1, %xmm1, %eax |
357 | movsd %xmm0, (%eax) |
358 | movsd 48(%esp), %xmm0 |
359 | movq %xmm2, %rax |
360 | movsd %xmm0, (%eax) |
361 | movsd 56(%esp), %xmm0 |
362 | pextrd $1, %xmm2, %eax |
363 | movsd %xmm0, (%eax) |
364 | addl $72, %esp |
365 | .cfi_def_cfa_offset 8 |
366 | ret |
367 | #endif |
368 | END (_ZGVbN2vvv_sincos_sse4) |
369 | |