1/* Function sincosf vectorized with SSE4.
2 Copyright (C) 2014-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <sysdep.h>
20#include "svml_s_trig_data.h"
21
22 .section .text.sse4, "ax", @progbits
23ENTRY (_ZGVbN4vl4l4_sincosf_sse4)
24/*
25 ALGORITHM DESCRIPTION:
26
27 1) Range reduction to [-Pi/4; +Pi/4] interval
28 a) Grab sign from source argument and save it.
29 b) Remove sign using AND operation
30 c) Getting octant Y by 2/Pi multiplication
31 d) Add "Right Shifter" value
32 e) Treat obtained value as integer S for destination sign setting.
33 SS = ((S-S&1)&2)<<30; For sin part
34 SC = ((S+S&1)&2)<<30; For cos part
35 f) Change destination sign if source sign is negative
36 using XOR operation.
37 g) Subtract "Right Shifter" (0x4B000000) value
38 h) Subtract Y*(PI/2) from X argument, where PI/2 divided to 4 parts:
39 X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4;
40 2) Polynomial (minimax for sin within [-Pi/4; +Pi/4] interval)
41 a) Calculate X^2 = X * X
42 b) Calculate 2 polynomials for sin and cos:
43 RS = X * ( A0 + X^2 * (A1 + x^2 * (A2 + x^2 * (A3))));
44 RC = B0 + X^2 * (B1 + x^2 * (B2 + x^2 * (B3 + x^2 * (B4))));
45 c) Swap RS & RC if first bit of obtained value after
46 Right Shifting is set to 1. Using And, Andnot & Or operations.
47 3) Destination sign setting
48 a) Set shifted destination sign using XOR operation:
49 R1 = XOR( RS, SS );
50 R2 = XOR( RC, SC ). */
51
52 pushq %rbp
53 cfi_adjust_cfa_offset (8)
54 cfi_rel_offset (%rbp, 0)
55 movq %rsp, %rbp
56 cfi_def_cfa_register (%rbp)
57 andq $-64, %rsp
58 subq $320, %rsp
59 movq __svml_s_trig_data@GOTPCREL(%rip), %rax
60 movups %xmm12, 176(%rsp)
61 movups %xmm9, 160(%rsp)
62 movups __sAbsMask(%rax), %xmm12
63
64/* Absolute argument computation */
65 movaps %xmm12, %xmm5
66 andnps %xmm0, %xmm12
67 movups __sInvPI(%rax), %xmm7
68 andps %xmm0, %xmm5
69
70/* c) Getting octant Y by 2/Pi multiplication
71 d) Add "Right Shifter" value. */
72 mulps %xmm5, %xmm7
73 movups %xmm10, 144(%rsp)
74 movups __sPI1(%rax), %xmm10
75
76/* h) Subtract Y*(PI/2) from X argument, where PI/2 divided to 3 parts:
77 X = X - Y*PI1 - Y*PI2 - Y*PI3. */
78 movaps %xmm10, %xmm1
79 addps __sRShifter(%rax), %xmm7
80
81/* e) Treat obtained value as integer S for destination sign setting */
82 movaps %xmm7, %xmm9
83
84/* g) Subtract "Right Shifter" (0x4B000000) value */
85 subps __sRShifter(%rax), %xmm7
86 mulps %xmm7, %xmm1
87 pslld $31, %xmm9
88 movups __sPI2(%rax), %xmm6
89 movups %xmm13, 112(%rsp)
90 movaps %xmm5, %xmm13
91 movaps %xmm6, %xmm2
92 subps %xmm1, %xmm13
93 mulps %xmm7, %xmm2
94 movups __sSignMask(%rax), %xmm3
95 movaps %xmm5, %xmm1
96 movups __sOneHalf(%rax), %xmm4
97 subps %xmm2, %xmm13
98 cmpnleps __sRangeReductionVal(%rax), %xmm5
99 movaps %xmm3, %xmm2
100 andps %xmm13, %xmm2
101 xorps %xmm2, %xmm4
102
103/* Result sign calculations */
104 xorps %xmm2, %xmm3
105 xorps %xmm9, %xmm3
106
107/* Add correction term 0.5 for cos() part */
108 addps %xmm7, %xmm4
109 movmskps %xmm5, %ecx
110 mulps %xmm4, %xmm10
111 mulps %xmm4, %xmm6
112 subps %xmm10, %xmm1
113 movups __sPI3(%rax), %xmm10
114 subps %xmm6, %xmm1
115 movaps %xmm10, %xmm6
116 mulps %xmm7, %xmm6
117 mulps %xmm4, %xmm10
118 subps %xmm6, %xmm13
119 subps %xmm10, %xmm1
120 movups __sPI4(%rax), %xmm6
121 mulps %xmm6, %xmm7
122 mulps %xmm6, %xmm4
123 subps %xmm7, %xmm13
124 subps %xmm4, %xmm1
125 xorps %xmm9, %xmm13
126 xorps %xmm3, %xmm1
127 movaps %xmm13, %xmm4
128 movaps %xmm1, %xmm2
129 mulps %xmm13, %xmm4
130 mulps %xmm1, %xmm2
131 movups __sA9(%rax), %xmm7
132
133/* 2) Polynomial (minimax for sin within [-Pi/4; +Pi/4] interval)
134 a) Calculate X^2 = X * X
135 b) Calculate 2 polynomials for sin and cos:
136 RS = X * ( A0 + X^2 * (A1 + x^2 * (A2 + x^2 * (A3))));
137 RC = B0 + X^2 * (B1 + x^2 * (B2 + x^2 * (B3 + x^2 * (B4)))) */
138 movaps %xmm7, %xmm3
139 mulps %xmm4, %xmm3
140 mulps %xmm2, %xmm7
141 addps __sA7(%rax), %xmm3
142 addps __sA7(%rax), %xmm7
143 mulps %xmm4, %xmm3
144 mulps %xmm2, %xmm7
145 addps __sA5(%rax), %xmm3
146 addps __sA5(%rax), %xmm7
147 mulps %xmm4, %xmm3
148 mulps %xmm2, %xmm7
149 addps __sA3(%rax), %xmm3
150 addps __sA3(%rax), %xmm7
151 mulps %xmm3, %xmm4
152 mulps %xmm7, %xmm2
153 mulps %xmm13, %xmm4
154 mulps %xmm1, %xmm2
155 addps %xmm4, %xmm13
156 addps %xmm2, %xmm1
157 xorps %xmm12, %xmm13
158 testl %ecx, %ecx
159 jne .LBL_1_3
160
161.LBL_1_2:
162 cfi_remember_state
163 movups 160(%rsp), %xmm9
164 movaps %xmm13, (%rdi)
165 movups 144(%rsp), %xmm10
166 movups 176(%rsp), %xmm12
167 movups 112(%rsp), %xmm13
168 movups %xmm1, (%rsi)
169 movq %rbp, %rsp
170 cfi_def_cfa_register (%rsp)
171 popq %rbp
172 cfi_adjust_cfa_offset (-8)
173 cfi_restore (%rbp)
174 ret
175
176.LBL_1_3:
177 cfi_restore_state
178 movups %xmm0, 128(%rsp)
179 movups %xmm13, 192(%rsp)
180 movups %xmm1, 256(%rsp)
181 je .LBL_1_2
182
183 xorb %dl, %dl
184 xorl %eax, %eax
185 movups %xmm8, 48(%rsp)
186 movups %xmm11, 32(%rsp)
187 movups %xmm14, 16(%rsp)
188 movups %xmm15, (%rsp)
189 movq %rsi, 64(%rsp)
190 movq %r12, 104(%rsp)
191 cfi_offset_rel_rsp (12, 104)
192 movb %dl, %r12b
193 movq %r13, 96(%rsp)
194 cfi_offset_rel_rsp (13, 96)
195 movl %eax, %r13d
196 movq %r14, 88(%rsp)
197 cfi_offset_rel_rsp (14, 88)
198 movl %ecx, %r14d
199 movq %r15, 80(%rsp)
200 cfi_offset_rel_rsp (15, 80)
201 movq %rbx, 72(%rsp)
202 movq %rdi, %rbx
203 cfi_remember_state
204
205.LBL_1_6:
206 btl %r13d, %r14d
207 jc .LBL_1_13
208
209.LBL_1_7:
210 lea 1(%r13), %esi
211 btl %esi, %r14d
212 jc .LBL_1_10
213
214.LBL_1_8:
215 incb %r12b
216 addl $2, %r13d
217 cmpb $16, %r12b
218 jb .LBL_1_6
219
220 movups 48(%rsp), %xmm8
221 movq %rbx, %rdi
222 movups 32(%rsp), %xmm11
223 movups 16(%rsp), %xmm14
224 movups (%rsp), %xmm15
225 movq 64(%rsp), %rsi
226 movq 104(%rsp), %r12
227 cfi_restore (%r12)
228 movq 96(%rsp), %r13
229 cfi_restore (%r13)
230 movq 88(%rsp), %r14
231 cfi_restore (%r14)
232 movq 80(%rsp), %r15
233 cfi_restore (%r15)
234 movq 72(%rsp), %rbx
235 movups 192(%rsp), %xmm13
236 movups 256(%rsp), %xmm1
237 jmp .LBL_1_2
238
239.LBL_1_10:
240 cfi_restore_state
241 movzbl %r12b, %r15d
242 movss 132(%rsp,%r15,8), %xmm0
243
244 call JUMPTARGET(sinf)
245
246 movss %xmm0, 196(%rsp,%r15,8)
247 movss 132(%rsp,%r15,8), %xmm0
248
249 call JUMPTARGET(cosf)
250
251 movss %xmm0, 260(%rsp,%r15,8)
252 jmp .LBL_1_8
253
254.LBL_1_13:
255 movzbl %r12b, %r15d
256 movss 128(%rsp,%r15,8), %xmm0
257
258 call JUMPTARGET(sinf)
259
260 movss %xmm0, 192(%rsp,%r15,8)
261 movss 128(%rsp,%r15,8), %xmm0
262
263 call JUMPTARGET(cosf)
264
265 movss %xmm0, 256(%rsp,%r15,8)
266 jmp .LBL_1_7
267
268END (_ZGVbN4vl4l4_sincosf_sse4)
269libmvec_hidden_def(_ZGVbN4vl4l4_sincosf_sse4)
270
271/* vvv version implemented with wrapper to vl4l4 variant. */
272ENTRY (_ZGVbN4vvv_sincosf_sse4)
273#ifndef __ILP32__
274 subq $104, %rsp
275 .cfi_def_cfa_offset 112
276 movdqu %xmm1, 32(%rsp)
277 lea (%rsp), %rdi
278 movdqu %xmm2, 48(%rdi)
279 lea 16(%rsp), %rsi
280 movdqu %xmm3, 48(%rsi)
281 movdqu %xmm4, 64(%rsi)
282 call HIDDEN_JUMPTARGET(_ZGVbN4vl4l4_sincosf_sse4)
283 movq 32(%rsp), %rdx
284 movq 40(%rsp), %rsi
285 movq 48(%rsp), %r8
286 movq 56(%rsp), %r10
287 movl (%rsp), %eax
288 movl 4(%rsp), %ecx
289 movl 8(%rsp), %edi
290 movl 12(%rsp), %r9d
291 movl %eax, (%rdx)
292 movl %ecx, (%rsi)
293 movq 64(%rsp), %rax
294 movq 72(%rsp), %rcx
295 movl %edi, (%r8)
296 movl %r9d, (%r10)
297 movq 80(%rsp), %rdi
298 movq 88(%rsp), %r9
299 movl 16(%rsp), %r11d
300 movl 20(%rsp), %edx
301 movl 24(%rsp), %esi
302 movl 28(%rsp), %r8d
303 movl %r11d, (%rax)
304 movl %edx, (%rcx)
305 movl %esi, (%rdi)
306 movl %r8d, (%r9)
307 addq $104, %rsp
308 .cfi_def_cfa_offset 8
309 ret
310#else
311 subl $72, %esp
312 .cfi_def_cfa_offset 80
313 leal 48(%rsp), %esi
314 movaps %xmm1, 16(%esp)
315 leal 32(%rsp), %edi
316 movaps %xmm2, (%esp)
317 call HIDDEN_JUMPTARGET(_ZGVbN4vl4l4_sincosf_sse4)
318 movl 16(%esp), %eax
319 movss 32(%esp), %xmm0
320 movss %xmm0, (%eax)
321 movl 20(%esp), %eax
322 movss 36(%esp), %xmm0
323 movss %xmm0, (%eax)
324 movl 24(%esp), %eax
325 movss 40(%esp), %xmm0
326 movss %xmm0, (%eax)
327 movl 28(%esp), %eax
328 movss 44(%esp), %xmm0
329 movss %xmm0, (%eax)
330 movl (%esp), %eax
331 movss 48(%esp), %xmm0
332 movss %xmm0, (%eax)
333 movl 4(%esp), %eax
334 movss 52(%esp), %xmm0
335 movss %xmm0, (%eax)
336 movl 8(%esp), %eax
337 movss 56(%esp), %xmm0
338 movss %xmm0, (%eax)
339 movl 12(%esp), %eax
340 movss 60(%esp), %xmm0
341 movss %xmm0, (%eax)
342 addl $72, %esp
343 .cfi_def_cfa_offset 8
344 ret
345#endif
346END (_ZGVbN4vvv_sincosf_sse4)
347

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf4_core_sse4.S