1/* Function sincosf vectorized with AVX-512. Wrapper to AVX2 version.
2 Copyright (C) 2014-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <sysdep.h>
20#include "svml_s_wrapper_impl.h"
21
22 .text
23ENTRY (_ZGVeN16vl4l4_sincosf)
24WRAPPER_IMPL_AVX512_fFF _ZGVdN8vl4l4_sincosf
25END (_ZGVeN16vl4l4_sincosf)
26
27/* AVX512 ISA version as wrapper to AVX2 ISA version (for vector
28 function declared with #pragma omp declare simd notinbranch). */
29.macro WRAPPER_IMPL_AVX512_fFF_vvv callee
30#ifndef __ILP32__
31 pushq %rbp
32 cfi_adjust_cfa_offset (8)
33 cfi_rel_offset (%rbp, 0)
34 movq %rsp, %rbp
35 cfi_def_cfa_register (%rbp)
36 andq $-64, %rsp
37 subq $448, %rsp
38 vmovups %zmm0, 384(%rsp)
39 lea (%rsp), %rdi
40 vmovups %zmm1, 128(%rdi)
41 vmovups %zmm2, 192(%rdi)
42 vmovups %zmm3, 256(%rdi)
43 vmovups %zmm4, 320(%rdi)
44 lea 64(%rsp), %rsi
45 call HIDDEN_JUMPTARGET(\callee)
46 vmovdqu 416(%rsp), %ymm0
47 lea 32(%rsp), %rdi
48 lea 96(%rsp), %rsi
49 call HIDDEN_JUMPTARGET(\callee)
50 movq 128(%rsp), %rdx
51 movq 136(%rsp), %rsi
52 movq 144(%rsp), %r8
53 movq 152(%rsp), %r10
54 movl (%rsp), %eax
55 movl 4(%rsp), %ecx
56 movl 8(%rsp), %edi
57 movl 12(%rsp), %r9d
58 movl %eax, (%rdx)
59 movl %ecx, (%rsi)
60 movq 160(%rsp), %rax
61 movq 168(%rsp), %rcx
62 movl %edi, (%r8)
63 movl %r9d, (%r10)
64 movq 176(%rsp), %rdi
65 movq 184(%rsp), %r9
66 movl 16(%rsp), %r11d
67 movl 20(%rsp), %edx
68 movl 24(%rsp), %esi
69 movl 28(%rsp), %r8d
70 movl %r11d, (%rax)
71 movl %edx, (%rcx)
72 movq 192(%rsp), %r11
73 movq 200(%rsp), %rdx
74 movl %esi, (%rdi)
75 movl %r8d, (%r9)
76 movq 208(%rsp), %rsi
77 movq 216(%rsp), %r8
78 movl 32(%rsp), %r10d
79 movl 36(%rsp), %eax
80 movl 40(%rsp), %ecx
81 movl 44(%rsp), %edi
82 movl %r10d, (%r11)
83 movl %eax, (%rdx)
84 movq 224(%rsp), %r10
85 movq 232(%rsp), %rax
86 movl %ecx, (%rsi)
87 movl %edi, (%r8)
88 movq 240(%rsp), %rcx
89 movq 248(%rsp), %rdi
90 movl 48(%rsp), %r9d
91 movl 52(%rsp), %r11d
92 movl 56(%rsp), %edx
93 movl 60(%rsp), %esi
94 movl %r9d, (%r10)
95 movl %r11d, (%rax)
96 movq 256(%rsp), %r9
97 movq 264(%rsp), %r11
98 movl %edx, (%rcx)
99 movl %esi, (%rdi)
100 movq 272(%rsp), %rdx
101 movq 280(%rsp), %rsi
102 movl 64(%rsp), %r8d
103 movl 68(%rsp), %r10d
104 movl 72(%rsp), %eax
105 movl 76(%rsp), %ecx
106 movl %r8d, (%r9)
107 movl %r10d, (%r11)
108 movq 288(%rsp), %r8
109 movq 296(%rsp), %r10
110 movl %eax, (%rdx)
111 movl %ecx, (%rsi)
112 movq 304(%rsp), %rax
113 movq 312(%rsp), %rcx
114 movl 80(%rsp), %edi
115 movl 84(%rsp), %r9d
116 movl 88(%rsp), %r11d
117 movl 92(%rsp), %edx
118 movl %edi, (%r8)
119 movl %r9d, (%r10)
120 movq 320(%rsp), %rdi
121 movq 328(%rsp), %r9
122 movl %r11d, (%rax)
123 movl %edx, (%rcx)
124 movq 336(%rsp), %r11
125 movq 344(%rsp), %rdx
126 movl 96(%rsp), %esi
127 movl 100(%rsp), %r8d
128 movl 104(%rsp), %r10d
129 movl 108(%rsp), %eax
130 movl %esi, (%rdi)
131 movl %r8d, (%r9)
132 movq 352(%rsp), %rsi
133 movq 360(%rsp), %r8
134 movl %r10d, (%r11)
135 movl %eax, (%rdx)
136 movq 368(%rsp), %r10
137 movq 376(%rsp), %rax
138 movl 112(%rsp), %ecx
139 movl 116(%rsp), %edi
140 movl 120(%rsp), %r9d
141 movl 124(%rsp), %r11d
142 movl %ecx, (%rsi)
143 movl %edi, (%r8)
144 movl %r9d, (%r10)
145 movl %r11d, (%rax)
146 movq %rbp, %rsp
147 cfi_def_cfa_register (%rsp)
148 popq %rbp
149 cfi_adjust_cfa_offset (-8)
150 cfi_restore (%rbp)
151 ret
152#else
153 leal 8(%rsp), %r10d
154 .cfi_def_cfa 10, 0
155 andl $-64, %esp
156 pushq -8(%r10d)
157 pushq %rbp
158 .cfi_escape 0x10,0x6,0x2,0x76,0
159 movl %esp, %ebp
160 pushq %r12
161 leal -112(%rbp), %esi
162 pushq %r10
163 .cfi_escape 0xf,0x3,0x76,0x70,0x6
164 .cfi_escape 0x10,0xc,0x2,0x76,0x78
165 leal -176(%rbp), %edi
166 movq %rsi, %r12
167 pushq %rbx
168 .cfi_escape 0x10,0x3,0x2,0x76,0x68
169 movq %rdi, %rbx
170 subl $344, %esp
171 vmovdqa64 %zmm1, -240(%ebp)
172 vmovdqa64 %zmm2, -304(%ebp)
173 vmovaps %zmm0, -368(%ebp)
174 call HIDDEN_JUMPTARGET(\callee)
175 leal 32(%r12), %esi
176 vmovups -336(%ebp), %ymm0
177 leal 32(%rbx), %edi
178 call HIDDEN_JUMPTARGET(\callee)
179 movl -240(%ebp), %eax
180 vmovss -176(%ebp), %xmm0
181 vmovss %xmm0, (%eax)
182 movl -236(%ebp), %eax
183 vmovss -172(%ebp), %xmm0
184 vmovss %xmm0, (%eax)
185 movl -232(%ebp), %eax
186 vmovss -168(%ebp), %xmm0
187 vmovss %xmm0, (%eax)
188 movl -228(%ebp), %eax
189 vmovss -164(%ebp), %xmm0
190 vmovss %xmm0, (%eax)
191 movl -224(%ebp), %eax
192 vmovss -160(%ebp), %xmm0
193 vmovss %xmm0, (%eax)
194 movl -220(%ebp), %eax
195 vmovss -156(%ebp), %xmm0
196 vmovss %xmm0, (%eax)
197 movl -216(%ebp), %eax
198 vmovss -152(%ebp), %xmm0
199 vmovss %xmm0, (%eax)
200 movl -212(%ebp), %eax
201 vmovss -148(%ebp), %xmm0
202 vmovss %xmm0, (%eax)
203 movl -208(%ebp), %eax
204 vmovss -144(%ebp), %xmm0
205 vmovss %xmm0, (%eax)
206 movl -204(%ebp), %eax
207 vmovss -140(%ebp), %xmm0
208 vmovss %xmm0, (%eax)
209 movl -200(%ebp), %eax
210 vmovss -136(%ebp), %xmm0
211 vmovss %xmm0, (%eax)
212 movl -196(%ebp), %eax
213 vmovss -132(%ebp), %xmm0
214 vmovss %xmm0, (%eax)
215 movl -192(%ebp), %eax
216 vmovss -128(%ebp), %xmm0
217 vmovss %xmm0, (%eax)
218 movl -188(%ebp), %eax
219 vmovss -124(%ebp), %xmm0
220 vmovss %xmm0, (%eax)
221 movl -184(%ebp), %eax
222 vmovss -120(%ebp), %xmm0
223 vmovss %xmm0, (%eax)
224 movl -180(%ebp), %eax
225 vmovss -116(%ebp), %xmm0
226 vmovss %xmm0, (%eax)
227 movl -304(%ebp), %eax
228 vmovss -112(%ebp), %xmm0
229 vmovss %xmm0, (%eax)
230 movl -300(%ebp), %eax
231 vmovss -108(%ebp), %xmm0
232 vmovss %xmm0, (%eax)
233 movl -296(%ebp), %eax
234 vmovss -104(%ebp), %xmm0
235 vmovss %xmm0, (%eax)
236 movl -292(%ebp), %eax
237 vmovss -100(%ebp), %xmm0
238 vmovss %xmm0, (%eax)
239 movl -288(%ebp), %eax
240 vmovss -96(%ebp), %xmm0
241 vmovss %xmm0, (%eax)
242 movl -284(%ebp), %eax
243 vmovss -92(%ebp), %xmm0
244 vmovss %xmm0, (%eax)
245 movl -280(%ebp), %eax
246 vmovss -88(%ebp), %xmm0
247 vmovss %xmm0, (%eax)
248 movl -276(%ebp), %eax
249 vmovss -84(%ebp), %xmm0
250 vmovss %xmm0, (%eax)
251 movl -272(%ebp), %eax
252 vmovss -80(%ebp), %xmm0
253 vmovss %xmm0, (%eax)
254 movl -268(%ebp), %eax
255 vmovss -76(%ebp), %xmm0
256 vmovss %xmm0, (%eax)
257 movl -264(%ebp), %eax
258 vmovss -72(%ebp), %xmm0
259 vmovss %xmm0, (%eax)
260 movl -260(%ebp), %eax
261 vmovss -68(%ebp), %xmm0
262 vmovss %xmm0, (%eax)
263 movl -256(%ebp), %eax
264 vmovss -64(%ebp), %xmm0
265 vmovss %xmm0, (%eax)
266 movl -252(%ebp), %eax
267 vmovss -60(%ebp), %xmm0
268 vmovss %xmm0, (%eax)
269 movl -248(%ebp), %eax
270 vmovss -56(%ebp), %xmm0
271 vmovss %xmm0, (%eax)
272 movl -244(%ebp), %eax
273 vmovss -52(%ebp), %xmm0
274 vmovss %xmm0, (%eax)
275 addl $344, %esp
276 popq %rbx
277 popq %r10
278 .cfi_def_cfa 10, 0
279 popq %r12
280 popq %rbp
281 leal -8(%r10), %esp
282 .cfi_def_cfa 7, 8
283 ret
284#endif
285.endm
286
287ENTRY (_ZGVeN16vvv_sincosf)
288WRAPPER_IMPL_AVX512_fFF_vvv _ZGVdN8vl4l4_sincosf
289END (_ZGVeN16vvv_sincosf)
290

source code of glibc/sysdeps/x86_64/fpu/svml_s_sincosf16_core.S