1/* Wrapper implementations of vector math functions.
2 Copyright (C) 2014-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19/* SSE2 ISA version as wrapper to scalar. */
20.macro WRAPPER_IMPL_SSE2 callee
21 subq $40, %rsp
22 cfi_adjust_cfa_offset(40)
23 movaps %xmm0, (%rsp)
24 call JUMPTARGET(\callee)
25 movss %xmm0, 16(%rsp)
26 movss 4(%rsp), %xmm0
27 call JUMPTARGET(\callee)
28 movss %xmm0, 20(%rsp)
29 movss 8(%rsp), %xmm0
30 call JUMPTARGET(\callee)
31 movss %xmm0, 24(%rsp)
32 movss 12(%rsp), %xmm0
33 call JUMPTARGET(\callee)
34 movss 16(%rsp), %xmm3
35 movss 20(%rsp), %xmm2
36 movss 24(%rsp), %xmm1
37 movss %xmm0, 28(%rsp)
38 unpcklps %xmm1, %xmm3
39 unpcklps %xmm0, %xmm2
40 unpcklps %xmm2, %xmm3
41 movaps %xmm3, %xmm0
42 addq $40, %rsp
43 cfi_adjust_cfa_offset(-40)
44 ret
45.endm
46
47/* 2 argument SSE2 ISA version as wrapper to scalar. */
48.macro WRAPPER_IMPL_SSE2_ff callee
49 subq $56, %rsp
50 cfi_adjust_cfa_offset(56)
51 movaps %xmm0, (%rsp)
52 movaps %xmm1, 16(%rsp)
53 call JUMPTARGET(\callee)
54 movss %xmm0, 32(%rsp)
55 movss 4(%rsp), %xmm0
56 movss 20(%rsp), %xmm1
57 call JUMPTARGET(\callee)
58 movss %xmm0, 36(%rsp)
59 movss 8(%rsp), %xmm0
60 movss 24(%rsp), %xmm1
61 call JUMPTARGET(\callee)
62 movss %xmm0, 40(%rsp)
63 movss 12(%rsp), %xmm0
64 movss 28(%rsp), %xmm1
65 call JUMPTARGET(\callee)
66 movss 32(%rsp), %xmm3
67 movss 36(%rsp), %xmm2
68 movss 40(%rsp), %xmm1
69 movss %xmm0, 44(%rsp)
70 unpcklps %xmm1, %xmm3
71 unpcklps %xmm0, %xmm2
72 unpcklps %xmm2, %xmm3
73 movaps %xmm3, %xmm0
74 addq $56, %rsp
75 cfi_adjust_cfa_offset(-56)
76 ret
77.endm
78
79/* 3 argument SSE2 ISA version as wrapper to scalar. */
80.macro WRAPPER_IMPL_SSE2_fFF callee
81 pushq %rbp
82 cfi_adjust_cfa_offset (8)
83 cfi_rel_offset (%rbp, 0)
84 pushq %rbx
85 cfi_adjust_cfa_offset (8)
86 cfi_rel_offset (%rbx, 0)
87 movq %rdi, %rbp
88 movq %rsi, %rbx
89 subq $40, %rsp
90 cfi_adjust_cfa_offset(40)
91 leaq 24(%rsp), %rsi
92 leaq 28(%rsp), %rdi
93 movaps %xmm0, (%rsp)
94 call JUMPTARGET(\callee)
95 leaq 24(%rsp), %rsi
96 leaq 28(%rsp), %rdi
97 movss 28(%rsp), %xmm0
98 movss %xmm0, 0(%rbp)
99 movaps (%rsp), %xmm1
100 movss 24(%rsp), %xmm0
101 movss %xmm0, (%rbx)
102 movaps %xmm1, %xmm0
103 shufps $85, %xmm1, %xmm0
104 call JUMPTARGET(\callee)
105 movss 28(%rsp), %xmm0
106 leaq 24(%rsp), %rsi
107 movss %xmm0, 4(%rbp)
108 leaq 28(%rsp), %rdi
109 movaps (%rsp), %xmm1
110 movss 24(%rsp), %xmm0
111 movss %xmm0, 4(%rbx)
112 movaps %xmm1, %xmm0
113 unpckhps %xmm1, %xmm0
114 call JUMPTARGET(\callee)
115 movaps (%rsp), %xmm1
116 leaq 24(%rsp), %rsi
117 leaq 28(%rsp), %rdi
118 movss 28(%rsp), %xmm0
119 shufps $255, %xmm1, %xmm1
120 movss %xmm0, 8(%rbp)
121 movss 24(%rsp), %xmm0
122 movss %xmm0, 8(%rbx)
123 movaps %xmm1, %xmm0
124 call JUMPTARGET(\callee)
125 movss 28(%rsp), %xmm0
126 movss %xmm0, 12(%rbp)
127 movss 24(%rsp), %xmm0
128 movss %xmm0, 12(%rbx)
129 addq $40, %rsp
130 cfi_adjust_cfa_offset(-40)
131 popq %rbx
132 cfi_adjust_cfa_offset (-8)
133 cfi_restore (%rbx)
134 popq %rbp
135 cfi_adjust_cfa_offset (-8)
136 cfi_restore (%rbp)
137 ret
138.endm
139
140/* AVX/AVX2 ISA version as wrapper to SSE ISA version. */
141.macro WRAPPER_IMPL_AVX callee
142 pushq %rbp
143 cfi_adjust_cfa_offset (8)
144 cfi_rel_offset (%rbp, 0)
145 movq %rsp, %rbp
146 cfi_def_cfa_register (%rbp)
147 andq $-32, %rsp
148 subq $32, %rsp
149 vextractf128 $1, %ymm0, (%rsp)
150 vzeroupper
151 call HIDDEN_JUMPTARGET(\callee)
152 vmovaps %xmm0, 16(%rsp)
153 vmovaps (%rsp), %xmm0
154 call HIDDEN_JUMPTARGET(\callee)
155 vmovaps %xmm0, %xmm1
156 vmovaps 16(%rsp), %xmm0
157 vinsertf128 $1, %xmm1, %ymm0, %ymm0
158 movq %rbp, %rsp
159 cfi_def_cfa_register (%rsp)
160 popq %rbp
161 cfi_adjust_cfa_offset (-8)
162 cfi_restore (%rbp)
163 ret
164.endm
165
166/* 2 argument AVX/AVX2 ISA version as wrapper to SSE ISA version. */
167.macro WRAPPER_IMPL_AVX_ff callee
168 pushq %rbp
169 cfi_adjust_cfa_offset (8)
170 cfi_rel_offset (%rbp, 0)
171 movq %rsp, %rbp
172 cfi_def_cfa_register (%rbp)
173 andq $-32, %rsp
174 subq $64, %rsp
175 vextractf128 $1, %ymm0, 16(%rsp)
176 vextractf128 $1, %ymm1, (%rsp)
177 vzeroupper
178 call HIDDEN_JUMPTARGET(\callee)
179 vmovaps %xmm0, 32(%rsp)
180 vmovaps 16(%rsp), %xmm0
181 vmovaps (%rsp), %xmm1
182 call HIDDEN_JUMPTARGET(\callee)
183 vmovaps %xmm0, %xmm1
184 vmovaps 32(%rsp), %xmm0
185 vinsertf128 $1, %xmm1, %ymm0, %ymm0
186 movq %rbp, %rsp
187 cfi_def_cfa_register (%rsp)
188 popq %rbp
189 cfi_adjust_cfa_offset (-8)
190 cfi_restore (%rbp)
191 ret
192.endm
193
194/* 3 argument AVX/AVX2 ISA version as wrapper to SSE ISA version. */
195.macro WRAPPER_IMPL_AVX_fFF callee
196 pushq %rbp
197 cfi_adjust_cfa_offset (8)
198 cfi_rel_offset (%rbp, 0)
199 movq %rsp, %rbp
200 cfi_def_cfa_register (%rbp)
201 andq $-32, %rsp
202 pushq %r13
203 cfi_adjust_cfa_offset (8)
204 cfi_rel_offset (%r13, 0)
205 pushq %r14
206 cfi_adjust_cfa_offset (8)
207 cfi_rel_offset (%r14, 0)
208 subq $48, %rsp
209 movq %rsi, %r14
210 vmovaps %ymm0, (%rsp)
211 movq %rdi, %r13
212 vmovaps 16(%rsp), %xmm1
213 vmovaps %xmm1, 32(%rsp)
214 vzeroupper
215 vmovaps (%rsp), %xmm0
216 call HIDDEN_JUMPTARGET(\callee)
217 vmovaps 32(%rsp), %xmm0
218 lea (%rsp), %rdi
219 lea 16(%rsp), %rsi
220 call HIDDEN_JUMPTARGET(\callee)
221 vmovaps (%rsp), %xmm0
222 vmovaps 16(%rsp), %xmm1
223 vmovaps %xmm0, 16(%r13)
224 vmovaps %xmm1, 16(%r14)
225 addq $48, %rsp
226 popq %r14
227 cfi_adjust_cfa_offset (-8)
228 cfi_restore (%r14)
229 popq %r13
230 cfi_adjust_cfa_offset (-8)
231 cfi_restore (%r13)
232 movq %rbp, %rsp
233 cfi_def_cfa_register (%rsp)
234 popq %rbp
235 cfi_adjust_cfa_offset (-8)
236 cfi_restore (%rbp)
237 ret
238.endm
239
240/* AVX512 ISA version as wrapper to AVX2 ISA version. */
241.macro WRAPPER_IMPL_AVX512 callee
242 pushq %rbp
243 cfi_adjust_cfa_offset (8)
244 cfi_rel_offset (%rbp, 0)
245 movq %rsp, %rbp
246 cfi_def_cfa_register (%rbp)
247 andq $-64, %rsp
248 subq $128, %rsp
249 vmovups %zmm0, (%rsp)
250 vmovupd (%rsp), %ymm0
251 call HIDDEN_JUMPTARGET(\callee)
252 vmovupd %ymm0, 64(%rsp)
253 vmovupd 32(%rsp), %ymm0
254 call HIDDEN_JUMPTARGET(\callee)
255 vmovupd %ymm0, 96(%rsp)
256 vmovups 64(%rsp), %zmm0
257 movq %rbp, %rsp
258 cfi_def_cfa_register (%rsp)
259 popq %rbp
260 cfi_adjust_cfa_offset (-8)
261 cfi_restore (%rbp)
262 ret
263.endm
264
265/* 2 argument AVX512 ISA version as wrapper to AVX2 ISA version. */
266.macro WRAPPER_IMPL_AVX512_ff callee
267 pushq %rbp
268 cfi_adjust_cfa_offset (8)
269 cfi_rel_offset (%rbp, 0)
270 movq %rsp, %rbp
271 cfi_def_cfa_register (%rbp)
272 andq $-64, %rsp
273 subq $192, %rsp
274 vmovups %zmm0, (%rsp)
275 vmovups %zmm1, 64(%rsp)
276 vmovups (%rsp), %ymm0
277 vmovups 64(%rsp), %ymm1
278 call HIDDEN_JUMPTARGET(\callee)
279 vmovups %ymm0, 128(%rsp)
280 vmovups 32(%rsp), %ymm0
281 vmovups 96(%rsp), %ymm1
282 call HIDDEN_JUMPTARGET(\callee)
283 vmovups %ymm0, 160(%rsp)
284 vmovups 128(%rsp), %zmm0
285 movq %rbp, %rsp
286 cfi_def_cfa_register (%rsp)
287 popq %rbp
288 cfi_adjust_cfa_offset (-8)
289 cfi_restore (%rbp)
290 ret
291.endm
292
293/* 3 argument AVX512 ISA version as wrapper to AVX2 ISA version. */
294.macro WRAPPER_IMPL_AVX512_fFF callee
295 pushq %rbp
296 cfi_adjust_cfa_offset (8)
297 cfi_rel_offset (%rbp, 0)
298 movq %rsp, %rbp
299 cfi_def_cfa_register (%rbp)
300 andq $-64, %rsp
301 pushq %r12
302 pushq %r13
303 subq $176, %rsp
304 movq %rsi, %r13
305 vmovaps %zmm0, (%rsp)
306 movq %rdi, %r12
307 vmovaps (%rsp), %ymm0
308 call HIDDEN_JUMPTARGET(\callee)
309 vmovaps 32(%rsp), %ymm0
310 lea 64(%rsp), %rdi
311 lea 96(%rsp), %rsi
312 call HIDDEN_JUMPTARGET(\callee)
313 vmovaps 64(%rsp), %ymm0
314 vmovaps 96(%rsp), %ymm1
315 vmovaps %ymm0, 32(%r12)
316 vmovaps %ymm1, 32(%r13)
317 addq $176, %rsp
318 popq %r13
319 popq %r12
320 movq %rbp, %rsp
321 cfi_def_cfa_register (%rsp)
322 popq %rbp
323 cfi_adjust_cfa_offset (-8)
324 cfi_restore (%rbp)
325 ret
326.endm
327

source code of glibc/sysdeps/x86_64/fpu/svml_s_wrapper_impl.h