1 | /* Function sincosf vectorized with AVX-512. KNL and SKX versions. |
2 | Copyright (C) 2014-2024 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <sysdep.h> |
20 | #include "svml_s_trig_data.h" |
21 | #include "svml_s_wrapper_impl.h" |
22 | |
23 | /* |
24 | ALGORITHM DESCRIPTION: |
25 | |
26 | 1) Range reduction to [-Pi/4; +Pi/4] interval |
27 | a) Grab sign from source argument and save it. |
28 | b) Remove sign using AND operation |
29 | c) Getting octant Y by 2/Pi multiplication |
30 | d) Add "Right Shifter" value |
31 | e) Treat obtained value as integer S for destination sign setting. |
32 | SS = ((S-S&1)&2)<<30; For sin part |
33 | SC = ((S+S&1)&2)<<30; For cos part |
34 | f) Change destination sign if source sign is negative |
35 | using XOR operation. |
36 | g) Subtract "Right Shifter" (0x4B000000) value |
37 | h) Subtract Y*(PI/2) from X argument, where PI/2 divided to 4 parts: |
38 | X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4; |
39 | 2) Polynomial (minimax for sin within [-Pi/4; +Pi/4] interval) |
40 | a) Calculate X^2 = X * X |
41 | b) Calculate 2 polynomials for sin and cos: |
42 | RS = X * ( A0 + X^2 * (A1 + x^2 * (A2 + x^2 * (A3)))); |
43 | RC = B0 + X^2 * (B1 + x^2 * (B2 + x^2 * (B3 + x^2 * (B4)))); |
44 | c) Swap RS & RC if first bit of obtained value after |
45 | Right Shifting is set to 1. Using And, Andnot & Or operations. |
46 | 3) Destination sign setting |
47 | a) Set shifted destination sign using XOR operation: |
48 | R1 = XOR( RS, SS ); |
49 | R2 = XOR( RC, SC ). */ |
50 | |
51 | .section .text.evex512, "ax" , @progbits |
52 | ENTRY (_ZGVeN16vl4l4_sincosf_knl) |
53 | pushq %rbp |
54 | cfi_adjust_cfa_offset (8) |
55 | cfi_rel_offset (%rbp, 0) |
56 | movq %rsp, %rbp |
57 | cfi_def_cfa_register (%rbp) |
58 | andq $-64, %rsp |
59 | subq $1344, %rsp |
60 | movq __svml_s_trig_data@GOTPCREL(%rip), %rax |
61 | vmovaps %zmm0, %zmm2 |
62 | movl $-1, %edx |
63 | vmovups __sAbsMask(%rax), %zmm0 |
64 | vmovups __sInvPI(%rax), %zmm3 |
65 | |
66 | /* Absolute argument computation */ |
67 | vpandd %zmm0, %zmm2, %zmm1 |
68 | vmovups __sPI1_FMA(%rax), %zmm5 |
69 | vmovups __sSignMask(%rax), %zmm9 |
70 | vpandnd %zmm2, %zmm0, %zmm0 |
71 | |
72 | /* h) Subtract Y*(PI/2) from X argument, where PI/2 divided to 3 parts: |
73 | X = X - Y*PI1 - Y*PI2 - Y*PI3 */ |
74 | vmovaps %zmm1, %zmm6 |
75 | vmovaps %zmm1, %zmm8 |
76 | |
77 | /* c) Getting octant Y by 2/Pi multiplication |
78 | d) Add "Right Shifter" value */ |
79 | vfmadd213ps __sRShifter(%rax), %zmm1, %zmm3 |
80 | vmovups __sPI3_FMA(%rax), %zmm7 |
81 | |
82 | /* g) Subtract "Right Shifter" (0x4B000000) value */ |
83 | vsubps __sRShifter(%rax), %zmm3, %zmm12 |
84 | |
85 | /* e) Treat obtained value as integer S for destination sign setting */ |
86 | vpslld $31, %zmm3, %zmm13 |
87 | vmovups __sA7_FMA(%rax), %zmm14 |
88 | vfnmadd231ps %zmm12, %zmm5, %zmm6 |
89 | |
90 | /* 2) Polynomial (minimax for sin within [-Pi/4; +Pi/4] interval) |
91 | a) Calculate X^2 = X * X |
92 | b) Calculate 2 polynomials for sin and cos: |
93 | RS = X * ( A0 + X^2 * (A1 + x^2 * (A2 + x^2 * (A3)))); |
94 | RC = B0 + X^2 * (B1 + x^2 * (B2 + x^2 * (B3 + x^2 * (B4)))) */ |
95 | vmovaps %zmm14, %zmm15 |
96 | vmovups __sA9_FMA(%rax), %zmm3 |
97 | vcmpps $22, __sRangeReductionVal(%rax), %zmm1, %k1 |
98 | vpbroadcastd %edx, %zmm1{%k1}{z} |
99 | vfnmadd231ps __sPI2_FMA(%rax), %zmm12, %zmm6 |
100 | vptestmd %zmm1, %zmm1, %k0 |
101 | vpandd %zmm6, %zmm9, %zmm11 |
102 | kmovw %k0, %ecx |
103 | vpxord __sOneHalf(%rax), %zmm11, %zmm4 |
104 | |
105 | /* Result sign calculations */ |
106 | vpternlogd $150, %zmm13, %zmm9, %zmm11 |
107 | |
108 | /* Add correction term 0.5 for cos() part */ |
109 | vaddps %zmm4, %zmm12, %zmm10 |
110 | vfnmadd213ps %zmm6, %zmm7, %zmm12 |
111 | vfnmadd231ps %zmm10, %zmm5, %zmm8 |
112 | vpxord %zmm13, %zmm12, %zmm13 |
113 | vmulps %zmm13, %zmm13, %zmm12 |
114 | vfnmadd231ps __sPI2_FMA(%rax), %zmm10, %zmm8 |
115 | vfmadd231ps __sA9_FMA(%rax), %zmm12, %zmm15 |
116 | vfnmadd213ps %zmm8, %zmm7, %zmm10 |
117 | vfmadd213ps __sA5_FMA(%rax), %zmm12, %zmm15 |
118 | vpxord %zmm11, %zmm10, %zmm5 |
119 | vmulps %zmm5, %zmm5, %zmm4 |
120 | vfmadd213ps __sA3(%rax), %zmm12, %zmm15 |
121 | vfmadd213ps %zmm14, %zmm4, %zmm3 |
122 | vmulps %zmm12, %zmm15, %zmm14 |
123 | vfmadd213ps __sA5_FMA(%rax), %zmm4, %zmm3 |
124 | vfmadd213ps %zmm13, %zmm13, %zmm14 |
125 | vfmadd213ps __sA3(%rax), %zmm4, %zmm3 |
126 | vpxord %zmm0, %zmm14, %zmm0 |
127 | vmulps %zmm4, %zmm3, %zmm3 |
128 | vfmadd213ps %zmm5, %zmm5, %zmm3 |
129 | testl %ecx, %ecx |
130 | jne .LBL_1_3 |
131 | |
132 | .LBL_1_2: |
133 | cfi_remember_state |
134 | vmovups %zmm0, (%rdi) |
135 | vmovups %zmm3, (%rsi) |
136 | movq %rbp, %rsp |
137 | cfi_def_cfa_register (%rsp) |
138 | popq %rbp |
139 | cfi_adjust_cfa_offset (-8) |
140 | cfi_restore (%rbp) |
141 | ret |
142 | |
143 | .LBL_1_3: |
144 | cfi_restore_state |
145 | vmovups %zmm2, 1152(%rsp) |
146 | vmovups %zmm0, 1216(%rsp) |
147 | vmovups %zmm3, 1280(%rsp) |
148 | je .LBL_1_2 |
149 | |
150 | xorb %dl, %dl |
151 | kmovw %k4, 1048(%rsp) |
152 | xorl %eax, %eax |
153 | kmovw %k5, 1040(%rsp) |
154 | kmovw %k6, 1032(%rsp) |
155 | kmovw %k7, 1024(%rsp) |
156 | vmovups %zmm16, 960(%rsp) |
157 | vmovups %zmm17, 896(%rsp) |
158 | vmovups %zmm18, 832(%rsp) |
159 | vmovups %zmm19, 768(%rsp) |
160 | vmovups %zmm20, 704(%rsp) |
161 | vmovups %zmm21, 640(%rsp) |
162 | vmovups %zmm22, 576(%rsp) |
163 | vmovups %zmm23, 512(%rsp) |
164 | vmovups %zmm24, 448(%rsp) |
165 | vmovups %zmm25, 384(%rsp) |
166 | vmovups %zmm26, 320(%rsp) |
167 | vmovups %zmm27, 256(%rsp) |
168 | vmovups %zmm28, 192(%rsp) |
169 | vmovups %zmm29, 128(%rsp) |
170 | vmovups %zmm30, 64(%rsp) |
171 | vmovups %zmm31, (%rsp) |
172 | movq %rsi, 1056(%rsp) |
173 | movq %r12, 1096(%rsp) |
174 | cfi_offset_rel_rsp (12, 1096) |
175 | movb %dl, %r12b |
176 | movq %r13, 1088(%rsp) |
177 | cfi_offset_rel_rsp (13, 1088) |
178 | movl %eax, %r13d |
179 | movq %r14, 1080(%rsp) |
180 | cfi_offset_rel_rsp (14, 1080) |
181 | movl %ecx, %r14d |
182 | movq %r15, 1072(%rsp) |
183 | cfi_offset_rel_rsp (15, 1072) |
184 | movq %rbx, 1064(%rsp) |
185 | movq %rdi, %rbx |
186 | cfi_remember_state |
187 | |
188 | .LBL_1_6: |
189 | btl %r13d, %r14d |
190 | jc .LBL_1_13 |
191 | |
192 | .LBL_1_7: |
193 | lea 1(%r13), %esi |
194 | btl %esi, %r14d |
195 | jc .LBL_1_10 |
196 | |
197 | .LBL_1_8: |
198 | addb $1, %r12b |
199 | addl $2, %r13d |
200 | cmpb $16, %r12b |
201 | jb .LBL_1_6 |
202 | |
203 | movq %rbx, %rdi |
204 | kmovw 1048(%rsp), %k4 |
205 | movq 1056(%rsp), %rsi |
206 | kmovw 1040(%rsp), %k5 |
207 | movq 1096(%rsp), %r12 |
208 | cfi_restore (%r12) |
209 | kmovw 1032(%rsp), %k6 |
210 | movq 1088(%rsp), %r13 |
211 | cfi_restore (%r13) |
212 | kmovw 1024(%rsp), %k7 |
213 | vmovups 960(%rsp), %zmm16 |
214 | vmovups 896(%rsp), %zmm17 |
215 | vmovups 832(%rsp), %zmm18 |
216 | vmovups 768(%rsp), %zmm19 |
217 | vmovups 704(%rsp), %zmm20 |
218 | vmovups 640(%rsp), %zmm21 |
219 | vmovups 576(%rsp), %zmm22 |
220 | vmovups 512(%rsp), %zmm23 |
221 | vmovups 448(%rsp), %zmm24 |
222 | vmovups 384(%rsp), %zmm25 |
223 | vmovups 320(%rsp), %zmm26 |
224 | vmovups 256(%rsp), %zmm27 |
225 | vmovups 192(%rsp), %zmm28 |
226 | vmovups 128(%rsp), %zmm29 |
227 | vmovups 64(%rsp), %zmm30 |
228 | vmovups (%rsp), %zmm31 |
229 | movq 1080(%rsp), %r14 |
230 | cfi_restore (%r14) |
231 | movq 1072(%rsp), %r15 |
232 | cfi_restore (%r15) |
233 | movq 1064(%rsp), %rbx |
234 | vmovups 1216(%rsp), %zmm0 |
235 | vmovups 1280(%rsp), %zmm3 |
236 | jmp .LBL_1_2 |
237 | |
238 | .LBL_1_10: |
239 | cfi_restore_state |
240 | movzbl %r12b, %r15d |
241 | vmovss 1156(%rsp,%r15,8), %xmm0 |
242 | |
243 | call JUMPTARGET(sinf) |
244 | |
245 | vmovss %xmm0, 1220(%rsp,%r15,8) |
246 | vmovss 1156(%rsp,%r15,8), %xmm0 |
247 | |
248 | call JUMPTARGET(cosf) |
249 | |
250 | vmovss %xmm0, 1284(%rsp,%r15,8) |
251 | jmp .LBL_1_8 |
252 | |
253 | .LBL_1_13: |
254 | movzbl %r12b, %r15d |
255 | vmovss 1152(%rsp,%r15,8), %xmm0 |
256 | |
257 | call JUMPTARGET(sinf) |
258 | |
259 | vmovss %xmm0, 1216(%rsp,%r15,8) |
260 | vmovss 1152(%rsp,%r15,8), %xmm0 |
261 | |
262 | call JUMPTARGET(cosf) |
263 | |
264 | vmovss %xmm0, 1280(%rsp,%r15,8) |
265 | jmp .LBL_1_7 |
266 | END (_ZGVeN16vl4l4_sincosf_knl) |
267 | libmvec_hidden_def(_ZGVeN16vl4l4_sincosf_knl) |
268 | |
269 | ENTRY (_ZGVeN16vl4l4_sincosf_skx) |
270 | pushq %rbp |
271 | cfi_adjust_cfa_offset (8) |
272 | cfi_rel_offset (%rbp, 0) |
273 | movq %rsp, %rbp |
274 | cfi_def_cfa_register (%rbp) |
275 | andq $-64, %rsp |
276 | subq $1344, %rsp |
277 | movq __svml_s_trig_data@GOTPCREL(%rip), %rax |
278 | vmovaps %zmm0, %zmm4 |
279 | vmovups __sAbsMask(%rax), %zmm3 |
280 | vmovups __sInvPI(%rax), %zmm5 |
281 | vmovups __sRShifter(%rax), %zmm6 |
282 | vmovups __sPI1_FMA(%rax), %zmm9 |
283 | vmovups __sPI2_FMA(%rax), %zmm10 |
284 | vmovups __sSignMask(%rax), %zmm14 |
285 | vmovups __sOneHalf(%rax), %zmm7 |
286 | vmovups __sPI3_FMA(%rax), %zmm12 |
287 | |
288 | /* Absolute argument computation */ |
289 | vandps %zmm3, %zmm4, %zmm2 |
290 | |
291 | /* c) Getting octant Y by 2/Pi multiplication |
292 | d) Add "Right Shifter" value */ |
293 | vfmadd213ps %zmm6, %zmm2, %zmm5 |
294 | vcmpps $18, __sRangeReductionVal(%rax), %zmm2, %k1 |
295 | |
296 | /* e) Treat obtained value as integer S for destination sign setting */ |
297 | vpslld $31, %zmm5, %zmm0 |
298 | |
299 | /* g) Subtract "Right Shifter" (0x4B000000) value */ |
300 | vsubps %zmm6, %zmm5, %zmm5 |
301 | vmovups __sA3(%rax), %zmm6 |
302 | |
303 | /* h) Subtract Y*(PI/2) from X argument, where PI/2 divided to 3 parts: |
304 | X = X - Y*PI1 - Y*PI2 - Y*PI3 */ |
305 | vmovaps %zmm2, %zmm11 |
306 | vfnmadd231ps %zmm5, %zmm9, %zmm11 |
307 | vfnmadd231ps %zmm5, %zmm10, %zmm11 |
308 | vandps %zmm11, %zmm14, %zmm1 |
309 | vxorps %zmm1, %zmm7, %zmm8 |
310 | |
311 | /* Result sign calculations */ |
312 | vpternlogd $150, %zmm0, %zmm14, %zmm1 |
313 | vpternlogd $0xff, %zmm14, %zmm14, %zmm14 |
314 | |
315 | /* Add correction term 0.5 for cos() part */ |
316 | vaddps %zmm8, %zmm5, %zmm15 |
317 | vfnmadd213ps %zmm11, %zmm12, %zmm5 |
318 | vandnps %zmm4, %zmm3, %zmm11 |
319 | vmovups __sA7_FMA(%rax), %zmm3 |
320 | vmovaps %zmm2, %zmm13 |
321 | vfnmadd231ps %zmm15, %zmm9, %zmm13 |
322 | vxorps %zmm0, %zmm5, %zmm9 |
323 | vmovups __sA5_FMA(%rax), %zmm0 |
324 | vfnmadd231ps %zmm15, %zmm10, %zmm13 |
325 | vmulps %zmm9, %zmm9, %zmm8 |
326 | vfnmadd213ps %zmm13, %zmm12, %zmm15 |
327 | vmovups __sA9_FMA(%rax), %zmm12 |
328 | vxorps %zmm1, %zmm15, %zmm1 |
329 | vmulps %zmm1, %zmm1, %zmm13 |
330 | |
331 | /* 2) Polynomial (minimax for sin within [-Pi/4; +Pi/4] interval) |
332 | a) Calculate X^2 = X * X |
333 | b) Calculate 2 polynomials for sin and cos: |
334 | RS = X * ( A0 + X^2 * (A1 + x^2 * (A2 + x^2 * (A3)))); |
335 | RC = B0 + X^2 * (B1 + x^2 * (B2 + x^2 * (B3 + x^2 * (B4)))) */ |
336 | vmovaps %zmm12, %zmm7 |
337 | vfmadd213ps %zmm3, %zmm8, %zmm7 |
338 | vfmadd213ps %zmm3, %zmm13, %zmm12 |
339 | vfmadd213ps %zmm0, %zmm8, %zmm7 |
340 | vfmadd213ps %zmm0, %zmm13, %zmm12 |
341 | vfmadd213ps %zmm6, %zmm8, %zmm7 |
342 | vfmadd213ps %zmm6, %zmm13, %zmm12 |
343 | vmulps %zmm8, %zmm7, %zmm10 |
344 | vmulps %zmm13, %zmm12, %zmm3 |
345 | vfmadd213ps %zmm9, %zmm9, %zmm10 |
346 | vfmadd213ps %zmm1, %zmm1, %zmm3 |
347 | vxorps %zmm11, %zmm10, %zmm0 |
348 | vpandnd %zmm2, %zmm2, %zmm14{%k1} |
349 | vptestmd %zmm14, %zmm14, %k0 |
350 | kmovw %k0, %ecx |
351 | testl %ecx, %ecx |
352 | jne .LBL_2_3 |
353 | |
354 | .LBL_2_2: |
355 | cfi_remember_state |
356 | vmovups %zmm0, (%rdi) |
357 | vmovups %zmm3, (%rsi) |
358 | movq %rbp, %rsp |
359 | cfi_def_cfa_register (%rsp) |
360 | popq %rbp |
361 | cfi_adjust_cfa_offset (-8) |
362 | cfi_restore (%rbp) |
363 | ret |
364 | |
365 | .LBL_2_3: |
366 | cfi_restore_state |
367 | vmovups %zmm4, 1152(%rsp) |
368 | vmovups %zmm0, 1216(%rsp) |
369 | vmovups %zmm3, 1280(%rsp) |
370 | je .LBL_2_2 |
371 | |
372 | xorb %dl, %dl |
373 | xorl %eax, %eax |
374 | kmovw %k4, 1048(%rsp) |
375 | kmovw %k5, 1040(%rsp) |
376 | kmovw %k6, 1032(%rsp) |
377 | kmovw %k7, 1024(%rsp) |
378 | vmovups %zmm16, 960(%rsp) |
379 | vmovups %zmm17, 896(%rsp) |
380 | vmovups %zmm18, 832(%rsp) |
381 | vmovups %zmm19, 768(%rsp) |
382 | vmovups %zmm20, 704(%rsp) |
383 | vmovups %zmm21, 640(%rsp) |
384 | vmovups %zmm22, 576(%rsp) |
385 | vmovups %zmm23, 512(%rsp) |
386 | vmovups %zmm24, 448(%rsp) |
387 | vmovups %zmm25, 384(%rsp) |
388 | vmovups %zmm26, 320(%rsp) |
389 | vmovups %zmm27, 256(%rsp) |
390 | vmovups %zmm28, 192(%rsp) |
391 | vmovups %zmm29, 128(%rsp) |
392 | vmovups %zmm30, 64(%rsp) |
393 | vmovups %zmm31, (%rsp) |
394 | movq %rsi, 1056(%rsp) |
395 | movq %r12, 1096(%rsp) |
396 | cfi_offset_rel_rsp (12, 1096) |
397 | movb %dl, %r12b |
398 | movq %r13, 1088(%rsp) |
399 | cfi_offset_rel_rsp (13, 1088) |
400 | movl %eax, %r13d |
401 | movq %r14, 1080(%rsp) |
402 | cfi_offset_rel_rsp (14, 1080) |
403 | movl %ecx, %r14d |
404 | movq %r15, 1072(%rsp) |
405 | cfi_offset_rel_rsp (15, 1072) |
406 | movq %rbx, 1064(%rsp) |
407 | movq %rdi, %rbx |
408 | cfi_remember_state |
409 | |
410 | .LBL_2_6: |
411 | btl %r13d, %r14d |
412 | jc .LBL_2_13 |
413 | |
414 | .LBL_2_7: |
415 | lea 1(%r13), %esi |
416 | btl %esi, %r14d |
417 | jc .LBL_2_10 |
418 | |
419 | .LBL_2_8: |
420 | incb %r12b |
421 | addl $2, %r13d |
422 | cmpb $16, %r12b |
423 | jb .LBL_2_6 |
424 | |
425 | kmovw 1048(%rsp), %k4 |
426 | movq %rbx, %rdi |
427 | kmovw 1040(%rsp), %k5 |
428 | kmovw 1032(%rsp), %k6 |
429 | kmovw 1024(%rsp), %k7 |
430 | vmovups 960(%rsp), %zmm16 |
431 | vmovups 896(%rsp), %zmm17 |
432 | vmovups 832(%rsp), %zmm18 |
433 | vmovups 768(%rsp), %zmm19 |
434 | vmovups 704(%rsp), %zmm20 |
435 | vmovups 640(%rsp), %zmm21 |
436 | vmovups 576(%rsp), %zmm22 |
437 | vmovups 512(%rsp), %zmm23 |
438 | vmovups 448(%rsp), %zmm24 |
439 | vmovups 384(%rsp), %zmm25 |
440 | vmovups 320(%rsp), %zmm26 |
441 | vmovups 256(%rsp), %zmm27 |
442 | vmovups 192(%rsp), %zmm28 |
443 | vmovups 128(%rsp), %zmm29 |
444 | vmovups 64(%rsp), %zmm30 |
445 | vmovups (%rsp), %zmm31 |
446 | vmovups 1216(%rsp), %zmm0 |
447 | vmovups 1280(%rsp), %zmm3 |
448 | movq 1056(%rsp), %rsi |
449 | movq 1096(%rsp), %r12 |
450 | cfi_restore (%r12) |
451 | movq 1088(%rsp), %r13 |
452 | cfi_restore (%r13) |
453 | movq 1080(%rsp), %r14 |
454 | cfi_restore (%r14) |
455 | movq 1072(%rsp), %r15 |
456 | cfi_restore (%r15) |
457 | movq 1064(%rsp), %rbx |
458 | jmp .LBL_2_2 |
459 | |
460 | .LBL_2_10: |
461 | cfi_restore_state |
462 | movzbl %r12b, %r15d |
463 | vmovss 1156(%rsp,%r15,8), %xmm0 |
464 | vzeroupper |
465 | vmovss 1156(%rsp,%r15,8), %xmm0 |
466 | |
467 | call JUMPTARGET(sinf) |
468 | |
469 | vmovss %xmm0, 1220(%rsp,%r15,8) |
470 | vmovss 1156(%rsp,%r15,8), %xmm0 |
471 | |
472 | call JUMPTARGET(cosf) |
473 | |
474 | vmovss %xmm0, 1284(%rsp,%r15,8) |
475 | jmp .LBL_2_8 |
476 | |
477 | .LBL_2_13: |
478 | movzbl %r12b, %r15d |
479 | vmovss 1152(%rsp,%r15,8), %xmm0 |
480 | vzeroupper |
481 | vmovss 1152(%rsp,%r15,8), %xmm0 |
482 | |
483 | call JUMPTARGET(sinf) |
484 | |
485 | vmovss %xmm0, 1216(%rsp,%r15,8) |
486 | vmovss 1152(%rsp,%r15,8), %xmm0 |
487 | |
488 | call JUMPTARGET(cosf) |
489 | |
490 | vmovss %xmm0, 1280(%rsp,%r15,8) |
491 | jmp .LBL_2_7 |
492 | END (_ZGVeN16vl4l4_sincosf_skx) |
493 | libmvec_hidden_def(_ZGVeN16vl4l4_sincosf_skx) |
494 | |
495 | /* Wrapper between vvv and vl4l4 vector variants. */ |
496 | .macro WRAPPER_AVX512_vvv_vl4l4 callee |
497 | #ifndef __ILP32__ |
498 | pushq %rbp |
499 | cfi_adjust_cfa_offset (8) |
500 | cfi_rel_offset (%rbp, 0) |
501 | movq %rsp, %rbp |
502 | cfi_def_cfa_register (%rbp) |
503 | andq $-64, %rsp |
504 | subq $384, %rsp |
505 | vmovups %zmm1, 128(%rsp) |
506 | lea (%rsp), %rdi |
507 | vmovups %zmm2, 192(%rdi) |
508 | vmovups %zmm3, 256(%rdi) |
509 | vmovups %zmm4, 320(%rdi) |
510 | lea 64(%rsp), %rsi |
511 | call HIDDEN_JUMPTARGET(\callee) |
512 | movq 128(%rsp), %rdx |
513 | movq 136(%rsp), %rsi |
514 | movq 144(%rsp), %r8 |
515 | movq 152(%rsp), %r10 |
516 | movl (%rsp), %eax |
517 | movl 4(%rsp), %ecx |
518 | movl 8(%rsp), %edi |
519 | movl 12(%rsp), %r9d |
520 | movl %eax, (%rdx) |
521 | movl %ecx, (%rsi) |
522 | movq 160(%rsp), %rax |
523 | movq 168(%rsp), %rcx |
524 | movl %edi, (%r8) |
525 | movl %r9d, (%r10) |
526 | movq 176(%rsp), %rdi |
527 | movq 184(%rsp), %r9 |
528 | movl 16(%rsp), %r11d |
529 | movl 20(%rsp), %edx |
530 | movl 24(%rsp), %esi |
531 | movl 28(%rsp), %r8d |
532 | movl %r11d, (%rax) |
533 | movl %edx, (%rcx) |
534 | movq 192(%rsp), %r11 |
535 | movq 200(%rsp), %rdx |
536 | movl %esi, (%rdi) |
537 | movl %r8d, (%r9) |
538 | movq 208(%rsp), %rsi |
539 | movq 216(%rsp), %r8 |
540 | movl 32(%rsp), %r10d |
541 | movl 36(%rsp), %eax |
542 | movl 40(%rsp), %ecx |
543 | movl 44(%rsp), %edi |
544 | movl %r10d, (%r11) |
545 | movl %eax, (%rdx) |
546 | movq 224(%rsp), %r10 |
547 | movq 232(%rsp), %rax |
548 | movl %ecx, (%rsi) |
549 | movl %edi, (%r8) |
550 | movq 240(%rsp), %rcx |
551 | movq 248(%rsp), %rdi |
552 | movl 48(%rsp), %r9d |
553 | movl 52(%rsp), %r11d |
554 | movl 56(%rsp), %edx |
555 | movl 60(%rsp), %esi |
556 | movl %r9d, (%r10) |
557 | movl %r11d, (%rax) |
558 | movq 256(%rsp), %r9 |
559 | movq 264(%rsp), %r11 |
560 | movl %edx, (%rcx) |
561 | movl %esi, (%rdi) |
562 | movq 272(%rsp), %rdx |
563 | movq 280(%rsp), %rsi |
564 | movl 64(%rsp), %r8d |
565 | movl 68(%rsp), %r10d |
566 | movl 72(%rsp), %eax |
567 | movl 76(%rsp), %ecx |
568 | movl %r8d, (%r9) |
569 | movl %r10d, (%r11) |
570 | movq 288(%rsp), %r8 |
571 | movq 296(%rsp), %r10 |
572 | movl %eax, (%rdx) |
573 | movl %ecx, (%rsi) |
574 | movq 304(%rsp), %rax |
575 | movq 312(%rsp), %rcx |
576 | movl 80(%rsp), %edi |
577 | movl 84(%rsp), %r9d |
578 | movl 88(%rsp), %r11d |
579 | movl 92(%rsp), %edx |
580 | movl %edi, (%r8) |
581 | movl %r9d, (%r10) |
582 | movq 320(%rsp), %rdi |
583 | movq 328(%rsp), %r9 |
584 | movl %r11d, (%rax) |
585 | movl %edx, (%rcx) |
586 | movq 336(%rsp), %r11 |
587 | movq 344(%rsp), %rdx |
588 | movl 96(%rsp), %esi |
589 | movl 100(%rsp), %r8d |
590 | movl 104(%rsp), %r10d |
591 | movl 108(%rsp), %eax |
592 | movl %esi, (%rdi) |
593 | movl %r8d, (%r9) |
594 | movq 352(%rsp), %rsi |
595 | movq 360(%rsp), %r8 |
596 | movl %r10d, (%r11) |
597 | movl %eax, (%rdx) |
598 | movq 368(%rsp), %r10 |
599 | movq 376(%rsp), %rax |
600 | movl 112(%rsp), %ecx |
601 | movl 116(%rsp), %edi |
602 | movl 120(%rsp), %r9d |
603 | movl 124(%rsp), %r11d |
604 | movl %ecx, (%rsi) |
605 | movl %edi, (%r8) |
606 | movl %r9d, (%r10) |
607 | movl %r11d, (%rax) |
608 | movq %rbp, %rsp |
609 | cfi_def_cfa_register (%rsp) |
610 | popq %rbp |
611 | cfi_adjust_cfa_offset (-8) |
612 | cfi_restore (%rbp) |
613 | ret |
614 | #else |
615 | leal 8(%rsp), %r10d |
616 | .cfi_def_cfa 10, 0 |
617 | andl $-64, %esp |
618 | pushq -8(%r10d) |
619 | pushq %rbp |
620 | .cfi_escape 0x10,0x6,0x2,0x76,0 |
621 | movl %esp, %ebp |
622 | pushq %r10 |
623 | .cfi_escape 0xf,0x3,0x76,0x78,0x6 |
624 | leal -112(%rbp), %esi |
625 | leal -176(%rbp), %edi |
626 | subl $296, %esp |
627 | vmovdqa64 %zmm1, -240(%ebp) |
628 | vmovdqa64 %zmm2, -304(%ebp) |
629 | call HIDDEN_JUMPTARGET(\callee) |
630 | movl -240(%ebp), %eax |
631 | vmovss -176(%ebp), %xmm0 |
632 | vmovss %xmm0, (%eax) |
633 | movl -236(%ebp), %eax |
634 | vmovss -172(%ebp), %xmm0 |
635 | vmovss %xmm0, (%eax) |
636 | movl -232(%ebp), %eax |
637 | vmovss -168(%ebp), %xmm0 |
638 | vmovss %xmm0, (%eax) |
639 | movl -228(%ebp), %eax |
640 | vmovss -164(%ebp), %xmm0 |
641 | vmovss %xmm0, (%eax) |
642 | movl -224(%ebp), %eax |
643 | vmovss -160(%ebp), %xmm0 |
644 | vmovss %xmm0, (%eax) |
645 | movl -220(%ebp), %eax |
646 | vmovss -156(%ebp), %xmm0 |
647 | vmovss %xmm0, (%eax) |
648 | movl -216(%ebp), %eax |
649 | vmovss -152(%ebp), %xmm0 |
650 | vmovss %xmm0, (%eax) |
651 | movl -212(%ebp), %eax |
652 | vmovss -148(%ebp), %xmm0 |
653 | vmovss %xmm0, (%eax) |
654 | movl -208(%ebp), %eax |
655 | vmovss -144(%ebp), %xmm0 |
656 | vmovss %xmm0, (%eax) |
657 | movl -204(%ebp), %eax |
658 | vmovss -140(%ebp), %xmm0 |
659 | vmovss %xmm0, (%eax) |
660 | movl -200(%ebp), %eax |
661 | vmovss -136(%ebp), %xmm0 |
662 | vmovss %xmm0, (%eax) |
663 | movl -196(%ebp), %eax |
664 | vmovss -132(%ebp), %xmm0 |
665 | vmovss %xmm0, (%eax) |
666 | movl -192(%ebp), %eax |
667 | vmovss -128(%ebp), %xmm0 |
668 | vmovss %xmm0, (%eax) |
669 | movl -188(%ebp), %eax |
670 | vmovss -124(%ebp), %xmm0 |
671 | vmovss %xmm0, (%eax) |
672 | movl -184(%ebp), %eax |
673 | vmovss -120(%ebp), %xmm0 |
674 | vmovss %xmm0, (%eax) |
675 | movl -180(%ebp), %eax |
676 | vmovss -116(%ebp), %xmm0 |
677 | vmovss %xmm0, (%eax) |
678 | movl -304(%ebp), %eax |
679 | vmovss -112(%ebp), %xmm0 |
680 | vmovss %xmm0, (%eax) |
681 | movl -300(%ebp), %eax |
682 | vmovss -108(%ebp), %xmm0 |
683 | vmovss %xmm0, (%eax) |
684 | movl -296(%ebp), %eax |
685 | vmovss -104(%ebp), %xmm0 |
686 | vmovss %xmm0, (%eax) |
687 | movl -292(%ebp), %eax |
688 | vmovss -100(%ebp), %xmm0 |
689 | vmovss %xmm0, (%eax) |
690 | movl -288(%ebp), %eax |
691 | vmovss -96(%ebp), %xmm0 |
692 | vmovss %xmm0, (%eax) |
693 | movl -284(%ebp), %eax |
694 | vmovss -92(%ebp), %xmm0 |
695 | vmovss %xmm0, (%eax) |
696 | movl -280(%ebp), %eax |
697 | vmovss -88(%ebp), %xmm0 |
698 | vmovss %xmm0, (%eax) |
699 | movl -276(%ebp), %eax |
700 | vmovss -84(%ebp), %xmm0 |
701 | vmovss %xmm0, (%eax) |
702 | movl -272(%ebp), %eax |
703 | vmovss -80(%ebp), %xmm0 |
704 | vmovss %xmm0, (%eax) |
705 | movl -268(%ebp), %eax |
706 | vmovss -76(%ebp), %xmm0 |
707 | vmovss %xmm0, (%eax) |
708 | movl -264(%ebp), %eax |
709 | vmovss -72(%ebp), %xmm0 |
710 | vmovss %xmm0, (%eax) |
711 | movl -260(%ebp), %eax |
712 | vmovss -68(%ebp), %xmm0 |
713 | vmovss %xmm0, (%eax) |
714 | movl -256(%ebp), %eax |
715 | vmovss -64(%ebp), %xmm0 |
716 | vmovss %xmm0, (%eax) |
717 | movl -252(%ebp), %eax |
718 | vmovss -60(%ebp), %xmm0 |
719 | vmovss %xmm0, (%eax) |
720 | movl -248(%ebp), %eax |
721 | vmovss -56(%ebp), %xmm0 |
722 | vmovss %xmm0, (%eax) |
723 | movl -244(%ebp), %eax |
724 | vmovss -52(%ebp), %xmm0 |
725 | vmovss %xmm0, (%eax) |
726 | addl $296, %esp |
727 | popq %r10 |
728 | .cfi_def_cfa 10, 0 |
729 | popq %rbp |
730 | leal -8(%r10), %esp |
731 | .cfi_def_cfa 7, 8 |
732 | ret |
733 | #endif |
734 | .endm |
735 | |
736 | ENTRY (_ZGVeN16vvv_sincosf_knl) |
737 | WRAPPER_AVX512_vvv_vl4l4 _ZGVeN16vl4l4_sincosf_knl |
738 | END (_ZGVeN16vvv_sincosf_knl) |
739 | |
740 | ENTRY (_ZGVeN16vvv_sincosf_skx) |
741 | WRAPPER_AVX512_vvv_vl4l4 _ZGVeN16vl4l4_sincosf_skx |
742 | END (_ZGVeN16vvv_sincosf_skx) |
743 | |