1/* Function cosf vectorized with AVX-512. KNL and SKX versions.
2 Copyright (C) 2014-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <sysdep.h>
20#include "svml_s_trig_data.h"
21#include "svml_s_wrapper_impl.h"
22
23 .section .text.evex512, "ax", @progbits
24ENTRY (_ZGVeN16v_cosf_knl)
25/*
26 ALGORITHM DESCRIPTION:
27
28 1) Range reduction to [-Pi/2; +Pi/2] interval
29 a) We remove sign using AND operation
30 b) Add Pi/2 value to argument X for Cos to Sin transformation
31 c) Getting octant Y by 1/Pi multiplication
32 d) Add "Right Shifter" value
33 e) Treat obtained value as integer for destination sign setting.
34 Shift first bit of this value to the last (sign) position
35 f) Subtract "Right Shifter" value
36 g) Subtract 0.5 from result for octant correction
37 h) Subtract Y*PI from X argument, where PI divided to 4 parts:
38 X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4;
39 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval)
40 a) Calculate X^2 = X * X
41 b) Calculate polynomial:
42 R = X + X * X^2 * (A3 + x^2 * (A5 + .....
43 3) Destination sign setting
44 a) Set shifted destination sign using XOR operation:
45 R = XOR( R, S );
46 */
47 pushq %rbp
48 cfi_adjust_cfa_offset (8)
49 cfi_rel_offset (%rbp, 0)
50 movq %rsp, %rbp
51 cfi_def_cfa_register (%rbp)
52 andq $-64, %rsp
53 subq $1280, %rsp
54 movq __svml_s_trig_data@GOTPCREL(%rip), %rdx
55
56/*
57 h) Subtract Y*PI from X argument, where PI divided to 4 parts:
58 X = X - Y*PI1 - Y*PI2 - Y*PI3
59 */
60 vmovaps %zmm0, %zmm6
61 movl $-1, %eax
62
63/* b) Add Pi/2 value to argument X for Cos to Sin transformation */
64 vaddps __sHalfPI(%rdx), %zmm0, %zmm2
65 vmovups __sRShifter(%rdx), %zmm3
66
67/*
68 1) Range reduction to [-Pi/2; +Pi/2] interval
69 c) Getting octant Y by 1/Pi multiplication
70 d) Add "Right Shifter" (0x4B000000) value
71 */
72 vfmadd132ps __sInvPI(%rdx), %zmm3, %zmm2
73 vmovups __sPI1_FMA(%rdx), %zmm5
74
75/* f) Subtract "Right Shifter" (0x4B000000) value */
76 vsubps %zmm3, %zmm2, %zmm4
77 vmovups __sA9_FMA(%rdx), %zmm9
78
79/* Check for large and special arguments */
80 vpandd __sAbsMask(%rdx), %zmm0, %zmm1
81
82/*
83 e) Treat obtained value as integer for destination sign setting.
84 Shift first bit of this value to the last (sign) position (S << 31)
85 */
86 vpslld $31, %zmm2, %zmm8
87 vcmpps $22, __sRangeReductionVal(%rdx), %zmm1, %k1
88 vpbroadcastd %eax, %zmm12{%k1}{z}
89
90/* g) Subtract 0.5 from result for octant correction */
91 vsubps __sOneHalf(%rdx), %zmm4, %zmm7
92 vptestmd %zmm12, %zmm12, %k0
93 vfnmadd231ps %zmm7, %zmm5, %zmm6
94 kmovw %k0, %ecx
95 vfnmadd231ps __sPI2_FMA(%rdx), %zmm7, %zmm6
96 vfnmadd132ps __sPI3_FMA(%rdx), %zmm6, %zmm7
97
98/* a) Calculate X^2 = X * X */
99 vmulps %zmm7, %zmm7, %zmm10
100
101/*
102 3) Destination sign setting
103 a) Set shifted destination sign using XOR operation:
104 R = XOR( R, S );
105 */
106 vpxord %zmm8, %zmm7, %zmm11
107
108/*
109 b) Calculate polynomial:
110 R = X + X * X^2 * (A3 + x^2 * (A5 + x^2 * (A7 + x^2 * (A9))));
111 */
112 vfmadd213ps __sA7_FMA(%rdx), %zmm10, %zmm9
113 vfmadd213ps __sA5_FMA(%rdx), %zmm10, %zmm9
114 vfmadd213ps __sA3(%rdx), %zmm10, %zmm9
115 vmulps %zmm10, %zmm9, %zmm1
116 vfmadd213ps %zmm11, %zmm11, %zmm1
117 testl %ecx, %ecx
118 jne .LBL_1_3
119
120.LBL_1_2:
121 cfi_remember_state
122 vmovaps %zmm1, %zmm0
123 movq %rbp, %rsp
124 cfi_def_cfa_register (%rsp)
125 popq %rbp
126 cfi_adjust_cfa_offset (-8)
127 cfi_restore (%rbp)
128 ret
129
130.LBL_1_3:
131 cfi_restore_state
132 vmovups %zmm0, 1152(%rsp)
133 vmovups %zmm1, 1216(%rsp)
134 je .LBL_1_2
135
136 xorb %dl, %dl
137 kmovw %k4, 1048(%rsp)
138 xorl %eax, %eax
139 kmovw %k5, 1040(%rsp)
140 kmovw %k6, 1032(%rsp)
141 kmovw %k7, 1024(%rsp)
142 vmovups %zmm16, 960(%rsp)
143 vmovups %zmm17, 896(%rsp)
144 vmovups %zmm18, 832(%rsp)
145 vmovups %zmm19, 768(%rsp)
146 vmovups %zmm20, 704(%rsp)
147 vmovups %zmm21, 640(%rsp)
148 vmovups %zmm22, 576(%rsp)
149 vmovups %zmm23, 512(%rsp)
150 vmovups %zmm24, 448(%rsp)
151 vmovups %zmm25, 384(%rsp)
152 vmovups %zmm26, 320(%rsp)
153 vmovups %zmm27, 256(%rsp)
154 vmovups %zmm28, 192(%rsp)
155 vmovups %zmm29, 128(%rsp)
156 vmovups %zmm30, 64(%rsp)
157 vmovups %zmm31, (%rsp)
158 movq %rsi, 1064(%rsp)
159 movq %rdi, 1056(%rsp)
160 movq %r12, 1096(%rsp)
161 cfi_offset_rel_rsp (12, 1096)
162 movb %dl, %r12b
163 movq %r13, 1088(%rsp)
164 cfi_offset_rel_rsp (13, 1088)
165 movl %ecx, %r13d
166 movq %r14, 1080(%rsp)
167 cfi_offset_rel_rsp (14, 1080)
168 movl %eax, %r14d
169 movq %r15, 1072(%rsp)
170 cfi_offset_rel_rsp (15, 1072)
171 cfi_remember_state
172
173.LBL_1_6:
174 btl %r14d, %r13d
175 jc .LBL_1_12
176
177.LBL_1_7:
178 lea 1(%r14), %esi
179 btl %esi, %r13d
180 jc .LBL_1_10
181
182.LBL_1_8:
183 addb $1, %r12b
184 addl $2, %r14d
185 cmpb $16, %r12b
186 jb .LBL_1_6
187
188 kmovw 1048(%rsp), %k4
189 movq 1064(%rsp), %rsi
190 kmovw 1040(%rsp), %k5
191 movq 1056(%rsp), %rdi
192 kmovw 1032(%rsp), %k6
193 movq 1096(%rsp), %r12
194 cfi_restore (%r12)
195 movq 1088(%rsp), %r13
196 cfi_restore (%r13)
197 kmovw 1024(%rsp), %k7
198 vmovups 960(%rsp), %zmm16
199 vmovups 896(%rsp), %zmm17
200 vmovups 832(%rsp), %zmm18
201 vmovups 768(%rsp), %zmm19
202 vmovups 704(%rsp), %zmm20
203 vmovups 640(%rsp), %zmm21
204 vmovups 576(%rsp), %zmm22
205 vmovups 512(%rsp), %zmm23
206 vmovups 448(%rsp), %zmm24
207 vmovups 384(%rsp), %zmm25
208 vmovups 320(%rsp), %zmm26
209 vmovups 256(%rsp), %zmm27
210 vmovups 192(%rsp), %zmm28
211 vmovups 128(%rsp), %zmm29
212 vmovups 64(%rsp), %zmm30
213 vmovups (%rsp), %zmm31
214 movq 1080(%rsp), %r14
215 cfi_restore (%r14)
216 movq 1072(%rsp), %r15
217 cfi_restore (%r15)
218 vmovups 1216(%rsp), %zmm1
219 jmp .LBL_1_2
220
221.LBL_1_10:
222 cfi_restore_state
223 movzbl %r12b, %r15d
224 vmovss 1156(%rsp,%r15,8), %xmm0
225 call JUMPTARGET(cosf)
226 vmovss %xmm0, 1220(%rsp,%r15,8)
227 jmp .LBL_1_8
228
229.LBL_1_12:
230 movzbl %r12b, %r15d
231 vmovss 1152(%rsp,%r15,8), %xmm0
232 call JUMPTARGET(cosf)
233 vmovss %xmm0, 1216(%rsp,%r15,8)
234 jmp .LBL_1_7
235END (_ZGVeN16v_cosf_knl)
236
237ENTRY (_ZGVeN16v_cosf_skx)
238/*
239 ALGORITHM DESCRIPTION:
240
241 1) Range reduction to [-Pi/2; +Pi/2] interval
242 a) We remove sign using AND operation
243 b) Add Pi/2 value to argument X for Cos to Sin transformation
244 c) Getting octant Y by 1/Pi multiplication
245 d) Add "Right Shifter" value
246 e) Treat obtained value as integer for destination sign setting.
247 Shift first bit of this value to the last (sign) position
248 f) Subtract "Right Shifter" value
249 g) Subtract 0.5 from result for octant correction
250 h) Subtract Y*PI from X argument, where PI divided to 4 parts:
251 X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4;
252 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval)
253 a) Calculate X^2 = X * X
254 b) Calculate polynomial:
255 R = X + X * X^2 * (A3 + x^2 * (A5 + .....
256 3) Destination sign setting
257 a) Set shifted destination sign using XOR operation:
258 R = XOR( R, S );
259 */
260 pushq %rbp
261 cfi_adjust_cfa_offset (8)
262 cfi_rel_offset (%rbp, 0)
263 movq %rsp, %rbp
264 cfi_def_cfa_register (%rbp)
265 andq $-64, %rsp
266 subq $1280, %rsp
267 movq __svml_s_trig_data@GOTPCREL(%rip), %rax
268
269/*
270 h) Subtract Y*PI from X argument, where PI divided to 4 parts:
271 X = X - Y*PI1 - Y*PI2 - Y*PI3
272 */
273 vmovaps %zmm0, %zmm6
274 vpternlogd $0xff, %zmm12, %zmm12, %zmm12
275 vmovups __sRShifter(%rax), %zmm3
276 vmovups __sPI1_FMA(%rax), %zmm5
277 vmovups __sA9_FMA(%rax), %zmm9
278
279/* b) Add Pi/2 value to argument X for Cos to Sin transformation */
280 vaddps __sHalfPI(%rax), %zmm0, %zmm2
281
282/* Check for large and special arguments */
283 vandps __sAbsMask(%rax), %zmm0, %zmm1
284
285/*
286 1) Range reduction to [-Pi/2; +Pi/2] interval
287 c) Getting octant Y by 1/Pi multiplication
288 d) Add "Right Shifter" (0x4B000000) value
289 */
290 vfmadd132ps __sInvPI(%rax), %zmm3, %zmm2
291 vcmpps $18, __sRangeReductionVal(%rax), %zmm1, %k1
292
293/*
294 e) Treat obtained value as integer for destination sign setting.
295 Shift first bit of this value to the last (sign) position (S << 31)
296 */
297 vpslld $31, %zmm2, %zmm8
298
299/* f) Subtract "Right Shifter" (0x4B000000) value */
300 vsubps %zmm3, %zmm2, %zmm4
301
302/* g) Subtract 0.5 from result for octant correction */
303 vsubps __sOneHalf(%rax), %zmm4, %zmm7
304 vfnmadd231ps %zmm7, %zmm5, %zmm6
305 vfnmadd231ps __sPI2_FMA(%rax), %zmm7, %zmm6
306 vfnmadd132ps __sPI3_FMA(%rax), %zmm6, %zmm7
307
308/* a) Calculate X^2 = X * X */
309 vmulps %zmm7, %zmm7, %zmm10
310
311/*
312 3) Destination sign setting
313 a) Set shifted destination sign using XOR operation:
314 R = XOR( R, S );
315 */
316 vxorps %zmm8, %zmm7, %zmm11
317
318/*
319 b) Calculate polynomial:
320 R = X + X * X^2 * (A3 + x^2 * (A5 + x^2 * (A7 + x^2 * (A9))));
321 */
322 vfmadd213ps __sA7_FMA(%rax), %zmm10, %zmm9
323 vfmadd213ps __sA5_FMA(%rax), %zmm10, %zmm9
324 vfmadd213ps __sA3(%rax), %zmm10, %zmm9
325 vpandnd %zmm1, %zmm1, %zmm12{%k1}
326 vmulps %zmm10, %zmm9, %zmm1
327 vptestmd %zmm12, %zmm12, %k0
328 vfmadd213ps %zmm11, %zmm11, %zmm1
329 kmovw %k0, %ecx
330 testl %ecx, %ecx
331 jne .LBL_2_3
332.LBL_2_2:
333 cfi_remember_state
334 vmovaps %zmm1, %zmm0
335 movq %rbp, %rsp
336 cfi_def_cfa_register (%rsp)
337 popq %rbp
338 cfi_adjust_cfa_offset (-8)
339 cfi_restore (%rbp)
340 ret
341
342.LBL_2_3:
343 cfi_restore_state
344 vmovups %zmm0, 1152(%rsp)
345 vmovups %zmm1, 1216(%rsp)
346 je .LBL_2_2
347
348 xorb %dl, %dl
349 xorl %eax, %eax
350 kmovw %k4, 1048(%rsp)
351 kmovw %k5, 1040(%rsp)
352 kmovw %k6, 1032(%rsp)
353 kmovw %k7, 1024(%rsp)
354 vmovups %zmm16, 960(%rsp)
355 vmovups %zmm17, 896(%rsp)
356 vmovups %zmm18, 832(%rsp)
357 vmovups %zmm19, 768(%rsp)
358 vmovups %zmm20, 704(%rsp)
359 vmovups %zmm21, 640(%rsp)
360 vmovups %zmm22, 576(%rsp)
361 vmovups %zmm23, 512(%rsp)
362 vmovups %zmm24, 448(%rsp)
363 vmovups %zmm25, 384(%rsp)
364 vmovups %zmm26, 320(%rsp)
365 vmovups %zmm27, 256(%rsp)
366 vmovups %zmm28, 192(%rsp)
367 vmovups %zmm29, 128(%rsp)
368 vmovups %zmm30, 64(%rsp)
369 vmovups %zmm31, (%rsp)
370 movq %rsi, 1064(%rsp)
371 movq %rdi, 1056(%rsp)
372 movq %r12, 1096(%rsp)
373 cfi_offset_rel_rsp (12, 1096)
374 movb %dl, %r12b
375 movq %r13, 1088(%rsp)
376 cfi_offset_rel_rsp (13, 1088)
377 movl %ecx, %r13d
378 movq %r14, 1080(%rsp)
379 cfi_offset_rel_rsp (14, 1080)
380 movl %eax, %r14d
381 movq %r15, 1072(%rsp)
382 cfi_offset_rel_rsp (15, 1072)
383 cfi_remember_state
384
385.LBL_2_6:
386 btl %r14d, %r13d
387 jc .LBL_2_12
388.LBL_2_7:
389 lea 1(%r14), %esi
390 btl %esi, %r13d
391 jc .LBL_2_10
392.LBL_2_8:
393 incb %r12b
394 addl $2, %r14d
395 cmpb $16, %r12b
396 jb .LBL_2_6
397 kmovw 1048(%rsp), %k4
398 kmovw 1040(%rsp), %k5
399 kmovw 1032(%rsp), %k6
400 kmovw 1024(%rsp), %k7
401 vmovups 960(%rsp), %zmm16
402 vmovups 896(%rsp), %zmm17
403 vmovups 832(%rsp), %zmm18
404 vmovups 768(%rsp), %zmm19
405 vmovups 704(%rsp), %zmm20
406 vmovups 640(%rsp), %zmm21
407 vmovups 576(%rsp), %zmm22
408 vmovups 512(%rsp), %zmm23
409 vmovups 448(%rsp), %zmm24
410 vmovups 384(%rsp), %zmm25
411 vmovups 320(%rsp), %zmm26
412 vmovups 256(%rsp), %zmm27
413 vmovups 192(%rsp), %zmm28
414 vmovups 128(%rsp), %zmm29
415 vmovups 64(%rsp), %zmm30
416 vmovups (%rsp), %zmm31
417 vmovups 1216(%rsp), %zmm1
418 movq 1064(%rsp), %rsi
419 movq 1056(%rsp), %rdi
420 movq 1096(%rsp), %r12
421 cfi_restore (%r12)
422 movq 1088(%rsp), %r13
423 cfi_restore (%r13)
424 movq 1080(%rsp), %r14
425 cfi_restore (%r14)
426 movq 1072(%rsp), %r15
427 cfi_restore (%r15)
428 jmp .LBL_2_2
429
430.LBL_2_10:
431 cfi_restore_state
432 movzbl %r12b, %r15d
433 vmovss 1156(%rsp,%r15,8), %xmm0
434 vzeroupper
435 vmovss 1156(%rsp,%r15,8), %xmm0
436 call JUMPTARGET(cosf)
437 vmovss %xmm0, 1220(%rsp,%r15,8)
438 jmp .LBL_2_8
439.LBL_2_12:
440 movzbl %r12b, %r15d
441 vmovss 1152(%rsp,%r15,8), %xmm0
442 vzeroupper
443 vmovss 1152(%rsp,%r15,8), %xmm0
444 call JUMPTARGET(cosf)
445 vmovss %xmm0, 1216(%rsp,%r15,8)
446 jmp .LBL_2_7
447END (_ZGVeN16v_cosf_skx)
448

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S