1/* Function sinf vectorized with AVX-512. KNL and SKX versions.
2 Copyright (C) 2014-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <sysdep.h>
20#include "svml_s_trig_data.h"
21#include "svml_s_wrapper_impl.h"
22
23 .section .text.evex512, "ax", @progbits
24ENTRY(_ZGVeN16v_sinf_knl)
25/*
26 ALGORITHM DESCRIPTION:
27
28 1) Range reduction to [-Pi/2; +Pi/2] interval
29 a) Grab sign from source argument and save it.
30 b) Remove sign using AND operation
31 c) Getting octant Y by 1/Pi multiplication
32 d) Add "Right Shifter" value
33 e) Treat obtained value as integer for destination sign setting.
34 Shift first bit of this value to the last (sign) position
35 f) Change destination sign if source sign is negative
36 using XOR operation.
37 g) Subtract "Right Shifter" value
38 h) Subtract Y*PI from X argument, where PI divided to 4 parts:
39 X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4;
40 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval)
41 a) Calculate X^2 = X * X
42 b) Calculate polynomial:
43 R = X + X * X^2 * (A3 + x^2 * (A5 + ......
44 3) Destination sign setting
45 a) Set shifted destination sign using XOR operation:
46 R = XOR( R, S );
47 */
48 pushq %rbp
49 cfi_adjust_cfa_offset (8)
50 cfi_rel_offset (%rbp, 0)
51 movq %rsp, %rbp
52 cfi_def_cfa_register (%rbp)
53 andq $-64, %rsp
54 subq $1280, %rsp
55 movq __svml_s_trig_data@GOTPCREL(%rip), %rax
56
57/* Check for large and special values */
58 movl $-1, %edx
59 vmovups __sAbsMask(%rax), %zmm4
60 vmovups __sInvPI(%rax), %zmm1
61
62/* b) Remove sign using AND operation */
63 vpandd %zmm4, %zmm0, %zmm12
64 vmovups __sPI1_FMA(%rax), %zmm2
65 vmovups __sA9(%rax), %zmm7
66
67/*
68 f) Change destination sign if source sign is negative
69 using XOR operation.
70 */
71 vpandnd %zmm0, %zmm4, %zmm11
72
73/*
74 h) Subtract Y*PI from X argument, where PI divided to 4 parts:
75 X = X - Y*PI1 - Y*PI2 - Y*PI3;
76 */
77 vmovaps %zmm12, %zmm3
78
79/*
80 c) Getting octant Y by 1/Pi multiplication
81 d) Add "Right Shifter" value
82 */
83 vfmadd213ps __sRShifter(%rax), %zmm12, %zmm1
84 vcmpps $22, __sRangeReductionVal(%rax), %zmm12, %k1
85 vpbroadcastd %edx, %zmm13{%k1}{z}
86
87/* g) Subtract "Right Shifter" value */
88 vsubps __sRShifter(%rax), %zmm1, %zmm5
89
90/*
91 e) Treat obtained value as integer for destination sign setting.
92 Shift first bit of this value to the last (sign) position
93 */
94 vpslld $31, %zmm1, %zmm6
95 vptestmd %zmm13, %zmm13, %k0
96 vfnmadd231ps %zmm5, %zmm2, %zmm3
97 kmovw %k0, %ecx
98 vfnmadd231ps __sPI2_FMA(%rax), %zmm5, %zmm3
99 vfnmadd132ps __sPI3_FMA(%rax), %zmm3, %zmm5
100
101/*
102 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval)
103 a) Calculate X^2 = X * X
104 b) Calculate polynomial:
105 R = X + X * X^2 * (A3 + x^2 * (A5 + ......
106 */
107 vmulps %zmm5, %zmm5, %zmm8
108 vpxord %zmm6, %zmm5, %zmm9
109 vfmadd213ps __sA7(%rax), %zmm8, %zmm7
110 vfmadd213ps __sA5(%rax), %zmm8, %zmm7
111 vfmadd213ps __sA3(%rax), %zmm8, %zmm7
112 vmulps %zmm8, %zmm7, %zmm10
113 vfmadd213ps %zmm9, %zmm9, %zmm10
114
115/*
116 3) Destination sign setting
117 a) Set shifted destination sign using XOR operation:
118 R = XOR( R, S );
119 */
120 vpxord %zmm11, %zmm10, %zmm1
121 testl %ecx, %ecx
122 jne .LBL_1_3
123
124.LBL_1_2:
125 cfi_remember_state
126 vmovaps %zmm1, %zmm0
127 movq %rbp, %rsp
128 cfi_def_cfa_register (%rsp)
129 popq %rbp
130 cfi_adjust_cfa_offset (-8)
131 cfi_restore (%rbp)
132 ret
133
134.LBL_1_3:
135 cfi_restore_state
136 vmovups %zmm0, 1152(%rsp)
137 vmovups %zmm1, 1216(%rsp)
138 je .LBL_1_2
139
140 xorb %dl, %dl
141 kmovw %k4, 1048(%rsp)
142 xorl %eax, %eax
143 kmovw %k5, 1040(%rsp)
144 kmovw %k6, 1032(%rsp)
145 kmovw %k7, 1024(%rsp)
146 vmovups %zmm16, 960(%rsp)
147 vmovups %zmm17, 896(%rsp)
148 vmovups %zmm18, 832(%rsp)
149 vmovups %zmm19, 768(%rsp)
150 vmovups %zmm20, 704(%rsp)
151 vmovups %zmm21, 640(%rsp)
152 vmovups %zmm22, 576(%rsp)
153 vmovups %zmm23, 512(%rsp)
154 vmovups %zmm24, 448(%rsp)
155 vmovups %zmm25, 384(%rsp)
156 vmovups %zmm26, 320(%rsp)
157 vmovups %zmm27, 256(%rsp)
158 vmovups %zmm28, 192(%rsp)
159 vmovups %zmm29, 128(%rsp)
160 vmovups %zmm30, 64(%rsp)
161 vmovups %zmm31, (%rsp)
162 movq %rsi, 1064(%rsp)
163 movq %rdi, 1056(%rsp)
164 movq %r12, 1096(%rsp)
165 cfi_offset_rel_rsp (12, 1096)
166 movb %dl, %r12b
167 movq %r13, 1088(%rsp)
168 cfi_offset_rel_rsp (13, 1088)
169 movl %ecx, %r13d
170 movq %r14, 1080(%rsp)
171 cfi_offset_rel_rsp (14, 1080)
172 movl %eax, %r14d
173 movq %r15, 1072(%rsp)
174 cfi_offset_rel_rsp (15, 1072)
175 cfi_remember_state
176
177.LBL_1_6:
178 btl %r14d, %r13d
179 jc .LBL_1_12
180
181.LBL_1_7:
182 lea 1(%r14), %esi
183 btl %esi, %r13d
184 jc .LBL_1_10
185
186.LBL_1_8:
187 addb $1, %r12b
188 addl $2, %r14d
189 cmpb $16, %r12b
190 jb .LBL_1_6
191
192 kmovw 1048(%rsp), %k4
193 movq 1064(%rsp), %rsi
194 kmovw 1040(%rsp), %k5
195 movq 1056(%rsp), %rdi
196 kmovw 1032(%rsp), %k6
197 movq 1096(%rsp), %r12
198 cfi_restore (%r12)
199 movq 1088(%rsp), %r13
200 cfi_restore (%r13)
201 kmovw 1024(%rsp), %k7
202 vmovups 960(%rsp), %zmm16
203 vmovups 896(%rsp), %zmm17
204 vmovups 832(%rsp), %zmm18
205 vmovups 768(%rsp), %zmm19
206 vmovups 704(%rsp), %zmm20
207 vmovups 640(%rsp), %zmm21
208 vmovups 576(%rsp), %zmm22
209 vmovups 512(%rsp), %zmm23
210 vmovups 448(%rsp), %zmm24
211 vmovups 384(%rsp), %zmm25
212 vmovups 320(%rsp), %zmm26
213 vmovups 256(%rsp), %zmm27
214 vmovups 192(%rsp), %zmm28
215 vmovups 128(%rsp), %zmm29
216 vmovups 64(%rsp), %zmm30
217 vmovups (%rsp), %zmm31
218 movq 1080(%rsp), %r14
219 cfi_restore (%r14)
220 movq 1072(%rsp), %r15
221 cfi_restore (%r15)
222 vmovups 1216(%rsp), %zmm1
223 jmp .LBL_1_2
224
225.LBL_1_10:
226 cfi_restore_state
227 movzbl %r12b, %r15d
228 vmovss 1156(%rsp,%r15,8), %xmm0
229 call JUMPTARGET(sinf)
230 vmovss %xmm0, 1220(%rsp,%r15,8)
231 jmp .LBL_1_8
232
233.LBL_1_12:
234 movzbl %r12b, %r15d
235 vmovss 1152(%rsp,%r15,8), %xmm0
236 call JUMPTARGET(sinf)
237 vmovss %xmm0, 1216(%rsp,%r15,8)
238 jmp .LBL_1_7
239END(_ZGVeN16v_sinf_knl)
240
241ENTRY (_ZGVeN16v_sinf_skx)
242/*
243 ALGORITHM DESCRIPTION:
244
245 1) Range reduction to [-Pi/2; +Pi/2] interval
246 a) Grab sign from source argument and save it.
247 b) Remove sign using AND operation
248 c) Getting octant Y by 1/Pi multiplication
249 d) Add "Right Shifter" value
250 e) Treat obtained value as integer for destination sign setting.
251 Shift first bit of this value to the last (sign) position
252 f) Change destination sign if source sign is negative
253 using XOR operation.
254 g) Subtract "Right Shifter" value
255 h) Subtract Y*PI from X argument, where PI divided to 4 parts:
256 X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4;
257 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval)
258 a) Calculate X^2 = X * X
259 b) Calculate polynomial:
260 R = X + X * X^2 * (A3 + x^2 * (A5 + ......
261 3) Destination sign setting
262 a) Set shifted destination sign using XOR operation:
263 R = XOR( R, S );
264 */
265
266 pushq %rbp
267 cfi_adjust_cfa_offset (8)
268 cfi_rel_offset (%rbp, 0)
269 movq %rsp, %rbp
270 cfi_def_cfa_register (%rbp)
271 andq $-64, %rsp
272 subq $1280, %rsp
273 movq __svml_s_trig_data@GOTPCREL(%rip), %rax
274
275/* Check for large and special values */
276 vpternlogd $0xff, %zmm14, %zmm14, %zmm14
277 vmovups __sAbsMask(%rax), %zmm5
278 vmovups __sInvPI(%rax), %zmm1
279 vmovups __sRShifter(%rax), %zmm2
280 vmovups __sPI1_FMA(%rax), %zmm3
281 vmovups __sA9(%rax), %zmm8
282
283/* b) Remove sign using AND operation */
284 vandps %zmm5, %zmm0, %zmm13
285
286/*
287 f) Change destination sign if source sign is negative
288 using XOR operation.
289 */
290 vandnps %zmm0, %zmm5, %zmm12
291
292/*
293 c) Getting octant Y by 1/Pi multiplication
294 d) Add "Right Shifter" value
295 */
296 vfmadd213ps %zmm2, %zmm13, %zmm1
297 vcmpps $18, __sRangeReductionVal(%rax), %zmm13, %k1
298
299/*
300 e) Treat obtained value as integer for destination sign setting.
301 Shift first bit of this value to the last (sign) position
302 */
303 vpslld $31, %zmm1, %zmm7
304
305/* g) Subtract "Right Shifter" value */
306 vsubps %zmm2, %zmm1, %zmm6
307
308/*
309 h) Subtract Y*PI from X argument, where PI divided to 4 parts:
310 X = X - Y*PI1 - Y*PI2 - Y*PI3;
311 */
312 vmovaps %zmm13, %zmm4
313 vfnmadd231ps %zmm6, %zmm3, %zmm4
314 vfnmadd231ps __sPI2_FMA(%rax), %zmm6, %zmm4
315 vfnmadd132ps __sPI3_FMA(%rax), %zmm4, %zmm6
316
317/*
318 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval)
319 a) Calculate X^2 = X * X
320 b) Calculate polynomial:
321 R = X + X * X^2 * (A3 + x^2 * (A5 + ......
322 */
323 vmulps %zmm6, %zmm6, %zmm9
324 vxorps %zmm7, %zmm6, %zmm10
325 vfmadd213ps __sA7(%rax), %zmm9, %zmm8
326 vfmadd213ps __sA5(%rax), %zmm9, %zmm8
327 vfmadd213ps __sA3(%rax), %zmm9, %zmm8
328 vmulps %zmm9, %zmm8, %zmm11
329 vfmadd213ps %zmm10, %zmm10, %zmm11
330
331/*
332 3) Destination sign setting
333 a) Set shifted destination sign using XOR operation:
334 R = XOR( R, S );
335 */
336 vxorps %zmm12, %zmm11, %zmm1
337 vpandnd %zmm13, %zmm13, %zmm14{%k1}
338 vptestmd %zmm14, %zmm14, %k0
339 kmovw %k0, %ecx
340 testl %ecx, %ecx
341 jne .LBL_2_3
342
343.LBL_2_2:
344 cfi_remember_state
345 vmovaps %zmm1, %zmm0
346 movq %rbp, %rsp
347 cfi_def_cfa_register (%rsp)
348 popq %rbp
349 cfi_adjust_cfa_offset (-8)
350 cfi_restore (%rbp)
351 ret
352
353.LBL_2_3:
354 cfi_restore_state
355 vmovups %zmm0, 1152(%rsp)
356 vmovups %zmm1, 1216(%rsp)
357 je .LBL_2_2
358
359 xorb %dl, %dl
360 xorl %eax, %eax
361 kmovw %k4, 1048(%rsp)
362 kmovw %k5, 1040(%rsp)
363 kmovw %k6, 1032(%rsp)
364 kmovw %k7, 1024(%rsp)
365 vmovups %zmm16, 960(%rsp)
366 vmovups %zmm17, 896(%rsp)
367 vmovups %zmm18, 832(%rsp)
368 vmovups %zmm19, 768(%rsp)
369 vmovups %zmm20, 704(%rsp)
370 vmovups %zmm21, 640(%rsp)
371 vmovups %zmm22, 576(%rsp)
372 vmovups %zmm23, 512(%rsp)
373 vmovups %zmm24, 448(%rsp)
374 vmovups %zmm25, 384(%rsp)
375 vmovups %zmm26, 320(%rsp)
376 vmovups %zmm27, 256(%rsp)
377 vmovups %zmm28, 192(%rsp)
378 vmovups %zmm29, 128(%rsp)
379 vmovups %zmm30, 64(%rsp)
380 vmovups %zmm31, (%rsp)
381 movq %rsi, 1064(%rsp)
382 movq %rdi, 1056(%rsp)
383 movq %r12, 1096(%rsp)
384 cfi_offset_rel_rsp (12, 1096)
385 movb %dl, %r12b
386 movq %r13, 1088(%rsp)
387 cfi_offset_rel_rsp (13, 1088)
388 movl %ecx, %r13d
389 movq %r14, 1080(%rsp)
390 cfi_offset_rel_rsp (14, 1080)
391 movl %eax, %r14d
392 movq %r15, 1072(%rsp)
393 cfi_offset_rel_rsp (15, 1072)
394 cfi_remember_state
395
396.LBL_2_6:
397 btl %r14d, %r13d
398 jc .LBL_2_12
399
400.LBL_2_7:
401 lea 1(%r14), %esi
402 btl %esi, %r13d
403 jc .LBL_2_10
404
405.LBL_2_8:
406 incb %r12b
407 addl $2, %r14d
408 cmpb $16, %r12b
409 jb .LBL_2_6
410
411 kmovw 1048(%rsp), %k4
412 kmovw 1040(%rsp), %k5
413 kmovw 1032(%rsp), %k6
414 kmovw 1024(%rsp), %k7
415 vmovups 960(%rsp), %zmm16
416 vmovups 896(%rsp), %zmm17
417 vmovups 832(%rsp), %zmm18
418 vmovups 768(%rsp), %zmm19
419 vmovups 704(%rsp), %zmm20
420 vmovups 640(%rsp), %zmm21
421 vmovups 576(%rsp), %zmm22
422 vmovups 512(%rsp), %zmm23
423 vmovups 448(%rsp), %zmm24
424 vmovups 384(%rsp), %zmm25
425 vmovups 320(%rsp), %zmm26
426 vmovups 256(%rsp), %zmm27
427 vmovups 192(%rsp), %zmm28
428 vmovups 128(%rsp), %zmm29
429 vmovups 64(%rsp), %zmm30
430 vmovups (%rsp), %zmm31
431 vmovups 1216(%rsp), %zmm1
432 movq 1064(%rsp), %rsi
433 movq 1056(%rsp), %rdi
434 movq 1096(%rsp), %r12
435 cfi_restore (%r12)
436 movq 1088(%rsp), %r13
437 cfi_restore (%r13)
438 movq 1080(%rsp), %r14
439 cfi_restore (%r14)
440 movq 1072(%rsp), %r15
441 cfi_restore (%r15)
442 jmp .LBL_2_2
443
444.LBL_2_10:
445 cfi_restore_state
446 movzbl %r12b, %r15d
447 vmovss 1156(%rsp,%r15,8), %xmm0
448 vzeroupper
449 vmovss 1156(%rsp,%r15,8), %xmm0
450
451 call JUMPTARGET(sinf)
452
453 vmovss %xmm0, 1220(%rsp,%r15,8)
454 jmp .LBL_2_8
455
456.LBL_2_12:
457 movzbl %r12b, %r15d
458 vmovss 1152(%rsp,%r15,8), %xmm0
459 vzeroupper
460 vmovss 1152(%rsp,%r15,8), %xmm0
461
462 call JUMPTARGET(sinf)
463
464 vmovss %xmm0, 1216(%rsp,%r15,8)
465 jmp .LBL_2_7
466END (_ZGVeN16v_sinf_skx)
467

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S