1/* Function cosf vectorized with AVX2.
2 Copyright (C) 2014-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19
20#include <sysdep.h>
21#include "svml_s_trig_data.h"
22
23 .section .text.avx2, "ax", @progbits
24ENTRY (_ZGVdN8v_cosf_avx2)
25/*
26 ALGORITHM DESCRIPTION:
27
28 1) Range reduction to [-Pi/2; +Pi/2] interval
29 a) We remove sign using AND operation
30 b) Add Pi/2 value to argument X for Cos to Sin transformation
31 c) Getting octant Y by 1/Pi multiplication
32 d) Add "Right Shifter" value
33 e) Treat obtained value as integer for destination sign setting.
34 Shift first bit of this value to the last (sign) position
35 f) Subtract "Right Shifter" value
36 g) Subtract 0.5 from result for octant correction
37 h) Subtract Y*PI from X argument, where PI divided to 4 parts:
38 X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4;
39 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval)
40 a) Calculate X^2 = X * X
41 b) Calculate polynomial:
42 R = X + X * X^2 * (A3 + x^2 * (A5 + .....
43 3) Destination sign setting
44 a) Set shifted destination sign using XOR operation:
45 R = XOR( R, S );
46 */
47 pushq %rbp
48 cfi_adjust_cfa_offset (8)
49 cfi_rel_offset (%rbp, 0)
50 movq %rsp, %rbp
51 cfi_def_cfa_register (%rbp)
52 andq $-64, %rsp
53 subq $448, %rsp
54 movq __svml_s_trig_data@GOTPCREL(%rip), %rax
55 vmovaps %ymm0, %ymm2
56 vmovups __sRShifter(%rax), %ymm5
57 vmovups __sPI1_FMA(%rax), %ymm7
58
59/* b) Add Pi/2 value to argument X for Cos to Sin transformation */
60 vaddps __sHalfPI(%rax), %ymm2, %ymm4
61
62/*
63 1) Range reduction to [-Pi/2; +Pi/2] interval
64 c) Getting octant Y by 1/Pi multiplication
65 d) Add "Right Shifter" (0x4B000000) value
66 */
67 vfmadd132ps __sInvPI(%rax), %ymm5, %ymm4
68
69/* f) Subtract "Right Shifter" (0x4B000000) value */
70 vsubps %ymm5, %ymm4, %ymm6
71
72/*
73 e) Treat obtained value as integer for destination sign setting.
74 Shift first bit of this value to the last (sign) position (S << 31)
75 */
76 vpslld $31, %ymm4, %ymm0
77
78/* g) Subtract 0.5 from result for octant correction */
79 vsubps __sOneHalf(%rax), %ymm6, %ymm4
80
81/* Check for large and special arguments */
82 vandps __sAbsMask(%rax), %ymm2, %ymm3
83 vcmpnle_uqps __sRangeReductionVal(%rax), %ymm3, %ymm1
84
85/*
86 h) Subtract Y*PI from X argument, where PI divided to 4 parts:
87 X = X - Y*PI1 - Y*PI2 - Y*PI3
88 */
89 vmovaps %ymm2, %ymm3
90 vfnmadd231ps %ymm4, %ymm7, %ymm3
91 vfnmadd231ps __sPI2_FMA(%rax), %ymm4, %ymm3
92 vfnmadd132ps __sPI3_FMA(%rax), %ymm3, %ymm4
93
94/* a) Calculate X^2 = X * X */
95 vmulps %ymm4, %ymm4, %ymm5
96
97/*
98 3) Destination sign setting
99 a) Set shifted destination sign using XOR operation:
100 R = XOR( R, S );
101 */
102 vxorps %ymm0, %ymm4, %ymm6
103 vmovups __sA9_FMA(%rax), %ymm0
104
105/*
106 b) Calculate polynomial:
107 R = X + X * X^2 * (A3 + x^2 * (A5 + x^2 * (A7 + x^2 * (A9))))
108 */
109 vfmadd213ps __sA7_FMA(%rax), %ymm5, %ymm0
110 vfmadd213ps __sA5_FMA(%rax), %ymm5, %ymm0
111 vfmadd213ps __sA3(%rax), %ymm5, %ymm0
112 vmulps %ymm5, %ymm0, %ymm0
113 vmovmskps %ymm1, %ecx
114 vfmadd213ps %ymm6, %ymm6, %ymm0
115 testl %ecx, %ecx
116 jne .LBL_1_3
117
118.LBL_1_2:
119 cfi_remember_state
120 movq %rbp, %rsp
121 cfi_def_cfa_register (%rsp)
122 popq %rbp
123 cfi_adjust_cfa_offset (-8)
124 cfi_restore (%rbp)
125 ret
126
127.LBL_1_3:
128 cfi_restore_state
129 vmovups %ymm2, 320(%rsp)
130 vmovups %ymm0, 384(%rsp)
131 je .LBL_1_2
132
133 xorb %dl, %dl
134 xorl %eax, %eax
135 vmovups %ymm8, 224(%rsp)
136 vmovups %ymm9, 192(%rsp)
137 vmovups %ymm10, 160(%rsp)
138 vmovups %ymm11, 128(%rsp)
139 vmovups %ymm12, 96(%rsp)
140 vmovups %ymm13, 64(%rsp)
141 vmovups %ymm14, 32(%rsp)
142 vmovups %ymm15, (%rsp)
143 movq %rsi, 264(%rsp)
144 movq %rdi, 256(%rsp)
145 movq %r12, 296(%rsp)
146 cfi_offset_rel_rsp (12, 296)
147 movb %dl, %r12b
148 movq %r13, 288(%rsp)
149 cfi_offset_rel_rsp (13, 288)
150 movl %ecx, %r13d
151 movq %r14, 280(%rsp)
152 cfi_offset_rel_rsp (14, 280)
153 movl %eax, %r14d
154 movq %r15, 272(%rsp)
155 cfi_offset_rel_rsp (15, 272)
156 cfi_remember_state
157
158.LBL_1_6:
159 btl %r14d, %r13d
160 jc .LBL_1_12
161
162.LBL_1_7:
163 lea 1(%r14), %esi
164 btl %esi, %r13d
165 jc .LBL_1_10
166
167.LBL_1_8:
168 incb %r12b
169 addl $2, %r14d
170 cmpb $16, %r12b
171 jb .LBL_1_6
172
173 vmovups 224(%rsp), %ymm8
174 vmovups 192(%rsp), %ymm9
175 vmovups 160(%rsp), %ymm10
176 vmovups 128(%rsp), %ymm11
177 vmovups 96(%rsp), %ymm12
178 vmovups 64(%rsp), %ymm13
179 vmovups 32(%rsp), %ymm14
180 vmovups (%rsp), %ymm15
181 vmovups 384(%rsp), %ymm0
182 movq 264(%rsp), %rsi
183 movq 256(%rsp), %rdi
184 movq 296(%rsp), %r12
185 cfi_restore (%r12)
186 movq 288(%rsp), %r13
187 cfi_restore (%r13)
188 movq 280(%rsp), %r14
189 cfi_restore (%r14)
190 movq 272(%rsp), %r15
191 cfi_restore (%r15)
192 jmp .LBL_1_2
193
194.LBL_1_10:
195 cfi_restore_state
196 movzbl %r12b, %r15d
197 vmovss 324(%rsp,%r15,8), %xmm0
198 vzeroupper
199
200 call JUMPTARGET(cosf)
201
202 vmovss %xmm0, 388(%rsp,%r15,8)
203 jmp .LBL_1_8
204
205.LBL_1_12:
206 movzbl %r12b, %r15d
207 vmovss 320(%rsp,%r15,8), %xmm0
208 vzeroupper
209
210 call JUMPTARGET(cosf)
211
212 vmovss %xmm0, 384(%rsp,%r15,8)
213 jmp .LBL_1_7
214
215END (_ZGVdN8v_cosf_avx2)
216

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core_avx2.S