1/* Function cosf vectorized with SSE4.
2 Copyright (C) 2014-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <sysdep.h>
20#include "svml_s_trig_data.h"
21
22 .text
23ENTRY (_ZGVbN4v_cosf_sse4)
24/*
25 ALGORITHM DESCRIPTION:
26
27 1) Range reduction to [-Pi/2; +Pi/2] interval
28 a) We remove sign using AND operation
29 b) Add Pi/2 value to argument X for Cos to Sin transformation
30 c) Getting octant Y by 1/Pi multiplication
31 d) Add "Right Shifter" value
32 e) Treat obtained value as integer for destination sign setting.
33 Shift first bit of this value to the last (sign) position
34 f) Subtract "Right Shifter" value
35 g) Subtract 0.5 from result for octant correction
36 h) Subtract Y*PI from X argument, where PI divided to 4 parts:
37 X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4;
38 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval)
39 a) Calculate X^2 = X * X
40 b) Calculate polynomial:
41 R = X + X * X^2 * (A3 + x^2 * (A5 + .....
42 3) Destination sign setting
43 a) Set shifted destination sign using XOR operation:
44 R = XOR( R, S );
45 */
46 pushq %rbp
47 cfi_adjust_cfa_offset (8)
48 cfi_rel_offset (%rbp, 0)
49 movq %rsp, %rbp
50 cfi_def_cfa_register (%rbp)
51 andq $-64, %rsp
52 subq $320, %rsp
53 movaps %xmm0, %xmm4
54 movq __svml_s_trig_data@GOTPCREL(%rip), %rax
55 movups __sHalfPI(%rax), %xmm1
56 movups __sRShifter(%rax), %xmm5
57
58/* b) Add Pi/2 value to argument X for Cos to Sin transformation */
59 addps %xmm4, %xmm1
60
61/*
62 1) Range reduction to [-Pi/2; +Pi/2] interval
63 c) Getting octant Y by 1/Pi multiplication
64 d) Add "Right Shifter" (0x4B000000) value
65 */
66 mulps __sInvPI(%rax), %xmm1
67 movups __sPI1(%rax), %xmm6
68 addps %xmm5, %xmm1
69
70/*
71 e) Treat obtained value as integer for destination sign setting.
72 Shift first bit of this value to the last (sign) position (S << 31)
73 */
74 movaps %xmm1, %xmm2
75
76/* f) Subtract "Right Shifter" (0x4B000000) value */
77 subps %xmm5, %xmm1
78 movups __sPI2(%rax), %xmm7
79 pslld $31, %xmm2
80 movups __sPI3(%rax), %xmm5
81 movups __sAbsMask(%rax), %xmm3
82
83/* Check for large and special arguments */
84 andps %xmm4, %xmm3
85
86/* g) Subtract 0.5 from result for octant correction */
87 subps __sOneHalf(%rax), %xmm1
88 cmpnleps __sRangeReductionVal(%rax), %xmm3
89
90/*
91 h) Subtract Y*PI from X argument, where PI divided to 4 parts:
92 X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4;
93 */
94 mulps %xmm1, %xmm6
95 mulps %xmm1, %xmm7
96 mulps %xmm1, %xmm5
97 subps %xmm6, %xmm0
98 movmskps %xmm3, %ecx
99 movups __sPI4(%rax), %xmm6
100 subps %xmm7, %xmm0
101 mulps %xmm6, %xmm1
102 subps %xmm5, %xmm0
103 subps %xmm1, %xmm0
104
105/* a) Calculate X^2 = X * X */
106 movaps %xmm0, %xmm1
107 mulps %xmm0, %xmm1
108
109/*
110 3) Destination sign setting
111 a) Set shifted destination sign using XOR operation:
112 R = XOR( R, S );
113 */
114 xorps %xmm2, %xmm0
115 movups __sA9(%rax), %xmm2
116
117/*
118 b) Calculate polynomial:
119 R = X + X * X^2 * (A3 + x^2 * (A5 + x^2 * (A7 + x^2 * (A9))));
120 */
121 mulps %xmm1, %xmm2
122 addps __sA7(%rax), %xmm2
123 mulps %xmm1, %xmm2
124 addps __sA5(%rax), %xmm2
125 mulps %xmm1, %xmm2
126 addps __sA3(%rax), %xmm2
127 mulps %xmm2, %xmm1
128 mulps %xmm0, %xmm1
129 addps %xmm1, %xmm0
130 testl %ecx, %ecx
131 jne .LBL_1_3
132
133.LBL_1_2:
134 cfi_remember_state
135 movq %rbp, %rsp
136 cfi_def_cfa_register (%rsp)
137 popq %rbp
138 cfi_adjust_cfa_offset (-8)
139 cfi_restore (%rbp)
140 ret
141
142.LBL_1_3:
143 cfi_restore_state
144 movups %xmm4, 192(%rsp)
145 movups %xmm0, 256(%rsp)
146 je .LBL_1_2
147
148 xorb %dl, %dl
149 xorl %eax, %eax
150 movups %xmm8, 112(%rsp)
151 movups %xmm9, 96(%rsp)
152 movups %xmm10, 80(%rsp)
153 movups %xmm11, 64(%rsp)
154 movups %xmm12, 48(%rsp)
155 movups %xmm13, 32(%rsp)
156 movups %xmm14, 16(%rsp)
157 movups %xmm15, (%rsp)
158 movq %rsi, 136(%rsp)
159 movq %rdi, 128(%rsp)
160 movq %r12, 168(%rsp)
161 cfi_offset_rel_rsp (12, 168)
162 movb %dl, %r12b
163 movq %r13, 160(%rsp)
164 cfi_offset_rel_rsp (13, 160)
165 movl %ecx, %r13d
166 movq %r14, 152(%rsp)
167 cfi_offset_rel_rsp (14, 152)
168 movl %eax, %r14d
169 movq %r15, 144(%rsp)
170 cfi_offset_rel_rsp (15, 144)
171 cfi_remember_state
172
173.LBL_1_6:
174 btl %r14d, %r13d
175 jc .LBL_1_12
176
177.LBL_1_7:
178 lea 1(%r14), %esi
179 btl %esi, %r13d
180 jc .LBL_1_10
181
182.LBL_1_8:
183 incb %r12b
184 addl $2, %r14d
185 cmpb $16, %r12b
186 jb .LBL_1_6
187
188 movups 112(%rsp), %xmm8
189 movups 96(%rsp), %xmm9
190 movups 80(%rsp), %xmm10
191 movups 64(%rsp), %xmm11
192 movups 48(%rsp), %xmm12
193 movups 32(%rsp), %xmm13
194 movups 16(%rsp), %xmm14
195 movups (%rsp), %xmm15
196 movq 136(%rsp), %rsi
197 movq 128(%rsp), %rdi
198 movq 168(%rsp), %r12
199 cfi_restore (%r12)
200 movq 160(%rsp), %r13
201 cfi_restore (%r13)
202 movq 152(%rsp), %r14
203 cfi_restore (%r14)
204 movq 144(%rsp), %r15
205 cfi_restore (%r15)
206 movups 256(%rsp), %xmm0
207 jmp .LBL_1_2
208
209.LBL_1_10:
210 cfi_restore_state
211 movzbl %r12b, %r15d
212 movss 196(%rsp,%r15,8), %xmm0
213
214 call JUMPTARGET(cosf)
215
216 movss %xmm0, 260(%rsp,%r15,8)
217 jmp .LBL_1_8
218
219.LBL_1_12:
220 movzbl %r12b, %r15d
221 movss 192(%rsp,%r15,8), %xmm0
222
223 call JUMPTARGET(cosf)
224
225 movss %xmm0, 256(%rsp,%r15,8)
226 jmp .LBL_1_7
227END (_ZGVbN4v_cosf_sse4)
228

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core_sse4.S