svml_d_cos2_core_sse4.S source code [glibc/sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core_sse4.S]

1	/ Function cos vectorized with SSE4.*
2	Copyright (C) 2014-2024 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	#include <sysdep.h>
20	#include "svml_d_trig_data.h"
21
22	.section .text.sse4, "ax", @progbits
23	ENTRY (_ZGVbN2v_cos_sse4)
24	/ ALGORITHM DESCRIPTION:*
25
26	( low accuracy ( < 4ulp ) or enhanced performance
27	( half of correct mantissa ) implementation )
28
29	Argument representation:
30	arg + Pi/2 = (NPi + R)*
31
32	Result calculation:
33	cos(arg) = sin(arg+Pi/2) = sin(NPi + R) = (-1)^N * sin(R)*
34	sin(R) is approximated by corresponding polynomial
35	*/
36	pushq %rbp
37	cfi_adjust_cfa_offset (`8`)
38	cfi_rel_offset (%rbp, `0`)
39	movq %rsp, %rbp
40	cfi_def_cfa_register (%rbp)
41	andq $-`64`, %rsp
42	subq $`320`, %rsp
43	movaps %xmm0, %xmm3
44	movq __svml_d_trig_data@GOTPCREL(%rip), %rax
45	movups __dHalfPI(%rax), %xmm2
46
47	/ ARGUMENT RANGE REDUCTION:*
48	Add Pi/2 to argument: X' = X+Pi/2
49	*/
50	addpd %xmm3, %xmm2
51	movups __dInvPI(%rax), %xmm5
52	movups __dAbsMask(%rax), %xmm4
53
54	/ Get absolute argument value: X' = \|X'\| /
55	andps %xmm2, %xmm4
56
57	/ Y = X'InvPi + RS : right shifter add /*
58	mulpd %xmm5, %xmm2
59
60	/ Check for large arguments path /
61	cmpnlepd __dRangeVal(%rax), %xmm4
62	movups __dRShifter(%rax), %xmm6
63	addpd %xmm6, %xmm2
64	movmskpd %xmm4, %ecx
65
66	/ N = Y - RS : right shifter sub /
67	movaps %xmm2, %xmm1
68
69	/ SignRes = Y<<63 : shift LSB to MSB place for result sign /
70	psllq $`63`, %xmm2
71	subpd %xmm6, %xmm1
72
73	/ N = N - 0.5 /
74	subpd __dOneHalf(%rax), %xmm1
75	movups __dPI1(%rax), %xmm7
76
77	/ R = X - NPi1 /*
78	mulpd %xmm1, %xmm7
79	movups __dPI2(%rax), %xmm4
80
81	/ R = R - NPi2 /*
82	mulpd %xmm1, %xmm4
83	subpd %xmm7, %xmm0
84	movups __dPI3(%rax), %xmm5
85
86	/ R = R - NPi3 /*
87	mulpd %xmm1, %xmm5
88	subpd %xmm4, %xmm0
89
90	/ R = R - NPi4 /*
91	movups __dPI4(%rax), %xmm6
92	mulpd %xmm6, %xmm1
93	subpd %xmm5, %xmm0
94	subpd %xmm1, %xmm0
95
96	/ POLYNOMIAL APPROXIMATION: R2 = RR /*
97	movaps %xmm0, %xmm4
98	mulpd %xmm0, %xmm4
99	movups __dC7(%rax), %xmm1
100	mulpd %xmm4, %xmm1
101	addpd __dC6(%rax), %xmm1
102	mulpd %xmm4, %xmm1
103	addpd __dC5(%rax), %xmm1
104	mulpd %xmm4, %xmm1
105	addpd __dC4(%rax), %xmm1
106
107	/ Poly = C3+R2(C4+R2(C5+R2(C6+R2C7))) /
108	mulpd %xmm4, %xmm1
109	addpd __dC3(%rax), %xmm1
110
111	/ Poly = R+R(R2(C1+R2(C2+R2Poly))) /
112	mulpd %xmm4, %xmm1
113	addpd __dC2(%rax), %xmm1
114	mulpd %xmm4, %xmm1
115	addpd __dC1(%rax), %xmm1
116	mulpd %xmm1, %xmm4
117	mulpd %xmm0, %xmm4
118	addpd %xmm4, %xmm0
119
120	/ RECONSTRUCTION:*
121	Final sign setting: Res = Poly^SignRes /*
122	xorps %xmm2, %xmm0
123	testl %ecx, %ecx
124	jne .LBL_1_3
125
126	.LBL_1_2:
127	cfi_remember_state
128	movq %rbp, %rsp
129	cfi_def_cfa_register (%rsp)
130	popq %rbp
131	cfi_adjust_cfa_offset (-`8`)
132	cfi_restore (%rbp)
133	ret
134
135	.LBL_1_3:
136	cfi_restore_state
137	movups %xmm3, `192`(%rsp)
138	movups %xmm0, `256`(%rsp)
139	je .LBL_1_2
140
141	xorb %dl, %dl
142	xorl %eax, %eax
143	movups %xmm8, `112`(%rsp)
144	movups %xmm9, `96`(%rsp)
145	movups %xmm10, `80`(%rsp)
146	movups %xmm11, `64`(%rsp)
147	movups %xmm12, `48`(%rsp)
148	movups %xmm13, `32`(%rsp)
149	movups %xmm14, `16`(%rsp)
150	movups %xmm15, (%rsp)
151	movq %rsi, `136`(%rsp)
152	movq %rdi, `128`(%rsp)
153	movq %r12, `168`(%rsp)
154	cfi_offset_rel_rsp (`12`, `168`)
155	movb %dl, %r12b
156	movq %r13, `160`(%rsp)
157	cfi_offset_rel_rsp (`13`, `160`)
158	movl %ecx, %r13d
159	movq %r14, `152`(%rsp)
160	cfi_offset_rel_rsp (`14`, `152`)
161	movl %eax, %r14d
162	movq %r15, `144`(%rsp)
163	cfi_offset_rel_rsp (`15`, `144`)
164	cfi_remember_state
165
166	.LBL_1_6:
167	btl %r14d, %r13d
168	jc .LBL_1_12
169
170	.LBL_1_7:
171	lea `1`(%r14), %esi
172	btl %esi, %r13d
173	jc .LBL_1_10
174
175	.LBL_1_8:
176	incb %r12b
177	addl $`2`, %r14d
178	cmpb $`16`, %r12b
179	jb .LBL_1_6
180
181	movups `112`(%rsp), %xmm8
182	movups `96`(%rsp), %xmm9
183	movups `80`(%rsp), %xmm10
184	movups `64`(%rsp), %xmm11
185	movups `48`(%rsp), %xmm12
186	movups `32`(%rsp), %xmm13
187	movups `16`(%rsp), %xmm14
188	movups (%rsp), %xmm15
189	movq `136`(%rsp), %rsi
190	movq `128`(%rsp), %rdi
191	movq `168`(%rsp), %r12
192	cfi_restore (%r12)
193	movq `160`(%rsp), %r13
194	cfi_restore (%r13)
195	movq `152`(%rsp), %r14
196	cfi_restore (%r14)
197	movq `144`(%rsp), %r15
198	cfi_restore (%r15)
199	movups `256`(%rsp), %xmm0
200	jmp .LBL_1_2
201
202	.LBL_1_10:
203	cfi_restore_state
204	movzbl %r12b, %r15d
205	shlq $`4`, %r15
206	movsd `200`(%rsp,%r15), %xmm0
207
208	call JUMPTARGET(cos)
209
210	movsd %xmm0, `264`(%rsp,%r15)
211	jmp .LBL_1_8
212
213	.LBL_1_12:
214	movzbl %r12b, %r15d
215	shlq $`4`, %r15
216	movsd `192`(%rsp,%r15), %xmm0
217
218	call JUMPTARGET(cos)
219
220	movsd %xmm0, `256`(%rsp,%r15)
221	jmp .LBL_1_7
222
223	END (_ZGVbN2v_cos_sse4)
224

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core_sse4.S