svml_d_sin2_core_sse4.S source code [glibc/sysdeps/x86_64/fpu/multiarch/svml_d_sin2_core_sse4.S]

1	/ Function sin vectorized with SSE4.*
2	Copyright (C) 2014-2024 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	#include <sysdep.h>
20	#include "svml_d_trig_data.h"
21
22	.section .text.sse4, "ax", @progbits
23	ENTRY (_ZGVbN2v_sin_sse4)
24	/ ALGORITHM DESCRIPTION:*
25
26	( low accuracy ( < 4ulp ) or enhanced performance
27	( half of correct mantissa ) implementation )
28
29	Argument representation:
30	arg = NPi + R*
31
32	Result calculation:
33	sin(arg) = sin(NPi + R) = (-1)^N * sin(R)*
34	sin(R) is approximated by corresponding polynomial
35	*/
36	pushq %rbp
37	cfi_adjust_cfa_offset (`8`)
38	cfi_rel_offset (%rbp, `0`)
39	movq %rsp, %rbp
40	cfi_def_cfa_register (%rbp)
41	andq $-`64`, %rsp
42	subq $`320`, %rsp
43	movaps %xmm0, %xmm5
44	movq __svml_d_trig_data@GOTPCREL(%rip), %rax
45	movups __dAbsMask(%rax), %xmm3
46	/*
47	ARGUMENT RANGE REDUCTION:
48	X' = \|X\|
49	*/
50	movaps %xmm3, %xmm4
51
52	/ SignX - sign bit of X /
53	andnps %xmm5, %xmm3
54	movups __dInvPI(%rax), %xmm2
55	andps %xmm5, %xmm4
56
57	/ Y = X'InvPi + RS : right shifter add /*
58	mulpd %xmm4, %xmm2
59	movups __dRShifter(%rax), %xmm6
60
61	/ R = X' - NPi1 /*
62	movaps %xmm4, %xmm0
63	addpd %xmm6, %xmm2
64	cmpnlepd __dRangeVal(%rax), %xmm4
65
66	/ N = Y - RS : right shifter sub /
67	movaps %xmm2, %xmm1
68
69	/ SignRes = Y<<63 : shift LSB to MSB place for result sign /
70	psllq $`63`, %xmm2
71	subpd %xmm6, %xmm1
72	movmskpd %xmm4, %ecx
73	movups __dPI1(%rax), %xmm7
74	mulpd %xmm1, %xmm7
75	movups __dPI2(%rax), %xmm6
76
77	/ R = R - NPi2 /*
78	mulpd %xmm1, %xmm6
79	subpd %xmm7, %xmm0
80	movups __dPI3(%rax), %xmm7
81
82	/ R = R - NPi3 /*
83	mulpd %xmm1, %xmm7
84	subpd %xmm6, %xmm0
85	movups __dPI4(%rax), %xmm6
86
87	/ R = R - NPi4 /*
88	mulpd %xmm6, %xmm1
89	subpd %xmm7, %xmm0
90	subpd %xmm1, %xmm0
91
92	/*
93	POLYNOMIAL APPROXIMATION:
94	R2 = RR*
95	*/
96	movaps %xmm0, %xmm1
97	mulpd %xmm0, %xmm1
98
99	/ R = R^SignRes : update sign of reduced argument /
100	xorps %xmm2, %xmm0
101	movups __dC7_sin(%rax), %xmm2
102	mulpd %xmm1, %xmm2
103	addpd __dC6_sin(%rax), %xmm2
104	mulpd %xmm1, %xmm2
105	addpd __dC5_sin(%rax), %xmm2
106	mulpd %xmm1, %xmm2
107	addpd __dC4_sin(%rax), %xmm2
108
109	/ Poly = C3+R2(C4+R2(C5+R2(C6+R2C7))) /
110	mulpd %xmm1, %xmm2
111	addpd __dC3_sin(%rax), %xmm2
112
113	/ Poly = R2(C1+R2(C2+R2Poly)) /*
114	mulpd %xmm1, %xmm2
115	addpd __dC2_sin(%rax), %xmm2
116	mulpd %xmm1, %xmm2
117	addpd __dC1_sin(%rax), %xmm2
118	mulpd %xmm2, %xmm1
119
120	/ Poly = PolyR + R /*
121	mulpd %xmm0, %xmm1
122	addpd %xmm1, %xmm0
123
124	/*
125	RECONSTRUCTION:
126	Final sign setting: Res = Poly^SignX
127	*/
128	xorps %xmm3, %xmm0
129	testl %ecx, %ecx
130	jne .LBL_1_3
131
132	.LBL_1_2:
133	cfi_remember_state
134	movq %rbp, %rsp
135	cfi_def_cfa_register (%rsp)
136	popq %rbp
137	cfi_adjust_cfa_offset (-`8`)
138	cfi_restore (%rbp)
139	ret
140
141	.LBL_1_3:
142	cfi_restore_state
143	movups %xmm5, `192`(%rsp)
144	movups %xmm0, `256`(%rsp)
145	je .LBL_1_2
146
147	xorb %dl, %dl
148	xorl %eax, %eax
149	movups %xmm8, `112`(%rsp)
150	movups %xmm9, `96`(%rsp)
151	movups %xmm10, `80`(%rsp)
152	movups %xmm11, `64`(%rsp)
153	movups %xmm12, `48`(%rsp)
154	movups %xmm13, `32`(%rsp)
155	movups %xmm14, `16`(%rsp)
156	movups %xmm15, (%rsp)
157	movq %rsi, `136`(%rsp)
158	movq %rdi, `128`(%rsp)
159	movq %r12, `168`(%rsp)
160	cfi_offset_rel_rsp (`12`, `168`)
161	movb %dl, %r12b
162	movq %r13, `160`(%rsp)
163	cfi_offset_rel_rsp (`13`, `160`)
164	movl %ecx, %r13d
165	movq %r14, `152`(%rsp)
166	cfi_offset_rel_rsp (`14`, `152`)
167	movl %eax, %r14d
168	movq %r15, `144`(%rsp)
169	cfi_offset_rel_rsp (`15`, `144`)
170	cfi_remember_state
171
172	.LBL_1_6:
173	btl %r14d, %r13d
174	jc .LBL_1_12
175
176	.LBL_1_7:
177	lea `1`(%r14), %esi
178	btl %esi, %r13d
179	jc .LBL_1_10
180
181	.LBL_1_8:
182	incb %r12b
183	addl $`2`, %r14d
184	cmpb $`16`, %r12b
185	jb .LBL_1_6
186
187	movups `112`(%rsp), %xmm8
188	movups `96`(%rsp), %xmm9
189	movups `80`(%rsp), %xmm10
190	movups `64`(%rsp), %xmm11
191	movups `48`(%rsp), %xmm12
192	movups `32`(%rsp), %xmm13
193	movups `16`(%rsp), %xmm14
194	movups (%rsp), %xmm15
195	movq `136`(%rsp), %rsi
196	movq `128`(%rsp), %rdi
197	movq `168`(%rsp), %r12
198	cfi_restore (%r12)
199	movq `160`(%rsp), %r13
200	cfi_restore (%r13)
201	movq `152`(%rsp), %r14
202	cfi_restore (%r14)
203	movq `144`(%rsp), %r15
204	cfi_restore (%r15)
205	movups `256`(%rsp), %xmm0
206	jmp .LBL_1_2
207
208	.LBL_1_10:
209	cfi_restore_state
210	movzbl %r12b, %r15d
211	shlq $`4`, %r15
212	movsd `200`(%rsp,%r15), %xmm0
213
214	call JUMPTARGET(sin)
215
216	movsd %xmm0, `264`(%rsp,%r15)
217	jmp .LBL_1_8
218
219	.LBL_1_12:
220	movzbl %r12b, %r15d
221	shlq $`4`, %r15
222	movsd `192`(%rsp,%r15), %xmm0
223
224	call JUMPTARGET(sin)
225
226	movsd %xmm0, `256`(%rsp,%r15)
227	jmp .LBL_1_7
228
229	END (_ZGVbN2v_sin_sse4)
230

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_d_sin2_core_sse4.S