svml_s_cosf4_core_sse4.S source code [glibc/sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core_sse4.S]

1	/ Function cosf vectorized with SSE4.*
2	Copyright (C) 2014-2022 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	#include <sysdep.h>
20	#include "svml_s_trig_data.h"
21
22	.text
23	ENTRY (_ZGVbN4v_cosf_sse4)
24	/*
25	ALGORITHM DESCRIPTION:
26
27	1) Range reduction to [-Pi/2; +Pi/2] interval
28	a) We remove sign using AND operation
29	b) Add Pi/2 value to argument X for Cos to Sin transformation
30	c) Getting octant Y by 1/Pi multiplication
31	d) Add "Right Shifter" value
32	e) Treat obtained value as integer for destination sign setting.
33	Shift first bit of this value to the last (sign) position
34	f) Subtract "Right Shifter" value
35	g) Subtract 0.5 from result for octant correction
36	h) Subtract YPI from X argument, where PI divided to 4 parts:*
37	X = X - YPI1 - YPI2 - YPI3 - YPI4;
38	2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval)
39	a) Calculate X^2 = X X*
40	b) Calculate polynomial:
41	R = X + X X^2 * (A3 + x^2 * (A5 + .....*
42	3) Destination sign setting
43	a) Set shifted destination sign using XOR operation:
44	R = XOR( R, S );
45	*/
46	pushq %rbp
47	cfi_adjust_cfa_offset (`8`)
48	cfi_rel_offset (%rbp, `0`)
49	movq %rsp, %rbp
50	cfi_def_cfa_register (%rbp)
51	andq $-`64`, %rsp
52	subq $`320`, %rsp
53	movaps %xmm0, %xmm4
54	movq __svml_s_trig_data@GOTPCREL(%rip), %rax
55	movups __sHalfPI(%rax), %xmm1
56	movups __sRShifter(%rax), %xmm5
57
58	/ b) Add Pi/2 value to argument X for Cos to Sin transformation /
59	addps %xmm4, %xmm1
60
61	/*
62	1) Range reduction to [-Pi/2; +Pi/2] interval
63	c) Getting octant Y by 1/Pi multiplication
64	d) Add "Right Shifter" (0x4B000000) value
65	*/
66	mulps __sInvPI(%rax), %xmm1
67	movups __sPI1(%rax), %xmm6
68	addps %xmm5, %xmm1
69
70	/*
71	e) Treat obtained value as integer for destination sign setting.
72	Shift first bit of this value to the last (sign) position (S << 31)
73	*/
74	movaps %xmm1, %xmm2
75
76	/ f) Subtract "Right Shifter" (0x4B000000) value /
77	subps %xmm5, %xmm1
78	movups __sPI2(%rax), %xmm7
79	pslld $`31`, %xmm2
80	movups __sPI3(%rax), %xmm5
81	movups __sAbsMask(%rax), %xmm3
82
83	/ Check for large and special arguments /
84	andps %xmm4, %xmm3
85
86	/ g) Subtract 0.5 from result for octant correction /
87	subps __sOneHalf(%rax), %xmm1
88	cmpnleps __sRangeReductionVal(%rax), %xmm3
89
90	/*
91	h) Subtract YPI from X argument, where PI divided to 4 parts:*
92	X = X - YPI1 - YPI2 - YPI3 - YPI4;
93	*/
94	mulps %xmm1, %xmm6
95	mulps %xmm1, %xmm7
96	mulps %xmm1, %xmm5
97	subps %xmm6, %xmm0
98	movmskps %xmm3, %ecx
99	movups __sPI4(%rax), %xmm6
100	subps %xmm7, %xmm0
101	mulps %xmm6, %xmm1
102	subps %xmm5, %xmm0
103	subps %xmm1, %xmm0
104
105	/ a) Calculate X^2 = X * X /
106	movaps %xmm0, %xmm1
107	mulps %xmm0, %xmm1
108
109	/*
110	3) Destination sign setting
111	a) Set shifted destination sign using XOR operation:
112	R = XOR( R, S );
113	*/
114	xorps %xmm2, %xmm0
115	movups __sA9(%rax), %xmm2
116
117	/*
118	b) Calculate polynomial:
119	R = X + X X^2 * (A3 + x^2 * (A5 + x^2 * (A7 + x^2 * (A9))));*
120	*/
121	mulps %xmm1, %xmm2
122	addps __sA7(%rax), %xmm2
123	mulps %xmm1, %xmm2
124	addps __sA5(%rax), %xmm2
125	mulps %xmm1, %xmm2
126	addps __sA3(%rax), %xmm2
127	mulps %xmm2, %xmm1
128	mulps %xmm0, %xmm1
129	addps %xmm1, %xmm0
130	testl %ecx, %ecx
131	jne .LBL_1_3
132
133	.LBL_1_2:
134	cfi_remember_state
135	movq %rbp, %rsp
136	cfi_def_cfa_register (%rsp)
137	popq %rbp
138	cfi_adjust_cfa_offset (-`8`)
139	cfi_restore (%rbp)
140	ret
141
142	.LBL_1_3:
143	cfi_restore_state
144	movups %xmm4, `192`(%rsp)
145	movups %xmm0, `256`(%rsp)
146	je .LBL_1_2
147
148	xorb %dl, %dl
149	xorl %eax, %eax
150	movups %xmm8, `112`(%rsp)
151	movups %xmm9, `96`(%rsp)
152	movups %xmm10, `80`(%rsp)
153	movups %xmm11, `64`(%rsp)
154	movups %xmm12, `48`(%rsp)
155	movups %xmm13, `32`(%rsp)
156	movups %xmm14, `16`(%rsp)
157	movups %xmm15, (%rsp)
158	movq %rsi, `136`(%rsp)
159	movq %rdi, `128`(%rsp)
160	movq %r12, `168`(%rsp)
161	cfi_offset_rel_rsp (`12`, `168`)
162	movb %dl, %r12b
163	movq %r13, `160`(%rsp)
164	cfi_offset_rel_rsp (`13`, `160`)
165	movl %ecx, %r13d
166	movq %r14, `152`(%rsp)
167	cfi_offset_rel_rsp (`14`, `152`)
168	movl %eax, %r14d
169	movq %r15, `144`(%rsp)
170	cfi_offset_rel_rsp (`15`, `144`)
171	cfi_remember_state
172
173	.LBL_1_6:
174	btl %r14d, %r13d
175	jc .LBL_1_12
176
177	.LBL_1_7:
178	lea `1`(%r14), %esi
179	btl %esi, %r13d
180	jc .LBL_1_10
181
182	.LBL_1_8:
183	incb %r12b
184	addl $`2`, %r14d
185	cmpb $`16`, %r12b
186	jb .LBL_1_6
187
188	movups `112`(%rsp), %xmm8
189	movups `96`(%rsp), %xmm9
190	movups `80`(%rsp), %xmm10
191	movups `64`(%rsp), %xmm11
192	movups `48`(%rsp), %xmm12
193	movups `32`(%rsp), %xmm13
194	movups `16`(%rsp), %xmm14
195	movups (%rsp), %xmm15
196	movq `136`(%rsp), %rsi
197	movq `128`(%rsp), %rdi
198	movq `168`(%rsp), %r12
199	cfi_restore (%r12)
200	movq `160`(%rsp), %r13
201	cfi_restore (%r13)
202	movq `152`(%rsp), %r14
203	cfi_restore (%r14)
204	movq `144`(%rsp), %r15
205	cfi_restore (%r15)
206	movups `256`(%rsp), %xmm0
207	jmp .LBL_1_2
208
209	.LBL_1_10:
210	cfi_restore_state
211	movzbl %r12b, %r15d
212	movss `196`(%rsp,%r15,`8`), %xmm0
213
214	call JUMPTARGET(cosf)
215
216	movss %xmm0, `260`(%rsp,%r15,`8`)
217	jmp .LBL_1_8
218
219	.LBL_1_12:
220	movzbl %r12b, %r15d
221	movss `192`(%rsp,%r15,`8`), %xmm0
222
223	call JUMPTARGET(cosf)
224
225	movss %xmm0, `256`(%rsp,%r15,`8`)
226	jmp .LBL_1_7
227	END (_ZGVbN4v_cosf_sse4)
228

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core_sse4.S