svml_s_cosf8_core_avx2.S source code [glibc/sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core_avx2.S]

1	/ Function cosf vectorized with AVX2.*
2	Copyright (C) 2014-2024 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19
20	#include <sysdep.h>
21	#include "svml_s_trig_data.h"
22
23	.section .text.avx2, "ax", @progbits
24	ENTRY (_ZGVdN8v_cosf_avx2)
25	/*
26	ALGORITHM DESCRIPTION:
27
28	1) Range reduction to [-Pi/2; +Pi/2] interval
29	a) We remove sign using AND operation
30	b) Add Pi/2 value to argument X for Cos to Sin transformation
31	c) Getting octant Y by 1/Pi multiplication
32	d) Add "Right Shifter" value
33	e) Treat obtained value as integer for destination sign setting.
34	Shift first bit of this value to the last (sign) position
35	f) Subtract "Right Shifter" value
36	g) Subtract 0.5 from result for octant correction
37	h) Subtract YPI from X argument, where PI divided to 4 parts:*
38	X = X - YPI1 - YPI2 - YPI3 - YPI4;
39	2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval)
40	a) Calculate X^2 = X X*
41	b) Calculate polynomial:
42	R = X + X X^2 * (A3 + x^2 * (A5 + .....*
43	3) Destination sign setting
44	a) Set shifted destination sign using XOR operation:
45	R = XOR( R, S );
46	*/
47	pushq %rbp
48	cfi_adjust_cfa_offset (`8`)
49	cfi_rel_offset (%rbp, `0`)
50	movq %rsp, %rbp
51	cfi_def_cfa_register (%rbp)
52	andq $-`64`, %rsp
53	subq $`448`, %rsp
54	movq __svml_s_trig_data@GOTPCREL(%rip), %rax
55	vmovaps %ymm0, %ymm2
56	vmovups __sRShifter(%rax), %ymm5
57	vmovups __sPI1_FMA(%rax), %ymm7
58
59	/ b) Add Pi/2 value to argument X for Cos to Sin transformation /
60	vaddps __sHalfPI(%rax), %ymm2, %ymm4
61
62	/*
63	1) Range reduction to [-Pi/2; +Pi/2] interval
64	c) Getting octant Y by 1/Pi multiplication
65	d) Add "Right Shifter" (0x4B000000) value
66	*/
67	vfmadd132ps __sInvPI(%rax), %ymm5, %ymm4
68
69	/ f) Subtract "Right Shifter" (0x4B000000) value /
70	vsubps %ymm5, %ymm4, %ymm6
71
72	/*
73	e) Treat obtained value as integer for destination sign setting.
74	Shift first bit of this value to the last (sign) position (S << 31)
75	*/
76	vpslld $`31`, %ymm4, %ymm0
77
78	/ g) Subtract 0.5 from result for octant correction /
79	vsubps __sOneHalf(%rax), %ymm6, %ymm4
80
81	/ Check for large and special arguments /
82	vandps __sAbsMask(%rax), %ymm2, %ymm3
83	vcmpnle_uqps __sRangeReductionVal(%rax), %ymm3, %ymm1
84
85	/*
86	h) Subtract YPI from X argument, where PI divided to 4 parts:*
87	X = X - YPI1 - YPI2 - YPI3*
88	*/
89	vmovaps %ymm2, %ymm3
90	vfnmadd231ps %ymm4, %ymm7, %ymm3
91	vfnmadd231ps __sPI2_FMA(%rax), %ymm4, %ymm3
92	vfnmadd132ps __sPI3_FMA(%rax), %ymm3, %ymm4
93
94	/ a) Calculate X^2 = X * X /
95	vmulps %ymm4, %ymm4, %ymm5
96
97	/*
98	3) Destination sign setting
99	a) Set shifted destination sign using XOR operation:
100	R = XOR( R, S );
101	*/
102	vxorps %ymm0, %ymm4, %ymm6
103	vmovups __sA9_FMA(%rax), %ymm0
104
105	/*
106	b) Calculate polynomial:
107	R = X + X X^2 * (A3 + x^2 * (A5 + x^2 * (A7 + x^2 * (A9))))*
108	*/
109	vfmadd213ps __sA7_FMA(%rax), %ymm5, %ymm0
110	vfmadd213ps __sA5_FMA(%rax), %ymm5, %ymm0
111	vfmadd213ps __sA3(%rax), %ymm5, %ymm0
112	vmulps %ymm5, %ymm0, %ymm0
113	vmovmskps %ymm1, %ecx
114	vfmadd213ps %ymm6, %ymm6, %ymm0
115	testl %ecx, %ecx
116	jne .LBL_1_3
117
118	.LBL_1_2:
119	cfi_remember_state
120	movq %rbp, %rsp
121	cfi_def_cfa_register (%rsp)
122	popq %rbp
123	cfi_adjust_cfa_offset (-`8`)
124	cfi_restore (%rbp)
125	ret
126
127	.LBL_1_3:
128	cfi_restore_state
129	vmovups %ymm2, `320`(%rsp)
130	vmovups %ymm0, `384`(%rsp)
131	je .LBL_1_2
132
133	xorb %dl, %dl
134	xorl %eax, %eax
135	vmovups %ymm8, `224`(%rsp)
136	vmovups %ymm9, `192`(%rsp)
137	vmovups %ymm10, `160`(%rsp)
138	vmovups %ymm11, `128`(%rsp)
139	vmovups %ymm12, `96`(%rsp)
140	vmovups %ymm13, `64`(%rsp)
141	vmovups %ymm14, `32`(%rsp)
142	vmovups %ymm15, (%rsp)
143	movq %rsi, `264`(%rsp)
144	movq %rdi, `256`(%rsp)
145	movq %r12, `296`(%rsp)
146	cfi_offset_rel_rsp (`12`, `296`)
147	movb %dl, %r12b
148	movq %r13, `288`(%rsp)
149	cfi_offset_rel_rsp (`13`, `288`)
150	movl %ecx, %r13d
151	movq %r14, `280`(%rsp)
152	cfi_offset_rel_rsp (`14`, `280`)
153	movl %eax, %r14d
154	movq %r15, `272`(%rsp)
155	cfi_offset_rel_rsp (`15`, `272`)
156	cfi_remember_state
157
158	.LBL_1_6:
159	btl %r14d, %r13d
160	jc .LBL_1_12
161
162	.LBL_1_7:
163	lea `1`(%r14), %esi
164	btl %esi, %r13d
165	jc .LBL_1_10
166
167	.LBL_1_8:
168	incb %r12b
169	addl $`2`, %r14d
170	cmpb $`16`, %r12b
171	jb .LBL_1_6
172
173	vmovups `224`(%rsp), %ymm8
174	vmovups `192`(%rsp), %ymm9
175	vmovups `160`(%rsp), %ymm10
176	vmovups `128`(%rsp), %ymm11
177	vmovups `96`(%rsp), %ymm12
178	vmovups `64`(%rsp), %ymm13
179	vmovups `32`(%rsp), %ymm14
180	vmovups (%rsp), %ymm15
181	vmovups `384`(%rsp), %ymm0
182	movq `264`(%rsp), %rsi
183	movq `256`(%rsp), %rdi
184	movq `296`(%rsp), %r12
185	cfi_restore (%r12)
186	movq `288`(%rsp), %r13
187	cfi_restore (%r13)
188	movq `280`(%rsp), %r14
189	cfi_restore (%r14)
190	movq `272`(%rsp), %r15
191	cfi_restore (%r15)
192	jmp .LBL_1_2
193
194	.LBL_1_10:
195	cfi_restore_state
196	movzbl %r12b, %r15d
197	vmovss `324`(%rsp,%r15,`8`), %xmm0
198	vzeroupper
199
200	call JUMPTARGET(cosf)
201
202	vmovss %xmm0, `388`(%rsp,%r15,`8`)
203	jmp .LBL_1_8
204
205	.LBL_1_12:
206	movzbl %r12b, %r15d
207	vmovss `320`(%rsp,%r15,`8`), %xmm0
208	vzeroupper
209
210	call JUMPTARGET(cosf)
211
212	vmovss %xmm0, `384`(%rsp,%r15,`8`)
213	jmp .LBL_1_7
214
215	END (_ZGVdN8v_cosf_avx2)
216

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core_avx2.S