svml_d_log2_core_sse4.S source code [glibc/sysdeps/x86_64/fpu/multiarch/svml_d_log2_core_sse4.S]

1	/ Function log vectorized with SSE4.*
2	Copyright (C) 2014-2024 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	#include <sysdep.h>
20	#include "svml_d_log_data.h"
21
22	.section .text.sse4, "ax", @progbits
23	ENTRY (_ZGVbN2v_log_sse4)
24	/*
25	ALGORITHM DESCRIPTION:
26
27	log(x) = -log(Rcp) + log(Rcpx),*
28	where Rcp ~ 1/x (accuracy ~9 bits, obtained by rounding
29	HW approximation to 1+9 mantissa bits)
30
31	Reduced argument R=Rcpx-1 is used to approximate log(1+R) as polynomial*
32
33	log(Rcp) = exponent_Rcplog(2) + log(mantissa_Rcp)*
34	-log(mantissa_Rcp) is obtained from a lookup table,
35	accessed by a 9-bit index
36	*/
37	pushq %rbp
38	cfi_adjust_cfa_offset (`8`)
39	cfi_rel_offset (%rbp, `0`)
40	movq %rsp, %rbp
41	cfi_def_cfa_register (%rbp)
42	andq $-`64`, %rsp
43	subq $`320`, %rsp
44	movaps %xmm0, %xmm6
45	movq __svml_dlog_data@GOTPCREL(%rip), %r8
46	movaps %xmm6, %xmm3
47	movaps %xmm6, %xmm2
48
49	/ isolate exponent bits /
50	movaps %xmm6, %xmm1
51	psrlq $`20`, %xmm1
52	movups _ExpMask(%r8), %xmm5
53
54	/ preserve mantissa, set input exponent to 2^(-10) /
55	andps %xmm6, %xmm5
56	orps _Two10(%r8), %xmm5
57
58	/ reciprocal approximation good to at least 11 bits /
59	cvtpd2ps %xmm5, %xmm7
60	cmpltpd _MinNorm(%r8), %xmm3
61	cmpnlepd _MaxNorm(%r8), %xmm2
62	movlhps %xmm7, %xmm7
63
64	/ combine and get argument value range mask /
65	orps %xmm2, %xmm3
66	rcpps %xmm7, %xmm0
67	movmskpd %xmm3, %eax
68	movups _HalfMask(%r8), %xmm2
69
70	/ argument reduction started: R = MantissaRcp - 1 /*
71	andps %xmm5, %xmm2
72	cvtps2pd %xmm0, %xmm4
73	subpd %xmm2, %xmm5
74
75	/ round reciprocal to nearest integer, will have 1+9 mantissa bits /
76	roundpd $`0`, %xmm4, %xmm4
77	mulpd %xmm4, %xmm2
78	mulpd %xmm4, %xmm5
79	subpd _One(%r8), %xmm2
80	addpd %xmm2, %xmm5
81	movups _Threshold(%r8), %xmm2
82
83	/ calculate index for table lookup /
84	movaps %xmm4, %xmm3
85	cmpltpd %xmm4, %xmm2
86	pshufd $`221`, %xmm1, %xmm7
87	psrlq $`40`, %xmm3
88
89	/ convert biased exponent to DP format /
90	cvtdq2pd %xmm7, %xmm0
91	movd %xmm3, %edx
92	movups _poly_coeff_1(%r8), %xmm4
93
94	/ polynomial computation /
95	mulpd %xmm5, %xmm4
96	andps _Bias(%r8), %xmm2
97	orps _Bias1(%r8), %xmm2
98
99	/*
100	Table stores -log(0.5mantissa) for larger mantissas,*
101	adjust exponent accordingly
102	*/
103	subpd %xmm2, %xmm0
104	addpd _poly_coeff_2(%r8), %xmm4
105
106	/ exponentlog(2.0) /*
107	mulpd _L2(%r8), %xmm0
108	movaps %xmm5, %xmm2
109	mulpd %xmm5, %xmm2
110	movups _poly_coeff_3(%r8), %xmm7
111	mulpd %xmm5, %xmm7
112	mulpd %xmm2, %xmm4
113	addpd _poly_coeff_4(%r8), %xmm7
114	addpd %xmm4, %xmm7
115	mulpd %xmm7, %xmm2
116	movslq %edx, %rdx
117	pextrd $`2`, %xmm3, %ecx
118
119	/*
120	reconstruction:
121	(exponentlog(2)) + (LogRcp + (R+poly))*
122	*/
123	addpd %xmm2, %xmm5
124	movslq %ecx, %rcx
125	movsd _LogRcp_lookup(%r8,%rdx), %xmm1
126	movhpd _LogRcp_lookup(%r8,%rcx), %xmm1
127	addpd %xmm5, %xmm1
128	addpd %xmm1, %xmm0
129	testl %eax, %eax
130	jne .LBL_1_3
131
132	.LBL_1_2:
133	cfi_remember_state
134	movq %rbp, %rsp
135	cfi_def_cfa_register (%rsp)
136	popq %rbp
137	cfi_adjust_cfa_offset (-`8`)
138	cfi_restore (%rbp)
139	ret
140
141	.LBL_1_3:
142	cfi_restore_state
143	movups %xmm6, `192`(%rsp)
144	movups %xmm0, `256`(%rsp)
145	je .LBL_1_2
146
147	xorb %cl, %cl
148	xorl %edx, %edx
149	movups %xmm8, `112`(%rsp)
150	movups %xmm9, `96`(%rsp)
151	movups %xmm10, `80`(%rsp)
152	movups %xmm11, `64`(%rsp)
153	movups %xmm12, `48`(%rsp)
154	movups %xmm13, `32`(%rsp)
155	movups %xmm14, `16`(%rsp)
156	movups %xmm15, (%rsp)
157	movq %rsi, `136`(%rsp)
158	movq %rdi, `128`(%rsp)
159	movq %r12, `168`(%rsp)
160	cfi_offset_rel_rsp (`12`, `168`)
161	movb %cl, %r12b
162	movq %r13, `160`(%rsp)
163	cfi_offset_rel_rsp (`13`, `160`)
164	movl %eax, %r13d
165	movq %r14, `152`(%rsp)
166	cfi_offset_rel_rsp (`14`, `152`)
167	movl %edx, %r14d
168	movq %r15, `144`(%rsp)
169	cfi_offset_rel_rsp (`15`, `144`)
170	cfi_remember_state
171
172	.LBL_1_6:
173	btl %r14d, %r13d
174	jc .LBL_1_12
175
176	.LBL_1_7:
177	lea `1`(%r14), %esi
178	btl %esi, %r13d
179	jc .LBL_1_10
180
181	.LBL_1_8:
182	incb %r12b
183	addl $`2`, %r14d
184	cmpb $`16`, %r12b
185	jb .LBL_1_6
186
187	movups `112`(%rsp), %xmm8
188	movups `96`(%rsp), %xmm9
189	movups `80`(%rsp), %xmm10
190	movups `64`(%rsp), %xmm11
191	movups `48`(%rsp), %xmm12
192	movups `32`(%rsp), %xmm13
193	movups `16`(%rsp), %xmm14
194	movups (%rsp), %xmm15
195	movq `136`(%rsp), %rsi
196	movq `128`(%rsp), %rdi
197	movq `168`(%rsp), %r12
198	cfi_restore (%r12)
199	movq `160`(%rsp), %r13
200	cfi_restore (%r13)
201	movq `152`(%rsp), %r14
202	cfi_restore (%r14)
203	movq `144`(%rsp), %r15
204	cfi_restore (%r15)
205	movups `256`(%rsp), %xmm0
206	jmp .LBL_1_2
207
208	.LBL_1_10:
209	cfi_restore_state
210	movzbl %r12b, %r15d
211	shlq $`4`, %r15
212	movsd `200`(%rsp,%r15), %xmm0
213
214	call JUMPTARGET(log)
215
216	movsd %xmm0, `264`(%rsp,%r15)
217	jmp .LBL_1_8
218
219	.LBL_1_12:
220	movzbl %r12b, %r15d
221	shlq $`4`, %r15
222	movsd `192`(%rsp,%r15), %xmm0
223
224	call JUMPTARGET(log)
225
226	movsd %xmm0, `256`(%rsp,%r15)
227	jmp .LBL_1_7
228
229	END (_ZGVbN2v_log_sse4)
230

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_d_log2_core_sse4.S