1/* Function log vectorized with SSE4.
2 Copyright (C) 2014-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <sysdep.h>
20#include "svml_d_log_data.h"
21
22 .section .text.sse4, "ax", @progbits
23ENTRY (_ZGVbN2v_log_sse4)
24/*
25 ALGORITHM DESCRIPTION:
26
27 log(x) = -log(Rcp) + log(Rcp*x),
28 where Rcp ~ 1/x (accuracy ~9 bits, obtained by rounding
29 HW approximation to 1+9 mantissa bits)
30
31 Reduced argument R=Rcp*x-1 is used to approximate log(1+R) as polynomial
32
33 log(Rcp) = exponent_Rcp*log(2) + log(mantissa_Rcp)
34 -log(mantissa_Rcp) is obtained from a lookup table,
35 accessed by a 9-bit index
36 */
37 pushq %rbp
38 cfi_adjust_cfa_offset (8)
39 cfi_rel_offset (%rbp, 0)
40 movq %rsp, %rbp
41 cfi_def_cfa_register (%rbp)
42 andq $-64, %rsp
43 subq $320, %rsp
44 movaps %xmm0, %xmm6
45 movq __svml_dlog_data@GOTPCREL(%rip), %r8
46 movaps %xmm6, %xmm3
47 movaps %xmm6, %xmm2
48
49/* isolate exponent bits */
50 movaps %xmm6, %xmm1
51 psrlq $20, %xmm1
52 movups _ExpMask(%r8), %xmm5
53
54/* preserve mantissa, set input exponent to 2^(-10) */
55 andps %xmm6, %xmm5
56 orps _Two10(%r8), %xmm5
57
58/* reciprocal approximation good to at least 11 bits */
59 cvtpd2ps %xmm5, %xmm7
60 cmpltpd _MinNorm(%r8), %xmm3
61 cmpnlepd _MaxNorm(%r8), %xmm2
62 movlhps %xmm7, %xmm7
63
64/* combine and get argument value range mask */
65 orps %xmm2, %xmm3
66 rcpps %xmm7, %xmm0
67 movmskpd %xmm3, %eax
68 movups _HalfMask(%r8), %xmm2
69
70/* argument reduction started: R = Mantissa*Rcp - 1 */
71 andps %xmm5, %xmm2
72 cvtps2pd %xmm0, %xmm4
73 subpd %xmm2, %xmm5
74
75/* round reciprocal to nearest integer, will have 1+9 mantissa bits */
76 roundpd $0, %xmm4, %xmm4
77 mulpd %xmm4, %xmm2
78 mulpd %xmm4, %xmm5
79 subpd _One(%r8), %xmm2
80 addpd %xmm2, %xmm5
81 movups _Threshold(%r8), %xmm2
82
83/* calculate index for table lookup */
84 movaps %xmm4, %xmm3
85 cmpltpd %xmm4, %xmm2
86 pshufd $221, %xmm1, %xmm7
87 psrlq $40, %xmm3
88
89/* convert biased exponent to DP format */
90 cvtdq2pd %xmm7, %xmm0
91 movd %xmm3, %edx
92 movups _poly_coeff_1(%r8), %xmm4
93
94/* polynomial computation */
95 mulpd %xmm5, %xmm4
96 andps _Bias(%r8), %xmm2
97 orps _Bias1(%r8), %xmm2
98
99/*
100 Table stores -log(0.5*mantissa) for larger mantissas,
101 adjust exponent accordingly
102 */
103 subpd %xmm2, %xmm0
104 addpd _poly_coeff_2(%r8), %xmm4
105
106/* exponent*log(2.0) */
107 mulpd _L2(%r8), %xmm0
108 movaps %xmm5, %xmm2
109 mulpd %xmm5, %xmm2
110 movups _poly_coeff_3(%r8), %xmm7
111 mulpd %xmm5, %xmm7
112 mulpd %xmm2, %xmm4
113 addpd _poly_coeff_4(%r8), %xmm7
114 addpd %xmm4, %xmm7
115 mulpd %xmm7, %xmm2
116 movslq %edx, %rdx
117 pextrd $2, %xmm3, %ecx
118
119/*
120 reconstruction:
121 (exponent*log(2)) + (LogRcp + (R+poly))
122 */
123 addpd %xmm2, %xmm5
124 movslq %ecx, %rcx
125 movsd _LogRcp_lookup(%r8,%rdx), %xmm1
126 movhpd _LogRcp_lookup(%r8,%rcx), %xmm1
127 addpd %xmm5, %xmm1
128 addpd %xmm1, %xmm0
129 testl %eax, %eax
130 jne .LBL_1_3
131
132.LBL_1_2:
133 cfi_remember_state
134 movq %rbp, %rsp
135 cfi_def_cfa_register (%rsp)
136 popq %rbp
137 cfi_adjust_cfa_offset (-8)
138 cfi_restore (%rbp)
139 ret
140
141.LBL_1_3:
142 cfi_restore_state
143 movups %xmm6, 192(%rsp)
144 movups %xmm0, 256(%rsp)
145 je .LBL_1_2
146
147 xorb %cl, %cl
148 xorl %edx, %edx
149 movups %xmm8, 112(%rsp)
150 movups %xmm9, 96(%rsp)
151 movups %xmm10, 80(%rsp)
152 movups %xmm11, 64(%rsp)
153 movups %xmm12, 48(%rsp)
154 movups %xmm13, 32(%rsp)
155 movups %xmm14, 16(%rsp)
156 movups %xmm15, (%rsp)
157 movq %rsi, 136(%rsp)
158 movq %rdi, 128(%rsp)
159 movq %r12, 168(%rsp)
160 cfi_offset_rel_rsp (12, 168)
161 movb %cl, %r12b
162 movq %r13, 160(%rsp)
163 cfi_offset_rel_rsp (13, 160)
164 movl %eax, %r13d
165 movq %r14, 152(%rsp)
166 cfi_offset_rel_rsp (14, 152)
167 movl %edx, %r14d
168 movq %r15, 144(%rsp)
169 cfi_offset_rel_rsp (15, 144)
170 cfi_remember_state
171
172.LBL_1_6:
173 btl %r14d, %r13d
174 jc .LBL_1_12
175
176.LBL_1_7:
177 lea 1(%r14), %esi
178 btl %esi, %r13d
179 jc .LBL_1_10
180
181.LBL_1_8:
182 incb %r12b
183 addl $2, %r14d
184 cmpb $16, %r12b
185 jb .LBL_1_6
186
187 movups 112(%rsp), %xmm8
188 movups 96(%rsp), %xmm9
189 movups 80(%rsp), %xmm10
190 movups 64(%rsp), %xmm11
191 movups 48(%rsp), %xmm12
192 movups 32(%rsp), %xmm13
193 movups 16(%rsp), %xmm14
194 movups (%rsp), %xmm15
195 movq 136(%rsp), %rsi
196 movq 128(%rsp), %rdi
197 movq 168(%rsp), %r12
198 cfi_restore (%r12)
199 movq 160(%rsp), %r13
200 cfi_restore (%r13)
201 movq 152(%rsp), %r14
202 cfi_restore (%r14)
203 movq 144(%rsp), %r15
204 cfi_restore (%r15)
205 movups 256(%rsp), %xmm0
206 jmp .LBL_1_2
207
208.LBL_1_10:
209 cfi_restore_state
210 movzbl %r12b, %r15d
211 shlq $4, %r15
212 movsd 200(%rsp,%r15), %xmm0
213
214 call JUMPTARGET(log)
215
216 movsd %xmm0, 264(%rsp,%r15)
217 jmp .LBL_1_8
218
219.LBL_1_12:
220 movzbl %r12b, %r15d
221 shlq $4, %r15
222 movsd 192(%rsp,%r15), %xmm0
223
224 call JUMPTARGET(log)
225
226 movsd %xmm0, 256(%rsp,%r15)
227 jmp .LBL_1_7
228
229END (_ZGVbN2v_log_sse4)
230

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_d_log2_core_sse4.S