1/* Function exp vectorized with SSE4.
2 Copyright (C) 2014-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <sysdep.h>
20#include "svml_d_exp_data.h"
21
22 .section .text.sse4, "ax", @progbits
23ENTRY (_ZGVbN2v_exp_sse4)
24/*
25 ALGORITHM DESCRIPTION:
26
27 Argument representation:
28 N = rint(X*2^k/ln2) = 2^k*M+j
29 X = N*ln2/2^k + r = M*ln2 + ln2*(j/2^k) + r
30 then -ln2/2^(k+1) < r < ln2/2^(k+1)
31 Alternatively:
32 N = trunc(X*2^k/ln2)
33 then 0 < r < ln2/2^k
34
35 Result calculation:
36 exp(X) = exp(M*ln2 + ln2*(j/2^k) + r)
37 = 2^M * 2^(j/2^k) * exp(r)
38 2^M is calculated by bit manipulation
39 2^(j/2^k) is stored in table
40 exp(r) is approximated by polynomial.
41
42 The table lookup is skipped if k = 0. */
43
44 pushq %rbp
45 cfi_adjust_cfa_offset (8)
46 cfi_rel_offset (%rbp, 0)
47 movq %rsp, %rbp
48 cfi_def_cfa_register (%rbp)
49 andq $-64, %rsp
50 subq $320, %rsp
51 movaps %xmm0, %xmm3
52 movq __svml_dexp_data@GOTPCREL(%rip), %r8
53
54/* iAbsX = (int)(lX>>32), lX = *(longlong*)&X */
55 pshufd $221, %xmm3, %xmm7
56 movups __dbInvLn2(%r8), %xmm0
57
58/* dK = X*dbInvLn2 */
59 mulpd %xmm3, %xmm0
60 movq __iAbsMask(%r8), %xmm5
61 movq __iDomainRange(%r8), %xmm6
62
63/* iAbsX = iAbsX&iAbsMask */
64 pand %xmm5, %xmm7
65
66/* iRangeMask = (iAbsX>iDomainRange) */
67 pcmpgtd %xmm6, %xmm7
68
69/* Mask = iRangeMask?1:0, set mask for overflow/underflow */
70 movmskps %xmm7, %eax
71
72/* dN = rint(X*2^k/Ln2) */
73 xorps %xmm7, %xmm7
74 movups __dbLn2hi(%r8), %xmm5
75 movups __dbLn2lo(%r8), %xmm6
76 roundpd $0, %xmm0, %xmm7
77
78/* dR = X - dN*dbLn2hi, dbLn2hi is 52-8-k hi bits of ln2/2^k */
79 mulpd %xmm7, %xmm5
80
81/* dR = dR - dN*dbLn2lo, dbLn2lo is 40..94 bits of lo part of ln2/2^k */
82 mulpd %xmm6, %xmm7
83 movups __dbShifter(%r8), %xmm4
84
85/* dM = X*dbInvLn2+dbShifter */
86 addpd %xmm0, %xmm4
87 movaps %xmm3, %xmm0
88 subpd %xmm5, %xmm0
89 subpd %xmm7, %xmm0
90 movups __dPC2(%r8), %xmm5
91
92/* exp(r) = b0+r*(b0+r*(b1+r*b2)) */
93 mulpd %xmm0, %xmm5
94 addpd __dPC1(%r8), %xmm5
95 mulpd %xmm0, %xmm5
96 movups __dPC0(%r8), %xmm6
97 addpd %xmm6, %xmm5
98 mulpd %xmm5, %xmm0
99 movdqu __lIndexMask(%r8), %xmm2
100
101/* lIndex = (*(longlong*)&dM)&lIndexMask, lIndex is the lower K bits of lM */
102 movdqa %xmm2, %xmm1
103
104/* lM = (*(longlong*)&dM)&(~lIndexMask) */
105 pandn %xmm4, %xmm2
106 pand %xmm4, %xmm1
107
108/* lM = lM<<(52-K), 2^M */
109 psllq $42, %xmm2
110
111/* table lookup for dT[j] = 2^(j/2^k) */
112 movd %xmm1, %edx
113 pextrw $4, %xmm1, %ecx
114 addpd %xmm0, %xmm6
115 shll $3, %edx
116 shll $3, %ecx
117 movq (%r8,%rdx), %xmm0
118 andl $3, %eax
119 movhpd (%r8,%rcx), %xmm0
120
121/* 2^(j/2^k) * exp(r) */
122 mulpd %xmm6, %xmm0
123
124/* multiply by 2^M through integer add */
125 paddq %xmm2, %xmm0
126 jne .LBL_1_3
127
128.LBL_1_2:
129 cfi_remember_state
130 movq %rbp, %rsp
131 cfi_def_cfa_register (%rsp)
132 popq %rbp
133 cfi_adjust_cfa_offset (-8)
134 cfi_restore (%rbp)
135 ret
136
137.LBL_1_3:
138 cfi_restore_state
139 movups %xmm3, 192(%rsp)
140 movups %xmm0, 256(%rsp)
141 je .LBL_1_2
142
143 xorb %cl, %cl
144 xorl %edx, %edx
145 movups %xmm8, 112(%rsp)
146 movups %xmm9, 96(%rsp)
147 movups %xmm10, 80(%rsp)
148 movups %xmm11, 64(%rsp)
149 movups %xmm12, 48(%rsp)
150 movups %xmm13, 32(%rsp)
151 movups %xmm14, 16(%rsp)
152 movups %xmm15, (%rsp)
153 movq %rsi, 136(%rsp)
154 movq %rdi, 128(%rsp)
155 movq %r12, 168(%rsp)
156 cfi_offset_rel_rsp (12, 168)
157 movb %cl, %r12b
158 movq %r13, 160(%rsp)
159 cfi_offset_rel_rsp (13, 160)
160 movl %eax, %r13d
161 movq %r14, 152(%rsp)
162 cfi_offset_rel_rsp (14, 152)
163 movl %edx, %r14d
164 movq %r15, 144(%rsp)
165 cfi_offset_rel_rsp (15, 144)
166 cfi_remember_state
167
168.LBL_1_6:
169 btl %r14d, %r13d
170 jc .LBL_1_12
171
172.LBL_1_7:
173 lea 1(%r14), %esi
174 btl %esi, %r13d
175 jc .LBL_1_10
176
177.LBL_1_8:
178 incb %r12b
179 addl $2, %r14d
180 cmpb $16, %r12b
181 jb .LBL_1_6
182
183 movups 112(%rsp), %xmm8
184 movups 96(%rsp), %xmm9
185 movups 80(%rsp), %xmm10
186 movups 64(%rsp), %xmm11
187 movups 48(%rsp), %xmm12
188 movups 32(%rsp), %xmm13
189 movups 16(%rsp), %xmm14
190 movups (%rsp), %xmm15
191 movq 136(%rsp), %rsi
192 movq 128(%rsp), %rdi
193 movq 168(%rsp), %r12
194 cfi_restore (%r12)
195 movq 160(%rsp), %r13
196 cfi_restore (%r13)
197 movq 152(%rsp), %r14
198 cfi_restore (%r14)
199 movq 144(%rsp), %r15
200 cfi_restore (%r15)
201 movups 256(%rsp), %xmm0
202 jmp .LBL_1_2
203
204.LBL_1_10:
205 cfi_restore_state
206 movzbl %r12b, %r15d
207 shlq $4, %r15
208 movsd 200(%rsp,%r15), %xmm0
209
210 call JUMPTARGET(exp)
211
212 movsd %xmm0, 264(%rsp,%r15)
213 jmp .LBL_1_8
214
215.LBL_1_12:
216 movzbl %r12b, %r15d
217 shlq $4, %r15
218 movsd 192(%rsp,%r15), %xmm0
219
220 call JUMPTARGET(exp)
221
222 movsd %xmm0, 256(%rsp,%r15)
223 jmp .LBL_1_7
224
225END (_ZGVbN2v_exp_sse4)
226

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_d_exp2_core_sse4.S