svml_d_exp2_core_sse4.S source code [glibc/sysdeps/x86_64/fpu/multiarch/svml_d_exp2_core_sse4.S]

1	/ Function exp vectorized with SSE4.*
2	Copyright (C) 2014-2024 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	#include <sysdep.h>
20	#include "svml_d_exp_data.h"
21
22	.section .text.sse4, "ax", @progbits
23	ENTRY (_ZGVbN2v_exp_sse4)
24	/*
25	ALGORITHM DESCRIPTION:
26
27	Argument representation:
28	N = rint(X2^k/ln2) = 2^kM+j
29	X = Nln2/2^k + r = Mln2 + ln2(j/2^k) + r*
30	then -ln2/2^(k+1) < r < ln2/2^(k+1)
31	Alternatively:
32	N = trunc(X2^k/ln2)*
33	then 0 < r < ln2/2^k
34
35	Result calculation:
36	exp(X) = exp(Mln2 + ln2(j/2^k) + r)
37	= 2^M 2^(j/2^k) * exp(r)*
38	2^M is calculated by bit manipulation
39	2^(j/2^k) is stored in table
40	exp(r) is approximated by polynomial.
41
42	The table lookup is skipped if k = 0. /*
43
44	pushq %rbp
45	cfi_adjust_cfa_offset (`8`)
46	cfi_rel_offset (%rbp, `0`)
47	movq %rsp, %rbp
48	cfi_def_cfa_register (%rbp)
49	andq $-`64`, %rsp
50	subq $`320`, %rsp
51	movaps %xmm0, %xmm3
52	movq __svml_dexp_data@GOTPCREL(%rip), %r8
53
54	/ iAbsX = (int)(lX>>32), lX = (longlong)&X /
55	pshufd $`221`, %xmm3, %xmm7
56	movups __dbInvLn2(%r8), %xmm0
57
58	/ dK = XdbInvLn2 /*
59	mulpd %xmm3, %xmm0
60	movq __iAbsMask(%r8), %xmm5
61	movq __iDomainRange(%r8), %xmm6
62
63	/ iAbsX = iAbsX&iAbsMask /
64	pand %xmm5, %xmm7
65
66	/ iRangeMask = (iAbsX>iDomainRange) /
67	pcmpgtd %xmm6, %xmm7
68
69	/ Mask = iRangeMask?1:0, set mask for overflow/underflow /
70	movmskps %xmm7, %eax
71
72	/ dN = rint(X2^k/Ln2) /*
73	xorps %xmm7, %xmm7
74	movups __dbLn2hi(%r8), %xmm5
75	movups __dbLn2lo(%r8), %xmm6
76	roundpd $`0`, %xmm0, %xmm7
77
78	/ dR = X - dNdbLn2hi, dbLn2hi is 52-8-k hi bits of ln2/2^k /*
79	mulpd %xmm7, %xmm5
80
81	/ dR = dR - dNdbLn2lo, dbLn2lo is 40..94 bits of lo part of ln2/2^k /*
82	mulpd %xmm6, %xmm7
83	movups __dbShifter(%r8), %xmm4
84
85	/ dM = XdbInvLn2+dbShifter /*
86	addpd %xmm0, %xmm4
87	movaps %xmm3, %xmm0
88	subpd %xmm5, %xmm0
89	subpd %xmm7, %xmm0
90	movups __dPC2(%r8), %xmm5
91
92	/ exp(r) = b0+r(b0+r(b1+rb2)) /*
93	mulpd %xmm0, %xmm5
94	addpd __dPC1(%r8), %xmm5
95	mulpd %xmm0, %xmm5
96	movups __dPC0(%r8), %xmm6
97	addpd %xmm6, %xmm5
98	mulpd %xmm5, %xmm0
99	movdqu __lIndexMask(%r8), %xmm2
100
101	/ lIndex = ((longlong)&dM)&lIndexMask, lIndex is the lower K bits of lM /
102	movdqa %xmm2, %xmm1
103
104	/ lM = ((longlong)&dM)&(~lIndexMask) /
105	pandn %xmm4, %xmm2
106	pand %xmm4, %xmm1
107
108	/ lM = lM<<(52-K), 2^M /
109	psllq $`42`, %xmm2
110
111	/ table lookup for dT[j] = 2^(j/2^k) /
112	movd %xmm1, %edx
113	pextrw $`4`, %xmm1, %ecx
114	addpd %xmm0, %xmm6
115	shll $`3`, %edx
116	shll $`3`, %ecx
117	movq (%r8,%rdx), %xmm0
118	andl $`3`, %eax
119	movhpd (%r8,%rcx), %xmm0
120
121	/ 2^(j/2^k) * exp(r) /
122	mulpd %xmm6, %xmm0
123
124	/ multiply by 2^M through integer add /
125	paddq %xmm2, %xmm0
126	jne .LBL_1_3
127
128	.LBL_1_2:
129	cfi_remember_state
130	movq %rbp, %rsp
131	cfi_def_cfa_register (%rsp)
132	popq %rbp
133	cfi_adjust_cfa_offset (-`8`)
134	cfi_restore (%rbp)
135	ret
136
137	.LBL_1_3:
138	cfi_restore_state
139	movups %xmm3, `192`(%rsp)
140	movups %xmm0, `256`(%rsp)
141	je .LBL_1_2
142
143	xorb %cl, %cl
144	xorl %edx, %edx
145	movups %xmm8, `112`(%rsp)
146	movups %xmm9, `96`(%rsp)
147	movups %xmm10, `80`(%rsp)
148	movups %xmm11, `64`(%rsp)
149	movups %xmm12, `48`(%rsp)
150	movups %xmm13, `32`(%rsp)
151	movups %xmm14, `16`(%rsp)
152	movups %xmm15, (%rsp)
153	movq %rsi, `136`(%rsp)
154	movq %rdi, `128`(%rsp)
155	movq %r12, `168`(%rsp)
156	cfi_offset_rel_rsp (`12`, `168`)
157	movb %cl, %r12b
158	movq %r13, `160`(%rsp)
159	cfi_offset_rel_rsp (`13`, `160`)
160	movl %eax, %r13d
161	movq %r14, `152`(%rsp)
162	cfi_offset_rel_rsp (`14`, `152`)
163	movl %edx, %r14d
164	movq %r15, `144`(%rsp)
165	cfi_offset_rel_rsp (`15`, `144`)
166	cfi_remember_state
167
168	.LBL_1_6:
169	btl %r14d, %r13d
170	jc .LBL_1_12
171
172	.LBL_1_7:
173	lea `1`(%r14), %esi
174	btl %esi, %r13d
175	jc .LBL_1_10
176
177	.LBL_1_8:
178	incb %r12b
179	addl $`2`, %r14d
180	cmpb $`16`, %r12b
181	jb .LBL_1_6
182
183	movups `112`(%rsp), %xmm8
184	movups `96`(%rsp), %xmm9
185	movups `80`(%rsp), %xmm10
186	movups `64`(%rsp), %xmm11
187	movups `48`(%rsp), %xmm12
188	movups `32`(%rsp), %xmm13
189	movups `16`(%rsp), %xmm14
190	movups (%rsp), %xmm15
191	movq `136`(%rsp), %rsi
192	movq `128`(%rsp), %rdi
193	movq `168`(%rsp), %r12
194	cfi_restore (%r12)
195	movq `160`(%rsp), %r13
196	cfi_restore (%r13)
197	movq `152`(%rsp), %r14
198	cfi_restore (%r14)
199	movq `144`(%rsp), %r15
200	cfi_restore (%r15)
201	movups `256`(%rsp), %xmm0
202	jmp .LBL_1_2
203
204	.LBL_1_10:
205	cfi_restore_state
206	movzbl %r12b, %r15d
207	shlq $`4`, %r15
208	movsd `200`(%rsp,%r15), %xmm0
209
210	call JUMPTARGET(exp)
211
212	movsd %xmm0, `264`(%rsp,%r15)
213	jmp .LBL_1_8
214
215	.LBL_1_12:
216	movzbl %r12b, %r15d
217	shlq $`4`, %r15
218	movsd `192`(%rsp,%r15), %xmm0
219
220	call JUMPTARGET(exp)
221
222	movsd %xmm0, `256`(%rsp,%r15)
223	jmp .LBL_1_7
224
225	END (_ZGVbN2v_exp_sse4)
226

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_d_exp2_core_sse4.S