svml_s_expf4_core_sse4.S source code [glibc/sysdeps/x86_64/fpu/multiarch/svml_s_expf4_core_sse4.S]

1	/ Function expf vectorized with SSE4.*
2	Copyright (C) 2014-2024 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	#include <sysdep.h>
20	#include "svml_s_expf_data.h"
21
22	.section .text.sse4, "ax", @progbits
23	ENTRY (_ZGVbN4v_expf_sse4)
24	/*
25	ALGORITHM DESCRIPTION:
26
27	Argument representation:
28	M = rint(X2^k/ln2) = 2^kN+j
29	X = Mln2/2^k + r = Nln2 + ln2(j/2^k) + r*
30	then -ln2/2^(k+1) < r < ln2/2^(k+1)
31	Alternatively:
32	M = trunc(X2^k/ln2)*
33	then 0 < r < ln2/2^k
34
35	Result calculation:
36	exp(X) = exp(Nln2 + ln2(j/2^k) + r)
37	= 2^N 2^(j/2^k) * exp(r)*
38	2^N is calculated by bit manipulation
39	2^(j/2^k) is computed from table lookup
40	exp(r) is approximated by polynomial
41
42	The table lookup is skipped if k = 0.
43	For low accuracy approximation, exp(r) ~ 1 or 1+r. /*
44
45	pushq %rbp
46	cfi_adjust_cfa_offset (`8`)
47	cfi_rel_offset (%rbp, `0`)
48	movq %rsp, %rbp
49	cfi_def_cfa_register (%rbp)
50	andq $-`64`, %rsp
51	subq $`320`, %rsp
52	movaps %xmm0, %xmm5
53	movq __svml_sexp_data@GOTPCREL(%rip), %rax
54	movups __sInvLn2(%rax), %xmm0
55
56	/ m = x2^k/ln2 + shifter /*
57	mulps %xmm5, %xmm0
58	movups __sShifter(%rax), %xmm6
59	movups __sLn2hi(%rax), %xmm4
60	addps %xmm6, %xmm0
61
62	/ n = m - shifter = rint(x2^k/ln2) /*
63	movaps %xmm0, %xmm2
64
65	/ remove sign of x by "and" operation /
66	movdqu __iAbsMask(%rax), %xmm7
67	subps %xmm6, %xmm2
68
69	/ r = x-nln2_hi/2^k /*
70	mulps %xmm2, %xmm4
71	pand %xmm5, %xmm7
72
73	/ compare against threshold /
74	pcmpgtd __iDomainRange(%rax), %xmm7
75	movups __sLn2lo(%rax), %xmm1
76
77	/ set mask for overflow/underflow /
78	movmskps %xmm7, %ecx
79	movaps %xmm5, %xmm7
80	movups __sPC5(%rax), %xmm3
81	subps %xmm4, %xmm7
82
83	/ r = r-nln2_lo/2^k = x - nln2/2^k /
84	mulps %xmm1, %xmm2
85
86	/ compute 2^N with "shift" /
87	movdqu __iBias(%rax), %xmm6
88	subps %xmm2, %xmm7
89
90	/ c5r+c4 /*
91	mulps %xmm7, %xmm3
92	paddd %xmm6, %xmm0
93	pslld $`23`, %xmm0
94	addps __sPC4(%rax), %xmm3
95
96	/ (c5r+c4)r+c3 /
97	mulps %xmm7, %xmm3
98	addps __sPC3(%rax), %xmm3
99
100	/ ((c5r+c4)r+c3)r+c2 /*
101	mulps %xmm7, %xmm3
102	addps __sPC2(%rax), %xmm3
103
104	/ (((c5r+c4)r+c3)r+c2)r+c1 /
105	mulps %xmm7, %xmm3
106	addps __sPC1(%rax), %xmm3
107
108	/ exp(r) = ((((c5r+c4)r+c3)r+c2)r+c1)r+c0 /*
109	mulps %xmm3, %xmm7
110	addps __sPC0(%rax), %xmm7
111
112	/ 2^Nexp(r) /*
113	mulps %xmm7, %xmm0
114	testl %ecx, %ecx
115	jne .LBL_1_3
116
117	.LBL_1_2:
118	cfi_remember_state
119	movq %rbp, %rsp
120	cfi_def_cfa_register (%rsp)
121	popq %rbp
122	cfi_adjust_cfa_offset (-`8`)
123	cfi_restore (%rbp)
124	ret
125
126	.LBL_1_3:
127	cfi_restore_state
128	movups %xmm5, `192`(%rsp)
129	movups %xmm0, `256`(%rsp)
130	je .LBL_1_2
131
132	xorb %dl, %dl
133	xorl %eax, %eax
134	movups %xmm8, `112`(%rsp)
135	movups %xmm9, `96`(%rsp)
136	movups %xmm10, `80`(%rsp)
137	movups %xmm11, `64`(%rsp)
138	movups %xmm12, `48`(%rsp)
139	movups %xmm13, `32`(%rsp)
140	movups %xmm14, `16`(%rsp)
141	movups %xmm15, (%rsp)
142	movq %rsi, `136`(%rsp)
143	movq %rdi, `128`(%rsp)
144	movq %r12, `168`(%rsp)
145	cfi_offset_rel_rsp (`12`, `168`)
146	movb %dl, %r12b
147	movq %r13, `160`(%rsp)
148	cfi_offset_rel_rsp (`13`, `160`)
149	movl %ecx, %r13d
150	movq %r14, `152`(%rsp)
151	cfi_offset_rel_rsp (`14`, `152`)
152	movl %eax, %r14d
153	movq %r15, `144`(%rsp)
154	cfi_offset_rel_rsp (`15`, `144`)
155	cfi_remember_state
156
157	.LBL_1_6:
158	btl %r14d, %r13d
159	jc .LBL_1_12
160
161	.LBL_1_7:
162	lea `1`(%r14), %esi
163	btl %esi, %r13d
164	jc .LBL_1_10
165
166	.LBL_1_8:
167	incb %r12b
168	addl $`2`, %r14d
169	cmpb $`16`, %r12b
170	jb .LBL_1_6
171
172	movups `112`(%rsp), %xmm8
173	movups `96`(%rsp), %xmm9
174	movups `80`(%rsp), %xmm10
175	movups `64`(%rsp), %xmm11
176	movups `48`(%rsp), %xmm12
177	movups `32`(%rsp), %xmm13
178	movups `16`(%rsp), %xmm14
179	movups (%rsp), %xmm15
180	movq `136`(%rsp), %rsi
181	movq `128`(%rsp), %rdi
182	movq `168`(%rsp), %r12
183	cfi_restore (%r12)
184	movq `160`(%rsp), %r13
185	cfi_restore (%r13)
186	movq `152`(%rsp), %r14
187	cfi_restore (%r14)
188	movq `144`(%rsp), %r15
189	cfi_restore (%r15)
190	movups `256`(%rsp), %xmm0
191	jmp .LBL_1_2
192
193	.LBL_1_10:
194	cfi_restore_state
195	movzbl %r12b, %r15d
196	movss `196`(%rsp,%r15,`8`), %xmm0
197
198	call JUMPTARGET(expf)
199
200	movss %xmm0, `260`(%rsp,%r15,`8`)
201	jmp .LBL_1_8
202
203	.LBL_1_12:
204	movzbl %r12b, %r15d
205	movss `192`(%rsp,%r15,`8`), %xmm0
206
207	call JUMPTARGET(expf)
208
209	movss %xmm0, `256`(%rsp,%r15,`8`)
210	jmp .LBL_1_7
211
212	END (_ZGVbN4v_expf_sse4)
213

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_s_expf4_core_sse4.S