svml_s_expf8_core_avx2.S source code [glibc/sysdeps/x86_64/fpu/multiarch/svml_s_expf8_core_avx2.S]

1	/ Function expf vectorized with AVX2.*
2	Copyright (C) 2014-2024 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	#include <sysdep.h>
20	#include "svml_s_expf_data.h"
21
22	.section .text.avx2, "ax", @progbits
23	ENTRY(_ZGVdN8v_expf_avx2)
24	/*
25	ALGORITHM DESCRIPTION:
26
27	Argument representation:
28	M = rint(X2^k/ln2) = 2^kN+j
29	X = Mln2/2^k + r = Nln2 + ln2(j/2^k) + r*
30	then -ln2/2^(k+1) < r < ln2/2^(k+1)
31	Alternatively:
32	M = trunc(X2^k/ln2)*
33	then 0 < r < ln2/2^k
34
35	Result calculation:
36	exp(X) = exp(Nln2 + ln2(j/2^k) + r)
37	= 2^N 2^(j/2^k) * exp(r)*
38	2^N is calculated by bit manipulation
39	2^(j/2^k) is computed from table lookup
40	exp(r) is approximated by polynomial
41
42	The table lookup is skipped if k = 0.
43	For low accuracy approximation, exp(r) ~ 1 or 1+r. /*
44
45	pushq %rbp
46	cfi_adjust_cfa_offset (`8`)
47	cfi_rel_offset (%rbp, `0`)
48	movq %rsp, %rbp
49	cfi_def_cfa_register (%rbp)
50	andq $-`64`, %rsp
51	subq $`448`, %rsp
52	movq __svml_sexp_data@GOTPCREL(%rip), %rax
53	vmovaps %ymm0, %ymm2
54	vmovups __sInvLn2(%rax), %ymm7
55	vmovups __sShifter(%rax), %ymm4
56	vmovups __sLn2hi(%rax), %ymm3
57	vmovups __sPC5(%rax), %ymm1
58
59	/ m = x2^k/ln2 + shifter /*
60	vfmadd213ps %ymm4, %ymm2, %ymm7
61
62	/ n = m - shifter = rint(x2^k/ln2) /*
63	vsubps %ymm4, %ymm7, %ymm0
64	vpaddd __iBias(%rax), %ymm7, %ymm4
65
66	/ remove sign of x by "and" operation /
67	vandps __iAbsMask(%rax), %ymm2, %ymm5
68
69	/ compare against threshold /
70	vpcmpgtd __iDomainRange(%rax), %ymm5, %ymm6
71
72	/ r = x-nln2_hi/2^k /*
73	vmovaps %ymm2, %ymm5
74	vfnmadd231ps %ymm0, %ymm3, %ymm5
75
76	/ r = r-nln2_lo/2^k = x - nln2/2^k /
77	vfnmadd132ps __sLn2lo(%rax), %ymm5, %ymm0
78
79	/ c5r+c4 /*
80	vfmadd213ps __sPC4(%rax), %ymm0, %ymm1
81
82	/ (c5r+c4)r+c3 /
83	vfmadd213ps __sPC3(%rax), %ymm0, %ymm1
84
85	/ ((c5r+c4)r+c3)r+c2 /*
86	vfmadd213ps __sPC2(%rax), %ymm0, %ymm1
87
88	/ (((c5r+c4)r+c3)r+c2)r+c1 /
89	vfmadd213ps __sPC1(%rax), %ymm0, %ymm1
90
91	/ exp(r) = ((((c5r+c4)r+c3)r+c2)r+c1)r+c0 /*
92	vfmadd213ps __sPC0(%rax), %ymm0, %ymm1
93
94	/ set mask for overflow/underflow /
95	vmovmskps %ymm6, %ecx
96
97	/ compute 2^N with "shift" /
98	vpslld $`23`, %ymm4, %ymm6
99
100	/ 2^Nexp(r) /*
101	vmulps %ymm1, %ymm6, %ymm0
102	testl %ecx, %ecx
103	jne .LBL_1_3
104
105	.LBL_1_2:
106	cfi_remember_state
107	movq %rbp, %rsp
108	cfi_def_cfa_register (%rsp)
109	popq %rbp
110	cfi_adjust_cfa_offset (-`8`)
111	cfi_restore (%rbp)
112	ret
113
114	.LBL_1_3:
115	cfi_restore_state
116	vmovups %ymm2, `320`(%rsp)
117	vmovups %ymm0, `384`(%rsp)
118	je .LBL_1_2
119
120	xorb %dl, %dl
121	xorl %eax, %eax
122	vmovups %ymm8, `224`(%rsp)
123	vmovups %ymm9, `192`(%rsp)
124	vmovups %ymm10, `160`(%rsp)
125	vmovups %ymm11, `128`(%rsp)
126	vmovups %ymm12, `96`(%rsp)
127	vmovups %ymm13, `64`(%rsp)
128	vmovups %ymm14, `32`(%rsp)
129	vmovups %ymm15, (%rsp)
130	movq %rsi, `264`(%rsp)
131	movq %rdi, `256`(%rsp)
132	movq %r12, `296`(%rsp)
133	cfi_offset_rel_rsp (`12`, `296`)
134	movb %dl, %r12b
135	movq %r13, `288`(%rsp)
136	cfi_offset_rel_rsp (`13`, `288`)
137	movl %ecx, %r13d
138	movq %r14, `280`(%rsp)
139	cfi_offset_rel_rsp (`14`, `280`)
140	movl %eax, %r14d
141	movq %r15, `272`(%rsp)
142	cfi_offset_rel_rsp (`15`, `272`)
143	cfi_remember_state
144
145	.LBL_1_6:
146	btl %r14d, %r13d
147	jc .LBL_1_12
148
149	.LBL_1_7:
150	lea `1`(%r14), %esi
151	btl %esi, %r13d
152	jc .LBL_1_10
153
154	.LBL_1_8:
155	incb %r12b
156	addl $`2`, %r14d
157	cmpb $`16`, %r12b
158	jb .LBL_1_6
159
160	vmovups `224`(%rsp), %ymm8
161	vmovups `192`(%rsp), %ymm9
162	vmovups `160`(%rsp), %ymm10
163	vmovups `128`(%rsp), %ymm11
164	vmovups `96`(%rsp), %ymm12
165	vmovups `64`(%rsp), %ymm13
166	vmovups `32`(%rsp), %ymm14
167	vmovups (%rsp), %ymm15
168	vmovups `384`(%rsp), %ymm0
169	movq `264`(%rsp), %rsi
170	movq `256`(%rsp), %rdi
171	movq `296`(%rsp), %r12
172	cfi_restore (%r12)
173	movq `288`(%rsp), %r13
174	cfi_restore (%r13)
175	movq `280`(%rsp), %r14
176	cfi_restore (%r14)
177	movq `272`(%rsp), %r15
178	cfi_restore (%r15)
179	jmp .LBL_1_2
180
181	.LBL_1_10:
182	cfi_restore_state
183	movzbl %r12b, %r15d
184	vmovss `324`(%rsp,%r15,`8`), %xmm0
185	vzeroupper
186
187	call JUMPTARGET(expf)
188
189	vmovss %xmm0, `388`(%rsp,%r15,`8`)
190	jmp .LBL_1_8
191
192	.LBL_1_12:
193	movzbl %r12b, %r15d
194	vmovss `320`(%rsp,%r15,`8`), %xmm0
195	vzeroupper
196
197	call JUMPTARGET(expf)
198
199	vmovss %xmm0, `384`(%rsp,%r15,`8`)
200	jmp .LBL_1_7
201
202	END(_ZGVdN8v_expf_avx2)
203

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_s_expf8_core_avx2.S