svml_s_tanhf4_core_sse4.S source code [glibc/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf4_core_sse4.S]

1	/ Function tanhf vectorized with SSE4.*
2	Copyright (C) 2021-2024 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	https://www.gnu.org/licenses/. /*
18
19	/*
20	* ALGORITHM DESCRIPTION:
21	*
22	* NOTE: Since the hyperbolic tangent function is odd
23	* (tanh(x) = -tanh(-x)), below algorithm deals with the absolute
24	* value of the argument \|x\|: tanh(x) = sign(x) * tanh(\|x\|)
25	*
26	* We use a table lookup method to compute tanh(\|x\|).
27	* The basic idea is to split the input range into a number of subintervals
28	* and to approximate tanh(.) with a polynomial on each of them.
29	*
30	* IEEE SPECIAL CONDITIONS:
31	* x = [+, -]0, r = [+, -]0
32	* x = +Inf, r = +1
33	* x = -Inf, r = -1
34	* x = QNaN, r = QNaN
35	* x = SNaN, r = QNaN
36	*
37	*
38	* ALGORITHM DETAILS
39	* We handle special values in a callout function, aside from main path
40	* computations. "Special" for this algorithm are:
41	* INF, NAN, \|x\| > HUGE_THRESHOLD
42	*
43	*
44	* Main path computations are organized as follows:
45	* Actually we split the interval [0, SATURATION_THRESHOLD)
46	* into a number of subintervals. On each subinterval we approximate tanh(.)
47	* with a minimax polynomial of pre-defined degree. Polynomial coefficients
48	* are computed beforehand and stored in table. We also use
49	*
50	* y := \|x\| + B,
51	*
52	* here B depends on subinterval and is used to make argument
53	* closer to zero.
54	* We also add large fake interval [SATURATION_THRESHOLD, HUGE_THRESHOLD],
55	* where 1.0 + 0.0y + 0.0y^2 ... coefficients are stored - just to
56	* preserve main path computation logic but return 1.0 for all arguments.
57	*
58	* Hence reconstruction looks as follows:
59	* we extract proper polynomial and range reduction coefficients
60	* (Pj and B), corresponding to subinterval, to which \|x\| belongs,
61	* and return
62	*
63	* r := sign(x) * (P0 + P1 * y + ... + Pn * y^n)
64	*
65	* NOTE: we use multiprecision technique to multiply and sum the first
66	* K terms of the polynomial. So Pj, j = 0..K are stored in
67	* table each as a pair of target precision numbers (Pj and PLj) to
68	* achieve wider than target precision.
69	*
70	*
71	*/
72
73
74	#include <sysdep.h>
75
76	/ tanhf data tables for avx2 and sse4 implementations defined here.*
77	*/
78	#define ONLY_DECL_OFFSET
79	#include "svml_s_tanhf_rodata.S"
80
81	.section .text.sse4, "ax", @progbits
82	ENTRY(_ZGVbN4v_tanhf_sse4)
83	/ Save copy of input in xmm12. /
84	movaps %xmm0, %xmm12
85
86	/ Here huge arguments, INF and NaNs are filtered out to callout. /
87	movdqu TANHF_DATA(_iExpMantMask)(%rip), %xmm3
88	pand %xmm0, %xmm3
89
90
91	/ Selection of arguments between [0, 0x04280000] into xmm3. /
92	pxor %xmm7, %xmm7
93	/ Save xmm3 for special values check at end. /
94	movdqa %xmm3, %xmm8
95	psubd TANHF_DATA(_iMinIdxOfsMask)(%rip), %xmm3
96	pmaxsd %xmm7, %xmm3
97	pminsd TANHF_DATA(_iMaxIdxMask)(%rip), %xmm3
98	psrld $`14`, %xmm3
99
100	movq %xmm3, %rcx
101	movl %ecx, %edx
102	shrq $`32`, %rcx
103
104	pshufd $`0x0e`, %xmm3, %xmm3
105	movq %xmm3, %rdi
106	movl %edi, %esi
107	shrq $`32`, %rdi
108
109	movaps TANHF_DATA(_sAbsMask)(%rip), %xmm1
110	andps %xmm1, %xmm0
111
112	leaq TANHF_DATA(_lookupTable)(%rip), %rax
113	movups (%rdx, %rax), %xmm2
114	movups (%rcx, %rax), %xmm6
115
116	/*
117	* small table specific variables *
118	* Constant loading
119	*/
120	movaps %xmm2, %xmm4
121	movlhps %xmm6, %xmm4
122	unpckhpd %xmm6, %xmm2
123
124	cvtps2pd %xmm0, %xmm6
125	movhlps %xmm0, %xmm0
126	cvtps2pd %xmm0, %xmm0
127
128	movups `16`(%rdx, %rax), %xmm5
129	movups `16`(%rsi, %rax), %xmm13
130
131	movaps %xmm5, %xmm10
132	movaps %xmm13, %xmm11
133
134	movups `16`(%rcx, %rax), %xmm7
135	movups `16`(%rdi, %rax), %xmm3
136
137	unpckhpd %xmm7, %xmm5
138	unpckhpd %xmm3, %xmm13
139
140	mulpd %xmm6, %xmm5
141	mulpd %xmm0, %xmm13
142
143	movlhps %xmm7, %xmm10
144	movlhps %xmm3, %xmm11
145
146	addpd %xmm10, %xmm5
147	addpd %xmm11, %xmm13
148
149	mulpd %xmm6, %xmm5
150	mulpd %xmm0, %xmm13
151
152	addpd %xmm2, %xmm5
153
154	movups (%rsi, %rax), %xmm2
155	movups (%rdi, %rax), %xmm7
156
157	movaps %xmm2, %xmm3
158
159	unpckhpd %xmm7, %xmm2
160	movlhps %xmm7, %xmm3
161
162	addpd %xmm13, %xmm2
163
164	mulpd %xmm5, %xmm6
165	addpd %xmm4, %xmm6
166
167	mulpd %xmm2, %xmm0
168	addpd %xmm3, %xmm0
169
170	cvtpd2ps %xmm0, %xmm2
171	cvtpd2ps %xmm6, %xmm0
172
173	movlhps %xmm2, %xmm0
174	andnps %xmm12, %xmm1
175	orps %xmm1, %xmm0
176
177	/ xmm8 contains mask of special values. /
178	pcmpgtd TANHF_DATA(_iExpMask)(%rip), %xmm8
179
180	movmskps %xmm8, %edx
181	testl %edx, %edx
182
183	/ Go to special inputs processing branch /
184	jne L(SPECIAL_VALUES_BRANCH)
185	# LOE rbx rbp r12 r13 r14 r15 xmm0
186	/ No stack restoration on the fastpath. /
187	ret
188
189	/ Cold case. edx has 1s where there was a special value that*
190	needs to be handled by a tanhf call. Optimize for code size
191	more so than speed here. /*
192	L(SPECIAL_VALUES_BRANCH):
193	# LOE rbx rdx rbp r12 r13 r14 r15 xmm0 xmm12
194	/ Stack coming in 16-byte aligned. Set 8-byte misaligned so on*
195	call entry will be 16-byte aligned. /*
196	subq $`56`, %rsp
197	cfi_def_cfa_offset(`64`)
198	movups %xmm0, `24`(%rsp)
199	movups %xmm12, `40`(%rsp)
200
201	/ Use rbx/rbp for callee save registers as they get short*
202	encoding for many instructions (as compared with r12/r13). /*
203	movq %rbx, (%rsp)
204	cfi_offset(rbx, -`64`)
205	movq %rbp, `8`(%rsp)
206	cfi_offset(rbp, -`56`)
207	/ edx has 1s where there was a special value that needs to be handled*
208	by a tanhf call. /*
209	movl %edx, %ebx
210	L(SPECIAL_VALUES_LOOP):
211	# LOE rbx rbp r12 r13 r14 r15
212	/ use rbp as index for special value that is saved across calls to*
213	tanhf. We technically don't need a callee save register here as offset
214	to rsp is always [0, 12] so we can restore rsp by realigning to 64.
215	Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
216	in the loop. /*
217	xorl %ebp, %ebp
218	bsfl %ebx, %ebp
219
220	/ Scalar math function call to process special input. /
221	movss `40`(%rsp, %rbp, `4`), %xmm0
222	call tanhf@PLT
223	/ No good way to avoid the store-forwarding fault this will cause on*
224	return. `lfence` avoids the SF fault but at greater cost as it
225	serialized stack/callee save restoration. /*
226	movss %xmm0, `24`(%rsp, %rbp, `4`)
227
228	leal -`1`(%rbx), %eax
229	andl %eax, %ebx
230	jnz L(SPECIAL_VALUES_LOOP)
231	# LOE r12 r13 r14 r15
232	/ All results have been written to 24(%rsp). /
233	movups `24`(%rsp), %xmm0
234	movq (%rsp), %rbx
235	cfi_restore(rbx)
236	movq `8`(%rsp), %rbp
237	cfi_restore(rbp)
238	addq $`56`, %rsp
239	cfi_def_cfa_offset(`8`)
240	ret
241	END(_ZGVbN4v_tanhf_sse4)
242

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf4_core_sse4.S