svml_s_tanhf8_core_avx2.S source code [glibc/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S]

1	/ Function tanhf vectorized with AVX2.*
2	Copyright (C) 2021-2024 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	https://www.gnu.org/licenses/. /*
18
19	/*
20	* ALGORITHM DESCRIPTION:
21	*
22	* NOTE: Since the hyperbolic tangent function is odd
23	* (tanh(x) = -tanh(-x)), below algorithm deals with the absolute
24	* value of the argument \|x\|: tanh(x) = sign(x) * tanh(\|x\|)
25	*
26	* We use a table lookup method to compute tanh(\|x\|).
27	* The basic idea is to split the input range into a number of subintervals
28	* and to approximate tanh(.) with a polynomial on each of them.
29	*
30	* IEEE SPECIAL CONDITIONS:
31	* x = [+, -]0, r = [+, -]0
32	* x = +Inf, r = +1
33	* x = -Inf, r = -1
34	* x = QNaN, r = QNaN
35	* x = SNaN, r = QNaN
36	*
37	*
38	* ALGORITHM DETAILS
39	* We handle special values in a callout function, aside from main path
40	* computations. "Special" for this algorithm are:
41	* INF, NAN, \|x\| > HUGE_THRESHOLD
42	*
43	*
44	* Main path computations are organized as follows:
45	* Actually we split the interval [0, SATURATION_THRESHOLD)
46	* into a number of subintervals. On each subinterval we approximate tanh(.)
47	* with a minimax polynomial of pre-defined degree. Polynomial coefficients
48	* are computed beforehand and stored in table. We also use
49	*
50	* y := \|x\| + B,
51	*
52	* here B depends on subinterval and is used to make argument
53	* closer to zero.
54	* We also add large fake interval [SATURATION_THRESHOLD, HUGE_THRESHOLD],
55	* where 1.0 + 0.0y + 0.0y^2 ... coefficients are stored - just to
56	* preserve main path computation logic but return 1.0 for all arguments.
57	*
58	* Hence reconstruction looks as follows:
59	* we extract proper polynomial and range reduction coefficients
60	* (Pj and B), corresponding to subinterval, to which \|x\| belongs,
61	* and return
62	*
63	* r := sign(x) * (P0 + P1 * y + ... + Pn * y^n)
64	*
65	* NOTE: we use multiprecision technique to multiply and sum the first
66	* K terms of the polynomial. So Pj, j = 0..K are stored in
67	* table each as a pair of target precision numbers (Pj and PLj) to
68	* achieve wider than target precision.
69	*
70	*
71	*/
72
73	#include <sysdep.h>
74
75	/ tanhf data tables for avx2 and sse4 implementations defined here.*
76	*/
77	#include "svml_s_tanhf_rodata.S"
78
79	.section .text.avx2, "ax", @progbits
80	ENTRY(_ZGVdN8v_tanhf_avx2)
81	/ Here huge arguments, INF and NaNs are filtered out to callout. /
82	vpand TANHF_DATA(_iExpMantMask)(%rip), %ymm0, %ymm4
83	vpsubd TANHF_DATA(_iMinIdxOfsMask)(%rip), %ymm4, %ymm2
84
85	/ Selection of arguments between [0, 0x04280000] into ymm2. /
86	vpxor %ymm3, %ymm3, %ymm3
87	vpmaxsd %ymm3, %ymm2, %ymm2
88	vpminsd TANHF_DATA(_iMaxIdxMask)(%rip), %ymm2, %ymm2
89
90	/*
91	* small table specific variables *
92	* Constant loading
93	*/
94	vpsrld $`14`, %ymm2, %ymm1
95
96	/ We are splitting xmm1 into 8 GPRs. This may be faster to do with*
97	store/load as we can take advantage of store-forwarding. /*
98	vmovq %xmm1, %r8
99	/ We have eliminated all negative values for ymm1 so no need to sign*
100	extend. /*
101	movl %r8d, %r9d
102	shrq $`32`, %r8
103
104	/ Store base of lookup table in rax. /
105	leaq TANHF_DATA(_lookupTable)(%rip), %rax
106
107	/ Instead of using cross-lane permutes on ymm vectors, use vpinsertf128*
108	with memory operand. This helps alleviate bottleneck on p5. /*
109	vmovupd `16`(%r9, %rax), %xmm5
110
111	vpextrq $`1`, %xmm1, %rsi
112	movl %esi, %edi
113	shrq $`32`, %rsi
114
115	vinsertf128 $`1`, `16`(%rdi, %rax), %ymm5, %ymm5
116
117	vextracti128 $`1`, %ymm1, %xmm2
118	vmovq %xmm2, %rdx
119	movl %edx, %ecx
120	shrq $`32`, %rdx
121
122	vmovupd (%rcx, %rax), %xmm6
123
124	vpextrq $`1`, %xmm2, %r10
125	movl %r10d, %r11d
126	shrq $`32`, %r10
127
128	vinsertf128 $`1`, (%r11, %rax), %ymm6, %ymm6
129
130	vmovupd `16`(%r8, %rax), %xmm1
131	vinsertf128 $`1`, `16`(%rsi, %rax), %ymm1, %ymm1
132	vmovupd (%rdx, %rax), %xmm3
133	vinsertf128 $`1`, (%r10, %rax), %ymm3, %ymm3
134
135	vunpcklpd %ymm3, %ymm6, %ymm7
136	vunpckhpd %ymm3, %ymm6, %ymm6
137
138	vunpcklpd %ymm1, %ymm5, %ymm3
139	vunpckhpd %ymm1, %ymm5, %ymm1
140
141	vmovaps TANHF_DATA(_sAbsMask)(%rip), %ymm11
142	/ Store special cases in ymm15. /
143	vpcmpgtd TANHF_DATA(_iExpMask)(%rip), %ymm4, %ymm15
144
145	vandps %ymm11, %ymm0, %ymm4
146
147	vcvtps2pd %xmm4, %ymm5
148
149	vextractf128 $`1`, %ymm4, %xmm4
150	vcvtps2pd %xmm4, %ymm4
151
152	vmovupd `16`(%rcx, %rax), %xmm2
153	vinsertf128 $`1`, `16`(%r11, %rax), %ymm2, %ymm2
154
155	vfmadd213pd %ymm3, %ymm5, %ymm1
156
157	vmovupd `16`(%rdx, %rax), %xmm3
158	vinsertf128 $`1`, `16`(%r10, %rax), %ymm3, %ymm3
159
160	vunpcklpd %ymm3, %ymm2, %ymm10
161	vunpckhpd %ymm3, %ymm2, %ymm2
162
163	vfmadd213pd %ymm10, %ymm4, %ymm2
164	vfmadd213pd %ymm6, %ymm4, %ymm2
165	vfmadd213pd %ymm7, %ymm4, %ymm2
166	vcvtpd2ps %ymm2, %xmm2
167
168	vmovupd (%r9, %rax), %xmm7
169	vinsertf128 $`1`, (%rdi, %rax), %ymm7, %ymm7
170
171	vmovupd (%r8, %rax), %xmm3
172	vinsertf128 $`1`, (%rsi, %rax), %ymm3, %ymm3
173
174	vunpckhpd %ymm3, %ymm7, %ymm4
175	vunpcklpd %ymm3, %ymm7, %ymm7
176
177	vfmadd213pd %ymm4, %ymm5, %ymm1
178	vfmadd213pd %ymm7, %ymm5, %ymm1
179
180
181	vcvtpd2ps %ymm1, %xmm1
182	vinsertf128 $`1`, %xmm2, %ymm1, %ymm1
183
184	vmovmskps %ymm15, %edx
185	vandnps %ymm0, %ymm11, %ymm2
186	testl %edx, %edx
187	/ Go to special inputs processing branch /
188	jne L(SPECIAL_VALUES_BRANCH)
189	# LOE rbx r12 r13 r14 r15 ymm0 ymm1 ymm2
190	/ Wait until after branch of write over ymm0. /
191	vorps %ymm2, %ymm1, %ymm0
192	/ No stack restoration on the fastpath. /
193	ret
194
195
196	/ Cold case. edx has 1s where there was a special value that*
197	needs to be handled by a tanhf call. Optimize for code size
198	more so than speed here. /*
199	L(SPECIAL_VALUES_BRANCH):
200	# LOE rbx rdx r12 r13 r14 r15 ymm0 ymm1 ymm2
201	/ Use r13 to save/restore the stack. This allows us to use rbp as*
202	callee save register saving code size. /*
203	pushq %r13
204	cfi_adjust_cfa_offset(`8`)
205	cfi_offset(r13, -`16`)
206	/ Need to callee save registers to preserve state across tanhf calls.*
207	*/
208	pushq %rbx
209	cfi_adjust_cfa_offset(`8`)
210	cfi_offset(rbx, -`24`)
211	pushq %rbp
212	cfi_adjust_cfa_offset(`8`)
213	cfi_offset(rbp, -`32`)
214	movq %rsp, %r13
215	cfi_def_cfa_register(r13)
216
217	/ Align stack and make room for 2x ymm vectors. /
218	andq $-`32`, %rsp
219	addq $-`64`, %rsp
220
221	/ Save all already computed inputs. /
222	vorps %ymm2, %ymm1, %ymm1
223	vmovaps %ymm1, (%rsp)
224	/ Save original input (ymm0 unchanged up to this point). /
225	vmovaps %ymm0, `32`(%rsp)
226
227	vzeroupper
228
229	/ edx has 1s where there was a special value that needs to be handled*
230	by a tanhf call. /*
231	movl %edx, %ebx
232	L(SPECIAL_VALUES_LOOP):
233	# LOE rbx rbp r12 r13 r14 r15
234	/ use rbp as index for special value that is saved across calls to*
235	tanhf. We technically don't need a callee save register here as offset
236	to rsp is always [0, 28] so we can restore rsp by realigning to 64.
237	Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
238	in the loop. Realigning also costs more code size. /*
239	xorl %ebp, %ebp
240	tzcntl %ebx, %ebp
241
242	/ Scalar math function call to process special input. /
243	vmovss `32`(%rsp, %rbp, `4`), %xmm0
244	call tanhf@PLT
245
246	/ No good way to avoid the store-forwarding fault this will cause on*
247	return. `lfence` avoids the SF fault but at greater cost as it
248	serialized stack/callee save restoration. /*
249	vmovss %xmm0, (%rsp, %rbp, `4`)
250
251	blsrl %ebx, %ebx
252	jnz L(SPECIAL_VALUES_LOOP)
253	# LOE r12 r13 r14 r15
254
255
256	/ All results have been written to (%rsp). /
257	vmovups (%rsp), %ymm0
258	/ Restore rsp. /
259	movq %r13, %rsp
260	cfi_def_cfa_register(rsp)
261	/ Restore callee save registers. /
262	popq %rbp
263	cfi_adjust_cfa_offset(-`8`)
264	cfi_restore(rbp)
265	popq %rbx
266	cfi_adjust_cfa_offset(-`8`)
267	cfi_restore(rbp)
268	popq %r13
269	cfi_adjust_cfa_offset(-`8`)
270	cfi_restore(r13)
271	ret
272	END(_ZGVdN8v_tanhf_avx2)
273

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S