1/* Function tanhf vectorized with SSE4.
2 Copyright (C) 2021-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
18
19/*
20 * ALGORITHM DESCRIPTION:
21 *
22 * NOTE: Since the hyperbolic tangent function is odd
23 * (tanh(x) = -tanh(-x)), below algorithm deals with the absolute
24 * value of the argument |x|: tanh(x) = sign(x) * tanh(|x|)
25 *
26 * We use a table lookup method to compute tanh(|x|).
27 * The basic idea is to split the input range into a number of subintervals
28 * and to approximate tanh(.) with a polynomial on each of them.
29 *
30 * IEEE SPECIAL CONDITIONS:
31 * x = [+, -]0, r = [+, -]0
32 * x = +Inf, r = +1
33 * x = -Inf, r = -1
34 * x = QNaN, r = QNaN
35 * x = SNaN, r = QNaN
36 *
37 *
38 * ALGORITHM DETAILS
39 * We handle special values in a callout function, aside from main path
40 * computations. "Special" for this algorithm are:
41 * INF, NAN, |x| > HUGE_THRESHOLD
42 *
43 *
44 * Main path computations are organized as follows:
45 * Actually we split the interval [0, SATURATION_THRESHOLD)
46 * into a number of subintervals. On each subinterval we approximate tanh(.)
47 * with a minimax polynomial of pre-defined degree. Polynomial coefficients
48 * are computed beforehand and stored in table. We also use
49 *
50 * y := |x| + B,
51 *
52 * here B depends on subinterval and is used to make argument
53 * closer to zero.
54 * We also add large fake interval [SATURATION_THRESHOLD, HUGE_THRESHOLD],
55 * where 1.0 + 0.0*y + 0.0*y^2 ... coefficients are stored - just to
56 * preserve main path computation logic but return 1.0 for all arguments.
57 *
58 * Hence reconstruction looks as follows:
59 * we extract proper polynomial and range reduction coefficients
60 * (Pj and B), corresponding to subinterval, to which |x| belongs,
61 * and return
62 *
63 * r := sign(x) * (P0 + P1 * y + ... + Pn * y^n)
64 *
65 * NOTE: we use multiprecision technique to multiply and sum the first
66 * K terms of the polynomial. So Pj, j = 0..K are stored in
67 * table each as a pair of target precision numbers (Pj and PLj) to
68 * achieve wider than target precision.
69 *
70 *
71 */
72
73
74#include <sysdep.h>
75
76/* tanhf data tables for avx2 and sse4 implementations defined here.
77 */
78#define ONLY_DECL_OFFSET
79#include "svml_s_tanhf_rodata.S"
80
81 .section .text.sse4, "ax", @progbits
82ENTRY(_ZGVbN4v_tanhf_sse4)
83 /* Save copy of input in xmm12. */
84 movaps %xmm0, %xmm12
85
86 /* Here huge arguments, INF and NaNs are filtered out to callout. */
87 movdqu TANHF_DATA(_iExpMantMask)(%rip), %xmm3
88 pand %xmm0, %xmm3
89
90
91 /* Selection of arguments between [0, 0x04280000] into xmm3. */
92 pxor %xmm7, %xmm7
93 /* Save xmm3 for special values check at end. */
94 movdqa %xmm3, %xmm8
95 psubd TANHF_DATA(_iMinIdxOfsMask)(%rip), %xmm3
96 pmaxsd %xmm7, %xmm3
97 pminsd TANHF_DATA(_iMaxIdxMask)(%rip), %xmm3
98 psrld $14, %xmm3
99
100 movq %xmm3, %rcx
101 movl %ecx, %edx
102 shrq $32, %rcx
103
104 pshufd $0x0e, %xmm3, %xmm3
105 movq %xmm3, %rdi
106 movl %edi, %esi
107 shrq $32, %rdi
108
109 movaps TANHF_DATA(_sAbsMask)(%rip), %xmm1
110 andps %xmm1, %xmm0
111
112 leaq TANHF_DATA(_lookupTable)(%rip), %rax
113 movups (%rdx, %rax), %xmm2
114 movups (%rcx, %rax), %xmm6
115
116 /*
117 * small table specific variables *
118 * Constant loading
119 */
120 movaps %xmm2, %xmm4
121 movlhps %xmm6, %xmm4
122 unpckhpd %xmm6, %xmm2
123
124 cvtps2pd %xmm0, %xmm6
125 movhlps %xmm0, %xmm0
126 cvtps2pd %xmm0, %xmm0
127
128 movups 16(%rdx, %rax), %xmm5
129 movups 16(%rsi, %rax), %xmm13
130
131 movaps %xmm5, %xmm10
132 movaps %xmm13, %xmm11
133
134 movups 16(%rcx, %rax), %xmm7
135 movups 16(%rdi, %rax), %xmm3
136
137 unpckhpd %xmm7, %xmm5
138 unpckhpd %xmm3, %xmm13
139
140 mulpd %xmm6, %xmm5
141 mulpd %xmm0, %xmm13
142
143 movlhps %xmm7, %xmm10
144 movlhps %xmm3, %xmm11
145
146 addpd %xmm10, %xmm5
147 addpd %xmm11, %xmm13
148
149 mulpd %xmm6, %xmm5
150 mulpd %xmm0, %xmm13
151
152 addpd %xmm2, %xmm5
153
154 movups (%rsi, %rax), %xmm2
155 movups (%rdi, %rax), %xmm7
156
157 movaps %xmm2, %xmm3
158
159 unpckhpd %xmm7, %xmm2
160 movlhps %xmm7, %xmm3
161
162 addpd %xmm13, %xmm2
163
164 mulpd %xmm5, %xmm6
165 addpd %xmm4, %xmm6
166
167 mulpd %xmm2, %xmm0
168 addpd %xmm3, %xmm0
169
170 cvtpd2ps %xmm0, %xmm2
171 cvtpd2ps %xmm6, %xmm0
172
173 movlhps %xmm2, %xmm0
174 andnps %xmm12, %xmm1
175 orps %xmm1, %xmm0
176
177 /* xmm8 contains mask of special values. */
178 pcmpgtd TANHF_DATA(_iExpMask)(%rip), %xmm8
179
180 movmskps %xmm8, %edx
181 testl %edx, %edx
182
183 /* Go to special inputs processing branch */
184 jne L(SPECIAL_VALUES_BRANCH)
185 # LOE rbx rbp r12 r13 r14 r15 xmm0
186 /* No stack restoration on the fastpath. */
187 ret
188
189 /* Cold case. edx has 1s where there was a special value that
190 needs to be handled by a tanhf call. Optimize for code size
191 more so than speed here. */
192L(SPECIAL_VALUES_BRANCH):
193 # LOE rbx rdx rbp r12 r13 r14 r15 xmm0 xmm12
194 /* Stack coming in 16-byte aligned. Set 8-byte misaligned so on
195 call entry will be 16-byte aligned. */
196 subq $56, %rsp
197 cfi_def_cfa_offset(64)
198 movups %xmm0, 24(%rsp)
199 movups %xmm12, 40(%rsp)
200
201 /* Use rbx/rbp for callee save registers as they get short
202 encoding for many instructions (as compared with r12/r13). */
203 movq %rbx, (%rsp)
204 cfi_offset(rbx, -64)
205 movq %rbp, 8(%rsp)
206 cfi_offset(rbp, -56)
207 /* edx has 1s where there was a special value that needs to be handled
208 by a tanhf call. */
209 movl %edx, %ebx
210L(SPECIAL_VALUES_LOOP):
211 # LOE rbx rbp r12 r13 r14 r15
212 /* use rbp as index for special value that is saved across calls to
213 tanhf. We technically don't need a callee save register here as offset
214 to rsp is always [0, 12] so we can restore rsp by realigning to 64.
215 Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
216 in the loop. */
217 xorl %ebp, %ebp
218 bsfl %ebx, %ebp
219
220 /* Scalar math function call to process special input. */
221 movss 40(%rsp, %rbp, 4), %xmm0
222 call tanhf@PLT
223 /* No good way to avoid the store-forwarding fault this will cause on
224 return. `lfence` avoids the SF fault but at greater cost as it
225 serialized stack/callee save restoration. */
226 movss %xmm0, 24(%rsp, %rbp, 4)
227
228 leal -1(%rbx), %eax
229 andl %eax, %ebx
230 jnz L(SPECIAL_VALUES_LOOP)
231 # LOE r12 r13 r14 r15
232 /* All results have been written to 24(%rsp). */
233 movups 24(%rsp), %xmm0
234 movq (%rsp), %rbx
235 cfi_restore(rbx)
236 movq 8(%rsp), %rbp
237 cfi_restore(rbp)
238 addq $56, %rsp
239 cfi_def_cfa_offset(8)
240 ret
241END(_ZGVbN4v_tanhf_sse4)
242

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf4_core_sse4.S