1 | /* Function tanhf vectorized with SSE4. |
2 | Copyright (C) 2021-2024 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | https://www.gnu.org/licenses/. */ |
18 | |
19 | /* |
20 | * ALGORITHM DESCRIPTION: |
21 | * |
22 | * NOTE: Since the hyperbolic tangent function is odd |
23 | * (tanh(x) = -tanh(-x)), below algorithm deals with the absolute |
24 | * value of the argument |x|: tanh(x) = sign(x) * tanh(|x|) |
25 | * |
26 | * We use a table lookup method to compute tanh(|x|). |
27 | * The basic idea is to split the input range into a number of subintervals |
28 | * and to approximate tanh(.) with a polynomial on each of them. |
29 | * |
30 | * IEEE SPECIAL CONDITIONS: |
31 | * x = [+, -]0, r = [+, -]0 |
32 | * x = +Inf, r = +1 |
33 | * x = -Inf, r = -1 |
34 | * x = QNaN, r = QNaN |
35 | * x = SNaN, r = QNaN |
36 | * |
37 | * |
38 | * ALGORITHM DETAILS |
39 | * We handle special values in a callout function, aside from main path |
40 | * computations. "Special" for this algorithm are: |
41 | * INF, NAN, |x| > HUGE_THRESHOLD |
42 | * |
43 | * |
44 | * Main path computations are organized as follows: |
45 | * Actually we split the interval [0, SATURATION_THRESHOLD) |
46 | * into a number of subintervals. On each subinterval we approximate tanh(.) |
47 | * with a minimax polynomial of pre-defined degree. Polynomial coefficients |
48 | * are computed beforehand and stored in table. We also use |
49 | * |
50 | * y := |x| + B, |
51 | * |
52 | * here B depends on subinterval and is used to make argument |
53 | * closer to zero. |
54 | * We also add large fake interval [SATURATION_THRESHOLD, HUGE_THRESHOLD], |
55 | * where 1.0 + 0.0*y + 0.0*y^2 ... coefficients are stored - just to |
56 | * preserve main path computation logic but return 1.0 for all arguments. |
57 | * |
58 | * Hence reconstruction looks as follows: |
59 | * we extract proper polynomial and range reduction coefficients |
60 | * (Pj and B), corresponding to subinterval, to which |x| belongs, |
61 | * and return |
62 | * |
63 | * r := sign(x) * (P0 + P1 * y + ... + Pn * y^n) |
64 | * |
65 | * NOTE: we use multiprecision technique to multiply and sum the first |
66 | * K terms of the polynomial. So Pj, j = 0..K are stored in |
67 | * table each as a pair of target precision numbers (Pj and PLj) to |
68 | * achieve wider than target precision. |
69 | * |
70 | * |
71 | */ |
72 | |
73 | |
74 | #include <sysdep.h> |
75 | |
76 | /* tanhf data tables for avx2 and sse4 implementations defined here. |
77 | */ |
78 | #define ONLY_DECL_OFFSET |
79 | #include "svml_s_tanhf_rodata.S" |
80 | |
81 | .section .text.sse4, "ax" , @progbits |
82 | ENTRY(_ZGVbN4v_tanhf_sse4) |
83 | /* Save copy of input in xmm12. */ |
84 | movaps %xmm0, %xmm12 |
85 | |
86 | /* Here huge arguments, INF and NaNs are filtered out to callout. */ |
87 | movdqu TANHF_DATA(_iExpMantMask)(%rip), %xmm3 |
88 | pand %xmm0, %xmm3 |
89 | |
90 | |
91 | /* Selection of arguments between [0, 0x04280000] into xmm3. */ |
92 | pxor %xmm7, %xmm7 |
93 | /* Save xmm3 for special values check at end. */ |
94 | movdqa %xmm3, %xmm8 |
95 | psubd TANHF_DATA(_iMinIdxOfsMask)(%rip), %xmm3 |
96 | pmaxsd %xmm7, %xmm3 |
97 | pminsd TANHF_DATA(_iMaxIdxMask)(%rip), %xmm3 |
98 | psrld $14, %xmm3 |
99 | |
100 | movq %xmm3, %rcx |
101 | movl %ecx, %edx |
102 | shrq $32, %rcx |
103 | |
104 | pshufd $0x0e, %xmm3, %xmm3 |
105 | movq %xmm3, %rdi |
106 | movl %edi, %esi |
107 | shrq $32, %rdi |
108 | |
109 | movaps TANHF_DATA(_sAbsMask)(%rip), %xmm1 |
110 | andps %xmm1, %xmm0 |
111 | |
112 | leaq TANHF_DATA(_lookupTable)(%rip), %rax |
113 | movups (%rdx, %rax), %xmm2 |
114 | movups (%rcx, %rax), %xmm6 |
115 | |
116 | /* |
117 | * small table specific variables * |
118 | * Constant loading |
119 | */ |
120 | movaps %xmm2, %xmm4 |
121 | movlhps %xmm6, %xmm4 |
122 | unpckhpd %xmm6, %xmm2 |
123 | |
124 | cvtps2pd %xmm0, %xmm6 |
125 | movhlps %xmm0, %xmm0 |
126 | cvtps2pd %xmm0, %xmm0 |
127 | |
128 | movups 16(%rdx, %rax), %xmm5 |
129 | movups 16(%rsi, %rax), %xmm13 |
130 | |
131 | movaps %xmm5, %xmm10 |
132 | movaps %xmm13, %xmm11 |
133 | |
134 | movups 16(%rcx, %rax), %xmm7 |
135 | movups 16(%rdi, %rax), %xmm3 |
136 | |
137 | unpckhpd %xmm7, %xmm5 |
138 | unpckhpd %xmm3, %xmm13 |
139 | |
140 | mulpd %xmm6, %xmm5 |
141 | mulpd %xmm0, %xmm13 |
142 | |
143 | movlhps %xmm7, %xmm10 |
144 | movlhps %xmm3, %xmm11 |
145 | |
146 | addpd %xmm10, %xmm5 |
147 | addpd %xmm11, %xmm13 |
148 | |
149 | mulpd %xmm6, %xmm5 |
150 | mulpd %xmm0, %xmm13 |
151 | |
152 | addpd %xmm2, %xmm5 |
153 | |
154 | movups (%rsi, %rax), %xmm2 |
155 | movups (%rdi, %rax), %xmm7 |
156 | |
157 | movaps %xmm2, %xmm3 |
158 | |
159 | unpckhpd %xmm7, %xmm2 |
160 | movlhps %xmm7, %xmm3 |
161 | |
162 | addpd %xmm13, %xmm2 |
163 | |
164 | mulpd %xmm5, %xmm6 |
165 | addpd %xmm4, %xmm6 |
166 | |
167 | mulpd %xmm2, %xmm0 |
168 | addpd %xmm3, %xmm0 |
169 | |
170 | cvtpd2ps %xmm0, %xmm2 |
171 | cvtpd2ps %xmm6, %xmm0 |
172 | |
173 | movlhps %xmm2, %xmm0 |
174 | andnps %xmm12, %xmm1 |
175 | orps %xmm1, %xmm0 |
176 | |
177 | /* xmm8 contains mask of special values. */ |
178 | pcmpgtd TANHF_DATA(_iExpMask)(%rip), %xmm8 |
179 | |
180 | movmskps %xmm8, %edx |
181 | testl %edx, %edx |
182 | |
183 | /* Go to special inputs processing branch */ |
184 | jne L(SPECIAL_VALUES_BRANCH) |
185 | # LOE rbx rbp r12 r13 r14 r15 xmm0 |
186 | /* No stack restoration on the fastpath. */ |
187 | ret |
188 | |
189 | /* Cold case. edx has 1s where there was a special value that |
190 | needs to be handled by a tanhf call. Optimize for code size |
191 | more so than speed here. */ |
192 | L(SPECIAL_VALUES_BRANCH): |
193 | # LOE rbx rdx rbp r12 r13 r14 r15 xmm0 xmm12 |
194 | /* Stack coming in 16-byte aligned. Set 8-byte misaligned so on |
195 | call entry will be 16-byte aligned. */ |
196 | subq $56, %rsp |
197 | cfi_def_cfa_offset(64) |
198 | movups %xmm0, 24(%rsp) |
199 | movups %xmm12, 40(%rsp) |
200 | |
201 | /* Use rbx/rbp for callee save registers as they get short |
202 | encoding for many instructions (as compared with r12/r13). */ |
203 | movq %rbx, (%rsp) |
204 | cfi_offset(rbx, -64) |
205 | movq %rbp, 8(%rsp) |
206 | cfi_offset(rbp, -56) |
207 | /* edx has 1s where there was a special value that needs to be handled |
208 | by a tanhf call. */ |
209 | movl %edx, %ebx |
210 | L(SPECIAL_VALUES_LOOP): |
211 | # LOE rbx rbp r12 r13 r14 r15 |
212 | /* use rbp as index for special value that is saved across calls to |
213 | tanhf. We technically don't need a callee save register here as offset |
214 | to rsp is always [0, 12] so we can restore rsp by realigning to 64. |
215 | Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions |
216 | in the loop. */ |
217 | xorl %ebp, %ebp |
218 | bsfl %ebx, %ebp |
219 | |
220 | /* Scalar math function call to process special input. */ |
221 | movss 40(%rsp, %rbp, 4), %xmm0 |
222 | call tanhf@PLT |
223 | /* No good way to avoid the store-forwarding fault this will cause on |
224 | return. `lfence` avoids the SF fault but at greater cost as it |
225 | serialized stack/callee save restoration. */ |
226 | movss %xmm0, 24(%rsp, %rbp, 4) |
227 | |
228 | leal -1(%rbx), %eax |
229 | andl %eax, %ebx |
230 | jnz L(SPECIAL_VALUES_LOOP) |
231 | # LOE r12 r13 r14 r15 |
232 | /* All results have been written to 24(%rsp). */ |
233 | movups 24(%rsp), %xmm0 |
234 | movq (%rsp), %rbx |
235 | cfi_restore(rbx) |
236 | movq 8(%rsp), %rbp |
237 | cfi_restore(rbp) |
238 | addq $56, %rsp |
239 | cfi_def_cfa_offset(8) |
240 | ret |
241 | END(_ZGVbN4v_tanhf_sse4) |
242 | |