1 | /* Function tanhf vectorized with AVX2. |
2 | Copyright (C) 2021-2024 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | https://www.gnu.org/licenses/. */ |
18 | |
19 | /* |
20 | * ALGORITHM DESCRIPTION: |
21 | * |
22 | * NOTE: Since the hyperbolic tangent function is odd |
23 | * (tanh(x) = -tanh(-x)), below algorithm deals with the absolute |
24 | * value of the argument |x|: tanh(x) = sign(x) * tanh(|x|) |
25 | * |
26 | * We use a table lookup method to compute tanh(|x|). |
27 | * The basic idea is to split the input range into a number of subintervals |
28 | * and to approximate tanh(.) with a polynomial on each of them. |
29 | * |
30 | * IEEE SPECIAL CONDITIONS: |
31 | * x = [+, -]0, r = [+, -]0 |
32 | * x = +Inf, r = +1 |
33 | * x = -Inf, r = -1 |
34 | * x = QNaN, r = QNaN |
35 | * x = SNaN, r = QNaN |
36 | * |
37 | * |
38 | * ALGORITHM DETAILS |
39 | * We handle special values in a callout function, aside from main path |
40 | * computations. "Special" for this algorithm are: |
41 | * INF, NAN, |x| > HUGE_THRESHOLD |
42 | * |
43 | * |
44 | * Main path computations are organized as follows: |
45 | * Actually we split the interval [0, SATURATION_THRESHOLD) |
46 | * into a number of subintervals. On each subinterval we approximate tanh(.) |
47 | * with a minimax polynomial of pre-defined degree. Polynomial coefficients |
48 | * are computed beforehand and stored in table. We also use |
49 | * |
50 | * y := |x| + B, |
51 | * |
52 | * here B depends on subinterval and is used to make argument |
53 | * closer to zero. |
54 | * We also add large fake interval [SATURATION_THRESHOLD, HUGE_THRESHOLD], |
55 | * where 1.0 + 0.0*y + 0.0*y^2 ... coefficients are stored - just to |
56 | * preserve main path computation logic but return 1.0 for all arguments. |
57 | * |
58 | * Hence reconstruction looks as follows: |
59 | * we extract proper polynomial and range reduction coefficients |
60 | * (Pj and B), corresponding to subinterval, to which |x| belongs, |
61 | * and return |
62 | * |
63 | * r := sign(x) * (P0 + P1 * y + ... + Pn * y^n) |
64 | * |
65 | * NOTE: we use multiprecision technique to multiply and sum the first |
66 | * K terms of the polynomial. So Pj, j = 0..K are stored in |
67 | * table each as a pair of target precision numbers (Pj and PLj) to |
68 | * achieve wider than target precision. |
69 | * |
70 | * |
71 | */ |
72 | |
73 | #include <sysdep.h> |
74 | |
75 | /* tanhf data tables for avx2 and sse4 implementations defined here. |
76 | */ |
77 | #include "svml_s_tanhf_rodata.S" |
78 | |
79 | .section .text.avx2, "ax" , @progbits |
80 | ENTRY(_ZGVdN8v_tanhf_avx2) |
81 | /* Here huge arguments, INF and NaNs are filtered out to callout. */ |
82 | vpand TANHF_DATA(_iExpMantMask)(%rip), %ymm0, %ymm4 |
83 | vpsubd TANHF_DATA(_iMinIdxOfsMask)(%rip), %ymm4, %ymm2 |
84 | |
85 | /* Selection of arguments between [0, 0x04280000] into ymm2. */ |
86 | vpxor %ymm3, %ymm3, %ymm3 |
87 | vpmaxsd %ymm3, %ymm2, %ymm2 |
88 | vpminsd TANHF_DATA(_iMaxIdxMask)(%rip), %ymm2, %ymm2 |
89 | |
90 | /* |
91 | * small table specific variables * |
92 | * Constant loading |
93 | */ |
94 | vpsrld $14, %ymm2, %ymm1 |
95 | |
96 | /* We are splitting xmm1 into 8 GPRs. This may be faster to do with |
97 | store/load as we can take advantage of store-forwarding. */ |
98 | vmovq %xmm1, %r8 |
99 | /* We have eliminated all negative values for ymm1 so no need to sign |
100 | extend. */ |
101 | movl %r8d, %r9d |
102 | shrq $32, %r8 |
103 | |
104 | /* Store base of lookup table in rax. */ |
105 | leaq TANHF_DATA(_lookupTable)(%rip), %rax |
106 | |
107 | /* Instead of using cross-lane permutes on ymm vectors, use vpinsertf128 |
108 | with memory operand. This helps alleviate bottleneck on p5. */ |
109 | vmovupd 16(%r9, %rax), %xmm5 |
110 | |
111 | vpextrq $1, %xmm1, %rsi |
112 | movl %esi, %edi |
113 | shrq $32, %rsi |
114 | |
115 | vinsertf128 $1, 16(%rdi, %rax), %ymm5, %ymm5 |
116 | |
117 | vextracti128 $1, %ymm1, %xmm2 |
118 | vmovq %xmm2, %rdx |
119 | movl %edx, %ecx |
120 | shrq $32, %rdx |
121 | |
122 | vmovupd (%rcx, %rax), %xmm6 |
123 | |
124 | vpextrq $1, %xmm2, %r10 |
125 | movl %r10d, %r11d |
126 | shrq $32, %r10 |
127 | |
128 | vinsertf128 $1, (%r11, %rax), %ymm6, %ymm6 |
129 | |
130 | vmovupd 16(%r8, %rax), %xmm1 |
131 | vinsertf128 $1, 16(%rsi, %rax), %ymm1, %ymm1 |
132 | vmovupd (%rdx, %rax), %xmm3 |
133 | vinsertf128 $1, (%r10, %rax), %ymm3, %ymm3 |
134 | |
135 | vunpcklpd %ymm3, %ymm6, %ymm7 |
136 | vunpckhpd %ymm3, %ymm6, %ymm6 |
137 | |
138 | vunpcklpd %ymm1, %ymm5, %ymm3 |
139 | vunpckhpd %ymm1, %ymm5, %ymm1 |
140 | |
141 | vmovaps TANHF_DATA(_sAbsMask)(%rip), %ymm11 |
142 | /* Store special cases in ymm15. */ |
143 | vpcmpgtd TANHF_DATA(_iExpMask)(%rip), %ymm4, %ymm15 |
144 | |
145 | vandps %ymm11, %ymm0, %ymm4 |
146 | |
147 | vcvtps2pd %xmm4, %ymm5 |
148 | |
149 | vextractf128 $1, %ymm4, %xmm4 |
150 | vcvtps2pd %xmm4, %ymm4 |
151 | |
152 | vmovupd 16(%rcx, %rax), %xmm2 |
153 | vinsertf128 $1, 16(%r11, %rax), %ymm2, %ymm2 |
154 | |
155 | vfmadd213pd %ymm3, %ymm5, %ymm1 |
156 | |
157 | vmovupd 16(%rdx, %rax), %xmm3 |
158 | vinsertf128 $1, 16(%r10, %rax), %ymm3, %ymm3 |
159 | |
160 | vunpcklpd %ymm3, %ymm2, %ymm10 |
161 | vunpckhpd %ymm3, %ymm2, %ymm2 |
162 | |
163 | vfmadd213pd %ymm10, %ymm4, %ymm2 |
164 | vfmadd213pd %ymm6, %ymm4, %ymm2 |
165 | vfmadd213pd %ymm7, %ymm4, %ymm2 |
166 | vcvtpd2ps %ymm2, %xmm2 |
167 | |
168 | vmovupd (%r9, %rax), %xmm7 |
169 | vinsertf128 $1, (%rdi, %rax), %ymm7, %ymm7 |
170 | |
171 | vmovupd (%r8, %rax), %xmm3 |
172 | vinsertf128 $1, (%rsi, %rax), %ymm3, %ymm3 |
173 | |
174 | vunpckhpd %ymm3, %ymm7, %ymm4 |
175 | vunpcklpd %ymm3, %ymm7, %ymm7 |
176 | |
177 | vfmadd213pd %ymm4, %ymm5, %ymm1 |
178 | vfmadd213pd %ymm7, %ymm5, %ymm1 |
179 | |
180 | |
181 | vcvtpd2ps %ymm1, %xmm1 |
182 | vinsertf128 $1, %xmm2, %ymm1, %ymm1 |
183 | |
184 | vmovmskps %ymm15, %edx |
185 | vandnps %ymm0, %ymm11, %ymm2 |
186 | testl %edx, %edx |
187 | /* Go to special inputs processing branch */ |
188 | jne L(SPECIAL_VALUES_BRANCH) |
189 | # LOE rbx r12 r13 r14 r15 ymm0 ymm1 ymm2 |
190 | /* Wait until after branch of write over ymm0. */ |
191 | vorps %ymm2, %ymm1, %ymm0 |
192 | /* No stack restoration on the fastpath. */ |
193 | ret |
194 | |
195 | |
196 | /* Cold case. edx has 1s where there was a special value that |
197 | needs to be handled by a tanhf call. Optimize for code size |
198 | more so than speed here. */ |
199 | L(SPECIAL_VALUES_BRANCH): |
200 | # LOE rbx rdx r12 r13 r14 r15 ymm0 ymm1 ymm2 |
201 | /* Use r13 to save/restore the stack. This allows us to use rbp as |
202 | callee save register saving code size. */ |
203 | pushq %r13 |
204 | cfi_adjust_cfa_offset(8) |
205 | cfi_offset(r13, -16) |
206 | /* Need to callee save registers to preserve state across tanhf calls. |
207 | */ |
208 | pushq %rbx |
209 | cfi_adjust_cfa_offset(8) |
210 | cfi_offset(rbx, -24) |
211 | pushq %rbp |
212 | cfi_adjust_cfa_offset(8) |
213 | cfi_offset(rbp, -32) |
214 | movq %rsp, %r13 |
215 | cfi_def_cfa_register(r13) |
216 | |
217 | /* Align stack and make room for 2x ymm vectors. */ |
218 | andq $-32, %rsp |
219 | addq $-64, %rsp |
220 | |
221 | /* Save all already computed inputs. */ |
222 | vorps %ymm2, %ymm1, %ymm1 |
223 | vmovaps %ymm1, (%rsp) |
224 | /* Save original input (ymm0 unchanged up to this point). */ |
225 | vmovaps %ymm0, 32(%rsp) |
226 | |
227 | vzeroupper |
228 | |
229 | /* edx has 1s where there was a special value that needs to be handled |
230 | by a tanhf call. */ |
231 | movl %edx, %ebx |
232 | L(SPECIAL_VALUES_LOOP): |
233 | # LOE rbx rbp r12 r13 r14 r15 |
234 | /* use rbp as index for special value that is saved across calls to |
235 | tanhf. We technically don't need a callee save register here as offset |
236 | to rsp is always [0, 28] so we can restore rsp by realigning to 64. |
237 | Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions |
238 | in the loop. Realigning also costs more code size. */ |
239 | xorl %ebp, %ebp |
240 | tzcntl %ebx, %ebp |
241 | |
242 | /* Scalar math function call to process special input. */ |
243 | vmovss 32(%rsp, %rbp, 4), %xmm0 |
244 | call tanhf@PLT |
245 | |
246 | /* No good way to avoid the store-forwarding fault this will cause on |
247 | return. `lfence` avoids the SF fault but at greater cost as it |
248 | serialized stack/callee save restoration. */ |
249 | vmovss %xmm0, (%rsp, %rbp, 4) |
250 | |
251 | blsrl %ebx, %ebx |
252 | jnz L(SPECIAL_VALUES_LOOP) |
253 | # LOE r12 r13 r14 r15 |
254 | |
255 | |
256 | /* All results have been written to (%rsp). */ |
257 | vmovups (%rsp), %ymm0 |
258 | /* Restore rsp. */ |
259 | movq %r13, %rsp |
260 | cfi_def_cfa_register(rsp) |
261 | /* Restore callee save registers. */ |
262 | popq %rbp |
263 | cfi_adjust_cfa_offset(-8) |
264 | cfi_restore(rbp) |
265 | popq %rbx |
266 | cfi_adjust_cfa_offset(-8) |
267 | cfi_restore(rbp) |
268 | popq %r13 |
269 | cfi_adjust_cfa_offset(-8) |
270 | cfi_restore(r13) |
271 | ret |
272 | END(_ZGVdN8v_tanhf_avx2) |
273 | |