1/* Function tanhf vectorized with AVX2.
2 Copyright (C) 2021-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
18
19/*
20 * ALGORITHM DESCRIPTION:
21 *
22 * NOTE: Since the hyperbolic tangent function is odd
23 * (tanh(x) = -tanh(-x)), below algorithm deals with the absolute
24 * value of the argument |x|: tanh(x) = sign(x) * tanh(|x|)
25 *
26 * We use a table lookup method to compute tanh(|x|).
27 * The basic idea is to split the input range into a number of subintervals
28 * and to approximate tanh(.) with a polynomial on each of them.
29 *
30 * IEEE SPECIAL CONDITIONS:
31 * x = [+, -]0, r = [+, -]0
32 * x = +Inf, r = +1
33 * x = -Inf, r = -1
34 * x = QNaN, r = QNaN
35 * x = SNaN, r = QNaN
36 *
37 *
38 * ALGORITHM DETAILS
39 * We handle special values in a callout function, aside from main path
40 * computations. "Special" for this algorithm are:
41 * INF, NAN, |x| > HUGE_THRESHOLD
42 *
43 *
44 * Main path computations are organized as follows:
45 * Actually we split the interval [0, SATURATION_THRESHOLD)
46 * into a number of subintervals. On each subinterval we approximate tanh(.)
47 * with a minimax polynomial of pre-defined degree. Polynomial coefficients
48 * are computed beforehand and stored in table. We also use
49 *
50 * y := |x| + B,
51 *
52 * here B depends on subinterval and is used to make argument
53 * closer to zero.
54 * We also add large fake interval [SATURATION_THRESHOLD, HUGE_THRESHOLD],
55 * where 1.0 + 0.0*y + 0.0*y^2 ... coefficients are stored - just to
56 * preserve main path computation logic but return 1.0 for all arguments.
57 *
58 * Hence reconstruction looks as follows:
59 * we extract proper polynomial and range reduction coefficients
60 * (Pj and B), corresponding to subinterval, to which |x| belongs,
61 * and return
62 *
63 * r := sign(x) * (P0 + P1 * y + ... + Pn * y^n)
64 *
65 * NOTE: we use multiprecision technique to multiply and sum the first
66 * K terms of the polynomial. So Pj, j = 0..K are stored in
67 * table each as a pair of target precision numbers (Pj and PLj) to
68 * achieve wider than target precision.
69 *
70 *
71 */
72
73#include <sysdep.h>
74
75/* tanhf data tables for avx2 and sse4 implementations defined here.
76 */
77#include "svml_s_tanhf_rodata.S"
78
79 .section .text.avx2, "ax", @progbits
80ENTRY(_ZGVdN8v_tanhf_avx2)
81 /* Here huge arguments, INF and NaNs are filtered out to callout. */
82 vpand TANHF_DATA(_iExpMantMask)(%rip), %ymm0, %ymm4
83 vpsubd TANHF_DATA(_iMinIdxOfsMask)(%rip), %ymm4, %ymm2
84
85 /* Selection of arguments between [0, 0x04280000] into ymm2. */
86 vpxor %ymm3, %ymm3, %ymm3
87 vpmaxsd %ymm3, %ymm2, %ymm2
88 vpminsd TANHF_DATA(_iMaxIdxMask)(%rip), %ymm2, %ymm2
89
90 /*
91 * small table specific variables *
92 * Constant loading
93 */
94 vpsrld $14, %ymm2, %ymm1
95
96 /* We are splitting xmm1 into 8 GPRs. This may be faster to do with
97 store/load as we can take advantage of store-forwarding. */
98 vmovq %xmm1, %r8
99 /* We have eliminated all negative values for ymm1 so no need to sign
100 extend. */
101 movl %r8d, %r9d
102 shrq $32, %r8
103
104 /* Store base of lookup table in rax. */
105 leaq TANHF_DATA(_lookupTable)(%rip), %rax
106
107 /* Instead of using cross-lane permutes on ymm vectors, use vpinsertf128
108 with memory operand. This helps alleviate bottleneck on p5. */
109 vmovupd 16(%r9, %rax), %xmm5
110
111 vpextrq $1, %xmm1, %rsi
112 movl %esi, %edi
113 shrq $32, %rsi
114
115 vinsertf128 $1, 16(%rdi, %rax), %ymm5, %ymm5
116
117 vextracti128 $1, %ymm1, %xmm2
118 vmovq %xmm2, %rdx
119 movl %edx, %ecx
120 shrq $32, %rdx
121
122 vmovupd (%rcx, %rax), %xmm6
123
124 vpextrq $1, %xmm2, %r10
125 movl %r10d, %r11d
126 shrq $32, %r10
127
128 vinsertf128 $1, (%r11, %rax), %ymm6, %ymm6
129
130 vmovupd 16(%r8, %rax), %xmm1
131 vinsertf128 $1, 16(%rsi, %rax), %ymm1, %ymm1
132 vmovupd (%rdx, %rax), %xmm3
133 vinsertf128 $1, (%r10, %rax), %ymm3, %ymm3
134
135 vunpcklpd %ymm3, %ymm6, %ymm7
136 vunpckhpd %ymm3, %ymm6, %ymm6
137
138 vunpcklpd %ymm1, %ymm5, %ymm3
139 vunpckhpd %ymm1, %ymm5, %ymm1
140
141 vmovaps TANHF_DATA(_sAbsMask)(%rip), %ymm11
142 /* Store special cases in ymm15. */
143 vpcmpgtd TANHF_DATA(_iExpMask)(%rip), %ymm4, %ymm15
144
145 vandps %ymm11, %ymm0, %ymm4
146
147 vcvtps2pd %xmm4, %ymm5
148
149 vextractf128 $1, %ymm4, %xmm4
150 vcvtps2pd %xmm4, %ymm4
151
152 vmovupd 16(%rcx, %rax), %xmm2
153 vinsertf128 $1, 16(%r11, %rax), %ymm2, %ymm2
154
155 vfmadd213pd %ymm3, %ymm5, %ymm1
156
157 vmovupd 16(%rdx, %rax), %xmm3
158 vinsertf128 $1, 16(%r10, %rax), %ymm3, %ymm3
159
160 vunpcklpd %ymm3, %ymm2, %ymm10
161 vunpckhpd %ymm3, %ymm2, %ymm2
162
163 vfmadd213pd %ymm10, %ymm4, %ymm2
164 vfmadd213pd %ymm6, %ymm4, %ymm2
165 vfmadd213pd %ymm7, %ymm4, %ymm2
166 vcvtpd2ps %ymm2, %xmm2
167
168 vmovupd (%r9, %rax), %xmm7
169 vinsertf128 $1, (%rdi, %rax), %ymm7, %ymm7
170
171 vmovupd (%r8, %rax), %xmm3
172 vinsertf128 $1, (%rsi, %rax), %ymm3, %ymm3
173
174 vunpckhpd %ymm3, %ymm7, %ymm4
175 vunpcklpd %ymm3, %ymm7, %ymm7
176
177 vfmadd213pd %ymm4, %ymm5, %ymm1
178 vfmadd213pd %ymm7, %ymm5, %ymm1
179
180
181 vcvtpd2ps %ymm1, %xmm1
182 vinsertf128 $1, %xmm2, %ymm1, %ymm1
183
184 vmovmskps %ymm15, %edx
185 vandnps %ymm0, %ymm11, %ymm2
186 testl %edx, %edx
187 /* Go to special inputs processing branch */
188 jne L(SPECIAL_VALUES_BRANCH)
189 # LOE rbx r12 r13 r14 r15 ymm0 ymm1 ymm2
190 /* Wait until after branch of write over ymm0. */
191 vorps %ymm2, %ymm1, %ymm0
192 /* No stack restoration on the fastpath. */
193 ret
194
195
196 /* Cold case. edx has 1s where there was a special value that
197 needs to be handled by a tanhf call. Optimize for code size
198 more so than speed here. */
199L(SPECIAL_VALUES_BRANCH):
200 # LOE rbx rdx r12 r13 r14 r15 ymm0 ymm1 ymm2
201 /* Use r13 to save/restore the stack. This allows us to use rbp as
202 callee save register saving code size. */
203 pushq %r13
204 cfi_adjust_cfa_offset(8)
205 cfi_offset(r13, -16)
206 /* Need to callee save registers to preserve state across tanhf calls.
207 */
208 pushq %rbx
209 cfi_adjust_cfa_offset(8)
210 cfi_offset(rbx, -24)
211 pushq %rbp
212 cfi_adjust_cfa_offset(8)
213 cfi_offset(rbp, -32)
214 movq %rsp, %r13
215 cfi_def_cfa_register(r13)
216
217 /* Align stack and make room for 2x ymm vectors. */
218 andq $-32, %rsp
219 addq $-64, %rsp
220
221 /* Save all already computed inputs. */
222 vorps %ymm2, %ymm1, %ymm1
223 vmovaps %ymm1, (%rsp)
224 /* Save original input (ymm0 unchanged up to this point). */
225 vmovaps %ymm0, 32(%rsp)
226
227 vzeroupper
228
229 /* edx has 1s where there was a special value that needs to be handled
230 by a tanhf call. */
231 movl %edx, %ebx
232L(SPECIAL_VALUES_LOOP):
233 # LOE rbx rbp r12 r13 r14 r15
234 /* use rbp as index for special value that is saved across calls to
235 tanhf. We technically don't need a callee save register here as offset
236 to rsp is always [0, 28] so we can restore rsp by realigning to 64.
237 Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
238 in the loop. Realigning also costs more code size. */
239 xorl %ebp, %ebp
240 tzcntl %ebx, %ebp
241
242 /* Scalar math function call to process special input. */
243 vmovss 32(%rsp, %rbp, 4), %xmm0
244 call tanhf@PLT
245
246 /* No good way to avoid the store-forwarding fault this will cause on
247 return. `lfence` avoids the SF fault but at greater cost as it
248 serialized stack/callee save restoration. */
249 vmovss %xmm0, (%rsp, %rbp, 4)
250
251 blsrl %ebx, %ebx
252 jnz L(SPECIAL_VALUES_LOOP)
253 # LOE r12 r13 r14 r15
254
255
256 /* All results have been written to (%rsp). */
257 vmovups (%rsp), %ymm0
258 /* Restore rsp. */
259 movq %r13, %rsp
260 cfi_def_cfa_register(rsp)
261 /* Restore callee save registers. */
262 popq %rbp
263 cfi_adjust_cfa_offset(-8)
264 cfi_restore(rbp)
265 popq %rbx
266 cfi_adjust_cfa_offset(-8)
267 cfi_restore(rbp)
268 popq %r13
269 cfi_adjust_cfa_offset(-8)
270 cfi_restore(r13)
271 ret
272END(_ZGVdN8v_tanhf_avx2)
273

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S