1/* Function atan vectorized with AVX2.
2 Copyright (C) 2021-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
18
19/*
20 * ALGORITHM DESCRIPTION:
21 *
22 * For 0.0 <= x <= 7.0/16.0: atan(x) = atan(0.0) + atan(s), where s=(x-0.0)/(1.0+0.0*x)
23 * For 7.0/16.0 <= x <= 11.0/16.0: atan(x) = atan(0.5) + atan(s), where s=(x-0.5)/(1.0+0.5*x)
24 * For 11.0/16.0 <= x <= 19.0/16.0: atan(x) = atan(1.0) + atan(s), where s=(x-1.0)/(1.0+1.0*x)
25 * For 19.0/16.0 <= x <= 39.0/16.0: atan(x) = atan(1.5) + atan(s), where s=(x-1.5)/(1.0+1.5*x)
26 * For 39.0/16.0 <= x <= inf : atan(x) = atan(inf) + atan(s), where s=-1.0/x
27 * Where atan(s) ~= s+s^3*Poly11(s^2) on interval |s|<7.0/0.16.
28 *
29 */
30
31/* Offsets for data table __svml_datan_data_internal_avx512
32 */
33#define AbsMask 0
34#define Shifter 32
35#define MaxThreshold 64
36#define MOne 96
37#define One 128
38#define LargeX 160
39#define Zero 192
40#define Tbl_H 224
41#define Tbl_L 480
42#define dIndexMed 736
43#define Pi2 768
44#define Pi2_low 800
45#define coeff 832
46
47#include <sysdep.h>
48
49 .section .text.avx2, "ax", @progbits
50ENTRY(_ZGVdN4v_atan_avx2)
51 lea Tbl_H+128+__svml_datan_data_internal_avx512(%rip), %rdi
52 vmovupd Shifter+__svml_datan_data_internal_avx512(%rip), %ymm4
53 vmovupd One+__svml_datan_data_internal_avx512(%rip), %ymm9
54
55 /* saturate X range */
56 vmovupd LargeX+__svml_datan_data_internal_avx512(%rip), %ymm6
57 vandpd __svml_datan_data_internal_avx512(%rip), %ymm0, %ymm7
58 vaddpd %ymm4, %ymm7, %ymm2
59 vcmpge_oqpd MaxThreshold+__svml_datan_data_internal_avx512(%rip), %ymm7, %ymm3
60 vminpd %ymm7, %ymm6, %ymm10
61 vsubpd %ymm4, %ymm2, %ymm5
62
63 /*
64 * table lookup sequence
65 * VPERMUTE not available
66 */
67 vpsllq $3, %ymm2, %ymm13
68 vsubpd %ymm5, %ymm7, %ymm8
69 vcmpge_oqpd dIndexMed+__svml_datan_data_internal_avx512(%rip), %ymm2, %ymm2
70 vfmadd231pd %ymm7, %ymm5, %ymm9
71 vpand .FLT_11(%rip), %ymm13, %ymm14
72 vblendvpd %ymm3, MOne+__svml_datan_data_internal_avx512(%rip), %ymm8, %ymm11
73 vblendvpd %ymm3, %ymm10, %ymm9, %ymm12
74 vxorpd %ymm0, %ymm7, %ymm1
75
76 /* R+Rl = DiffX/Y */
77 vdivpd %ymm12, %ymm11, %ymm0
78 vextractf128 $1, %ymm14, %xmm4
79 vmovd %xmm14, %eax
80 vmovd %xmm4, %ecx
81 movslq %eax, %rax
82 vpextrd $2, %xmm14, %edx
83 movslq %ecx, %rcx
84 vpextrd $2, %xmm4, %esi
85 movslq %edx, %rdx
86 movslq %esi, %rsi
87 vmovsd -128(%rax, %rdi), %xmm15
88 vmovsd (%rdi, %rax), %xmm7
89 vmovsd -128(%rcx, %rdi), %xmm5
90 vmovsd (%rdi, %rcx), %xmm9
91 vmovhpd -128(%rdx, %rdi), %xmm15, %xmm15
92 vmovhpd (%rdi, %rdx), %xmm7, %xmm8
93 vmovhpd -128(%rsi, %rdi), %xmm5, %xmm6
94 vmovhpd (%rdi, %rsi), %xmm9, %xmm10
95
96 /* polynomial evaluation */
97 vmulpd %ymm0, %ymm0, %ymm5
98 vmulpd %ymm5, %ymm5, %ymm4
99 vinsertf128 $1, %xmm6, %ymm15, %ymm11
100 vinsertf128 $1, %xmm10, %ymm8, %ymm12
101 vblendvpd %ymm2, %ymm12, %ymm11, %ymm13
102 vmovupd coeff+__svml_datan_data_internal_avx512(%rip), %ymm8
103 vmovupd coeff+64+__svml_datan_data_internal_avx512(%rip), %ymm2
104 vmulpd %ymm5, %ymm0, %ymm6
105 vfmadd213pd coeff+32+__svml_datan_data_internal_avx512(%rip), %ymm5, %ymm8
106 vfmadd213pd coeff+96+__svml_datan_data_internal_avx512(%rip), %ymm5, %ymm2
107
108 /* set table value to Pi/2 for large X */
109 vblendvpd %ymm3, Pi2+__svml_datan_data_internal_avx512(%rip), %ymm13, %ymm7
110 vmovupd coeff+128+__svml_datan_data_internal_avx512(%rip), %ymm3
111 vfmadd213pd %ymm2, %ymm4, %ymm8
112 vfmadd213pd coeff+160+__svml_datan_data_internal_avx512(%rip), %ymm3, %ymm5
113 vfmadd213pd %ymm5, %ymm4, %ymm8
114 vfmadd213pd %ymm0, %ymm6, %ymm8
115 vaddpd %ymm8, %ymm7, %ymm0
116 vxorpd %ymm1, %ymm0, %ymm0
117 ret
118
119END(_ZGVdN4v_atan_avx2)
120
121 .section .rodata, "a"
122 .align 32
123
124.FLT_11:
125 .long 0x00000078, 0x00000000, 0x00000078, 0x00000000, 0x00000078, 0x00000000, 0x00000078, 0x00000000
126 .type .FLT_11, @object
127 .size .FLT_11, 32
128 .align 32
129
130#ifdef __svml_datan_data_internal_avx512_typedef
131typedef unsigned int VUINT32;
132typedef struct {
133 __declspec(align(32)) VUINT32 AbsMask[4][2];
134 __declspec(align(32)) VUINT32 Shifter[4][2];
135 __declspec(align(32)) VUINT32 MaxThreshold[4][2];
136 __declspec(align(32)) VUINT32 MOne[4][2];
137 __declspec(align(32)) VUINT32 One[4][2];
138 __declspec(align(32)) VUINT32 LargeX[4][2];
139 __declspec(align(32)) VUINT32 Zero[4][2];
140 __declspec(align(32)) VUINT32 Tbl_H[32][2];
141 __declspec(align(32)) VUINT32 Tbl_L[32][2];
142 __declspec(align(32)) VUINT32 dIndexMed[4][2];
143 __declspec(align(32)) VUINT32 Pi2[4][2];
144 __declspec(align(32)) VUINT32 Pi2_low[4][2];
145 __declspec(align(32)) VUINT32 coeff[6][4][2];
146} __svml_datan_data_internal_avx512;
147#endif
148__svml_datan_data_internal_avx512:
149 /* AbsMask */
150 .quad 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff
151 /* Shifter */
152 .align 32
153 .quad 0x4318000000000000, 0x4318000000000000, 0x4318000000000000, 0x4318000000000000
154 /* MaxThreshold */
155 .align 32
156 .quad 0x401f800000000000, 0x401f800000000000, 0x401f800000000000, 0x401f800000000000
157 /* MOne */
158 .align 32
159 .quad 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000
160 /* One */
161 .align 32
162 .quad 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000
163 /* LargeX */
164 .align 32
165 .quad 0x47f0000000000000, 0x47f0000000000000, 0x47f0000000000000, 0x47f0000000000000
166 /* Zero */
167 .align 32
168 .quad 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000
169 /* Tbl_H */
170 .align 32
171 .quad 0x0000000000000000, 0x3fcf5b75f92c80dd
172 .quad 0x3fddac670561bb4f, 0x3fe4978fa3269ee1
173 .quad 0x3fe921fb54442d18, 0x3fecac7c57846f9e
174 .quad 0x3fef730bd281f69b, 0x3ff0d38f2c5ba09f
175 .quad 0x3ff1b6e192ebbe44, 0x3ff270ef55a53a25
176 .quad 0x3ff30b6d796a4da8, 0x3ff38d6a6ce13353
177 .quad 0x3ff3fc176b7a8560, 0x3ff45b54837351a0
178 .quad 0x3ff4ae10fc6589a5, 0x3ff4f68dea672617
179 .quad 0x3ff5368c951e9cfd, 0x3ff56f6f33a3e6a7
180 .quad 0x3ff5a25052114e60, 0x3ff5d013c41adabd
181 .quad 0x3ff5f97315254857, 0x3ff61f06c6a92b89
182 .quad 0x3ff6414d44094c7c, 0x3ff660b02c736a06
183 .quad 0x3ff67d8863bc99bd, 0x3ff698213a9d5053
184 .quad 0x3ff6b0bae830c070, 0x3ff6c78c7edeb195
185 .quad 0x3ff6dcc57bb565fd, 0x3ff6f08f07435fec
186 .quad 0x3ff7030cf9403197, 0x3ff7145eac2088a4
187 /* Tbl_L */
188 .align 32
189 .quad 0x0000000000000000, 0x3c68ab6e3cf7afbd
190 .quad 0x3c7a2b7f222f65e2, 0x3c72419a87f2a458
191 .quad 0x3c81a62633145c07, 0x3c80dae13ad18a6b
192 .quad 0x3c7007887af0cbbd, 0xbc9bd0dc231bfd70
193 .quad 0x3c9b1b466a88828e, 0xbc9a66b1af5f84fb
194 .quad 0x3c96254cb03bb199, 0xbc812c77e8a80f5c
195 .quad 0xbc4441a3bd3f1084, 0x3c79e4a72eedacc4
196 .quad 0xbc93b03e8a27f555, 0x3c9934f9f2b0020e
197 .quad 0xbc996f47948a99f1, 0xbc7df6edd6f1ec3b
198 .quad 0x3c78c2d0c89de218, 0x3c9f82bba194dd5d
199 .quad 0xbc831151a43b51ca, 0xbc8487d50bceb1a5
200 .quad 0xbc9c5f60a65c7397, 0xbc7acb6afb332a0f
201 .quad 0xbc99b7bd2e1e8c9c, 0xbc9b9839085189e3
202 .quad 0xbc97d1ab82ffb70b, 0x3c99239ad620ffe2
203 .quad 0xbc929c86447928e7, 0xbc8957a7170df016
204 .quad 0xbc7cbe1896221608, 0xbc9fda5797b32a0b
205 /* dIndexMed */
206 .align 32
207 .quad 0x4318000000000010, 0x4318000000000010, 0x4318000000000010, 0x4318000000000010
208 /* Pi2 */
209 .align 32
210 .quad 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18
211 /* Pi2_low */
212 .align 32
213 .quad 0x3c91a62633145c07, 0x3c91a62633145c07, 0x3c91a62633145c07, 0x3c91a62633145c07
214 /* coeff6 */
215 .align 32
216 .quad 0x3fb2e9b9f5c4fe97, 0x3fb2e9b9f5c4fe97, 0x3fb2e9b9f5c4fe97, 0x3fb2e9b9f5c4fe97
217 .quad 0xbfb74257c46790cc, 0xbfb74257c46790cc, 0xbfb74257c46790cc, 0xbfb74257c46790cc
218 .quad 0x3fbc71bfeff916a0, 0x3fbc71bfeff916a0, 0x3fbc71bfeff916a0, 0x3fbc71bfeff916a0
219 .quad 0xbfc249248eef04da, 0xbfc249248eef04da, 0xbfc249248eef04da, 0xbfc249248eef04da
220 .quad 0x3fc999999998741e, 0x3fc999999998741e, 0x3fc999999998741e, 0x3fc999999998741e
221 .quad 0xbfd555555555554d, 0xbfd555555555554d, 0xbfd555555555554d, 0xbfd555555555554d
222 .align 32
223 .type __svml_datan_data_internal_avx512, @object
224 .size __svml_datan_data_internal_avx512, .-__svml_datan_data_internal_avx512
225

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_d_atan4_core_avx2.S