1/* Function atan vectorized with AVX-512.
2 Copyright (C) 2021-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
18
19/*
20 * ALGORITHM DESCRIPTION:
21 *
22 * For 0.0 <= x <= 7.0/16.0: atan(x) = atan(0.0) + atan(s), where s=(x-0.0)/(1.0+0.0*x)
23 * For 7.0/16.0 <= x <= 11.0/16.0: atan(x) = atan(0.5) + atan(s), where s=(x-0.5)/(1.0+0.5*x)
24 * For 11.0/16.0 <= x <= 19.0/16.0: atan(x) = atan(1.0) + atan(s), where s=(x-1.0)/(1.0+1.0*x)
25 * For 19.0/16.0 <= x <= 39.0/16.0: atan(x) = atan(1.5) + atan(s), where s=(x-1.5)/(1.0+1.5*x)
26 * For 39.0/16.0 <= x <= inf : atan(x) = atan(inf) + atan(s), where s=-1.0/x
27 * Where atan(s) ~= s+s^3*Poly11(s^2) on interval |s|<7.0/0.16.
28 *
29 */
30
31/* Offsets for data table __svml_datan_data_internal_avx512
32 */
33#define AbsMask 0
34#define Shifter 64
35#define MaxThreshold 128
36#define MOne 192
37#define One 256
38#define LargeX 320
39#define Zero 384
40#define Tbl_H 448
41#define dIndexMed 704
42#define Pi2 768
43#define coeff_1 832
44#define coeff_2 896
45#define coeff_3 960
46#define coeff_4 1024
47#define coeff_5 1088
48#define coeff_6 1152
49
50#include <sysdep.h>
51
52 .section .text.evex512, "ax", @progbits
53ENTRY(_ZGVeN8v_atan_skx)
54 vmovups Shifter+__svml_datan_data_internal_avx512(%rip), %zmm4
55 vmovups MaxThreshold+__svml_datan_data_internal_avx512(%rip), %zmm3
56 vmovups One+__svml_datan_data_internal_avx512(%rip), %zmm9
57
58 /* saturate X range */
59 vmovups LargeX+__svml_datan_data_internal_avx512(%rip), %zmm7
60 vandpd __svml_datan_data_internal_avx512(%rip), %zmm0, %zmm8
61
62 /* R+Rl = DiffX/Y */
63 vbroadcastsd .FLT_10(%rip), %zmm15
64 vaddpd {rn-sae}, %zmm4, %zmm8, %zmm2
65 vxorpd %zmm0, %zmm8, %zmm1
66 vcmppd $29, {sae}, %zmm3, %zmm8, %k2
67
68 /* round to 2 bits after binary point */
69 vreducepd $40, {sae}, %zmm8, %zmm6
70 vsubpd {rn-sae}, %zmm4, %zmm2, %zmm5
71
72 /*
73 * if|X|>=MaxThreshold, set DiffX=-1
74 * VMSUB(D, DiffX, LargeMask, Zero, One);
75 */
76 vblendmpd MOne+__svml_datan_data_internal_avx512(%rip), %zmm6, %zmm10{%k2}
77 vfmadd231pd {rn-sae}, %zmm8, %zmm5, %zmm9
78 vmovups dIndexMed+__svml_datan_data_internal_avx512(%rip), %zmm5
79
80 /* table lookup sequence */
81 vmovups Tbl_H+__svml_datan_data_internal_avx512(%rip), %zmm6
82 vgetmantpd $0, {sae}, %zmm10, %zmm14
83 vgetexppd {sae}, %zmm10, %zmm11
84 vmovups coeff_5+__svml_datan_data_internal_avx512(%rip), %zmm10
85
86 /*
87 * if|X|>=MaxThreshold, set Y=X
88 * VMADD(D, Y, LargeMask, X, Zero);
89 */
90 vminpd {sae}, %zmm8, %zmm7, %zmm9{%k2}
91 vcmppd $29, {sae}, %zmm5, %zmm2, %k1
92 vmovups Tbl_H+128+__svml_datan_data_internal_avx512(%rip), %zmm7
93 vmovups coeff_1+__svml_datan_data_internal_avx512(%rip), %zmm8
94 vgetmantpd $0, {sae}, %zmm9, %zmm3
95 vgetexppd {sae}, %zmm9, %zmm12
96 vmovups coeff_3+__svml_datan_data_internal_avx512(%rip), %zmm9
97 vpermt2pd Tbl_H+64+__svml_datan_data_internal_avx512(%rip), %zmm2, %zmm6
98 vsubpd {rn-sae}, %zmm12, %zmm11, %zmm4
99 vpermt2pd Tbl_H+192+__svml_datan_data_internal_avx512(%rip), %zmm2, %zmm7
100 vrcp14pd %zmm3, %zmm13
101 vmovups coeff_4+__svml_datan_data_internal_avx512(%rip), %zmm12
102 vmovups coeff_6+__svml_datan_data_internal_avx512(%rip), %zmm11
103 vblendmpd %zmm7, %zmm6, %zmm2{%k1}
104 vmulpd {rn-sae}, %zmm13, %zmm14, %zmm0
105 vfnmadd231pd {rn-sae}, %zmm3, %zmm13, %zmm15
106 vfnmadd213pd {rn-sae}, %zmm14, %zmm0, %zmm3
107 vfmadd213pd {rn-sae}, %zmm15, %zmm15, %zmm15
108 vfmadd213pd {rn-sae}, %zmm13, %zmm13, %zmm15
109 vfmadd213pd {rn-sae}, %zmm0, %zmm15, %zmm3
110 vscalefpd {rn-sae}, %zmm4, %zmm3, %zmm0
111
112 /* set table value to Pi/2 for large X */
113 vblendmpd Pi2+__svml_datan_data_internal_avx512(%rip), %zmm2, %zmm3{%k2}
114 vmovups coeff_2+__svml_datan_data_internal_avx512(%rip), %zmm2
115
116 /* polynomial evaluation */
117 vmulpd {rn-sae}, %zmm0, %zmm0, %zmm14
118 vmulpd {rn-sae}, %zmm14, %zmm14, %zmm13
119 vmulpd {rn-sae}, %zmm0, %zmm14, %zmm15
120 vfmadd231pd {rn-sae}, %zmm14, %zmm8, %zmm2
121 vfmadd231pd {rn-sae}, %zmm14, %zmm9, %zmm12
122 vfmadd213pd {rn-sae}, %zmm11, %zmm10, %zmm14
123 vfmadd213pd {rn-sae}, %zmm12, %zmm13, %zmm2
124 vfmadd213pd {rn-sae}, %zmm14, %zmm13, %zmm2
125 vfmadd213pd {rn-sae}, %zmm0, %zmm15, %zmm2
126 vaddpd {rn-sae}, %zmm3, %zmm2, %zmm0
127 vxorpd %zmm1, %zmm0, %zmm0
128 ret
129
130END(_ZGVeN8v_atan_skx)
131
132 .section .rodata, "a"
133 .align 64
134
135#ifdef __svml_datan_data_internal_avx512_typedef
136typedef unsigned int VUINT32;
137typedef struct {
138 __declspec(align(64)) VUINT32 AbsMask[8][2];
139 __declspec(align(64)) VUINT32 Shifter[8][2];
140 __declspec(align(64)) VUINT32 MaxThreshold[8][2];
141 __declspec(align(64)) VUINT32 MOne[8][2];
142 __declspec(align(64)) VUINT32 One[8][2];
143 __declspec(align(64)) VUINT32 LargeX[8][2];
144 __declspec(align(64)) VUINT32 Zero[8][2];
145 __declspec(align(64)) VUINT32 Tbl_H[32][2];
146 __declspec(align(64)) VUINT32 dIndexMed[8][2];
147 __declspec(align(64)) VUINT32 Pi2[8][2];
148 __declspec(align(64)) VUINT32 coeff[6][8][2];
149} __svml_datan_data_internal_avx512;
150#endif
151__svml_datan_data_internal_avx512:
152 /* AbsMask */
153 .quad 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff
154 /* Shifter */
155 .align 64
156 .quad 0x4318000000000000, 0x4318000000000000, 0x4318000000000000, 0x4318000000000000, 0x4318000000000000, 0x4318000000000000, 0x4318000000000000, 0x4318000000000000
157 /* MaxThreshold */
158 .align 64
159 .quad 0x401f800000000000, 0x401f800000000000, 0x401f800000000000, 0x401f800000000000, 0x401f800000000000, 0x401f800000000000, 0x401f800000000000, 0x401f800000000000
160 /* MOne */
161 .align 64
162 .quad 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000
163 /* One */
164 .align 64
165 .quad 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000
166 /* LargeX */
167 .align 64
168 .quad 0x47f0000000000000, 0x47f0000000000000, 0x47f0000000000000, 0x47f0000000000000, 0x47f0000000000000, 0x47f0000000000000, 0x47f0000000000000, 0x47f0000000000000
169 /* Zero */
170 .align 64
171 .quad 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000
172 /* Tbl_H */
173 .align 64
174 .quad 0x0000000000000000, 0x3fcf5b75f92c80dd
175 .quad 0x3fddac670561bb4f, 0x3fe4978fa3269ee1
176 .quad 0x3fe921fb54442d18, 0x3fecac7c57846f9e
177 .quad 0x3fef730bd281f69b, 0x3ff0d38f2c5ba09f
178 .quad 0x3ff1b6e192ebbe44, 0x3ff270ef55a53a25
179 .quad 0x3ff30b6d796a4da8, 0x3ff38d6a6ce13353
180 .quad 0x3ff3fc176b7a8560, 0x3ff45b54837351a0
181 .quad 0x3ff4ae10fc6589a5, 0x3ff4f68dea672617
182 .quad 0x3ff5368c951e9cfd, 0x3ff56f6f33a3e6a7
183 .quad 0x3ff5a25052114e60, 0x3ff5d013c41adabd
184 .quad 0x3ff5f97315254857, 0x3ff61f06c6a92b89
185 .quad 0x3ff6414d44094c7c, 0x3ff660b02c736a06
186 .quad 0x3ff67d8863bc99bd, 0x3ff698213a9d5053
187 .quad 0x3ff6b0bae830c070, 0x3ff6c78c7edeb195
188 .quad 0x3ff6dcc57bb565fd, 0x3ff6f08f07435fec
189 .quad 0x3ff7030cf9403197, 0x3ff7145eac2088a4
190 /* dIndexMed */
191 .align 64
192 .quad 0x4318000000000010, 0x4318000000000010, 0x4318000000000010, 0x4318000000000010, 0x4318000000000010, 0x4318000000000010, 0x4318000000000010, 0x4318000000000010
193 /* Pi2 */
194 .align 64
195 .quad 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18
196 /* coeff6 */
197 .align 64
198 .quad 0x3fb2e9b9f5c4fe97, 0x3fb2e9b9f5c4fe97, 0x3fb2e9b9f5c4fe97, 0x3fb2e9b9f5c4fe97, 0x3fb2e9b9f5c4fe97, 0x3fb2e9b9f5c4fe97, 0x3fb2e9b9f5c4fe97, 0x3fb2e9b9f5c4fe97
199 .quad 0xbfb74257c46790cc, 0xbfb74257c46790cc, 0xbfb74257c46790cc, 0xbfb74257c46790cc, 0xbfb74257c46790cc, 0xbfb74257c46790cc, 0xbfb74257c46790cc, 0xbfb74257c46790cc
200 .quad 0x3fbc71bfeff916a0, 0x3fbc71bfeff916a0, 0x3fbc71bfeff916a0, 0x3fbc71bfeff916a0, 0x3fbc71bfeff916a0, 0x3fbc71bfeff916a0, 0x3fbc71bfeff916a0, 0x3fbc71bfeff916a0
201 .quad 0xbfc249248eef04da, 0xbfc249248eef04da, 0xbfc249248eef04da, 0xbfc249248eef04da, 0xbfc249248eef04da, 0xbfc249248eef04da, 0xbfc249248eef04da, 0xbfc249248eef04da
202 .quad 0x3fc999999998741e, 0x3fc999999998741e, 0x3fc999999998741e, 0x3fc999999998741e, 0x3fc999999998741e, 0x3fc999999998741e, 0x3fc999999998741e, 0x3fc999999998741e
203 .quad 0xbfd555555555554d, 0xbfd555555555554d, 0xbfd555555555554d, 0xbfd555555555554d, 0xbfd555555555554d, 0xbfd555555555554d, 0xbfd555555555554d, 0xbfd555555555554d
204 .align 64
205 .type __svml_datan_data_internal_avx512, @object
206 .size __svml_datan_data_internal_avx512, .-__svml_datan_data_internal_avx512
207 .align 8
208
209.FLT_10:
210 .long 0x00000000, 0x3ff00000
211 .type .FLT_10, @object
212 .size .FLT_10, 8
213

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_d_atan8_core_avx512.S