1/* Function atanf vectorized with AVX-512.
2 Copyright (C) 2021-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
18
19/*
20 * ALGORITHM DESCRIPTION:
21 *
22 * For 0.0 <= x <= 7.0/16.0: atan(x) = atan(0.0) + atan(s), where s=(x-0.0)/(1.0+0.0*x)
23 * For 7.0/16.0 <= x <= 11.0/16.0: atan(x) = atan(0.5) + atan(s), where s=(x-0.5)/(1.0+0.5*x)
24 * For 11.0/16.0 <= x <= 19.0/16.0: atan(x) = atan(1.0) + atan(s), where s=(x-1.0)/(1.0+1.0*x)
25 * For 19.0/16.0 <= x <= 39.0/16.0: atan(x) = atan(1.5) + atan(s), where s=(x-1.5)/(1.0+1.5*x)
26 * For 39.0/16.0 <= x <= inf : atan(x) = atan(inf) + atan(s), where s=-1.0/x
27 * Where atan(s) ~= s+s^3*Poly11(s^2) on interval |s|<7.0/0.16.
28 *
29 */
30
31/* Offsets for data table __svml_satan_data_internal_avx512
32 */
33#define AbsMask 0
34#define Shifter 64
35#define MaxThreshold 128
36#define MOne 192
37#define One 256
38#define LargeX 320
39#define Zero 384
40#define Tbl_H 448
41#define Pi2 576
42#define coeff_1 640
43#define coeff_2 704
44#define coeff_3 768
45
46#include <sysdep.h>
47
48 .section .text.evex512, "ax", @progbits
49ENTRY(_ZGVeN16v_atanf_skx)
50 vandps __svml_satan_data_internal_avx512(%rip), %zmm0, %zmm7
51 vmovups MaxThreshold+__svml_satan_data_internal_avx512(%rip), %zmm3
52 vmovups One+__svml_satan_data_internal_avx512(%rip), %zmm8
53
54 /* round to 2 bits after binary point */
55 vreduceps $40, {sae}, %zmm7, %zmm5
56
57 /* saturate X range */
58 vmovups LargeX+__svml_satan_data_internal_avx512(%rip), %zmm6
59 vmovups Shifter+__svml_satan_data_internal_avx512(%rip), %zmm2
60 vcmpps $29, {sae}, %zmm3, %zmm7, %k1
61
62 /* table lookup sequence */
63 vmovups Tbl_H+__svml_satan_data_internal_avx512(%rip), %zmm3
64 vsubps {rn-sae}, %zmm5, %zmm7, %zmm4
65 vaddps {rn-sae}, %zmm2, %zmm7, %zmm1
66 vxorps %zmm0, %zmm7, %zmm0
67 vfmadd231ps {rn-sae}, %zmm7, %zmm4, %zmm8
68 vmovups coeff_2+__svml_satan_data_internal_avx512(%rip), %zmm4
69
70 /* if|X|>=MaxThreshold, set DiffX=-1 */
71 vblendmps MOne+__svml_satan_data_internal_avx512(%rip), %zmm5, %zmm9{%k1}
72 vmovups coeff_3+__svml_satan_data_internal_avx512(%rip), %zmm5
73
74 /* if|X|>=MaxThreshold, set Y=X */
75 vminps {sae}, %zmm7, %zmm6, %zmm8{%k1}
76
77 /* R+Rl = DiffX/Y */
78 vgetmantps $0, {sae}, %zmm9, %zmm12
79 vgetexpps {sae}, %zmm9, %zmm10
80 vpermt2ps Tbl_H+64+__svml_satan_data_internal_avx512(%rip), %zmm1, %zmm3
81 vgetmantps $0, {sae}, %zmm8, %zmm15
82 vgetexpps {sae}, %zmm8, %zmm11
83 vmovups coeff_1+__svml_satan_data_internal_avx512(%rip), %zmm1
84
85 /* set table value to Pi/2 for large X */
86 vblendmps Pi2+__svml_satan_data_internal_avx512(%rip), %zmm3, %zmm9{%k1}
87 vrcp14ps %zmm15, %zmm13
88 vsubps {rn-sae}, %zmm11, %zmm10, %zmm2
89 vmulps {rn-sae}, %zmm13, %zmm12, %zmm14
90 vfnmadd213ps {rn-sae}, %zmm12, %zmm14, %zmm15
91 vfmadd213ps {rn-sae}, %zmm14, %zmm13, %zmm15
92 vscalefps {rn-sae}, %zmm2, %zmm15, %zmm7
93
94 /* polynomial evaluation */
95 vmulps {rn-sae}, %zmm7, %zmm7, %zmm8
96 vmulps {rn-sae}, %zmm7, %zmm8, %zmm6
97 vfmadd231ps {rn-sae}, %zmm8, %zmm1, %zmm4
98 vfmadd213ps {rn-sae}, %zmm5, %zmm4, %zmm8
99 vfmadd213ps {rn-sae}, %zmm7, %zmm6, %zmm8
100 vaddps {rn-sae}, %zmm9, %zmm8, %zmm10
101 vxorps %zmm0, %zmm10, %zmm0
102 ret
103
104END(_ZGVeN16v_atanf_skx)
105
106 .section .rodata, "a"
107 .align 64
108
109#ifdef __svml_satan_data_internal_avx512_typedef
110typedef unsigned int VUINT32;
111typedef struct {
112 __declspec(align(64)) VUINT32 AbsMask[16][1];
113 __declspec(align(64)) VUINT32 Shifter[16][1];
114 __declspec(align(64)) VUINT32 MaxThreshold[16][1];
115 __declspec(align(64)) VUINT32 MOne[16][1];
116 __declspec(align(64)) VUINT32 One[16][1];
117 __declspec(align(64)) VUINT32 LargeX[16][1];
118 __declspec(align(64)) VUINT32 Zero[16][1];
119 __declspec(align(64)) VUINT32 Tbl_H[32][1];
120 __declspec(align(64)) VUINT32 Pi2[16][1];
121 __declspec(align(64)) VUINT32 coeff[3][16][1];
122} __svml_satan_data_internal_avx512;
123#endif
124__svml_satan_data_internal_avx512:
125 /* AbsMask */
126 .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
127 /* Shifter */
128 .align 64
129 .long 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000
130 /* MaxThreshold */
131 .align 64
132 .long 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000
133 /* MOne */
134 .align 64
135 .long 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000
136 /* One */
137 .align 64
138 .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
139 /* LargeX */
140 .align 64
141 .long 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000
142 /* Zero */
143 .align 64
144 .long 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000
145 /* Tbl_H */
146 .align 64
147 .long 0x00000000, 0x3e7adbb0
148 .long 0x3eed6338, 0x3f24bc7d
149 .long 0x3f490fdb, 0x3f6563e3
150 .long 0x3f7b985f, 0x3f869c79
151 .long 0x3f8db70d, 0x3f93877b
152 .long 0x3f985b6c, 0x3f9c6b53
153 .long 0x3f9fe0bb, 0x3fa2daa4
154 .long 0x3fa57088, 0x3fa7b46f
155 .long 0x3fa9b465, 0x3fab7b7a
156 .long 0x3fad1283, 0x3fae809e
157 .long 0x3fafcb99, 0x3fb0f836
158 .long 0x3fb20a6a, 0x3fb30581
159 .long 0x3fb3ec43, 0x3fb4c10a
160 .long 0x3fb585d7, 0x3fb63c64
161 .long 0x3fb6e62c, 0x3fb78478
162 .long 0x3fb81868, 0x3fb8a2f5
163 /* Pi2 */
164 .align 64
165 .long 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB
166 /* coeff3 */
167 .align 64
168 .long 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de
169 .long 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2
170 .long 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa
171 .align 64
172 .type __svml_satan_data_internal_avx512, @object
173 .size __svml_satan_data_internal_avx512, .-__svml_satan_data_internal_avx512
174

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_s_atanf16_core_avx512.S