1/* Function atanf vectorized with SSE4.
2 Copyright (C) 2021-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
18
19/*
20 * ALGORITHM DESCRIPTION:
21 *
22 * For 0.0 <= x <= 7.0/16.0: atan(x) = atan(0.0) + atan(s), where s=(x-0.0)/(1.0+0.0*x)
23 * For 7.0/16.0 <= x <= 11.0/16.0: atan(x) = atan(0.5) + atan(s), where s=(x-0.5)/(1.0+0.5*x)
24 * For 11.0/16.0 <= x <= 19.0/16.0: atan(x) = atan(1.0) + atan(s), where s=(x-1.0)/(1.0+1.0*x)
25 * For 19.0/16.0 <= x <= 39.0/16.0: atan(x) = atan(1.5) + atan(s), where s=(x-1.5)/(1.0+1.5*x)
26 * For 39.0/16.0 <= x <= inf : atan(x) = atan(inf) + atan(s), where s=-1.0/x
27 * Where atan(s) ~= s+s^3*Poly11(s^2) on interval |s|<7.0/0.16.
28 *
29 */
30
31/* Offsets for data table __svml_satan_data_internal
32 */
33#define _sSIGN_MASK 0
34#define _sABS_MASK 16
35#define _sONE 32
36#define _sPIO2 48
37#define _sPC8 64
38#define _sPC7 80
39#define _sPC6 96
40#define _sPC5 112
41#define _sPC4 128
42#define _sPC3 144
43#define _sPC2 160
44#define _sPC1 176
45#define _sPC0 192
46
47#include <sysdep.h>
48
49 .section .text.sse4, "ax", @progbits
50ENTRY(_ZGVbN4v_atanf_sse4)
51 /*
52 * To use minps\maxps operations for argument reduction
53 * uncomment _AT_USEMINMAX_ definition
54 * Declarations
55 * Variables
56 * Constants
57 */
58 movups _sABS_MASK+__svml_satan_data_internal(%rip), %xmm2
59
60 /*
61 * 1) If x>1, then r=-1/x, PIO2=Pi/2
62 * 2) If -1<=x<=1, then r=x, PIO2=0
63 * 3) If x<-1, then r=-1/x, PIO2=-Pi/2
64 */
65 movups _sONE+__svml_satan_data_internal(%rip), %xmm1
66 andps %xmm0, %xmm2
67 movaps %xmm2, %xmm9
68 movaps %xmm1, %xmm3
69 cmpleps %xmm1, %xmm9
70 maxps %xmm2, %xmm3
71 minps %xmm2, %xmm1
72 divps %xmm3, %xmm1
73 movups __svml_satan_data_internal(%rip), %xmm4
74 movaps %xmm9, %xmm10
75 andps %xmm4, %xmm0
76 andnps %xmm4, %xmm9
77 pxor %xmm0, %xmm9
78 pxor %xmm1, %xmm9
79
80 /* Polynomial. */
81 movaps %xmm9, %xmm8
82 mulps %xmm9, %xmm8
83 movaps %xmm8, %xmm7
84 mulps %xmm8, %xmm7
85 movups _sPC8+__svml_satan_data_internal(%rip), %xmm6
86 mulps %xmm7, %xmm6
87 movups _sPC7+__svml_satan_data_internal(%rip), %xmm5
88 mulps %xmm7, %xmm5
89 addps _sPC6+__svml_satan_data_internal(%rip), %xmm6
90 mulps %xmm7, %xmm6
91 addps _sPC5+__svml_satan_data_internal(%rip), %xmm5
92 mulps %xmm7, %xmm5
93 addps _sPC4+__svml_satan_data_internal(%rip), %xmm6
94 mulps %xmm7, %xmm6
95 addps _sPC3+__svml_satan_data_internal(%rip), %xmm5
96 mulps %xmm5, %xmm7
97 addps _sPC2+__svml_satan_data_internal(%rip), %xmm6
98 mulps %xmm8, %xmm6
99 addps _sPC1+__svml_satan_data_internal(%rip), %xmm7
100 andnps _sPIO2+__svml_satan_data_internal(%rip), %xmm10
101 addps %xmm6, %xmm7
102 mulps %xmm7, %xmm8
103 pxor %xmm0, %xmm10
104 addps _sPC0+__svml_satan_data_internal(%rip), %xmm8
105
106 /* Reconstruction. */
107 mulps %xmm8, %xmm9
108 addps %xmm9, %xmm10
109 movaps %xmm10, %xmm0
110 ret
111
112END(_ZGVbN4v_atanf_sse4)
113
114 .section .rodata, "a"
115 .align 16
116
117#ifdef __svml_satan_data_internal_typedef
118typedef unsigned int VUINT32;
119typedef struct {
120 __declspec(align(16)) VUINT32 _sSIGN_MASK[4][1];
121 __declspec(align(16)) VUINT32 _sABS_MASK[4][1];
122 __declspec(align(16)) VUINT32 _sONE[4][1];
123 __declspec(align(16)) VUINT32 _sPIO2[4][1];
124 __declspec(align(16)) VUINT32 _sPC8[4][1];
125 __declspec(align(16)) VUINT32 _sPC7[4][1];
126 __declspec(align(16)) VUINT32 _sPC6[4][1];
127 __declspec(align(16)) VUINT32 _sPC5[4][1];
128 __declspec(align(16)) VUINT32 _sPC4[4][1];
129 __declspec(align(16)) VUINT32 _sPC3[4][1];
130 __declspec(align(16)) VUINT32 _sPC2[4][1];
131 __declspec(align(16)) VUINT32 _sPC1[4][1];
132 __declspec(align(16)) VUINT32 _sPC0[4][1];
133} __svml_satan_data_internal;
134#endif
135__svml_satan_data_internal:
136 .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 // _sSIGN_MASK
137 .align 16
138 .long 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF // _sABS_MASK
139 .align 16
140 .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 // _sONE
141 .align 16
142 .long 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB // _sPIO2
143 .align 16
144 .long 0x3B322CC0, 0x3B322CC0, 0x3B322CC0, 0x3B322CC0 // _sPC8
145 .align 16
146 .long 0xBC7F2631, 0xBC7F2631, 0xBC7F2631, 0xBC7F2631 // _sPC7
147 .align 16
148 .long 0x3D2BC384, 0x3D2BC384, 0x3D2BC384, 0x3D2BC384 // _sPC6
149 .align 16
150 .long 0xBD987629, 0xBD987629, 0xBD987629, 0xBD987629 // _sPC5
151 .align 16
152 .long 0x3DD96474, 0x3DD96474, 0x3DD96474, 0x3DD96474 // _sPC4
153 .align 16
154 .long 0xBE1161F8, 0xBE1161F8, 0xBE1161F8, 0xBE1161F8 // _sPC3
155 .align 16
156 .long 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F // _sPC2
157 .align 16
158 .long 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49 // _sPC1
159 .align 16
160 .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 // _sPC0
161 .align 16
162 .type __svml_satan_data_internal, @object
163 .size __svml_satan_data_internal, .-__svml_satan_data_internal
164

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_s_atanf4_core_sse4.S