1 | /* Function erff vectorized with AVX-512. |
2 | Copyright (C) 2021-2024 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | https://www.gnu.org/licenses/. */ |
18 | |
19 | /* |
20 | * ALGORITHM DESCRIPTION: |
21 | * |
22 | * erf(x) is computed as higher precision simple polynomial |
23 | * with no lookup table: |
24 | * |
25 | * R = P0 + x^2*(P1 + x^2*(P2 + .... x^2*P12)); |
26 | * erf(x) = R * R * x; |
27 | * |
28 | * Special cases: |
29 | * |
30 | * erf(0) = 0 |
31 | * erf(+INF) = +1 |
32 | * erf(-INF) = -1 |
33 | * erf(QNaN) = QNaN |
34 | * erf(SNaN) = QNaN |
35 | * |
36 | */ |
37 | |
38 | /* Offsets for data table __svml_serf_data_internal |
39 | */ |
40 | #define _AbsMask 0 |
41 | #define _One 64 |
42 | #define _gf_MaxThreshold_LA 128 |
43 | #define _gf_la_poly_0 192 |
44 | #define _gf_la_poly_1 256 |
45 | #define _gf_la_poly_2 320 |
46 | #define _gf_la_poly_3 384 |
47 | #define _gf_la_poly_4 448 |
48 | #define _gf_la_poly_5 512 |
49 | #define _gf_la_poly_6 576 |
50 | #define _gf_la_poly_7 640 |
51 | #define _gf_la_poly_8 704 |
52 | #define _gf_la_poly_9 768 |
53 | #define _gf_la_poly_10 832 |
54 | #define _gf_la_poly_11 896 |
55 | #define _gf_la_poly_12 960 |
56 | |
57 | #include <sysdep.h> |
58 | |
59 | .section .text.evex512, "ax" , @progbits |
60 | ENTRY(_ZGVeN16v_erff_skx) |
61 | vmovaps %zmm0, %zmm8 |
62 | vmulps {rn-sae}, %zmm8, %zmm8, %zmm11 |
63 | vmovups _gf_la_poly_11+__svml_serf_data_internal(%rip), %zmm15 |
64 | vmovups _gf_la_poly_12+__svml_serf_data_internal(%rip), %zmm10 |
65 | vmovups _gf_la_poly_10+__svml_serf_data_internal(%rip), %zmm9 |
66 | vmovups _gf_la_poly_9+__svml_serf_data_internal(%rip), %zmm7 |
67 | vmovups _gf_la_poly_8+__svml_serf_data_internal(%rip), %zmm0 |
68 | vmovups _gf_la_poly_7+__svml_serf_data_internal(%rip), %zmm1 |
69 | vmovups _gf_la_poly_6+__svml_serf_data_internal(%rip), %zmm2 |
70 | vmovups _gf_la_poly_5+__svml_serf_data_internal(%rip), %zmm3 |
71 | vmovups _gf_la_poly_4+__svml_serf_data_internal(%rip), %zmm4 |
72 | vmovups _gf_la_poly_3+__svml_serf_data_internal(%rip), %zmm5 |
73 | vmovups _gf_la_poly_2+__svml_serf_data_internal(%rip), %zmm6 |
74 | vextractf32x8 $1, %zmm8, %ymm13 |
75 | vcvtps2pd {sae}, %ymm8, %zmm12 |
76 | vcvtps2pd {sae}, %ymm13, %zmm14 |
77 | vmulpd {rn-sae}, %zmm12, %zmm12, %zmm12 |
78 | vmulpd {rn-sae}, %zmm14, %zmm14, %zmm13 |
79 | |
80 | /* R = P0 + x^2*(P1 + x^2*(P2 + .... x^2*P12)); */ |
81 | vmovaps %zmm15, %zmm14 |
82 | vfmadd231pd {rn-sae}, %zmm12, %zmm10, %zmm14 |
83 | vfmadd231pd {rn-sae}, %zmm13, %zmm10, %zmm15 |
84 | vmovups _gf_la_poly_1+__svml_serf_data_internal(%rip), %zmm10 |
85 | vfmadd213pd {rn-sae}, %zmm9, %zmm12, %zmm14 |
86 | vfmadd231pd {rn-sae}, %zmm13, %zmm15, %zmm9 |
87 | vfmadd213pd {rn-sae}, %zmm7, %zmm12, %zmm14 |
88 | vfmadd231pd {rn-sae}, %zmm13, %zmm9, %zmm7 |
89 | vfmadd213pd {rn-sae}, %zmm0, %zmm12, %zmm14 |
90 | vfmadd231pd {rn-sae}, %zmm13, %zmm7, %zmm0 |
91 | vmovups _gf_MaxThreshold_LA+__svml_serf_data_internal(%rip), %zmm7 |
92 | vfmadd213pd {rn-sae}, %zmm1, %zmm12, %zmm14 |
93 | vfmadd231pd {rn-sae}, %zmm13, %zmm0, %zmm1 |
94 | vmovups _gf_la_poly_0+__svml_serf_data_internal(%rip), %zmm0 |
95 | vcmpps $22, {sae}, %zmm11, %zmm7, %k1 |
96 | vfmadd213pd {rn-sae}, %zmm2, %zmm12, %zmm14 |
97 | vfmadd231pd {rn-sae}, %zmm13, %zmm1, %zmm2 |
98 | vfmadd213pd {rn-sae}, %zmm3, %zmm12, %zmm14 |
99 | vfmadd231pd {rn-sae}, %zmm13, %zmm2, %zmm3 |
100 | vfmadd213pd {rn-sae}, %zmm4, %zmm12, %zmm14 |
101 | vfmadd231pd {rn-sae}, %zmm13, %zmm3, %zmm4 |
102 | vfmadd213pd {rn-sae}, %zmm5, %zmm12, %zmm14 |
103 | vfmadd231pd {rn-sae}, %zmm13, %zmm4, %zmm5 |
104 | vfmadd213pd {rn-sae}, %zmm6, %zmm12, %zmm14 |
105 | vfmadd231pd {rn-sae}, %zmm13, %zmm5, %zmm6 |
106 | vmovups _AbsMask+__svml_serf_data_internal(%rip), %zmm5 |
107 | vfmadd213pd {rn-sae}, %zmm10, %zmm12, %zmm14 |
108 | vfmadd231pd {rn-sae}, %zmm13, %zmm6, %zmm10 |
109 | vandnps %zmm8, %zmm5, %zmm6 |
110 | vfmadd213pd {rn-sae}, %zmm0, %zmm14, %zmm12 |
111 | vfmadd213pd {rn-sae}, %zmm0, %zmm10, %zmm13 |
112 | vorps _One+__svml_serf_data_internal(%rip), %zmm6, %zmm0 |
113 | vmulpd {rn-sae}, %zmm12, %zmm12, %zmm1 |
114 | vmulpd {rn-sae}, %zmm13, %zmm13, %zmm3 |
115 | vcvtpd2ps {rn-sae}, %zmm1, %ymm2 |
116 | vcvtpd2ps {rn-sae}, %zmm3, %ymm4 |
117 | vinsertf32x8 $1, %ymm4, %zmm2, %zmm9 |
118 | |
119 | /* erf(x) = R * R * x; */ |
120 | vmulps {rn-sae}, %zmm8, %zmm9, %zmm0{%k1} |
121 | ret |
122 | |
123 | END(_ZGVeN16v_erff_skx) |
124 | |
125 | .section .rodata, "a" |
126 | .align 64 |
127 | |
128 | #ifdef __svml_serf_data_internal_typedef |
129 | typedef unsigned int VUINT32; |
130 | typedef struct { |
131 | __declspec(align(64)) VUINT32 _AbsMask[16][1]; |
132 | __declspec(align(64)) VUINT32 _One[16][1]; |
133 | __declspec(align(64)) VUINT32 _gf_MaxThreshold_LA[16][1]; |
134 | __declspec(align(64)) VUINT32 _gf_la_poly_0[8][2]; |
135 | __declspec(align(64)) VUINT32 _gf_la_poly_1[8][2]; |
136 | __declspec(align(64)) VUINT32 _gf_la_poly_2[8][2]; |
137 | __declspec(align(64)) VUINT32 _gf_la_poly_3[8][2]; |
138 | __declspec(align(64)) VUINT32 _gf_la_poly_4[8][2]; |
139 | __declspec(align(64)) VUINT32 _gf_la_poly_5[8][2]; |
140 | __declspec(align(64)) VUINT32 _gf_la_poly_6[8][2]; |
141 | __declspec(align(64)) VUINT32 _gf_la_poly_7[8][2]; |
142 | __declspec(align(64)) VUINT32 _gf_la_poly_8[8][2]; |
143 | __declspec(align(64)) VUINT32 _gf_la_poly_9[8][2]; |
144 | __declspec(align(64)) VUINT32 _gf_la_poly_10[8][2]; |
145 | __declspec(align(64)) VUINT32 _gf_la_poly_11[8][2]; |
146 | __declspec(align(64)) VUINT32 _gf_la_poly_12[8][2]; |
147 | } __svml_serf_data_internal; |
148 | #endif |
149 | __svml_serf_data_internal: |
150 | .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff /* _AbsMask */ |
151 | .align 64 |
152 | .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 /* _One */ |
153 | .align 64 |
154 | .long 0x41558c5a, 0x41558c5a, 0x41558c5a, 0x41558c5a, 0x41558c5a, 0x41558c5a, 0x41558c5a, 0x41558c5a, 0x41558c5a, 0x41558c5a, 0x41558c5a, 0x41558c5a, 0x41558c5a, 0x41558c5a, 0x41558c5a, 0x41558c5a /* _gf_MaxThreshold_LA */ |
155 | .align 64 |
156 | .quad 0x3ff0fefbd933b903, 0x3ff0fefbd933b903, 0x3ff0fefbd933b903, 0x3ff0fefbd933b903, 0x3ff0fefbd933b903, 0x3ff0fefbd933b903, 0x3ff0fefbd933b903, 0x3ff0fefbd933b903 /* _gf_la_poly_0 */ |
157 | .align 64 |
158 | .quad 0xbfc6a948101e6367, 0xbfc6a948101e6367, 0xbfc6a948101e6367, 0xbfc6a948101e6367, 0xbfc6a948101e6367, 0xbfc6a948101e6367, 0xbfc6a948101e6367, 0xbfc6a948101e6367 /* _gf_la_poly_1 */ |
159 | .align 64 |
160 | .quad 0x3fa3a334ce602c6b, 0x3fa3a334ce602c6b, 0x3fa3a334ce602c6b, 0x3fa3a334ce602c6b, 0x3fa3a334ce602c6b, 0x3fa3a334ce602c6b, 0x3fa3a334ce602c6b, 0x3fa3a334ce602c6b /* _gf_la_poly_2 */ |
161 | .align 64 |
162 | .quad 0xbf799309ea0c81dc, 0xbf799309ea0c81dc, 0xbf799309ea0c81dc, 0xbf799309ea0c81dc, 0xbf799309ea0c81dc, 0xbf799309ea0c81dc, 0xbf799309ea0c81dc, 0xbf799309ea0c81dc /* _gf_la_poly_3 */ |
163 | .align 64 |
164 | .quad 0x3f476df64a40e392, 0x3f476df64a40e392, 0x3f476df64a40e392, 0x3f476df64a40e392, 0x3f476df64a40e392, 0x3f476df64a40e392, 0x3f476df64a40e392, 0x3f476df64a40e392 /* _gf_la_poly_4 */ |
165 | .align 64 |
166 | .quad 0xbf0a5216b9508ede, 0xbf0a5216b9508ede, 0xbf0a5216b9508ede, 0xbf0a5216b9508ede, 0xbf0a5216b9508ede, 0xbf0a5216b9508ede, 0xbf0a5216b9508ede, 0xbf0a5216b9508ede /* _gf_la_poly_5 */ |
167 | .align 64 |
168 | .quad 0x3ea5794b95c8e8a0, 0x3ea5794b95c8e8a0, 0x3ea5794b95c8e8a0, 0x3ea5794b95c8e8a0, 0x3ea5794b95c8e8a0, 0x3ea5794b95c8e8a0, 0x3ea5794b95c8e8a0, 0x3ea5794b95c8e8a0 /* _gf_la_poly_6 */ |
169 | .align 64 |
170 | .quad 0x3e94b6c0b485f30f, 0x3e94b6c0b485f30f, 0x3e94b6c0b485f30f, 0x3e94b6c0b485f30f, 0x3e94b6c0b485f30f, 0x3e94b6c0b485f30f, 0x3e94b6c0b485f30f, 0x3e94b6c0b485f30f /* _gf_la_poly_7 */ |
171 | .align 64 |
172 | .quad 0xbe65806ce17f0523, 0xbe65806ce17f0523, 0xbe65806ce17f0523, 0xbe65806ce17f0523, 0xbe65806ce17f0523, 0xbe65806ce17f0523, 0xbe65806ce17f0523, 0xbe65806ce17f0523 /* _gf_la_poly_8 */ |
173 | .align 64 |
174 | .quad 0x3e2715640470db47, 0x3e2715640470db47, 0x3e2715640470db47, 0x3e2715640470db47, 0x3e2715640470db47, 0x3e2715640470db47, 0x3e2715640470db47, 0x3e2715640470db47 /* _gf_la_poly_9 */ |
175 | .align 64 |
176 | .quad 0xbdddcb2653d80f03, 0xbdddcb2653d80f03, 0xbdddcb2653d80f03, 0xbdddcb2653d80f03, 0xbdddcb2653d80f03, 0xbdddcb2653d80f03, 0xbdddcb2653d80f03, 0xbdddcb2653d80f03 /* _gf_la_poly_10 */ |
177 | .align 64 |
178 | .quad 0x3d85eadfc762d3eb, 0x3d85eadfc762d3eb, 0x3d85eadfc762d3eb, 0x3d85eadfc762d3eb, 0x3d85eadfc762d3eb, 0x3d85eadfc762d3eb, 0x3d85eadfc762d3eb, 0x3d85eadfc762d3eb /* _gf_la_poly_11 */ |
179 | .align 64 |
180 | .quad 0xbd1c668a2871f0f1, 0xbd1c668a2871f0f1, 0xbd1c668a2871f0f1, 0xbd1c668a2871f0f1, 0xbd1c668a2871f0f1, 0xbd1c668a2871f0f1, 0xbd1c668a2871f0f1, 0xbd1c668a2871f0f1 /* _gf_la_poly_12 */ |
181 | .align 64 |
182 | .type __svml_serf_data_internal, @object |
183 | .size __svml_serf_data_internal, .-__svml_serf_data_internal |
184 | |