1/* Function erff vectorized with AVX-512.
2 Copyright (C) 2021-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
18
19/*
20 * ALGORITHM DESCRIPTION:
21 *
22 * erf(x) is computed as higher precision simple polynomial
23 * with no lookup table:
24 *
25 * R = P0 + x^2*(P1 + x^2*(P2 + .... x^2*P12));
26 * erf(x) = R * R * x;
27 *
28 * Special cases:
29 *
30 * erf(0) = 0
31 * erf(+INF) = +1
32 * erf(-INF) = -1
33 * erf(QNaN) = QNaN
34 * erf(SNaN) = QNaN
35 *
36 */
37
38/* Offsets for data table __svml_serf_data_internal
39 */
40#define _AbsMask 0
41#define _One 64
42#define _gf_MaxThreshold_LA 128
43#define _gf_la_poly_0 192
44#define _gf_la_poly_1 256
45#define _gf_la_poly_2 320
46#define _gf_la_poly_3 384
47#define _gf_la_poly_4 448
48#define _gf_la_poly_5 512
49#define _gf_la_poly_6 576
50#define _gf_la_poly_7 640
51#define _gf_la_poly_8 704
52#define _gf_la_poly_9 768
53#define _gf_la_poly_10 832
54#define _gf_la_poly_11 896
55#define _gf_la_poly_12 960
56
57#include <sysdep.h>
58
59 .section .text.evex512, "ax", @progbits
60ENTRY(_ZGVeN16v_erff_skx)
61 vmovaps %zmm0, %zmm8
62 vmulps {rn-sae}, %zmm8, %zmm8, %zmm11
63 vmovups _gf_la_poly_11+__svml_serf_data_internal(%rip), %zmm15
64 vmovups _gf_la_poly_12+__svml_serf_data_internal(%rip), %zmm10
65 vmovups _gf_la_poly_10+__svml_serf_data_internal(%rip), %zmm9
66 vmovups _gf_la_poly_9+__svml_serf_data_internal(%rip), %zmm7
67 vmovups _gf_la_poly_8+__svml_serf_data_internal(%rip), %zmm0
68 vmovups _gf_la_poly_7+__svml_serf_data_internal(%rip), %zmm1
69 vmovups _gf_la_poly_6+__svml_serf_data_internal(%rip), %zmm2
70 vmovups _gf_la_poly_5+__svml_serf_data_internal(%rip), %zmm3
71 vmovups _gf_la_poly_4+__svml_serf_data_internal(%rip), %zmm4
72 vmovups _gf_la_poly_3+__svml_serf_data_internal(%rip), %zmm5
73 vmovups _gf_la_poly_2+__svml_serf_data_internal(%rip), %zmm6
74 vextractf32x8 $1, %zmm8, %ymm13
75 vcvtps2pd {sae}, %ymm8, %zmm12
76 vcvtps2pd {sae}, %ymm13, %zmm14
77 vmulpd {rn-sae}, %zmm12, %zmm12, %zmm12
78 vmulpd {rn-sae}, %zmm14, %zmm14, %zmm13
79
80 /* R = P0 + x^2*(P1 + x^2*(P2 + .... x^2*P12)); */
81 vmovaps %zmm15, %zmm14
82 vfmadd231pd {rn-sae}, %zmm12, %zmm10, %zmm14
83 vfmadd231pd {rn-sae}, %zmm13, %zmm10, %zmm15
84 vmovups _gf_la_poly_1+__svml_serf_data_internal(%rip), %zmm10
85 vfmadd213pd {rn-sae}, %zmm9, %zmm12, %zmm14
86 vfmadd231pd {rn-sae}, %zmm13, %zmm15, %zmm9
87 vfmadd213pd {rn-sae}, %zmm7, %zmm12, %zmm14
88 vfmadd231pd {rn-sae}, %zmm13, %zmm9, %zmm7
89 vfmadd213pd {rn-sae}, %zmm0, %zmm12, %zmm14
90 vfmadd231pd {rn-sae}, %zmm13, %zmm7, %zmm0
91 vmovups _gf_MaxThreshold_LA+__svml_serf_data_internal(%rip), %zmm7
92 vfmadd213pd {rn-sae}, %zmm1, %zmm12, %zmm14
93 vfmadd231pd {rn-sae}, %zmm13, %zmm0, %zmm1
94 vmovups _gf_la_poly_0+__svml_serf_data_internal(%rip), %zmm0
95 vcmpps $22, {sae}, %zmm11, %zmm7, %k1
96 vfmadd213pd {rn-sae}, %zmm2, %zmm12, %zmm14
97 vfmadd231pd {rn-sae}, %zmm13, %zmm1, %zmm2
98 vfmadd213pd {rn-sae}, %zmm3, %zmm12, %zmm14
99 vfmadd231pd {rn-sae}, %zmm13, %zmm2, %zmm3
100 vfmadd213pd {rn-sae}, %zmm4, %zmm12, %zmm14
101 vfmadd231pd {rn-sae}, %zmm13, %zmm3, %zmm4
102 vfmadd213pd {rn-sae}, %zmm5, %zmm12, %zmm14
103 vfmadd231pd {rn-sae}, %zmm13, %zmm4, %zmm5
104 vfmadd213pd {rn-sae}, %zmm6, %zmm12, %zmm14
105 vfmadd231pd {rn-sae}, %zmm13, %zmm5, %zmm6
106 vmovups _AbsMask+__svml_serf_data_internal(%rip), %zmm5
107 vfmadd213pd {rn-sae}, %zmm10, %zmm12, %zmm14
108 vfmadd231pd {rn-sae}, %zmm13, %zmm6, %zmm10
109 vandnps %zmm8, %zmm5, %zmm6
110 vfmadd213pd {rn-sae}, %zmm0, %zmm14, %zmm12
111 vfmadd213pd {rn-sae}, %zmm0, %zmm10, %zmm13
112 vorps _One+__svml_serf_data_internal(%rip), %zmm6, %zmm0
113 vmulpd {rn-sae}, %zmm12, %zmm12, %zmm1
114 vmulpd {rn-sae}, %zmm13, %zmm13, %zmm3
115 vcvtpd2ps {rn-sae}, %zmm1, %ymm2
116 vcvtpd2ps {rn-sae}, %zmm3, %ymm4
117 vinsertf32x8 $1, %ymm4, %zmm2, %zmm9
118
119 /* erf(x) = R * R * x; */
120 vmulps {rn-sae}, %zmm8, %zmm9, %zmm0{%k1}
121 ret
122
123END(_ZGVeN16v_erff_skx)
124
125 .section .rodata, "a"
126 .align 64
127
128#ifdef __svml_serf_data_internal_typedef
129typedef unsigned int VUINT32;
130typedef struct {
131 __declspec(align(64)) VUINT32 _AbsMask[16][1];
132 __declspec(align(64)) VUINT32 _One[16][1];
133 __declspec(align(64)) VUINT32 _gf_MaxThreshold_LA[16][1];
134 __declspec(align(64)) VUINT32 _gf_la_poly_0[8][2];
135 __declspec(align(64)) VUINT32 _gf_la_poly_1[8][2];
136 __declspec(align(64)) VUINT32 _gf_la_poly_2[8][2];
137 __declspec(align(64)) VUINT32 _gf_la_poly_3[8][2];
138 __declspec(align(64)) VUINT32 _gf_la_poly_4[8][2];
139 __declspec(align(64)) VUINT32 _gf_la_poly_5[8][2];
140 __declspec(align(64)) VUINT32 _gf_la_poly_6[8][2];
141 __declspec(align(64)) VUINT32 _gf_la_poly_7[8][2];
142 __declspec(align(64)) VUINT32 _gf_la_poly_8[8][2];
143 __declspec(align(64)) VUINT32 _gf_la_poly_9[8][2];
144 __declspec(align(64)) VUINT32 _gf_la_poly_10[8][2];
145 __declspec(align(64)) VUINT32 _gf_la_poly_11[8][2];
146 __declspec(align(64)) VUINT32 _gf_la_poly_12[8][2];
147} __svml_serf_data_internal;
148#endif
149__svml_serf_data_internal:
150 .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff /* _AbsMask */
151 .align 64
152 .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 /* _One */
153 .align 64
154 .long 0x41558c5a, 0x41558c5a, 0x41558c5a, 0x41558c5a, 0x41558c5a, 0x41558c5a, 0x41558c5a, 0x41558c5a, 0x41558c5a, 0x41558c5a, 0x41558c5a, 0x41558c5a, 0x41558c5a, 0x41558c5a, 0x41558c5a, 0x41558c5a /* _gf_MaxThreshold_LA */
155 .align 64
156 .quad 0x3ff0fefbd933b903, 0x3ff0fefbd933b903, 0x3ff0fefbd933b903, 0x3ff0fefbd933b903, 0x3ff0fefbd933b903, 0x3ff0fefbd933b903, 0x3ff0fefbd933b903, 0x3ff0fefbd933b903 /* _gf_la_poly_0 */
157 .align 64
158 .quad 0xbfc6a948101e6367, 0xbfc6a948101e6367, 0xbfc6a948101e6367, 0xbfc6a948101e6367, 0xbfc6a948101e6367, 0xbfc6a948101e6367, 0xbfc6a948101e6367, 0xbfc6a948101e6367 /* _gf_la_poly_1 */
159 .align 64
160 .quad 0x3fa3a334ce602c6b, 0x3fa3a334ce602c6b, 0x3fa3a334ce602c6b, 0x3fa3a334ce602c6b, 0x3fa3a334ce602c6b, 0x3fa3a334ce602c6b, 0x3fa3a334ce602c6b, 0x3fa3a334ce602c6b /* _gf_la_poly_2 */
161 .align 64
162 .quad 0xbf799309ea0c81dc, 0xbf799309ea0c81dc, 0xbf799309ea0c81dc, 0xbf799309ea0c81dc, 0xbf799309ea0c81dc, 0xbf799309ea0c81dc, 0xbf799309ea0c81dc, 0xbf799309ea0c81dc /* _gf_la_poly_3 */
163 .align 64
164 .quad 0x3f476df64a40e392, 0x3f476df64a40e392, 0x3f476df64a40e392, 0x3f476df64a40e392, 0x3f476df64a40e392, 0x3f476df64a40e392, 0x3f476df64a40e392, 0x3f476df64a40e392 /* _gf_la_poly_4 */
165 .align 64
166 .quad 0xbf0a5216b9508ede, 0xbf0a5216b9508ede, 0xbf0a5216b9508ede, 0xbf0a5216b9508ede, 0xbf0a5216b9508ede, 0xbf0a5216b9508ede, 0xbf0a5216b9508ede, 0xbf0a5216b9508ede /* _gf_la_poly_5 */
167 .align 64
168 .quad 0x3ea5794b95c8e8a0, 0x3ea5794b95c8e8a0, 0x3ea5794b95c8e8a0, 0x3ea5794b95c8e8a0, 0x3ea5794b95c8e8a0, 0x3ea5794b95c8e8a0, 0x3ea5794b95c8e8a0, 0x3ea5794b95c8e8a0 /* _gf_la_poly_6 */
169 .align 64
170 .quad 0x3e94b6c0b485f30f, 0x3e94b6c0b485f30f, 0x3e94b6c0b485f30f, 0x3e94b6c0b485f30f, 0x3e94b6c0b485f30f, 0x3e94b6c0b485f30f, 0x3e94b6c0b485f30f, 0x3e94b6c0b485f30f /* _gf_la_poly_7 */
171 .align 64
172 .quad 0xbe65806ce17f0523, 0xbe65806ce17f0523, 0xbe65806ce17f0523, 0xbe65806ce17f0523, 0xbe65806ce17f0523, 0xbe65806ce17f0523, 0xbe65806ce17f0523, 0xbe65806ce17f0523 /* _gf_la_poly_8 */
173 .align 64
174 .quad 0x3e2715640470db47, 0x3e2715640470db47, 0x3e2715640470db47, 0x3e2715640470db47, 0x3e2715640470db47, 0x3e2715640470db47, 0x3e2715640470db47, 0x3e2715640470db47 /* _gf_la_poly_9 */
175 .align 64
176 .quad 0xbdddcb2653d80f03, 0xbdddcb2653d80f03, 0xbdddcb2653d80f03, 0xbdddcb2653d80f03, 0xbdddcb2653d80f03, 0xbdddcb2653d80f03, 0xbdddcb2653d80f03, 0xbdddcb2653d80f03 /* _gf_la_poly_10 */
177 .align 64
178 .quad 0x3d85eadfc762d3eb, 0x3d85eadfc762d3eb, 0x3d85eadfc762d3eb, 0x3d85eadfc762d3eb, 0x3d85eadfc762d3eb, 0x3d85eadfc762d3eb, 0x3d85eadfc762d3eb, 0x3d85eadfc762d3eb /* _gf_la_poly_11 */
179 .align 64
180 .quad 0xbd1c668a2871f0f1, 0xbd1c668a2871f0f1, 0xbd1c668a2871f0f1, 0xbd1c668a2871f0f1, 0xbd1c668a2871f0f1, 0xbd1c668a2871f0f1, 0xbd1c668a2871f0f1, 0xbd1c668a2871f0f1 /* _gf_la_poly_12 */
181 .align 64
182 .type __svml_serf_data_internal, @object
183 .size __svml_serf_data_internal, .-__svml_serf_data_internal
184

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_s_erff16_core_avx512.S