svml_s_asinf16_core_avx512.S source code [glibc/sysdeps/x86_64/fpu/multiarch/svml_s_asinf16_core_avx512.S]

1	/ Function asinf vectorized with AVX-512.*
2	Copyright (C) 2021-2024 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	https://www.gnu.org/licenses/. /*
18
19	/*
20	* ALGORITHM DESCRIPTION:
21	*
22	* SelMask = (\|x\| >= 0.5) ? 1 : 0;
23	* R = SelMask ? sqrt(0.5 - 0.5*\|x\|) : \|x\|
24	* asin(x) = (SelMask ? (Pi/2 - 2Poly(R)) : Poly(R))(-1)^sign(x)
25	*
26	*
27	*/
28
29	/ Offsets for data table __svml_sasin_data_internal*
30	*/
31	#define AbsMask 0
32	#define OneHalf 64
33	#define SmallNorm 128
34	#define One 192
35	#define Two 256
36	#define sqrt_coeff_1 320
37	#define sqrt_coeff_2 384
38	#define poly_coeff_1 448
39	#define poly_coeff_2 512
40	#define poly_coeff_3 576
41	#define poly_coeff_4 640
42	#define poly_coeff_5 704
43	#define Pi2H 768
44
45	#include <sysdep.h>
46
47	.section .text.evex512, "ax", @progbits
48	ENTRY(_ZGVeN16v_asinf_skx)
49	pushq %rbp
50	cfi_def_cfa_offset(`16`)
51	movq %rsp, %rbp
52	cfi_def_cfa(`6`, `16`)
53	cfi_offset(`6`, -`16`)
54	andq $-`64`, %rsp
55	subq $`192`, %rsp
56	vmovups __svml_sasin_data_internal(%rip), %zmm4
57	vmovups OneHalf+__svml_sasin_data_internal(%rip), %zmm6
58
59	/ SQ ~ -2sqrt(Y) /*
60	vmovups SmallNorm+__svml_sasin_data_internal(%rip), %zmm8
61	vmovups Two+__svml_sasin_data_internal(%rip), %zmm12
62	vmovups sqrt_coeff_1+__svml_sasin_data_internal(%rip), %zmm13
63	vmovups One+__svml_sasin_data_internal(%rip), %zmm7
64	vmovaps %zmm0, %zmm3
65
66	/ x = \|arg\| /
67	vandps %zmm3, %zmm4, %zmm2
68	vandnps %zmm3, %zmm4, %zmm1
69
70	/ x^2 /
71	vmulps {rn-sae}, %zmm2, %zmm2, %zmm5
72	vcmpps $`17`, {sae}, %zmm2, %zmm7, %k0
73	vcmpps $`21`, {sae}, %zmm6, %zmm2, %k2
74	vmovups poly_coeff_2+__svml_sasin_data_internal(%rip), %zmm7
75	kmovw %k0, %edx
76
77	/ Y = 0.5 - 0.5x /*
78	vmovaps %zmm6, %zmm9
79	vfnmadd231ps {rn-sae}, %zmm2, %zmm6, %zmm9
80	vmovups poly_coeff_5+__svml_sasin_data_internal(%rip), %zmm6
81	vrsqrt14ps %zmm9, %zmm10
82	vcmpps $`17`, {sae}, %zmm8, %zmm9, %k1
83	vminps {sae}, %zmm9, %zmm5, %zmm0
84	vmovups sqrt_coeff_2+__svml_sasin_data_internal(%rip), %zmm8
85	vmovups poly_coeff_4+__svml_sasin_data_internal(%rip), %zmm5
86	vxorps %zmm10, %zmm10, %zmm10{%k1}
87	vaddps {rn-sae}, %zmm9, %zmm9, %zmm14
88	vmulps {rn-sae}, %zmm10, %zmm10, %zmm11
89	vmulps {rn-sae}, %zmm10, %zmm14, %zmm4
90	vfmsub213ps {rn-sae}, %zmm12, %zmm11, %zmm14
91	vmulps {rn-sae}, %zmm14, %zmm4, %zmm15
92	vfmadd231ps {rn-sae}, %zmm14, %zmm13, %zmm8
93	vmovups poly_coeff_3+__svml_sasin_data_internal(%rip), %zmm14
94
95	/ polynomial /
96	vmovups poly_coeff_1+__svml_sasin_data_internal(%rip), %zmm13
97	vfmsub213ps {rn-sae}, %zmm4, %zmm15, %zmm8
98	vfmadd231ps {rn-sae}, %zmm0, %zmm14, %zmm5
99	vfmadd231ps {rn-sae}, %zmm0, %zmm13, %zmm7
100	vmulps {rn-sae}, %zmm0, %zmm0, %zmm15
101	vblendmps %zmm8, %zmm2, %zmm2{%k2}
102	vfmadd213ps {rn-sae}, %zmm5, %zmm15, %zmm7
103	vfmadd213ps {rn-sae}, %zmm6, %zmm0, %zmm7
104	vmulps {rn-sae}, %zmm0, %zmm7, %zmm9
105	vmovups Pi2H+__svml_sasin_data_internal(%rip), %zmm0
106	vfmadd213ps {rn-sae}, %zmm2, %zmm2, %zmm9
107	vaddps {rn-sae}, %zmm0, %zmm9, %zmm9{%k2}
108	vxorps %zmm1, %zmm9, %zmm0
109	testl %edx, %edx
110
111	/ Go to special inputs processing branch /
112	jne L(SPECIAL_VALUES_BRANCH)
113	# LOE rbx r12 r13 r14 r15 edx zmm0 zmm3
114
115	/ Restore registers*
116	* and exit the function
117	*/
118
119	L(EXIT):
120	movq %rbp, %rsp
121	popq %rbp
122	cfi_def_cfa(`7`, `8`)
123	cfi_restore(`6`)
124	ret
125	cfi_def_cfa(`6`, `16`)
126	cfi_offset(`6`, -`16`)
127
128	/ Branch to process*
129	* special inputs
130	*/
131
132	L(SPECIAL_VALUES_BRANCH):
133	vmovups %zmm3, `64`(%rsp)
134	vmovups %zmm0, `128`(%rsp)
135	# LOE rbx r12 r13 r14 r15 edx zmm0
136
137	xorl %eax, %eax
138	# LOE rbx r12 r13 r14 r15 eax edx
139
140	vzeroupper
141	movq %r12, `16`(%rsp)
142	/ DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) /
143	.cfi_escape `0x10`, `0x0c`, `0x0e`, `0x38`, `0x1c`, `0x0d`, `0xc0`, `0xff`, `0xff`, `0xff`, `0x1a`, `0x0d`, `0x50`, `0xff`, `0xff`, `0xff`, `0x22`
144	movl %eax, %r12d
145	movq %r13, `8`(%rsp)
146	/ DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) /
147	.cfi_escape `0x10`, `0x0d`, `0x0e`, `0x38`, `0x1c`, `0x0d`, `0xc0`, `0xff`, `0xff`, `0xff`, `0x1a`, `0x0d`, `0x48`, `0xff`, `0xff`, `0xff`, `0x22`
148	movl %edx, %r13d
149	movq %r14, (%rsp)
150	/ DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) /
151	.cfi_escape `0x10`, `0x0e`, `0x0e`, `0x38`, `0x1c`, `0x0d`, `0xc0`, `0xff`, `0xff`, `0xff`, `0x1a`, `0x0d`, `0x40`, `0xff`, `0xff`, `0xff`, `0x22`
152	# LOE rbx r15 r12d r13d
153
154	/ Range mask*
155	* bits check
156	*/
157
158	L(RANGEMASK_CHECK):
159	btl %r12d, %r13d
160
161	/ Call scalar math function /
162	jc L(SCALAR_MATH_CALL)
163	# LOE rbx r15 r12d r13d
164
165	/ Special inputs*
166	* processing loop
167	*/
168
169	L(SPECIAL_VALUES_LOOP):
170	incl %r12d
171	cmpl $`16`, %r12d
172
173	/ Check bits in range mask /
174	jl L(RANGEMASK_CHECK)
175	# LOE rbx r15 r12d r13d
176
177	movq `16`(%rsp), %r12
178	cfi_restore(`12`)
179	movq `8`(%rsp), %r13
180	cfi_restore(`13`)
181	movq (%rsp), %r14
182	cfi_restore(`14`)
183	vmovups `128`(%rsp), %zmm0
184
185	/ Go to exit /
186	jmp L(EXIT)
187	/ DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) /
188	.cfi_escape `0x10`, `0x0c`, `0x0e`, `0x38`, `0x1c`, `0x0d`, `0xc0`, `0xff`, `0xff`, `0xff`, `0x1a`, `0x0d`, `0x50`, `0xff`, `0xff`, `0xff`, `0x22`
189	/ DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) /
190	.cfi_escape `0x10`, `0x0d`, `0x0e`, `0x38`, `0x1c`, `0x0d`, `0xc0`, `0xff`, `0xff`, `0xff`, `0x1a`, `0x0d`, `0x48`, `0xff`, `0xff`, `0xff`, `0x22`
191	/ DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) /
192	.cfi_escape `0x10`, `0x0e`, `0x0e`, `0x38`, `0x1c`, `0x0d`, `0xc0`, `0xff`, `0xff`, `0xff`, `0x1a`, `0x0d`, `0x40`, `0xff`, `0xff`, `0xff`, `0x22`
193	# LOE rbx r12 r13 r14 r15 zmm0
194
195	/ Scalar math function call*
196	* to process special input
197	*/
198
199	L(SCALAR_MATH_CALL):
200	movl %r12d, %r14d
201	vmovss `64`(%rsp, %r14, `4`), %xmm0
202	call asinf@PLT
203	# LOE rbx r14 r15 r12d r13d xmm0
204
205	vmovss %xmm0, `128`(%rsp, %r14, `4`)
206
207	/ Process special inputs in loop /
208	jmp L(SPECIAL_VALUES_LOOP)
209	# LOE rbx r15 r12d r13d
210	END(_ZGVeN16v_asinf_skx)
211
212	.section .rodata, "a"
213	.align `64`
214
215	#ifdef __svml_sasin_data_internal_typedef
216	typedef unsigned int VUINT32;
217	typedef struct {
218	__declspec(align(`64`)) VUINT32 AbsMask[`16`][`1`];
219	__declspec(align(`64`)) VUINT32 OneHalf[`16`][`1`];
220	__declspec(align(`64`)) VUINT32 SmallNorm[`16`][`1`];
221	__declspec(align(`64`)) VUINT32 One[`16`][`1`];
222	__declspec(align(`64`)) VUINT32 Two[`16`][`1`];
223	__declspec(align(`64`)) VUINT32 sqrt_coeff[`2`][`16`][`1`];
224	__declspec(align(`64`)) VUINT32 poly_coeff[`5`][`16`][`1`];
225	__declspec(align(`64`)) VUINT32 Pi2H[`16`][`1`];
226	} __svml_sasin_data_internal;
227	#endif
228	__svml_sasin_data_internal:
229	/ AbsMask /
230	.long `0x7fffffff`, `0x7fffffff`, `0x7fffffff`, `0x7fffffff`, `0x7fffffff`, `0x7fffffff`, `0x7fffffff`, `0x7fffffff`, `0x7fffffff`, `0x7fffffff`, `0x7fffffff`, `0x7fffffff`, `0x7fffffff`, `0x7fffffff`, `0x7fffffff`, `0x7fffffff`
231	/ OneHalf /
232	.align `64`
233	.long `0x3f000000`, `0x3f000000`, `0x3f000000`, `0x3f000000`, `0x3f000000`, `0x3f000000`, `0x3f000000`, `0x3f000000`, `0x3f000000`, `0x3f000000`, `0x3f000000`, `0x3f000000`, `0x3f000000`, `0x3f000000`, `0x3f000000`, `0x3f000000`
234	/ SmallNorm /
235	.align `64`
236	.long `0x2f800000`, `0x2f800000`, `0x2f800000`, `0x2f800000`, `0x2f800000`, `0x2f800000`, `0x2f800000`, `0x2f800000`, `0x2f800000`, `0x2f800000`, `0x2f800000`, `0x2f800000`, `0x2f800000`, `0x2f800000`, `0x2f800000`, `0x2f800000`
237	/ One /
238	.align `64`
239	.long `0x3f800000`, `0x3f800000`, `0x3f800000`, `0x3f800000`, `0x3f800000`, `0x3f800000`, `0x3f800000`, `0x3f800000`, `0x3f800000`, `0x3f800000`, `0x3f800000`, `0x3f800000`, `0x3f800000`, `0x3f800000`, `0x3f800000`, `0x3f800000`
240	/ Two /
241	.align `64`
242	.long `0x40000000`, `0x40000000`, `0x40000000`, `0x40000000`, `0x40000000`, `0x40000000`, `0x40000000`, `0x40000000`, `0x40000000`, `0x40000000`, `0x40000000`, `0x40000000`, `0x40000000`, `0x40000000`, `0x40000000`, `0x40000000`
243	/ sqrt_coeff[2] /
244	.align `64`
245	.long `0xbdC00004`, `0xbdC00004`, `0xbdC00004`, `0xbdC00004`, `0xbdC00004`, `0xbdC00004`, `0xbdC00004`, `0xbdC00004`, `0xbdC00004`, `0xbdC00004`, `0xbdC00004`, `0xbdC00004`, `0xbdC00004`, `0xbdC00004`, `0xbdC00004`, `0xbdC00004` / sqrt_coeff2 /
246	.long `0x3e800001`, `0x3e800001`, `0x3e800001`, `0x3e800001`, `0x3e800001`, `0x3e800001`, `0x3e800001`, `0x3e800001`, `0x3e800001`, `0x3e800001`, `0x3e800001`, `0x3e800001`, `0x3e800001`, `0x3e800001`, `0x3e800001`, `0x3e800001` / sqrt_coeff1 /
247	/ poly_coeff[5] /
248	.align `64`
249	.long `0x3d2EDC07`, `0x3d2EDC07`, `0x3d2EDC07`, `0x3d2EDC07`, `0x3d2EDC07`, `0x3d2EDC07`, `0x3d2EDC07`, `0x3d2EDC07`, `0x3d2EDC07`, `0x3d2EDC07`, `0x3d2EDC07`, `0x3d2EDC07`, `0x3d2EDC07`, `0x3d2EDC07`, `0x3d2EDC07`, `0x3d2EDC07` / poly_coeff5 /
250	.long `0x3CC32A6B`, `0x3CC32A6B`, `0x3CC32A6B`, `0x3CC32A6B`, `0x3CC32A6B`, `0x3CC32A6B`, `0x3CC32A6B`, `0x3CC32A6B`, `0x3CC32A6B`, `0x3CC32A6B`, `0x3CC32A6B`, `0x3CC32A6B`, `0x3CC32A6B`, `0x3CC32A6B`, `0x3CC32A6B`, `0x3CC32A6B` / poly_coeff4 /
251	.long `0x3d3A9AB4`, `0x3d3A9AB4`, `0x3d3A9AB4`, `0x3d3A9AB4`, `0x3d3A9AB4`, `0x3d3A9AB4`, `0x3d3A9AB4`, `0x3d3A9AB4`, `0x3d3A9AB4`, `0x3d3A9AB4`, `0x3d3A9AB4`, `0x3d3A9AB4`, `0x3d3A9AB4`, `0x3d3A9AB4`, `0x3d3A9AB4`, `0x3d3A9AB4` / poly_coeff3 /
252	.long `0x3d997C12`, `0x3d997C12`, `0x3d997C12`, `0x3d997C12`, `0x3d997C12`, `0x3d997C12`, `0x3d997C12`, `0x3d997C12`, `0x3d997C12`, `0x3d997C12`, `0x3d997C12`, `0x3d997C12`, `0x3d997C12`, `0x3d997C12`, `0x3d997C12`, `0x3d997C12` / poly_coeff2 /
253	.long `0x3e2AAAFF`, `0x3e2AAAFF`, `0x3e2AAAFF`, `0x3e2AAAFF`, `0x3e2AAAFF`, `0x3e2AAAFF`, `0x3e2AAAFF`, `0x3e2AAAFF`, `0x3e2AAAFF`, `0x3e2AAAFF`, `0x3e2AAAFF`, `0x3e2AAAFF`, `0x3e2AAAFF`, `0x3e2AAAFF`, `0x3e2AAAFF`, `0x3e2AAAFF` / poly_coeff1 /
254	/ Pi2H /
255	.align `64`
256	.long `0x3fc90FDB`, `0x3fc90FDB`, `0x3fc90FDB`, `0x3fc90FDB`, `0x3fc90FDB`, `0x3fc90FDB`, `0x3fc90FDB`, `0x3fc90FDB`, `0x3fc90FDB`, `0x3fc90FDB`, `0x3fc90FDB`, `0x3fc90FDB`, `0x3fc90FDB`, `0x3fc90FDB`, `0x3fc90FDB`, `0x3fc90FDB`
257	.align `64`
258	.type __svml_sasin_data_internal, @object
259	.size __svml_sasin_data_internal, .-__svml_sasin_data_internal
260

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_s_asinf16_core_avx512.S