svml_s_hypotf16_core_avx512.S source code [glibc/sysdeps/x86_64/fpu/multiarch/svml_s_hypotf16_core_avx512.S]

1	/ Function hypotf vectorized with AVX-512.*
2	Copyright (C) 2021-2024 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	https://www.gnu.org/licenses/. /*
18
19	/*
20	* ALGORITHM DESCRIPTION:
21	*
22	* HIGH LEVEL OVERVIEW
23	*
24	* Calculate z = (xx+yy)
25	* Calculate reciplicle sqrt (z)
26	* Calculate make two NR iterations
27	*
28	* ALGORITHM DETAILS
29	*
30	* Multiprecision branch for _HA_ only
31	* Remove sigm from both arguments
32	* Find maximum (_x) and minimum (_y) (by abs value) between arguments
33	* Split _x int _a and _b for multiprecision
34	* If _x >> _y we will we will not split _y for multiprecision
35	* all _y will be put into lower part (_d) and higher part (_c = 0)
36	* Fixing _hilo_mask for the case _x >> _y
37	* Split _y into _c and _d for multiprecision with fixed mask
38	*
39	* compute Hi and Lo parts of _z = _x_x + _y_y
40	*
41	* _zHi = _a_a + _c_c
42	* _zLo = (_x + _a)_b + _d_y + _d*_c
43	* _z = _zHi + _zLo
44	*
45	* No multiprecision branch for _LA_ and _EP_
46	* _z = _VARG1 * _VARG1 + _VARG2 * _VARG2
47	*
48	* Check _z exponent to be within borders [1E3 ; 60A] else goto Callout
49	*
50	* Compute resciplicle sqrt s0 ~ 1.0/sqrt(_z),
51	* that multiplied by _z, is final result for _EP_ version.
52	*
53	* First iteration (or zero iteration):
54	* s = z * s0
55	* h = .5 * s0
56	* d = s * h - .5
57	*
58	* Second iteration:
59	* h = d * h + h
60	* s = s * d + s
61	* d = s * s - z (in multiprecision for _HA_)
62	*
63	* result = s - h * d
64	*
65	* EP version of the function can be implemented as y[i]=sqrt(a[i]^2+b[i]^2)
66	* with all intermediate operations done in target precision for i=1, .., n.
67	* It can return result y[i]=0 in case a[i]^2 and b[i]^2 underflow in target
68	* precision (for some i). It can return result y[i]=NAN in case
69	* a[i]^2+b[i]^2 overflow in target precision, for some i. It can return
70	* result y[i]=NAN in case a[i] or b[i] is infinite, for some i.
71	*
72	*
73	*/
74
75	/ Offsets for data table __svml_shypot_data_internal*
76	*/
77	#define _sAbsMask 0
78	#define _sHalf 64
79	#define _iExpBound 128
80
81	#include <sysdep.h>
82
83	.section .text.evex512, "ax", @progbits
84	ENTRY(_ZGVeN16vv_hypotf_skx)
85	pushq %rbp
86	cfi_def_cfa_offset(`16`)
87	movq %rsp, %rbp
88	cfi_def_cfa(`6`, `16`)
89	cfi_offset(`6`, -`16`)
90	andq $-`64`, %rsp
91	subq $`256`, %rsp
92	vgetexpps {sae}, %zmm0, %zmm2
93	vgetexpps {sae}, %zmm1, %zmm3
94	vmovups _sHalf+__svml_shypot_data_internal(%rip), %zmm6
95	vmaxps {sae}, %zmm3, %zmm2, %zmm4
96	vmulps {rn-sae}, %zmm0, %zmm0, %zmm2
97	vandps _sAbsMask+__svml_shypot_data_internal(%rip), %zmm4, %zmm5
98	vfmadd231ps {rn-sae}, %zmm1, %zmm1, %zmm2
99	vpcmpd $`5`, _iExpBound+__svml_shypot_data_internal(%rip), %zmm5, %k0
100	vrsqrt14ps %zmm2, %zmm7
101	kmovw %k0, %edx
102	vmulps {rn-sae}, %zmm7, %zmm2, %zmm9
103	vmulps {rn-sae}, %zmm7, %zmm6, %zmm8
104	vfnmadd231ps {rn-sae}, %zmm9, %zmm9, %zmm2
105	vfmadd213ps {rn-sae}, %zmm9, %zmm8, %zmm2
106
107	/*
108	* VSCALEF( S, _VRES1, _VRES1, sExp );
109	* The end of implementation
110	*/
111	testl %edx, %edx
112
113	/ Go to special inputs processing branch /
114	jne L(SPECIAL_VALUES_BRANCH)
115	# LOE rbx r12 r13 r14 r15 edx zmm0 zmm1 zmm2
116
117	/ Restore registers*
118	* and exit the function
119	*/
120
121	L(EXIT):
122	vmovaps %zmm2, %zmm0
123	movq %rbp, %rsp
124	popq %rbp
125	cfi_def_cfa(`7`, `8`)
126	cfi_restore(`6`)
127	ret
128	cfi_def_cfa(`6`, `16`)
129	cfi_offset(`6`, -`16`)
130
131	/ Branch to process*
132	* special inputs
133	*/
134
135	L(SPECIAL_VALUES_BRANCH):
136	vmovups %zmm0, `64`(%rsp)
137	vmovups %zmm1, `128`(%rsp)
138	vmovups %zmm2, `192`(%rsp)
139	# LOE rbx r12 r13 r14 r15 edx zmm2
140
141	xorl %eax, %eax
142	# LOE rbx r12 r13 r14 r15 eax edx
143
144	vzeroupper
145	movq %r12, `16`(%rsp)
146	/ DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -240; DW_OP_plus) /
147	.cfi_escape `0x10`, `0x0c`, `0x0e`, `0x38`, `0x1c`, `0x0d`, `0xc0`, `0xff`, `0xff`, `0xff`, `0x1a`, `0x0d`, `0x10`, `0xff`, `0xff`, `0xff`, `0x22`
148	movl %eax, %r12d
149	movq %r13, `8`(%rsp)
150	/ DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -248; DW_OP_plus) /
151	.cfi_escape `0x10`, `0x0d`, `0x0e`, `0x38`, `0x1c`, `0x0d`, `0xc0`, `0xff`, `0xff`, `0xff`, `0x1a`, `0x0d`, `0x08`, `0xff`, `0xff`, `0xff`, `0x22`
152	movl %edx, %r13d
153	movq %r14, (%rsp)
154	/ DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -256; DW_OP_plus) /
155	.cfi_escape `0x10`, `0x0e`, `0x0e`, `0x38`, `0x1c`, `0x0d`, `0xc0`, `0xff`, `0xff`, `0xff`, `0x1a`, `0x0d`, `0x00`, `0xff`, `0xff`, `0xff`, `0x22`
156	# LOE rbx r15 r12d r13d
157
158	/ Range mask*
159	* bits check
160	*/
161
162	L(RANGEMASK_CHECK):
163	btl %r12d, %r13d
164
165	/ Call scalar math function /
166	jc L(SCALAR_MATH_CALL)
167	# LOE rbx r15 r12d r13d
168
169	/ Special inputs*
170	* processing loop
171	*/
172
173	L(SPECIAL_VALUES_LOOP):
174	incl %r12d
175	cmpl $`16`, %r12d
176
177	/ Check bits in range mask /
178	jl L(RANGEMASK_CHECK)
179	# LOE rbx r15 r12d r13d
180
181	movq `16`(%rsp), %r12
182	cfi_restore(`12`)
183	movq `8`(%rsp), %r13
184	cfi_restore(`13`)
185	movq (%rsp), %r14
186	cfi_restore(`14`)
187	vmovups `192`(%rsp), %zmm2
188
189	/ Go to exit /
190	jmp L(EXIT)
191	/ DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -240; DW_OP_plus) /
192	.cfi_escape `0x10`, `0x0c`, `0x0e`, `0x38`, `0x1c`, `0x0d`, `0xc0`, `0xff`, `0xff`, `0xff`, `0x1a`, `0x0d`, `0x10`, `0xff`, `0xff`, `0xff`, `0x22`
193	/ DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -248; DW_OP_plus) /
194	.cfi_escape `0x10`, `0x0d`, `0x0e`, `0x38`, `0x1c`, `0x0d`, `0xc0`, `0xff`, `0xff`, `0xff`, `0x1a`, `0x0d`, `0x08`, `0xff`, `0xff`, `0xff`, `0x22`
195	/ DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -256; DW_OP_plus) /
196	.cfi_escape `0x10`, `0x0e`, `0x0e`, `0x38`, `0x1c`, `0x0d`, `0xc0`, `0xff`, `0xff`, `0xff`, `0x1a`, `0x0d`, `0x00`, `0xff`, `0xff`, `0xff`, `0x22`
197	# LOE rbx r12 r13 r14 r15 zmm2
198
199	/ Scalar math function call*
200	* to process special input
201	*/
202
203	L(SCALAR_MATH_CALL):
204	movl %r12d, %r14d
205	vmovss `64`(%rsp, %r14, `4`), %xmm0
206	vmovss `128`(%rsp, %r14, `4`), %xmm1
207	call hypotf@PLT
208	# LOE rbx r14 r15 r12d r13d xmm0
209
210	vmovss %xmm0, `192`(%rsp, %r14, `4`)
211
212	/ Process special inputs in loop /
213	jmp L(SPECIAL_VALUES_LOOP)
214	# LOE rbx r15 r12d r13d
215	END(_ZGVeN16vv_hypotf_skx)
216
217	.section .rodata, "a"
218	.align `64`
219
220	#ifdef __svml_shypot_data_internal_typedef
221	typedef unsigned int VUINT32;
222	typedef struct {
223	__declspec(align(`64`)) VUINT32 _sAbsMask[`16`][`1`];
224	__declspec(align(`64`)) VUINT32 _sHalf[`16`][`1`];
225	__declspec(align(`64`)) VUINT32 _iExpBound[`16`][`1`];
226	} __svml_shypot_data_internal;
227	#endif
228	__svml_shypot_data_internal:
229	.long `0x7fffffff`, `0x7fffffff`, `0x7fffffff`, `0x7fffffff`, `0x7fffffff`, `0x7fffffff`, `0x7fffffff`, `0x7fffffff`, `0x7fffffff`, `0x7fffffff`, `0x7fffffff`, `0x7fffffff`, `0x7fffffff`, `0x7fffffff`, `0x7fffffff`, `0x7fffffff` / _sAbsMask /
230	.align `64`
231	.long `0x3f000000`, `0x3f000000`, `0x3f000000`, `0x3f000000`, `0x3f000000`, `0x3f000000`, `0x3f000000`, `0x3f000000`, `0x3f000000`, `0x3f000000`, `0x3f000000`, `0x3f000000`, `0x3f000000`, `0x3f000000`, `0x3f000000`, `0x3f000000` / _sHalf /
232	/ fma based algorithm/
233	.align `64`
234	.long `0x427C0000`, `0x427C0000`, `0x427C0000`, `0x427C0000`, `0x427C0000`, `0x427C0000`, `0x427C0000`, `0x427C0000`, `0x427C0000`, `0x427C0000`, `0x427C0000`, `0x427C0000`, `0x427C0000`, `0x427C0000`, `0x427C0000`, `0x427C0000` / _iExpBound /
235	.align `64`
236	.type __svml_shypot_data_internal, @object
237	.size __svml_shypot_data_internal, .-__svml_shypot_data_internal
238

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_s_hypotf16_core_avx512.S