svml_s_hypotf4_core_sse4.S source code [glibc/sysdeps/x86_64/fpu/multiarch/svml_s_hypotf4_core_sse4.S]

1	/ Function hypotf vectorized with SSE4.*
2	Copyright (C) 2021-2024 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	https://www.gnu.org/licenses/. /*
18
19	/*
20	* ALGORITHM DESCRIPTION:
21	*
22	* HIGH LEVEL OVERVIEW
23	*
24	* Calculate z = (xx+yy)
25	* Calculate reciplicle sqrt (z)
26	* Calculate make two NR iterations
27	*
28	* ALGORITHM DETAILS
29	*
30	* Multiprecision branch for _HA_ only
31	* Remove sigm from both arguments
32	* Find maximum (_x) and minimum (_y) (by abs value) between arguments
33	* Split _x int _a and _b for multiprecision
34	* If _x >> _y we will we will not split _y for multiprecision
35	* all _y will be put into lower part (_d) and higher part (_c = 0)
36	* Fixing _hilo_mask for the case _x >> _y
37	* Split _y into _c and _d for multiprecision with fixed mask
38	*
39	* compute Hi and Lo parts of _z = _x_x + _y_y
40	*
41	* _zHi = _a_a + _c_c
42	* _zLo = (_x + _a)_b + _d_y + _d*_c
43	* _z = _zHi + _zLo
44	*
45	* No multiprecision branch for _LA_ and _EP_
46	* _z = _VARG1 * _VARG1 + _VARG2 * _VARG2
47	*
48	* Check _z exponent to be within borders [1E3 ; 60A] else goto Callout
49	*
50	* Compute resciplicle sqrt s0 ~ 1.0/sqrt(_z),
51	* that multiplied by _z, is final result for _EP_ version.
52	*
53	* First iteration (or zero iteration):
54	* s = z * s0
55	* h = .5 * s0
56	* d = s * h - .5
57	*
58	* Second iteration:
59	* h = d * h + h
60	* s = s * d + s
61	* d = s * s - z (in multiprecision for _HA_)
62	*
63	* result = s - h * d
64	*
65	* EP version of the function can be implemented as y[i]=sqrt(a[i]^2+b[i]^2)
66	* with all intermediate operations done in target precision for i=1, .., n.
67	* It can return result y[i]=0 in case a[i]^2 and b[i]^2 underflow in target
68	* precision (for some i). It can return result y[i]=NAN in case
69	* a[i]^2+b[i]^2 overflow in target precision, for some i. It can return
70	* result y[i]=NAN in case a[i] or b[i] is infinite, for some i.
71	*
72	*
73	*/
74
75	/ Offsets for data table __svml_shypot_data_internal*
76	*/
77	#define _sHiLoMask 0
78	#define _sAbsMask 16
79	#define _sHalf 32
80	#define _LowBoundary 48
81	#define _HighBoundary 64
82
83	#include <sysdep.h>
84
85	.section .text.sse4, "ax", @progbits
86	ENTRY(_ZGVbN4vv_hypotf_sse4)
87	subq $`88`, %rsp
88	cfi_def_cfa_offset(`96`)
89
90	/*
91	* Implementation
92	* Multiprecision branch for _HA_ only
93	* No multiprecision branch for _LA_
94	* _z = _VARG1 * _VARG1 + _VARG2 * _VARG2
95	*/
96	movaps %xmm0, %xmm8
97	movaps %xmm1, %xmm2
98	mulps %xmm0, %xmm8
99	mulps %xmm1, %xmm2
100
101	/*
102	* Variables
103	* Defines
104	* Constants loading
105	*/
106	movups _sHalf+__svml_shypot_data_internal(%rip), %xmm5
107	addps %xmm2, %xmm8
108
109	/ _s0 ~ 1.0/sqrt(_z) /
110	rsqrtps %xmm8, %xmm10
111
112	/ First iteration /
113	movaps %xmm10, %xmm2
114	movaps %xmm8, %xmm3
115	mulps %xmm8, %xmm2
116	mulps %xmm5, %xmm10
117	movaps %xmm2, %xmm6
118	mulps %xmm10, %xmm6
119
120	/ Check _z exponent to be within borders [1E3 ; 60A] else goto Callout /
121	movdqu _LowBoundary+__svml_shypot_data_internal(%rip), %xmm4
122	subps %xmm6, %xmm5
123
124	/ Second iteration /
125	movaps %xmm5, %xmm7
126	pcmpgtd %xmm8, %xmm4
127	mulps %xmm2, %xmm5
128	mulps %xmm10, %xmm7
129	addps %xmm5, %xmm2
130	addps %xmm7, %xmm10
131
132	/ Finish second iteration in native precision for _LA_ /
133	movaps %xmm2, %xmm9
134	mulps %xmm2, %xmm9
135	pcmpgtd _HighBoundary+__svml_shypot_data_internal(%rip), %xmm3
136	subps %xmm8, %xmm9
137	mulps %xmm9, %xmm10
138	por %xmm3, %xmm4
139	movmskps %xmm4, %edx
140	subps %xmm10, %xmm2
141
142	/ The end of implementation /
143	testl %edx, %edx
144
145	/ Go to special inputs processing branch /
146	jne L(SPECIAL_VALUES_BRANCH)
147	# LOE rbx rbp r12 r13 r14 r15 edx xmm0 xmm1 xmm2
148
149	/ Restore registers*
150	* and exit the function
151	*/
152
153	L(EXIT):
154	movaps %xmm2, %xmm0
155	addq $`88`, %rsp
156	cfi_def_cfa_offset(`8`)
157	ret
158	cfi_def_cfa_offset(`96`)
159
160	/ Branch to process*
161	* special inputs
162	*/
163
164	L(SPECIAL_VALUES_BRANCH):
165	movups %xmm0, `32`(%rsp)
166	movups %xmm1, `48`(%rsp)
167	movups %xmm2, `64`(%rsp)
168	# LOE rbx rbp r12 r13 r14 r15 edx
169
170	xorl %eax, %eax
171	movq %r12, `16`(%rsp)
172	cfi_offset(`12`, -`80`)
173	movl %eax, %r12d
174	movq %r13, `8`(%rsp)
175	cfi_offset(`13`, -`88`)
176	movl %edx, %r13d
177	movq %r14, (%rsp)
178	cfi_offset(`14`, -`96`)
179	# LOE rbx rbp r15 r12d r13d
180
181	/ Range mask*
182	* bits check
183	*/
184
185	L(RANGEMASK_CHECK):
186	btl %r12d, %r13d
187
188	/ Call scalar math function /
189	jc L(SCALAR_MATH_CALL)
190	# LOE rbx rbp r15 r12d r13d
191
192	/ Special inputs*
193	* processing loop
194	*/
195
196	L(SPECIAL_VALUES_LOOP):
197	incl %r12d
198	cmpl $`4`, %r12d
199
200	/ Check bits in range mask /
201	jl L(RANGEMASK_CHECK)
202	# LOE rbx rbp r15 r12d r13d
203
204	movq `16`(%rsp), %r12
205	cfi_restore(`12`)
206	movq `8`(%rsp), %r13
207	cfi_restore(`13`)
208	movq (%rsp), %r14
209	cfi_restore(`14`)
210	movups `64`(%rsp), %xmm2
211
212	/ Go to exit /
213	jmp L(EXIT)
214	cfi_offset(`12`, -`80`)
215	cfi_offset(`13`, -`88`)
216	cfi_offset(`14`, -`96`)
217	# LOE rbx rbp r12 r13 r14 r15 xmm2
218
219	/ Scalar math function call*
220	* to process special input
221	*/
222
223	L(SCALAR_MATH_CALL):
224	movl %r12d, %r14d
225	movss `32`(%rsp, %r14, `4`), %xmm0
226	movss `48`(%rsp, %r14, `4`), %xmm1
227	call hypotf@PLT
228	# LOE rbx rbp r14 r15 r12d r13d xmm0
229
230	movss %xmm0, `64`(%rsp, %r14, `4`)
231
232	/ Process special inputs in loop /
233	jmp L(SPECIAL_VALUES_LOOP)
234	# LOE rbx rbp r15 r12d r13d
235	END(_ZGVbN4vv_hypotf_sse4)
236
237	.section .rodata, "a"
238	.align `16`
239
240	#ifdef __svml_shypot_data_internal_typedef
241	typedef unsigned int VUINT32;
242	typedef struct {
243	__declspec(align(`16`)) VUINT32 _sHiLoMask[`4`][`1`];
244	__declspec(align(`16`)) VUINT32 _sAbsMask[`4`][`1`];
245	__declspec(align(`16`)) VUINT32 _sHalf[`4`][`1`];
246	__declspec(align(`16`)) VUINT32 _LowBoundary[`4`][`1`];
247	__declspec(align(`16`)) VUINT32 _HighBoundary[`4`][`1`];
248	} __svml_shypot_data_internal;
249	#endif
250	__svml_shypot_data_internal:
251	/ legacy algorithm /
252	.long `0xFFF80000`, `0xFFF80000`, `0xFFF80000`, `0xFFF80000` / _sHiLoMask /
253	.align `16`
254	.long `0x7fffffff`, `0x7fffffff`, `0x7fffffff`, `0x7fffffff` / _sAbsMask /
255	.align `16`
256	.long `0x3f000000`, `0x3f000000`, `0x3f000000`, `0x3f000000` / _sHalf /
257	.align `16`
258	.long `0x1E300000`, `0x1E300000`, `0x1E300000`, `0x1E300000` / _LowBoundary /
259	.align `16`
260	.long `0x60A00000`, `0x60A00000`, `0x60A00000`, `0x60A00000` / _HighBoundary /
261	.align `16`
262	.type __svml_shypot_data_internal, @object
263	.size __svml_shypot_data_internal, .-__svml_shypot_data_internal
264

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_s_hypotf4_core_sse4.S