svml_d_hypot2_core_sse4.S source code [glibc/sysdeps/x86_64/fpu/multiarch/svml_d_hypot2_core_sse4.S]

1	/ Function hypot vectorized with SSE4.*
2	Copyright (C) 2021-2024 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	https://www.gnu.org/licenses/. /*
18
19	/*
20	* ALGORITHM DESCRIPTION:
21	*
22	* HIGH LEVEL OVERVIEW
23	*
24	* Calculate z = (xx+yy)
25	* Calculate reciplicle sqrt (z)
26	* Calculate error = z(rsqrt(z)rsqrt(z)) - 1
27	* Calculate fixing part p with polynom
28	* Fix answer with sqrt(z) = z * rsqrt(z) + error * p * z
29	*
30	* ALGORITHM DETAILS
31	*
32	* Multiprecision branch for _HA_ only
33	* Remove sigm from both arguments
34	* Find maximum (_x) and minimum (_y) (by abs value) between arguments
35	* Split _x int _a and _b for multiprecision
36	* If _x >> _y we will we will not split _y for multiprecision
37	* all _y will be put into lower part (_d) and higher part (_c = 0)
38	* Fixing _hilo_mask for the case _x >> _y
39	* Split _y into _c and _d for multiprecision with fixed mask
40	*
41	* compute Hi and Lo parts of _z = _x_x + _y_y
42	*
43	* _zHi = _a_a + _c_c
44	* _zLo = (_x + _a)_b + _d_y + _d*_c
45	* _z = _zHi + _zLo
46	*
47	* No multiprecision branch for _LA_ and _EP_
48	* _z = _VARG1 * _VARG1 + _VARG2 * _VARG2
49	*
50	* Check _z exponent to be within borders [3BC ; 441] else goto Callout
51	*
52	* _s ~ 1.0/sqrt(_z)
53	* _s2 ~ 1.0/(sqrt(_z)*sqrt(_z)) ~ 1.0/_z = (1.0/_z + O)
54	* _e[rror] = (1.0/_z + O) * _z - 1.0
55	* calculate fixing part _p
56	* _p = (((_POLY_C5_e + _POLY_C4)_e +_POLY_C3)_e +_POLY_C2)_e + _POLY_C1
57	* some parts of polynom are skipped for lower flav
58	*
59	* result = _z * (1.0/sqrt(_z) + O) + _p * _e[rror] * _z
60	*
61	*
62	*/
63
64	/ Offsets for data table __svml_dhypot_data_internal*
65	*/
66	#define _dHiLoMask 0
67	#define _dAbsMask 16
68	#define _dOne 32
69	#define _POLY_C5 48
70	#define _POLY_C4 64
71	#define _POLY_C3 80
72	#define _POLY_C2 96
73	#define _POLY_C1 112
74	#define _LowBoundary 128
75	#define _HighBoundary 144
76
77	#include <sysdep.h>
78
79	.section .text.sse4, "ax", @progbits
80	ENTRY(_ZGVbN2vv_hypot_sse4)
81	subq $`88`, %rsp
82	cfi_def_cfa_offset(`96`)
83
84	/*
85	* Defines
86	* Implementation
87	* Multiprecision branch for _HA_ only
88	* _z = _VARG1 * _VARG1 + _VARG2 * _VARG2
89	*/
90	movaps %xmm0, %xmm10
91	movaps %xmm1, %xmm2
92	mulpd %xmm0, %xmm10
93	mulpd %xmm1, %xmm2
94	addpd %xmm2, %xmm10
95
96	/*
97	* _s ~ 1.0/sqrt(_z)
98	* _s2 ~ 1.0/(sqrt(_z)*sqrt(_z)) ~ 1.0/_z
99	*/
100	cvtpd2ps %xmm10, %xmm7
101	movlhps %xmm7, %xmm7
102	rsqrtps %xmm7, %xmm8
103	cvtps2pd %xmm8, %xmm11
104	movaps %xmm11, %xmm2
105	mulpd %xmm11, %xmm2
106
107	/ _e[rror] ~ (1.0/_z + O) * _z - 1.0 /
108	mulpd %xmm10, %xmm2
109	subpd _dOne+__svml_dhypot_data_internal(%rip), %xmm2
110
111	/*
112	* calculate fixing part _p
113	* _p = (((_POLY_C5_e + _POLY_C4)_e +_POLY_C3)_e +_POLY_C2)_e + _POLY_C1
114	* some parts of polynom are skipped for lower flav
115	*/
116	movups _POLY_C4+__svml_dhypot_data_internal(%rip), %xmm9
117	mulpd %xmm2, %xmm9
118	addpd _POLY_C3+__svml_dhypot_data_internal(%rip), %xmm9
119	mulpd %xmm2, %xmm9
120	addpd _POLY_C2+__svml_dhypot_data_internal(%rip), %xmm9
121	mulpd %xmm2, %xmm9
122	addpd _POLY_C1+__svml_dhypot_data_internal(%rip), %xmm9
123
124	/ result = _z * (1.0/sqrt(_z) + O) + _p * _e[rror] * _z /
125	mulpd %xmm9, %xmm2
126	mulpd %xmm11, %xmm2
127	mulpd %xmm10, %xmm11
128	mulpd %xmm10, %xmm2
129
130	/ Check _z exponent to be within borders [3BC ; 441] else goto Callout /
131	movq _LowBoundary+__svml_dhypot_data_internal(%rip), %xmm5
132	movq _HighBoundary+__svml_dhypot_data_internal(%rip), %xmm3
133	pshufd $`221`, %xmm10, %xmm4
134	pcmpgtd %xmm4, %xmm5
135	pcmpgtd %xmm3, %xmm4
136	por %xmm4, %xmm5
137	pshufd $`80`, %xmm5, %xmm6
138	movmskpd %xmm6, %edx
139	addpd %xmm11, %xmm2
140
141	/ The end of implementation /
142	testl %edx, %edx
143
144	/ Go to special inputs processing branch /
145	jne L(SPECIAL_VALUES_BRANCH)
146	# LOE rbx rbp r12 r13 r14 r15 edx xmm0 xmm1 xmm2
147
148	/ Restore registers*
149	* and exit the function
150	*/
151
152	L(EXIT):
153	movaps %xmm2, %xmm0
154	addq $`88`, %rsp
155	cfi_def_cfa_offset(`8`)
156	ret
157	cfi_def_cfa_offset(`96`)
158
159	/ Branch to process*
160	* special inputs
161	*/
162
163	L(SPECIAL_VALUES_BRANCH):
164	movups %xmm0, `32`(%rsp)
165	movups %xmm1, `48`(%rsp)
166	movups %xmm2, `64`(%rsp)
167	# LOE rbx rbp r12 r13 r14 r15 edx
168
169	xorl %eax, %eax
170	movq %r12, `16`(%rsp)
171	cfi_offset(`12`, -`80`)
172	movl %eax, %r12d
173	movq %r13, `8`(%rsp)
174	cfi_offset(`13`, -`88`)
175	movl %edx, %r13d
176	movq %r14, (%rsp)
177	cfi_offset(`14`, -`96`)
178	# LOE rbx rbp r15 r12d r13d
179
180	/ Range mask*
181	* bits check
182	*/
183
184	L(RANGEMASK_CHECK):
185	btl %r12d, %r13d
186
187	/ Call scalar math function /
188	jc L(SCALAR_MATH_CALL)
189	# LOE rbx rbp r15 r12d r13d
190
191	/ Special inputs*
192	* processing loop
193	*/
194
195	L(SPECIAL_VALUES_LOOP):
196	incl %r12d
197	cmpl $`2`, %r12d
198
199	/ Check bits in range mask /
200	jl L(RANGEMASK_CHECK)
201	# LOE rbx rbp r15 r12d r13d
202
203	movq `16`(%rsp), %r12
204	cfi_restore(`12`)
205	movq `8`(%rsp), %r13
206	cfi_restore(`13`)
207	movq (%rsp), %r14
208	cfi_restore(`14`)
209	movups `64`(%rsp), %xmm2
210
211	/ Go to exit /
212	jmp L(EXIT)
213	cfi_offset(`12`, -`80`)
214	cfi_offset(`13`, -`88`)
215	cfi_offset(`14`, -`96`)
216	# LOE rbx rbp r12 r13 r14 r15 xmm2
217
218	/ Scalar math function call*
219	* to process special input
220	*/
221
222	L(SCALAR_MATH_CALL):
223	movl %r12d, %r14d
224	movsd `32`(%rsp, %r14, `8`), %xmm0
225	movsd `48`(%rsp, %r14, `8`), %xmm1
226	call hypot@PLT
227	# LOE rbx rbp r14 r15 r12d r13d xmm0
228
229	movsd %xmm0, `64`(%rsp, %r14, `8`)
230
231	/ Process special inputs in loop /
232	jmp L(SPECIAL_VALUES_LOOP)
233	# LOE rbx rbp r15 r12d r13d
234	END(_ZGVbN2vv_hypot_sse4)
235
236	.section .rodata, "a"
237	.align `16`
238
239	#ifdef __svml_dhypot_data_internal_typedef
240	typedef unsigned int VUINT32;
241	typedef struct {
242	__declspec(align(`16`)) VUINT32 _dHiLoMask[`2`][`2`];
243	__declspec(align(`16`)) VUINT32 _dAbsMask[`2`][`2`];
244	__declspec(align(`16`)) VUINT32 _dOne[`2`][`2`];
245	__declspec(align(`16`)) VUINT32 _POLY_C5[`2`][`2`];
246	__declspec(align(`16`)) VUINT32 _POLY_C4[`2`][`2`];
247	__declspec(align(`16`)) VUINT32 _POLY_C3[`2`][`2`];
248	__declspec(align(`16`)) VUINT32 _POLY_C2[`2`][`2`];
249	__declspec(align(`16`)) VUINT32 _POLY_C1[`2`][`2`];
250	__declspec(align(`16`)) VUINT32 _LowBoundary[`4`][`1`];
251	__declspec(align(`16`)) VUINT32 _HighBoundary[`4`][`1`];
252	} __svml_dhypot_data_internal;
253	#endif
254	__svml_dhypot_data_internal:
255	/ legacy algorithm /
256	.quad `0xffffc00000000000`, `0xffffc00000000000` / _dHiLoMask /
257	.align `16`
258	.quad `0x7fffffffffffffff`, `0x7fffffffffffffff` / _dAbsMask /
259	.align `16`
260	.quad `0x3FF0000000000000`, `0x3FF0000000000000` / _dOne /
261	.align `16`
262	.quad `0xBFCF800000000000`, `0xBFCF800000000000` / _POLY_C5 /
263	.align `16`
264	.quad `0x3FD1800000000000`, `0x3FD1800000000000` / _POLY_C4 /
265	.align `16`
266	.quad `0xBFD4000000000000`, `0xBFD4000000000000` / _POLY_C3 /
267	.align `16`
268	.quad `0x3FD8000000000000`, `0x3FD8000000000000` / _POLY_C2 /
269	.align `16`
270	.quad `0xBFE0000000000000`, `0xBFE0000000000000` / _POLY_C1 /
271	.align `16`
272	.long `0x3BC00000`, `0x3BC00000`, `0x3BC00000`, `0x3BC00000` / _LowBoundary /
273	.align `16`
274	.long `0x44100000`, `0x44100000`, `0x44100000`, `0x44100000` / _HighBoundary /
275	.align `16`
276	.type __svml_dhypot_data_internal, @object
277	.size __svml_dhypot_data_internal, .-__svml_dhypot_data_internal
278

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_d_hypot2_core_sse4.S