svml_d_hypot8_core_avx512.S source code [glibc/sysdeps/x86_64/fpu/multiarch/svml_d_hypot8_core_avx512.S]

1	/ Function hypot vectorized with AVX-512.*
2	Copyright (C) 2021-2024 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	https://www.gnu.org/licenses/. /*
18
19	/*
20	* ALGORITHM DESCRIPTION:
21	*
22	* HIGH LEVEL OVERVIEW
23	*
24	* Calculate z = (xx+yy)
25	* Calculate reciplicle sqrt (z)
26	* Calculate error = z(rsqrt(z)rsqrt(z)) - 1
27	* Calculate fixing part p with polynom
28	* Fix answer with sqrt(z) = z * rsqrt(z) + error * p * z
29	*
30	* ALGORITHM DETAILS
31	*
32	* Multiprecision branch for _HA_ only
33	* Remove sigm from both arguments
34	* Find maximum (_x) and minimum (_y) (by abs value) between arguments
35	* Split _x int _a and _b for multiprecision
36	* If _x >> _y we will we will not split _y for multiprecision
37	* all _y will be put into lower part (_d) and higher part (_c = 0)
38	* Fixing _hilo_mask for the case _x >> _y
39	* Split _y into _c and _d for multiprecision with fixed mask
40	*
41	* compute Hi and Lo parts of _z = _x_x + _y_y
42	*
43	* _zHi = _a_a + _c_c
44	* _zLo = (_x + _a)_b + _d_y + _d*_c
45	* _z = _zHi + _zLo
46	*
47	* No multiprecision branch for _LA_ and _EP_
48	* _z = _VARG1 * _VARG1 + _VARG2 * _VARG2
49	*
50	* Check _z exponent to be within borders [3BC ; 441] else goto Callout
51	*
52	* _s ~ 1.0/sqrt(_z)
53	* _s2 ~ 1.0/(sqrt(_z)*sqrt(_z)) ~ 1.0/_z = (1.0/_z + O)
54	* _e[rror] = (1.0/_z + O) * _z - 1.0
55	* calculate fixing part _p
56	* _p = (((_POLY_C5_e + _POLY_C4)_e +_POLY_C3)_e +_POLY_C2)_e + _POLY_C1
57	* some parts of polynom are skipped for lower flav
58	*
59	* result = _z * (1.0/sqrt(_z) + O) + _p * _e[rror] * _z
60	*
61	*
62	*/
63
64	/ Offsets for data table __svml_dhypot_data_internal*
65	*/
66	#define _dAbsMask 0
67	#define _lExpBound_uisa 64
68	#define _lExpBound 128
69	#define _dHalf 192
70
71	#include <sysdep.h>
72
73	.section .text.evex512, "ax", @progbits
74	ENTRY(_ZGVeN8vv_hypot_skx)
75	pushq %rbp
76	cfi_def_cfa_offset(`16`)
77	movq %rsp, %rbp
78	cfi_def_cfa(`6`, `16`)
79	cfi_offset(`6`, -`16`)
80	andq $-`64`, %rsp
81	subq $`256`, %rsp
82	vgetexppd {sae}, %zmm0, %zmm2
83	vgetexppd {sae}, %zmm1, %zmm3
84	vmovups _dHalf+__svml_dhypot_data_internal(%rip), %zmm9
85	vmaxpd {sae}, %zmm3, %zmm2, %zmm4
86	vmulpd {rn-sae}, %zmm0, %zmm0, %zmm2
87	vandpd _dAbsMask+__svml_dhypot_data_internal(%rip), %zmm4, %zmm5
88	vfmadd231pd {rn-sae}, %zmm1, %zmm1, %zmm2
89
90	/ Select exponent bound so that no scaling is needed /
91	vpcmpq $`5`, _lExpBound_uisa+__svml_dhypot_data_internal(%rip), %zmm5, %k0
92	vrsqrt14pd %zmm2, %zmm6
93	kmovw %k0, %edx
94	vmulpd {rn-sae}, %zmm6, %zmm2, %zmm7
95	vmulpd {rn-sae}, %zmm6, %zmm9, %zmm8
96	vfnmadd231pd {rn-sae}, %zmm7, %zmm8, %zmm9
97	vfmadd231pd {rn-sae}, %zmm9, %zmm8, %zmm8
98	vfmadd213pd {rn-sae}, %zmm7, %zmm7, %zmm9
99	vfnmadd231pd {rn-sae}, %zmm9, %zmm9, %zmm2
100	vfmadd213pd {rn-sae}, %zmm9, %zmm8, %zmm2
101
102	/ The end of implementation /
103	testl %edx, %edx
104
105	/ Go to special inputs processing branch /
106	jne L(SPECIAL_VALUES_BRANCH)
107	# LOE rbx r12 r13 r14 r15 edx zmm0 zmm1 zmm2
108
109	/ Restore registers*
110	* and exit the function
111	*/
112
113	L(EXIT):
114	vmovaps %zmm2, %zmm0
115	movq %rbp, %rsp
116	popq %rbp
117	cfi_def_cfa(`7`, `8`)
118	cfi_restore(`6`)
119	ret
120	cfi_def_cfa(`6`, `16`)
121	cfi_offset(`6`, -`16`)
122
123	/ Branch to process*
124	* special inputs
125	*/
126
127	L(SPECIAL_VALUES_BRANCH):
128	vmovups %zmm0, `64`(%rsp)
129	vmovups %zmm1, `128`(%rsp)
130	vmovups %zmm2, `192`(%rsp)
131	# LOE rbx r12 r13 r14 r15 edx zmm2
132
133	xorl %eax, %eax
134	# LOE rbx r12 r13 r14 r15 eax edx
135
136	vzeroupper
137	movq %r12, `16`(%rsp)
138	/ DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -240; DW_OP_plus) /
139	.cfi_escape `0x10`, `0x0c`, `0x0e`, `0x38`, `0x1c`, `0x0d`, `0xc0`, `0xff`, `0xff`, `0xff`, `0x1a`, `0x0d`, `0x10`, `0xff`, `0xff`, `0xff`, `0x22`
140	movl %eax, %r12d
141	movq %r13, `8`(%rsp)
142	/ DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -248; DW_OP_plus) /
143	.cfi_escape `0x10`, `0x0d`, `0x0e`, `0x38`, `0x1c`, `0x0d`, `0xc0`, `0xff`, `0xff`, `0xff`, `0x1a`, `0x0d`, `0x08`, `0xff`, `0xff`, `0xff`, `0x22`
144	movl %edx, %r13d
145	movq %r14, (%rsp)
146	/ DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -256; DW_OP_plus) /
147	.cfi_escape `0x10`, `0x0e`, `0x0e`, `0x38`, `0x1c`, `0x0d`, `0xc0`, `0xff`, `0xff`, `0xff`, `0x1a`, `0x0d`, `0x00`, `0xff`, `0xff`, `0xff`, `0x22`
148	# LOE rbx r15 r12d r13d
149
150	/ Range mask*
151	* bits check
152	*/
153
154	L(RANGEMASK_CHECK):
155	btl %r12d, %r13d
156
157	/ Call scalar math function /
158	jc L(SCALAR_MATH_CALL)
159	# LOE rbx r15 r12d r13d
160
161	/ Special inputs*
162	* processing loop
163	*/
164
165	L(SPECIAL_VALUES_LOOP):
166	incl %r12d
167	cmpl $`8`, %r12d
168
169	/ Check bits in range mask /
170	jl L(RANGEMASK_CHECK)
171	# LOE rbx r15 r12d r13d
172
173	movq `16`(%rsp), %r12
174	cfi_restore(`12`)
175	movq `8`(%rsp), %r13
176	cfi_restore(`13`)
177	movq (%rsp), %r14
178	cfi_restore(`14`)
179	vmovups `192`(%rsp), %zmm2
180
181	/ Go to exit /
182	jmp L(EXIT)
183	/ DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -240; DW_OP_plus) /
184	.cfi_escape `0x10`, `0x0c`, `0x0e`, `0x38`, `0x1c`, `0x0d`, `0xc0`, `0xff`, `0xff`, `0xff`, `0x1a`, `0x0d`, `0x10`, `0xff`, `0xff`, `0xff`, `0x22`
185	/ DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -248; DW_OP_plus) /
186	.cfi_escape `0x10`, `0x0d`, `0x0e`, `0x38`, `0x1c`, `0x0d`, `0xc0`, `0xff`, `0xff`, `0xff`, `0x1a`, `0x0d`, `0x08`, `0xff`, `0xff`, `0xff`, `0x22`
187	/ DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -256; DW_OP_plus) /
188	.cfi_escape `0x10`, `0x0e`, `0x0e`, `0x38`, `0x1c`, `0x0d`, `0xc0`, `0xff`, `0xff`, `0xff`, `0x1a`, `0x0d`, `0x00`, `0xff`, `0xff`, `0xff`, `0x22`
189	# LOE rbx r12 r13 r14 r15 zmm2
190
191	/ Scalar math function call*
192	* to process special input
193	*/
194
195	L(SCALAR_MATH_CALL):
196	movl %r12d, %r14d
197	vmovsd `64`(%rsp, %r14, `8`), %xmm0
198	vmovsd `128`(%rsp, %r14, `8`), %xmm1
199	call hypot@PLT
200	# LOE rbx r14 r15 r12d r13d xmm0
201
202	vmovsd %xmm0, `192`(%rsp, %r14, `8`)
203
204	/ Process special inputs in loop /
205	jmp L(SPECIAL_VALUES_LOOP)
206	# LOE rbx r15 r12d r13d
207	END(_ZGVeN8vv_hypot_skx)
208
209	.section .rodata, "a"
210	.align `64`
211
212	#ifdef __svml_dhypot_data_internal_typedef
213	typedef unsigned int VUINT32;
214	typedef struct {
215	__declspec(align(`64`)) VUINT32 _dAbsMask[`8`][`2`];
216	__declspec(align(`64`)) VUINT32 _lExpBound_uisa[`8`][`2`];
217	__declspec(align(`64`)) VUINT32 _lExpBound[`8`][`2`];
218	__declspec(align(`64`)) VUINT32 _dHalf[`8`][`2`];
219	} __svml_dhypot_data_internal;
220	#endif
221	__svml_dhypot_data_internal:
222	/ legacy algorithm /
223	.quad `0x7fffffffffffffff`, `0x7fffffffffffffff`, `0x7fffffffffffffff`, `0x7fffffffffffffff`, `0x7fffffffffffffff`, `0x7fffffffffffffff`, `0x7fffffffffffffff`, `0x7fffffffffffffff` / _dAbsMask /
224	/ fma based algorithm/
225	.align `64`
226	.quad `0x407ff00000000000`, `0x407ff00000000000`, `0x407ff00000000000`, `0x407ff00000000000`, `0x407ff00000000000`, `0x407ff00000000000`, `0x407ff00000000000`, `0x407ff00000000000` / _lExpBound_uisa /
227	.align `64`
228	.quad `0x404f800000000000`, `0x404f800000000000`, `0x404f800000000000`, `0x404f800000000000`, `0x404f800000000000`, `0x404f800000000000`, `0x404f800000000000`, `0x404f800000000000` / _lExpBound /
229	.align `64`
230	.quad `0x3FE0000000000000`, `0x3FE0000000000000`, `0x3FE0000000000000`, `0x3FE0000000000000`, `0x3FE0000000000000`, `0x3FE0000000000000`, `0x3FE0000000000000`, `0x3FE0000000000000` / _dHalf /
231	.align `64`
232	.type __svml_dhypot_data_internal, @object
233	.size __svml_dhypot_data_internal, .-__svml_dhypot_data_internal
234

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_d_hypot8_core_avx512.S