svml_s_acoshf8_core_avx2.S source code [glibc/sysdeps/x86_64/fpu/multiarch/svml_s_acoshf8_core_avx2.S]

1	/ Function acoshf vectorized with AVX2.*
2	Copyright (C) 2021-2024 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	https://www.gnu.org/licenses/. /*
18
19	/*
20	* ALGORITHM DESCRIPTION:
21	*
22	* Compute acosh(x) as log(x + sqrt(x*x - 1))
23	*
24	* Special cases:
25	*
26	* acosh(NaN) = quiet NaN, and raise invalid exception
27	* acosh(-INF) = NaN
28	* acosh(+INF) = +INF
29	* acosh(x) = NaN if x < 1
30	* acosh(1) = +0
31	*
32	*/
33
34	/ Offsets for data table __svml_sacosh_data_internal*
35	*/
36	#define sOne 0
37	#define sPoly 32
38	#define iBrkValue 288
39	#define iOffExpoMask 320
40	#define sBigThreshold 352
41	#define sC2 384
42	#define sC3 416
43	#define sHalf 448
44	#define sLargestFinite 480
45	#define sThirtyOne 512
46	#define sTopMask8 544
47	#define XScale 576
48	#define sLn2 608
49
50	#include <sysdep.h>
51
52	.section .text.avx2, "ax", @progbits
53	ENTRY(_ZGVdN8v_acoshf_avx2)
54	pushq %rbp
55	cfi_def_cfa_offset(`16`)
56	movq %rsp, %rbp
57	cfi_def_cfa(`6`, `16`)
58	cfi_offset(`6`, -`16`)
59	andq $-`32`, %rsp
60	subq $`96`, %rsp
61
62	/ Load constants, always including One = 1 /
63	vmovups sOne+__svml_sacosh_data_internal(%rip), %ymm2
64
65	/ Finally, express Y + W = U * V accurately where Y has <= 8 bits /
66	vmovups sTopMask8+__svml_sacosh_data_internal(%rip), %ymm9
67
68	/*
69	* Now 1 / (1 + d)
70	* = 1 / (1 + (sqrt(1 - e) - 1))
71	* = 1 / sqrt(1 - e)
72	* = 1 + 1/2 * e + 3/8 * e^2 + 5/16 * e^3 + 35/128 * e^4 + ...
73	* So compute the first three nonconstant terms of that, so that
74	* we have a relative correction (1 + Corr) to apply to S etc.
75	* C1 = 1/2
76	* C2 = 3/8
77	* C3 = 5/16
78	*/
79	vmovups sC3+__svml_sacosh_data_internal(%rip), %ymm14
80	vmovaps %ymm0, %ymm3
81	vmovaps %ymm2, %ymm7
82	vfmsub231ps %ymm3, %ymm3, %ymm7
83
84	/*
85	* Check that 1 < X < +inf; otherwise go to the callout function.
86	* We need the callout for X = 1 to avoid division by zero below.
87	* This test ensures that callout handles NaN and either infinity.
88	*/
89	vcmpnle_uqps sLargestFinite+__svml_sacosh_data_internal(%rip), %ymm3, %ymm4
90	vcmpngt_uqps %ymm2, %ymm3, %ymm5
91
92	/*
93	* The following computation can go wrong for very large X, e.g.
94	* the X^2 - 1 = U * V can overflow. But for large X we have
95	* acosh(X) / log(2 X) - 1 =~= 1/(4 * X^2), so for X >= 2^30
96	* we can just later stick X back into the log and tweak up the exponent.
97	* Actually we scale X by 2^-30 and tweak the exponent up by 31,
98	* to stay in the safe range for the later log computation.
99	* Compute a flag now telling us when to do this.
100	*/
101	vcmplt_oqps sBigThreshold+__svml_sacosh_data_internal(%rip), %ymm3, %ymm1
102	vandps %ymm9, %ymm7, %ymm10
103
104	/*
105	* Compute R = 1/sqrt(Y + W) * (1 + d)
106	* Force R to <= 8 significant bits.
107	* This means that R * Y and R^2 * Y are exactly representable.
108	*/
109	vrsqrtps %ymm10, %ymm8
110	vsubps %ymm10, %ymm7, %ymm11
111	vandps %ymm9, %ymm8, %ymm12
112
113	/*
114	* Compute S = (Y/sqrt(Y + W)) * (1 + d)
115	* and T = (W/sqrt(Y + W)) * (1 + d)
116	* so that S + T = sqrt(Y + W) * (1 + d)
117	* S is exact, and the rounding error in T is OK.
118	*/
119	vmulps %ymm12, %ymm10, %ymm15
120	vmulps %ymm11, %ymm12, %ymm0
121
122	/ Now multiplex to the case X = 2^-30 * input, Xl = 0 in the "big" case. /
123	vmulps XScale+__svml_sacosh_data_internal(%rip), %ymm3, %ymm11
124
125	/*
126	* Compute e = -(2 * d + d^2)
127	* The first FMR is exact, and the rounding error in the other is acceptable
128	* since d and e are ~ 2^-8
129	*/
130	vmovaps %ymm2, %ymm13
131	vfnmadd231ps %ymm15, %ymm12, %ymm13
132	vfnmadd231ps %ymm0, %ymm12, %ymm13
133	vfmadd213ps sC2+__svml_sacosh_data_internal(%rip), %ymm13, %ymm14
134	vfmadd213ps sHalf+__svml_sacosh_data_internal(%rip), %ymm13, %ymm14
135	vmulps %ymm14, %ymm13, %ymm7
136	vorps %ymm5, %ymm4, %ymm6
137
138	/*
139	* For low-accuracy versions, the computation can be done
140	* just as U + ((S + T) + (S + T) * Corr)
141	*/
142	vaddps %ymm0, %ymm15, %ymm5
143
144	/ sU is needed later on /
145	vsubps %ymm2, %ymm3, %ymm4
146	vfmadd213ps %ymm5, %ymm7, %ymm5
147	vmovmskps %ymm6, %edx
148	vaddps %ymm5, %ymm4, %ymm6
149
150	/*
151	* Now resume the main code.
152	* reduction: compute r, n
153	*/
154	vmovups iBrkValue+__svml_sacosh_data_internal(%rip), %ymm4
155
156	/*
157	* Now we feed into the log1p code, using H in place of _VARG1 and
158	* also adding L into Xl.
159	* compute 1+x as high, low parts
160	*/
161	vmaxps %ymm6, %ymm2, %ymm8
162	vminps %ymm6, %ymm2, %ymm9
163	vaddps %ymm9, %ymm8, %ymm12
164	vblendvps %ymm1, %ymm12, %ymm11, %ymm14
165	vsubps %ymm12, %ymm8, %ymm10
166	vpsubd %ymm4, %ymm14, %ymm15
167	vaddps %ymm10, %ymm9, %ymm13
168	vpand iOffExpoMask+__svml_sacosh_data_internal(%rip), %ymm15, %ymm14
169	vpsrad $`23`, %ymm15, %ymm15
170	vpaddd %ymm4, %ymm14, %ymm8
171	vpslld $`23`, %ymm15, %ymm5
172	vmovups sPoly+`224`+__svml_sacosh_data_internal(%rip), %ymm4
173	vcvtdq2ps %ymm15, %ymm0
174	vpsubd %ymm5, %ymm2, %ymm7
175
176	/ polynomial evaluation /
177	vsubps %ymm2, %ymm8, %ymm2
178
179	/ Add 31 to the exponent in the "large" case to get log(2 * input) /
180	vaddps sThirtyOne+__svml_sacosh_data_internal(%rip), %ymm0, %ymm5
181	vandps %ymm1, %ymm13, %ymm6
182	vmulps %ymm7, %ymm6, %ymm9
183	vblendvps %ymm1, %ymm0, %ymm5, %ymm0
184	vaddps %ymm2, %ymm9, %ymm2
185	vfmadd213ps sPoly+`192`+__svml_sacosh_data_internal(%rip), %ymm2, %ymm4
186	vfmadd213ps sPoly+`160`+__svml_sacosh_data_internal(%rip), %ymm2, %ymm4
187	vfmadd213ps sPoly+`128`+__svml_sacosh_data_internal(%rip), %ymm2, %ymm4
188	vfmadd213ps sPoly+`96`+__svml_sacosh_data_internal(%rip), %ymm2, %ymm4
189	vfmadd213ps sPoly+`64`+__svml_sacosh_data_internal(%rip), %ymm2, %ymm4
190	vfmadd213ps sPoly+`32`+__svml_sacosh_data_internal(%rip), %ymm2, %ymm4
191	vfmadd213ps sPoly+__svml_sacosh_data_internal(%rip), %ymm2, %ymm4
192	vmulps %ymm4, %ymm2, %ymm6
193	vfmadd213ps %ymm2, %ymm2, %ymm6
194
195	/ final reconstruction /
196	vfmadd132ps sLn2+__svml_sacosh_data_internal(%rip), %ymm6, %ymm0
197	testl %edx, %edx
198
199	/ Go to special inputs processing branch /
200	jne L(SPECIAL_VALUES_BRANCH)
201	# LOE rbx r12 r13 r14 r15 edx ymm0 ymm3
202
203	/ Restore registers*
204	* and exit the function
205	*/
206
207	L(EXIT):
208	movq %rbp, %rsp
209	popq %rbp
210	cfi_def_cfa(`7`, `8`)
211	cfi_restore(`6`)
212	ret
213	cfi_def_cfa(`6`, `16`)
214	cfi_offset(`6`, -`16`)
215
216	/ Branch to process*
217	* special inputs
218	*/
219
220	L(SPECIAL_VALUES_BRANCH):
221	vmovups %ymm3, `32`(%rsp)
222	vmovups %ymm0, `64`(%rsp)
223	# LOE rbx r12 r13 r14 r15 edx ymm0
224
225	xorl %eax, %eax
226	# LOE rbx r12 r13 r14 r15 eax edx
227
228	vzeroupper
229	movq %r12, `16`(%rsp)
230	/ DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) /
231	.cfi_escape `0x10`, `0x0c`, `0x0e`, `0x38`, `0x1c`, `0x0d`, `0xe0`, `0xff`, `0xff`, `0xff`, `0x1a`, `0x0d`, `0xb0`, `0xff`, `0xff`, `0xff`, `0x22`
232	movl %eax, %r12d
233	movq %r13, `8`(%rsp)
234	/ DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) /
235	.cfi_escape `0x10`, `0x0d`, `0x0e`, `0x38`, `0x1c`, `0x0d`, `0xe0`, `0xff`, `0xff`, `0xff`, `0x1a`, `0x0d`, `0xa8`, `0xff`, `0xff`, `0xff`, `0x22`
236	movl %edx, %r13d
237	movq %r14, (%rsp)
238	/ DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) /
239	.cfi_escape `0x10`, `0x0e`, `0x0e`, `0x38`, `0x1c`, `0x0d`, `0xe0`, `0xff`, `0xff`, `0xff`, `0x1a`, `0x0d`, `0xa0`, `0xff`, `0xff`, `0xff`, `0x22`
240	# LOE rbx r15 r12d r13d
241
242	/ Range mask*
243	* bits check
244	*/
245
246	L(RANGEMASK_CHECK):
247	btl %r12d, %r13d
248
249	/ Call scalar math function /
250	jc L(SCALAR_MATH_CALL)
251	# LOE rbx r15 r12d r13d
252
253	/ Special inputs*
254	* processing loop
255	*/
256
257	L(SPECIAL_VALUES_LOOP):
258	incl %r12d
259	cmpl $`8`, %r12d
260
261	/ Check bits in range mask /
262	jl L(RANGEMASK_CHECK)
263	# LOE rbx r15 r12d r13d
264
265	movq `16`(%rsp), %r12
266	cfi_restore(`12`)
267	movq `8`(%rsp), %r13
268	cfi_restore(`13`)
269	movq (%rsp), %r14
270	cfi_restore(`14`)
271	vmovups `64`(%rsp), %ymm0
272
273	/ Go to exit /
274	jmp L(EXIT)
275	/ DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) /
276	.cfi_escape `0x10`, `0x0c`, `0x0e`, `0x38`, `0x1c`, `0x0d`, `0xe0`, `0xff`, `0xff`, `0xff`, `0x1a`, `0x0d`, `0xb0`, `0xff`, `0xff`, `0xff`, `0x22`
277	/ DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) /
278	.cfi_escape `0x10`, `0x0d`, `0x0e`, `0x38`, `0x1c`, `0x0d`, `0xe0`, `0xff`, `0xff`, `0xff`, `0x1a`, `0x0d`, `0xa8`, `0xff`, `0xff`, `0xff`, `0x22`
279	/ DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) /
280	.cfi_escape `0x10`, `0x0e`, `0x0e`, `0x38`, `0x1c`, `0x0d`, `0xe0`, `0xff`, `0xff`, `0xff`, `0x1a`, `0x0d`, `0xa0`, `0xff`, `0xff`, `0xff`, `0x22`
281	# LOE rbx r12 r13 r14 r15 ymm0
282
283	/ Scalar math function call*
284	* to process special input
285	*/
286
287	L(SCALAR_MATH_CALL):
288	movl %r12d, %r14d
289	vmovss `32`(%rsp, %r14, `4`), %xmm0
290	call acoshf@PLT
291	# LOE rbx r14 r15 r12d r13d xmm0
292
293	vmovss %xmm0, `64`(%rsp, %r14, `4`)
294
295	/ Process special inputs in loop /
296	jmp L(SPECIAL_VALUES_LOOP)
297	# LOE rbx r15 r12d r13d
298	END(_ZGVdN8v_acoshf_avx2)
299
300	.section .rodata, "a"
301	.align `32`
302
303	#ifdef __svml_sacosh_data_internal_typedef
304	typedef unsigned int VUINT32;
305	typedef struct {
306	__declspec(align(`32`)) VUINT32 sOne[`8`][`1`];
307	__declspec(align(`32`)) VUINT32 sPoly[`8`][`8`][`1`];
308	__declspec(align(`32`)) VUINT32 iBrkValue[`8`][`1`];
309	__declspec(align(`32`)) VUINT32 iOffExpoMask[`8`][`1`];
310	__declspec(align(`32`)) VUINT32 sBigThreshold[`8`][`1`];
311	__declspec(align(`32`)) VUINT32 sC2[`8`][`1`];
312	__declspec(align(`32`)) VUINT32 sC3[`8`][`1`];
313	__declspec(align(`32`)) VUINT32 sHalf[`8`][`1`];
314	__declspec(align(`32`)) VUINT32 sLargestFinite[`8`][`1`];
315	__declspec(align(`32`)) VUINT32 sThirtyOne[`8`][`1`];
316	__declspec(align(`32`)) VUINT32 sTopMask8[`8`][`1`];
317	__declspec(align(`32`)) VUINT32 XScale[`8`][`1`];
318	__declspec(align(`32`)) VUINT32 sLn2[`8`][`1`];
319	} __svml_sacosh_data_internal;
320	#endif
321	__svml_sacosh_data_internal:
322	/ sOne = SP 1.0 /
323	.long `0x3f800000`, `0x3f800000`, `0x3f800000`, `0x3f800000`, `0x3f800000`, `0x3f800000`, `0x3f800000`, `0x3f800000`
324	/ sPoly[] = SP polynomial /
325	.align `32`
326	.long `0xbf000000`, `0xbf000000`, `0xbf000000`, `0xbf000000`, `0xbf000000`, `0xbf000000`, `0xbf000000`, `0xbf000000` / -5.0000000000000000000000000e-01 P0 /
327	.long `0x3eaaaa94`, `0x3eaaaa94`, `0x3eaaaa94`, `0x3eaaaa94`, `0x3eaaaa94`, `0x3eaaaa94`, `0x3eaaaa94`, `0x3eaaaa94` / 3.3333265781402587890625000e-01 P1 /
328	.long `0xbe80058e`, `0xbe80058e`, `0xbe80058e`, `0xbe80058e`, `0xbe80058e`, `0xbe80058e`, `0xbe80058e`, `0xbe80058e` / -2.5004237890243530273437500e-01 P2 /
329	.long `0x3e4ce190`, `0x3e4ce190`, `0x3e4ce190`, `0x3e4ce190`, `0x3e4ce190`, `0x3e4ce190`, `0x3e4ce190`, `0x3e4ce190` / 2.0007920265197753906250000e-01 P3 /
330	.long `0xbe28ad37`, `0xbe28ad37`, `0xbe28ad37`, `0xbe28ad37`, `0xbe28ad37`, `0xbe28ad37`, `0xbe28ad37`, `0xbe28ad37` / -1.6472326219081878662109375e-01 P4 /
331	.long `0x3e0fcb12`, `0x3e0fcb12`, `0x3e0fcb12`, `0x3e0fcb12`, `0x3e0fcb12`, `0x3e0fcb12`, `0x3e0fcb12`, `0x3e0fcb12` / 1.4042308926582336425781250e-01 P5 /
332	.long `0xbe1ad9e3`, `0xbe1ad9e3`, `0xbe1ad9e3`, `0xbe1ad9e3`, `0xbe1ad9e3`, `0xbe1ad9e3`, `0xbe1ad9e3`, `0xbe1ad9e3` / -1.5122179687023162841796875e-01 P6 /
333	.long `0x3e0d84ed`, `0x3e0d84ed`, `0x3e0d84ed`, `0x3e0d84ed`, `0x3e0d84ed`, `0x3e0d84ed`, `0x3e0d84ed`, `0x3e0d84ed` / 1.3820238411426544189453125e-01 P7 /
334	/ iBrkValue = SP 2/3 /
335	.align `32`
336	.long `0x3f2aaaab`, `0x3f2aaaab`, `0x3f2aaaab`, `0x3f2aaaab`, `0x3f2aaaab`, `0x3f2aaaab`, `0x3f2aaaab`, `0x3f2aaaab`
337	/ iOffExpoMask = SP significand mask /
338	.align `32`
339	.long `0x007fffff`, `0x007fffff`, `0x007fffff`, `0x007fffff`, `0x007fffff`, `0x007fffff`, `0x007fffff`, `0x007fffff`
340	/ sBigThreshold /
341	.align `32`
342	.long `0x4E800000`, `0x4E800000`, `0x4E800000`, `0x4E800000`, `0x4E800000`, `0x4E800000`, `0x4E800000`, `0x4E800000`
343	/ sC2 /
344	.align `32`
345	.long `0x3EC00000`, `0x3EC00000`, `0x3EC00000`, `0x3EC00000`, `0x3EC00000`, `0x3EC00000`, `0x3EC00000`, `0x3EC00000`
346	/ sC3 /
347	.align `32`
348	.long `0x3EA00000`, `0x3EA00000`, `0x3EA00000`, `0x3EA00000`, `0x3EA00000`, `0x3EA00000`, `0x3EA00000`, `0x3EA00000`
349	/ sHalf /
350	.align `32`
351	.long `0x3F000000`, `0x3F000000`, `0x3F000000`, `0x3F000000`, `0x3F000000`, `0x3F000000`, `0x3F000000`, `0x3F000000`
352	/ sLargestFinite /
353	.align `32`
354	.long `0x7F7FFFFF`, `0x7F7FFFFF`, `0x7F7FFFFF`, `0x7F7FFFFF`, `0x7F7FFFFF`, `0x7F7FFFFF`, `0x7F7FFFFF`, `0x7F7FFFFF`
355	/ sThirtyOne /
356	.align `32`
357	.long `0x41F80000`, `0x41F80000`, `0x41F80000`, `0x41F80000`, `0x41F80000`, `0x41F80000`, `0x41F80000`, `0x41F80000`
358	/ sTopMask8 /
359	.align `32`
360	.long `0xFFFF0000`, `0xFFFF0000`, `0xFFFF0000`, `0xFFFF0000`, `0xFFFF0000`, `0xFFFF0000`, `0xFFFF0000`, `0xFFFF0000`
361	/ XScale /
362	.align `32`
363	.long `0x30800000`, `0x30800000`, `0x30800000`, `0x30800000`, `0x30800000`, `0x30800000`, `0x30800000`, `0x30800000`
364	/ sLn2 = SP ln(2) /
365	.align `32`
366	.long `0x3f317218`, `0x3f317218`, `0x3f317218`, `0x3f317218`, `0x3f317218`, `0x3f317218`, `0x3f317218`, `0x3f317218`
367	.align `32`
368	.type __svml_sacosh_data_internal, @object
369	.size __svml_sacosh_data_internal, .-__svml_sacosh_data_internal
370

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_s_acoshf8_core_avx2.S