svml_s_asinhf4_core_sse4.S source code [glibc/sysdeps/x86_64/fpu/multiarch/svml_s_asinhf4_core_sse4.S]

1	/ Function asinhf vectorized with SSE4.*
2	Copyright (C) 2021-2024 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	https://www.gnu.org/licenses/. /*
18
19	/*
20	* ALGORITHM DESCRIPTION:
21	*
22	* Compute asinh(x) as log(x + sqrt(x*x + 1))
23	*
24	* Special cases:
25	*
26	* asinh(NaN) = quiet NaN, and raise invalid exception
27	* asinh(INF) = that INF
28	* asinh(0) = that 0
29	*
30	*/
31
32	/ Offsets for data table __svml_sasinh_data_internal*
33	*/
34	#define SgnMask 0
35	#define sOne 16
36	#define sPoly 32
37	#define iBrkValue 160
38	#define iOffExpoMask 176
39	#define sBigThreshold 192
40	#define sC2 208
41	#define sC3 224
42	#define sHalf 240
43	#define sLargestFinite 256
44	#define sLittleThreshold 272
45	#define sSign 288
46	#define sThirtyOne 304
47	#define sTopMask11 320
48	#define sTopMask8 336
49	#define XScale 352
50	#define sLn2 368
51
52	#include <sysdep.h>
53
54	.section .text.sse4, "ax", @progbits
55	ENTRY(_ZGVbN4v_asinhf_sse4)
56	subq $`72`, %rsp
57	cfi_def_cfa_offset(`80`)
58	movaps %xmm0, %xmm8
59
60	/*
61	* Split X into high and low parts, XHi (<= 11 bits) and XLo (<= 13 bits)
62	* We could use either X or \|X\| here, but it doesn't seem to matter
63	*/
64	movups sTopMask11+__svml_sasinh_data_internal(%rip), %xmm10
65	movaps %xmm8, %xmm2
66	andps %xmm8, %xmm10
67
68	/*
69	* Compute X^2 = (XHi + XLo)^2 = XHi^2 + XLo * (X + XHi)
70	* The two parts are shifted off by around 11 bits. So even though
71	* the low bit will not in general be exact, it's near enough
72	*/
73	movaps %xmm10, %xmm3
74	subps %xmm10, %xmm2
75	mulps %xmm10, %xmm3
76	addps %xmm8, %xmm10
77
78	/ Load the constant 1 and a sign mask /
79	movups sOne+__svml_sasinh_data_internal(%rip), %xmm7
80
81	/*
82	* Finally, express Y + W = X^2 + 1 accurately where Y has <= 8 bits.
83	* If \|X\| <= 1 then \|XHi\| <= 1 and so \|X2Hi\| <= 1, so we can treat 1
84	* as the dominant component in the compensated summation. Otherwise,
85	* if \|X\| >= 1, then since X2Hi only has 22 significant bits, the basic
86	* addition will be exact anyway until we get to \|X\| >= 2^24. But by
87	* that time the log function is well-conditioned enough that the
88	* rounding error doesn't matter. Hence we can treat 1 as dominant even
89	* if it literally isn't.
90	*/
91	movaps %xmm7, %xmm11
92	movaps %xmm7, %xmm4
93	movups sTopMask8+__svml_sasinh_data_internal(%rip), %xmm12
94	addps %xmm3, %xmm11
95	mulps %xmm10, %xmm2
96	subps %xmm11, %xmm4
97	movaps %xmm12, %xmm0
98	addps %xmm3, %xmm4
99
100	/*
101	* Unfortunately, we can still be in trouble if \|X\| <= 2^-5, since
102	* the absolute error 2^-(7+24)-ish in sqrt(1 + X^2) gets scaled up
103	* by 1/X and comes close to our threshold. Hence if \|X\| <= 2^-4,
104	* perform an alternative computation
105	* sqrt(1 + X^2) - 1 = X^2/2 - X^4/8 + X^6/16
106	* X2 = X^2
107	*/
108	addps %xmm2, %xmm3
109	addps %xmm2, %xmm4
110	andps %xmm11, %xmm0
111
112	/*
113	* Compute R = 1/sqrt(Y + W) * (1 + d)
114	* Force R to <= 8 significant bits.
115	* This means that R * Y and R^2 * Y are exactly representable.
116	*/
117	rsqrtps %xmm0, %xmm14
118	subps %xmm0, %xmm11
119	andps %xmm12, %xmm14
120	addps %xmm11, %xmm4
121
122	/*
123	* Compute S = (Y/sqrt(Y + W)) * (1 + d)
124	* and T = (W/sqrt(Y + W)) * (1 + d)
125	* so that S + T = sqrt(Y + W) * (1 + d)
126	* S is exact, and the rounding error in T is OK.
127	*/
128	mulps %xmm14, %xmm0
129	mulps %xmm14, %xmm4
130
131	/*
132	* Get the absolute value of the input, since we will exploit antisymmetry
133	* and mostly assume X >= 0 in the core computation
134	*/
135	movups SgnMask+__svml_sasinh_data_internal(%rip), %xmm6
136
137	/*
138	* Compute e = -(2 * d + d^2)
139	* The first FMR is exact, and the rounding error in the other is acceptable
140	* since d and e are ~ 2^-8
141	*/
142	movaps %xmm14, %xmm13
143	andps %xmm8, %xmm6
144
145	/*
146	* Obtain sqrt(1 + X^2) - 1 in two pieces
147	* sqrt(1 + X^2) - 1
148	* = sqrt(Y + W) - 1
149	* = (S + T) * (1 + Corr) - 1
150	* = [S - 1] + [T + (S + T) * Corr]
151	* We need a compensated summation for the last part. We treat S - 1
152	* as the larger part; it certainly is until about X < 2^-4, and in that
153	* case, the error is affordable since X dominates over sqrt(1 + X^2) - 1
154	* Final sum is dTmp5 (hi) + dTmp7 (lo)
155	*/
156	movaps %xmm0, %xmm1
157
158	/*
159	* Check whether the input is finite, by checking \|X\| <= MaxFloat
160	* Otherwise set the rangemask so that the callout will get used.
161	* Note that this will also use the callout for NaNs since not(NaN <= MaxFloat)
162	*/
163	movaps %xmm6, %xmm9
164
165	/*
166	* The following computation can go wrong for very large X, basically
167	* because X^2 overflows. But for large X we have
168	* asinh(X) / log(2 X) - 1 =~= 1/(4 * X^2), so for X >= 2^30
169	* we can just later stick X back into the log and tweak up the exponent.
170	* Actually we scale X by 2^-30 and tweak the exponent up by 31,
171	* to stay in the safe range for the later log computation.
172	* Compute a flag now telling us when do do this.
173	*/
174	movaps %xmm6, %xmm5
175	cmpnleps sLargestFinite+__svml_sasinh_data_internal(%rip), %xmm9
176	cmpltps sBigThreshold+__svml_sasinh_data_internal(%rip), %xmm5
177	mulps %xmm0, %xmm13
178	addps %xmm4, %xmm1
179	subps %xmm7, %xmm0
180	mulps %xmm4, %xmm14
181	movmskps %xmm9, %edx
182	movaps %xmm7, %xmm9
183
184	/*
185	* Now 1 / (1 + d)
186	* = 1 / (1 + (sqrt(1 - e) - 1))
187	* = 1 / sqrt(1 - e)
188	* = 1 + 1/2 * e + 3/8 * e^2 + 5/16 * e^3 + 35/128 * e^4 + ...
189	* So compute the first three nonconstant terms of that, so that
190	* we have a relative correction (1 + Corr) to apply to S etc.
191	* C1 = 1/2
192	* C2 = 3/8
193	* C3 = 5/16
194	*/
195	movups sC3+__svml_sasinh_data_internal(%rip), %xmm15
196	subps %xmm13, %xmm9
197	movups sHalf+__svml_sasinh_data_internal(%rip), %xmm10
198	subps %xmm14, %xmm9
199
200	/ sX2over2 = X^2/2 /
201	mulps %xmm10, %xmm3
202	mulps %xmm9, %xmm15
203
204	/ sX46 = -X^4/4 + X^6/8 /
205	movaps %xmm3, %xmm2
206	movaps %xmm3, %xmm12
207
208	/*
209	* Now do another compensated sum to add \|X\| + [sqrt(1 + X^2) - 1].
210	* It's always safe to assume \|X\| is larger.
211	* This is the final 2-part argument to the log1p function
212	*/
213	movaps %xmm6, %xmm14
214	addps sC2+__svml_sasinh_data_internal(%rip), %xmm15
215	mulps %xmm9, %xmm15
216	addps %xmm10, %xmm15
217	mulps %xmm15, %xmm9
218	mulps %xmm1, %xmm9
219
220	/ Now multiplex to the case X = 2^-30 * input, Xl = sL = 0 in the "big" case. /
221	movups XScale+__svml_sasinh_data_internal(%rip), %xmm15
222	addps %xmm9, %xmm4
223	movaps %xmm4, %xmm11
224	addps %xmm0, %xmm11
225	subps %xmm11, %xmm0
226	addps %xmm0, %xmm4
227
228	/ sX4over4 = X^4/4 /
229	movaps %xmm3, %xmm0
230	mulps %xmm3, %xmm0
231	mulps %xmm0, %xmm2
232	subps %xmm0, %xmm2
233
234	/*
235	* Now we feed into the log1p code, using H in place of _VARG1 and
236	* also adding L into Xl.
237	* compute 1+x as high, low parts
238	*/
239	movaps %xmm7, %xmm0
240
241	/ sX46over2 = -X^4/8 + x^6/16 /
242	mulps %xmm2, %xmm10
243	movaps %xmm7, %xmm2
244	addps %xmm10, %xmm12
245	subps %xmm12, %xmm3
246	addps %xmm3, %xmm10
247
248	/ Now multiplex the two possible computations /
249	movaps %xmm6, %xmm3
250	cmpleps sLittleThreshold+__svml_sasinh_data_internal(%rip), %xmm3
251	movaps %xmm3, %xmm13
252	andps %xmm3, %xmm12
253	andnps %xmm11, %xmm13
254	movaps %xmm3, %xmm1
255	orps %xmm12, %xmm13
256	andnps %xmm4, %xmm1
257	andps %xmm3, %xmm10
258	movaps %xmm6, %xmm4
259	orps %xmm10, %xmm1
260	addps %xmm13, %xmm14
261	mulps %xmm15, %xmm6
262	maxps %xmm14, %xmm0
263	minps %xmm14, %xmm2
264	subps %xmm14, %xmm4
265	movaps %xmm0, %xmm3
266	addps %xmm4, %xmm13
267	addps %xmm2, %xmm3
268	addps %xmm13, %xmm1
269	subps %xmm3, %xmm0
270	movaps %xmm5, %xmm4
271	andps %xmm5, %xmm3
272	andnps %xmm6, %xmm4
273	addps %xmm0, %xmm2
274
275	/*
276	* Now resume the main code.
277	* reduction: compute r, n
278	*/
279	movdqu iBrkValue+__svml_sasinh_data_internal(%rip), %xmm6
280	orps %xmm3, %xmm4
281	psubd %xmm6, %xmm4
282	movaps %xmm7, %xmm0
283	addps %xmm2, %xmm1
284	movdqu iOffExpoMask+__svml_sasinh_data_internal(%rip), %xmm2
285	pand %xmm4, %xmm2
286	psrad $`23`, %xmm4
287	cvtdq2ps %xmm4, %xmm3
288	pslld $`23`, %xmm4
289	andps %xmm5, %xmm1
290	paddd %xmm6, %xmm2
291	psubd %xmm4, %xmm0
292	mulps %xmm0, %xmm1
293
294	/ polynomial evaluation /
295	subps %xmm7, %xmm2
296	movups sPoly+`112`+__svml_sasinh_data_internal(%rip), %xmm7
297	addps %xmm2, %xmm1
298	mulps %xmm1, %xmm7
299	movaps %xmm5, %xmm2
300
301	/ Add 31 to the exponent in the "large" case to get log(2 * input) /
302	movups sThirtyOne+__svml_sasinh_data_internal(%rip), %xmm0
303	addps sPoly+`96`+__svml_sasinh_data_internal(%rip), %xmm7
304	addps %xmm3, %xmm0
305	mulps %xmm1, %xmm7
306	andnps %xmm0, %xmm2
307	andps %xmm5, %xmm3
308	orps %xmm3, %xmm2
309	addps sPoly+`80`+__svml_sasinh_data_internal(%rip), %xmm7
310
311	/ final reconstruction /
312	mulps sLn2+__svml_sasinh_data_internal(%rip), %xmm2
313	mulps %xmm1, %xmm7
314
315	/ Finally, reincorporate the original sign. /
316	movups sSign+__svml_sasinh_data_internal(%rip), %xmm0
317	andps %xmm8, %xmm0
318	addps sPoly+`64`+__svml_sasinh_data_internal(%rip), %xmm7
319	mulps %xmm1, %xmm7
320	addps sPoly+`48`+__svml_sasinh_data_internal(%rip), %xmm7
321	mulps %xmm1, %xmm7
322	addps sPoly+`32`+__svml_sasinh_data_internal(%rip), %xmm7
323	mulps %xmm1, %xmm7
324	addps sPoly+`16`+__svml_sasinh_data_internal(%rip), %xmm7
325	mulps %xmm1, %xmm7
326	addps sPoly+__svml_sasinh_data_internal(%rip), %xmm7
327	mulps %xmm1, %xmm7
328	mulps %xmm1, %xmm7
329	addps %xmm7, %xmm1
330	addps %xmm2, %xmm1
331	pxor %xmm1, %xmm0
332	testl %edx, %edx
333
334	/ Go to special inputs processing branch /
335	jne L(SPECIAL_VALUES_BRANCH)
336	# LOE rbx rbp r12 r13 r14 r15 edx xmm0 xmm8
337
338	/ Restore registers*
339	* and exit the function
340	*/
341
342	L(EXIT):
343	addq $`72`, %rsp
344	cfi_def_cfa_offset(`8`)
345	ret
346	cfi_def_cfa_offset(`80`)
347
348	/ Branch to process*
349	* special inputs
350	*/
351
352	L(SPECIAL_VALUES_BRANCH):
353	movups %xmm8, `32`(%rsp)
354	movups %xmm0, `48`(%rsp)
355	# LOE rbx rbp r12 r13 r14 r15 edx
356
357	xorl %eax, %eax
358	movq %r12, `16`(%rsp)
359	cfi_offset(`12`, -`64`)
360	movl %eax, %r12d
361	movq %r13, `8`(%rsp)
362	cfi_offset(`13`, -`72`)
363	movl %edx, %r13d
364	movq %r14, (%rsp)
365	cfi_offset(`14`, -`80`)
366	# LOE rbx rbp r15 r12d r13d
367
368	/ Range mask*
369	* bits check
370	*/
371
372	L(RANGEMASK_CHECK):
373	btl %r12d, %r13d
374
375	/ Call scalar math function /
376	jc L(SCALAR_MATH_CALL)
377	# LOE rbx rbp r15 r12d r13d
378
379	/ Special inputs*
380	* processing loop
381	*/
382
383	L(SPECIAL_VALUES_LOOP):
384	incl %r12d
385	cmpl $`4`, %r12d
386
387	/ Check bits in range mask /
388	jl L(RANGEMASK_CHECK)
389	# LOE rbx rbp r15 r12d r13d
390
391	movq `16`(%rsp), %r12
392	cfi_restore(`12`)
393	movq `8`(%rsp), %r13
394	cfi_restore(`13`)
395	movq (%rsp), %r14
396	cfi_restore(`14`)
397	movups `48`(%rsp), %xmm0
398
399	/ Go to exit /
400	jmp L(EXIT)
401	cfi_offset(`12`, -`64`)
402	cfi_offset(`13`, -`72`)
403	cfi_offset(`14`, -`80`)
404	# LOE rbx rbp r12 r13 r14 r15 xmm0
405
406	/ Scalar math function call*
407	* to process special input
408	*/
409
410	L(SCALAR_MATH_CALL):
411	movl %r12d, %r14d
412	movss `32`(%rsp, %r14, `4`), %xmm0
413	call asinhf@PLT
414	# LOE rbx rbp r14 r15 r12d r13d xmm0
415
416	movss %xmm0, `48`(%rsp, %r14, `4`)
417
418	/ Process special inputs in loop /
419	jmp L(SPECIAL_VALUES_LOOP)
420	# LOE rbx rbp r15 r12d r13d
421	END(_ZGVbN4v_asinhf_sse4)
422
423	.section .rodata, "a"
424	.align `16`
425
426	#ifdef __svml_sasinh_data_internal_typedef
427	typedef unsigned int VUINT32;
428	typedef struct {
429	__declspec(align(`16`)) VUINT32 SgnMask[`4`][`1`];
430	__declspec(align(`16`)) VUINT32 sOne[`4`][`1`];
431	__declspec(align(`16`)) VUINT32 sPoly[`8`][`4`][`1`];
432	__declspec(align(`16`)) VUINT32 iBrkValue[`4`][`1`];
433	__declspec(align(`16`)) VUINT32 iOffExpoMask[`4`][`1`];
434	__declspec(align(`16`)) VUINT32 sBigThreshold[`4`][`1`];
435	__declspec(align(`16`)) VUINT32 sC2[`4`][`1`];
436	__declspec(align(`16`)) VUINT32 sC3[`4`][`1`];
437	__declspec(align(`16`)) VUINT32 sHalf[`4`][`1`];
438	__declspec(align(`16`)) VUINT32 sLargestFinite[`4`][`1`];
439	__declspec(align(`16`)) VUINT32 sLittleThreshold[`4`][`1`];
440	__declspec(align(`16`)) VUINT32 sSign[`4`][`1`];
441	__declspec(align(`16`)) VUINT32 sThirtyOne[`4`][`1`];
442	__declspec(align(`16`)) VUINT32 sTopMask11[`4`][`1`];
443	__declspec(align(`16`)) VUINT32 sTopMask8[`4`][`1`];
444	__declspec(align(`16`)) VUINT32 XScale[`4`][`1`];
445	__declspec(align(`16`)) VUINT32 sLn2[`4`][`1`];
446	} __svml_sasinh_data_internal;
447	#endif
448	__svml_sasinh_data_internal:
449	/ SgnMask /
450	.long `0x7fffffff`, `0x7fffffff`, `0x7fffffff`, `0x7fffffff`
451	/ sOne = SP 1.0 /
452	.align `16`
453	.long `0x3f800000`, `0x3f800000`, `0x3f800000`, `0x3f800000`
454	/ sPoly[] = SP polynomial /
455	.align `16`
456	.long `0xbf000000`, `0xbf000000`, `0xbf000000`, `0xbf000000` / -5.0000000000000000000000000e-01 P0 /
457	.long `0x3eaaaa94`, `0x3eaaaa94`, `0x3eaaaa94`, `0x3eaaaa94` / 3.3333265781402587890625000e-01 P1 /
458	.long `0xbe80058e`, `0xbe80058e`, `0xbe80058e`, `0xbe80058e` / -2.5004237890243530273437500e-01 P2 /
459	.long `0x3e4ce190`, `0x3e4ce190`, `0x3e4ce190`, `0x3e4ce190` / 2.0007920265197753906250000e-01 P3 /
460	.long `0xbe28ad37`, `0xbe28ad37`, `0xbe28ad37`, `0xbe28ad37` / -1.6472326219081878662109375e-01 P4 /
461	.long `0x3e0fcb12`, `0x3e0fcb12`, `0x3e0fcb12`, `0x3e0fcb12` / 1.4042308926582336425781250e-01 P5 /
462	.long `0xbe1ad9e3`, `0xbe1ad9e3`, `0xbe1ad9e3`, `0xbe1ad9e3` / -1.5122179687023162841796875e-01 P6 /
463	.long `0x3e0d84ed`, `0x3e0d84ed`, `0x3e0d84ed`, `0x3e0d84ed` / 1.3820238411426544189453125e-01 P7 /
464	/ iBrkValue = SP 2/3 /
465	.align `16`
466	.long `0x3f2aaaab`, `0x3f2aaaab`, `0x3f2aaaab`, `0x3f2aaaab`
467	/ iOffExpoMask = SP significand mask /
468	.align `16`
469	.long `0x007fffff`, `0x007fffff`, `0x007fffff`, `0x007fffff`
470	/ sBigThreshold /
471	.align `16`
472	.long `0x4E800000`, `0x4E800000`, `0x4E800000`, `0x4E800000`
473	/ sC2 /
474	.align `16`
475	.long `0x3EC00000`, `0x3EC00000`, `0x3EC00000`, `0x3EC00000`
476	/ sC3 /
477	.align `16`
478	.long `0x3EA00000`, `0x3EA00000`, `0x3EA00000`, `0x3EA00000`
479	/ sHalf /
480	.align `16`
481	.long `0x3F000000`, `0x3F000000`, `0x3F000000`, `0x3F000000`
482	/ sLargestFinite /
483	.align `16`
484	.long `0x7F7FFFFF`, `0x7F7FFFFF`, `0x7F7FFFFF`, `0x7F7FFFFF`
485	/ sLittleThreshold /
486	.align `16`
487	.long `0x3D800000`, `0x3D800000`, `0x3D800000`, `0x3D800000`
488	/ sSign /
489	.align `16`
490	.long `0x80000000`, `0x80000000`, `0x80000000`, `0x80000000`
491	/ sThirtyOne /
492	.align `16`
493	.long `0x41F80000`, `0x41F80000`, `0x41F80000`, `0x41F80000`
494	/ sTopMask11 /
495	.align `16`
496	.long `0xFFFFE000`, `0xFFFFE000`, `0xFFFFE000`, `0xFFFFE000`
497	/ sTopMask8 /
498	.align `16`
499	.long `0xFFFF0000`, `0xFFFF0000`, `0xFFFF0000`, `0xFFFF0000`
500	/ XScale /
501	.align `16`
502	.long `0x30800000`, `0x30800000`, `0x30800000`, `0x30800000`
503	/ sLn2 = SP ln(2) /
504	.align `16`
505	.long `0x3f317218`, `0x3f317218`, `0x3f317218`, `0x3f317218`
506	.align `16`
507	.type __svml_sasinh_data_internal, @object
508	.size __svml_sasinh_data_internal, .-__svml_sasinh_data_internal
509

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_s_asinhf4_core_sse4.S