s_cbrtf.S source code [glibc/sysdeps/i386/fpu/s_cbrtf.S]

1	/ Compute cubic root of float value.*
2	Copyright (C) 1997-2022 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	#include <machine/asm.h>
20	#include <libm-alias-float.h>
21
22	.section .rodata
23
24	.align ALIGNARG(`4`)
25	.type f3,@object
26	f3: .double `0.191502161678719066`
27	ASM_SIZE_DIRECTIVE(f3)
28	.type f2,@object
29	f2: .double `0.697570460207922770`
30	ASM_SIZE_DIRECTIVE(f2)
31	.type f1,@object
32	f1: .double `0.492659620528969547`
33	ASM_SIZE_DIRECTIVE(f1)
34
35	#define CBRT2 1.2599210498948731648
36	#define ONE_CBRT2 0.793700525984099737355196796584
37	#define SQR_CBRT2 1.5874010519681994748
38	#define ONE_SQR_CBRT2 0.629960524947436582364439673883
39
40	.type factor,@object
41	.align ALIGNARG(`4`)
42	factor: .double ONE_SQR_CBRT2
43	.double ONE_CBRT2
44	.double `1.0`
45	.double CBRT2
46	.double SQR_CBRT2
47	ASM_SIZE_DIRECTIVE(factor)
48
49	.type two25,@object
50	two25: .byte `0`, `0`, `0`, `0x4c`
51	ASM_SIZE_DIRECTIVE(two25)
52
53	#ifdef PIC
54	#define MO(op) op##@GOTOFF(%ebx)
55	#define MOX(op,x) op##@GOTOFF(%ebx,x,1)
56	#else
57	#define MO(op) op
58	#define MOX(op,x) op(x)
59	#endif
60
61	.text
62	ENTRY(__cbrtf)
63	movl `4`(%esp), %eax
64	xorl %ecx, %ecx
65	movl %eax, %edx
66	andl $`0x7fffffff`, %eax
67	jz `1f`
68	cmpl $`0x7f800000`, %eax
69	jae `1f`
70
71	#ifdef PIC
72	pushl %ebx
73	cfi_adjust_cfa_offset (`4`)
74	cfi_rel_offset (ebx, `0`)
75	LOAD_PIC_REG (bx)
76	#endif
77
78	cmpl $`0x00800000`, %eax
79	jae `2f`
80
81	#ifdef PIC
82	flds `8`(%esp)
83	#else
84	flds `4`(%esp)
85	#endif
86	fmuls MO(two25)
87	movl $-`25`, %ecx
88	#ifdef PIC
89	fstps `8`(%esp)
90	movl `8`(%esp), %eax
91	#else
92	fstps `4`(%esp)
93	movl `4`(%esp), %eax
94	#endif
95	movl %eax, %edx
96	andl $`0x7fffffff`, %eax
97
98	`2`: shrl $`23`, %eax
99	andl $`0x807fffff`, %edx
100	subl $`126`, %eax
101	orl $`0x3f000000`, %edx
102	addl %eax, %ecx
103	#ifdef PIC
104	movl %edx, `8`(%esp)
105
106	flds `8`(%esp) / xm /
107	#else
108	movl %edx, `4`(%esp)
109
110	flds `4`(%esp) / xm /
111	#endif
112	fabs
113
114	/ The following code has two tracks:*
115	a) compute the normalized cbrt value
116	b) compute xe/3 and xe%3
117	The right track computes the value for b) and this is done
118	in an optimized way by avoiding division.
119
120	But why two tracks at all? Very easy: efficiency. Some FP
121	instruction can overlap with a certain amount of integer (and
122	FP) instructions. So we get (except for the imull) all
123	instructions for free. /*
124
125	fld %st(`0`) / xm : xm /
126	fmull MO(f3) / f3xm : xm /*
127	movl $`1431655766`, %eax
128	fsubrl MO(f2) / f2-f3xm : xm /*
129	imull %ecx
130	fmul %st(`1`) / (f2-f3xm)xm : xm /
131	movl %ecx, %eax
132	faddl MO(f1) / u:=f1+(f2-f3xm)xm : xm /
133	sarl $`31`, %eax
134	fld %st / u : u : xm /
135	subl %eax, %edx
136	fmul %st(`1`) / uu : u : xm /*
137	fld %st(`2`) / xm : uu : u : xm /*
138	fadd %st / 2xm : uu : u : xm /
139	fxch %st(`1`) / uu : 2xm : u : xm /
140	fmul %st(`2`) / t2:=uuu : 2xm : u : xm /*
141	movl %edx, %eax
142	fadd %st, %st(`1`) / t2 : t2+2xm : u : xm /*
143	leal (%edx,%edx,`2`),%edx
144	fadd %st(`0`) / 2t2 : t2+2xm : u : xm /
145	subl %edx, %ecx
146	faddp %st, %st(`3`) / t2+2xm : u : 2t2+xm /
147	shll $`3`, %ecx
148	fmulp / u(t2+2xm) : 2t2+xm /*
149	fdivp %st, %st(`1`) / u(t2+2xm)/(2t2+xm) /*
150	fmull MOX(`16`+factor,%ecx) / u(t2+2xm)/(2t2+xm)FACT /
151	pushl %eax
152	cfi_adjust_cfa_offset (`4`)
153	fildl (%esp) / xe/3 : u(t2+2xm)/(2t2+xm)FACT /
154	fxch / u(t2+2xm)/(2t2+xm)FACT : xe/3 /
155	fscale / u(t2+2xm)/(2t2+xm)FACT2^xe/3 /*
156	popl %edx
157	cfi_adjust_cfa_offset (-`4`)
158	#ifdef PIC
159	movl `8`(%esp), %eax
160	popl %ebx
161	cfi_adjust_cfa_offset (-`4`)
162	cfi_restore (ebx)
163	#else
164	movl `4`(%esp), %eax
165	#endif
166	testl %eax, %eax
167	fstp %st(`1`)
168	jns `4f`
169	fchs
170	`4`: ret
171
172	/ Return the argument. /
173	`1`: flds `4`(%esp)
174	ret
175	END(__cbrtf)
176	libm_alias_float (__cbrt, cbrt)
177

source code of glibc/sysdeps/i386/fpu/s_cbrtf.S