s_cbrt.S source code [glibc/sysdeps/i386/fpu/s_cbrt.S]

1	/ Compute cubic root of double value.*
2	Copyright (C) 1997-2022 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	#include <machine/asm.h>
20	#include <libm-alias-double.h>
21
22	.section .rodata
23
24	.align ALIGNARG(`4`)
25	.type f7,@object
26	f7: .double -`0.145263899385486377`
27	ASM_SIZE_DIRECTIVE(f7)
28	.type f6,@object
29	f6: .double `0.784932344976639262`
30	ASM_SIZE_DIRECTIVE(f6)
31	.type f5,@object
32	f5: .double -`1.83469277483613086`
33	ASM_SIZE_DIRECTIVE(f5)
34	.type f4,@object
35	f4: .double `2.44693122563534430`
36	ASM_SIZE_DIRECTIVE(f4)
37	.type f3,@object
38	f3: .double -`2.11499494167371287`
39	ASM_SIZE_DIRECTIVE(f3)
40	.type f2,@object
41	f2: .double `1.50819193781584896`
42	ASM_SIZE_DIRECTIVE(f2)
43	.type f1,@object
44	f1: .double `0.354895765043919860`
45	ASM_SIZE_DIRECTIVE(f1)
46
47	#define CBRT2 1.2599210498948731648
48	#define ONE_CBRT2 0.793700525984099737355196796584
49	#define SQR_CBRT2 1.5874010519681994748
50	#define ONE_SQR_CBRT2 0.629960524947436582364439673883
51
52	.type factor,@object
53	factor: .double ONE_SQR_CBRT2
54	.double ONE_CBRT2
55	.double `1.0`
56	.double CBRT2
57	.double SQR_CBRT2
58	ASM_SIZE_DIRECTIVE(factor)
59
60	.type two54,@object
61	two54: .byte `0`, `0`, `0`, `0`, `0`, `0`, `0x50`, `0x43`
62	ASM_SIZE_DIRECTIVE(two54)
63
64	#ifdef PIC
65	#define MO(op) op##@GOTOFF(%ebx)
66	#define MOX(op,x) op##@GOTOFF(%ebx,x,1)
67	#else
68	#define MO(op) op
69	#define MOX(op,x) op(x)
70	#endif
71
72	.text
73	ENTRY(__cbrt)
74	movl `4`(%esp), %ecx
75	movl `8`(%esp), %eax
76	movl %eax, %edx
77	andl $`0x7fffffff`, %eax
78	orl %eax, %ecx
79	jz `1f`
80	xorl %ecx, %ecx
81	cmpl $`0x7ff00000`, %eax
82	jae `1f`
83
84	#ifdef PIC
85	pushl %ebx
86	cfi_adjust_cfa_offset (`4`)
87	cfi_rel_offset (ebx, `0`)
88	LOAD_PIC_REG (bx)
89	#endif
90
91	cmpl $`0x00100000`, %eax
92	jae `2f`
93
94	#ifdef PIC
95	fldl `8`(%esp)
96	#else
97	fldl `4`(%esp)
98	#endif
99	fmull MO(two54)
100	movl $-`54`, %ecx
101	#ifdef PIC
102	fstpl `8`(%esp)
103	movl `12`(%esp), %eax
104	#else
105	fstpl `4`(%esp)
106	movl `8`(%esp), %eax
107	#endif
108	movl %eax, %edx
109	andl $`0x7fffffff`, %eax
110
111	`2`: shrl $`20`, %eax
112	andl $`0x800fffff`, %edx
113	subl $`1022`, %eax
114	orl $`0x3fe00000`, %edx
115	addl %eax, %ecx
116	#ifdef PIC
117	movl %edx, `12`(%esp)
118
119	fldl `8`(%esp) / xm /
120	#else
121	movl %edx, `8`(%esp)
122
123	fldl `4`(%esp) / xm /
124	#endif
125	fabs
126
127	/ The following code has two tracks:*
128	a) compute the normalized cbrt value
129	b) compute xe/3 and xe%3
130	The right track computes the value for b) and this is done
131	in an optimized way by avoiding division.
132
133	But why two tracks at all? Very easy: efficiency. Some FP
134	instruction can overlap with a certain amount of integer (and
135	FP) instructions. So we get (except for the imull) all
136	instructions for free. /*
137
138	fld %st(`0`) / xm : xm /
139
140	fmull MO(f7) / f7xm : xm /*
141	movl $`1431655766`, %eax
142	faddl MO(f6) / f6+f7xm : xm /*
143	imull %ecx
144	fmul %st(`1`) / (f6+f7xm)xm : xm /
145	movl %ecx, %eax
146	faddl MO(f5) / f5+(f6+f7xm)xm : xm /
147	sarl $`31`, %eax
148	fmul %st(`1`) / (f5+(f6+f7xm)xm)xm : xm /*
149	subl %eax, %edx
150	faddl MO(f4) / f4+(f5+(f6+f7xm)xm)xm : xm /*
151	fmul %st(`1`) / (f4+(f5+(f6+f7xm)xm)xm)xm : xm /
152	faddl MO(f3) / f3+(f4+(f5+(f6+f7xm)xm)xm)xm : xm /
153	fmul %st(`1`) / (f3+(f4+(f5+(f6+f7xm)xm)xm)xm)xm : xm /*
154	faddl MO(f2) / f2+(f3+(f4+(f5+(f6+f7xm)xm)xm)xm)xm : xm /*
155	fmul %st(`1`) / (f2+(f3+(f4+(f5+(f6+f7xm)xm)xm)xm)xm)xm : xm /
156	faddl MO(f1) / u:=f1+(f2+(f3+(f4+(f5+(f6+f7xm)xm)xm)xm)xm)xm : xm /
157
158	fld %st / u : u : xm /
159	fmul %st(`1`) / uu : u : xm /*
160	fld %st(`2`) / xm : uu : u : xm /*
161	fadd %st / 2xm : uu : u : xm /
162	fxch %st(`1`) / uu : 2xm : u : xm /
163	fmul %st(`2`) / t2:=uuu : 2xm : u : xm /*
164	movl %edx, %eax
165	fadd %st, %st(`1`) / t2 : t2+2xm : u : xm /*
166	leal (%edx,%edx,`2`),%edx
167	fadd %st(`0`) / 2t2 : t2+2xm : u : xm /
168	subl %edx, %ecx
169	faddp %st, %st(`3`) / t2+2xm : u : 2t2+xm /
170	shll $`3`, %ecx
171	fmulp / u(t2+2xm) : 2t2+xm /*
172	fdivp %st, %st(`1`) / u(t2+2xm)/(2t2+xm) /*
173	fmull MOX(`16`+factor,%ecx) / u(t2+2xm)/(2t2+xm)FACT /
174	pushl %eax
175	cfi_adjust_cfa_offset (`4`)
176	fildl (%esp) / xe/3 : u(t2+2xm)/(2t2+xm)FACT /
177	fxch / u(t2+2xm)/(2t2+xm)FACT : xe/3 /
178	fscale / u(t2+2xm)/(2t2+xm)FACT2^xe/3 /*
179	popl %edx
180	cfi_adjust_cfa_offset (-`4`)
181	#ifdef PIC
182	movl `12`(%esp), %eax
183	popl %ebx
184	cfi_adjust_cfa_offset (-`4`)
185	cfi_restore (ebx)
186	#else
187	movl `8`(%esp), %eax
188	#endif
189	testl %eax, %eax
190	fstp %st(`1`)
191	jns `4f`
192	fchs
193	`4`: ret
194
195	/ Return the argument. /
196	`1`: fldl `4`(%esp)
197	ret
198	END(__cbrt)
199	libm_alias_double (__cbrt, cbrt)
200

source code of glibc/sysdeps/i386/fpu/s_cbrt.S