s_cbrtl.S source code [glibc/sysdeps/i386/fpu/s_cbrtl.S]

1	/ Compute cubic root of long double value.*
2	Copyright (C) 1997-2022 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	#include <libm-alias-ldouble.h>
20	#include <machine/asm.h>
21
22	.section .rodata
23
24	.align ALIGNARG(`4`)
25	.type f8,@object
26	f8: .tfloat `0.161617097923756032`
27	ASM_SIZE_DIRECTIVE(f8)
28	.align ALIGNARG(`4`)
29	.type f7,@object
30	f7: .tfloat -`0.988553671195413709`
31	ASM_SIZE_DIRECTIVE(f7)
32	.align ALIGNARG(`4`)
33	.type f6,@object
34	f6: .tfloat `2.65298938441952296`
35	ASM_SIZE_DIRECTIVE(f6)
36	.align ALIGNARG(`4`)
37	.type f5,@object
38	f5: .tfloat -`4.11151425200350531`
39	ASM_SIZE_DIRECTIVE(f5)
40	.align ALIGNARG(`4`)
41	.type f4,@object
42	f4: .tfloat `4.09559907378707839`
43	ASM_SIZE_DIRECTIVE(f4)
44	.align ALIGNARG(`4`)
45	.type f3,@object
46	f3: .tfloat -`2.82414939754975962`
47	ASM_SIZE_DIRECTIVE(f3)
48	.align ALIGNARG(`4`)
49	.type f2,@object
50	f2: .tfloat `1.67595307700780102`
51	ASM_SIZE_DIRECTIVE(f2)
52	.align ALIGNARG(`4`)
53	.type f1,@object
54	f1: .tfloat `0.338058687610520237`
55	ASM_SIZE_DIRECTIVE(f1)
56
57	#define CBRT2 1.2599210498948731648
58	#define ONE_CBRT2 0.793700525984099737355196796584
59	#define SQR_CBRT2 1.5874010519681994748
60	#define ONE_SQR_CBRT2 0.629960524947436582364439673883
61
62	/ We make the entries in the following table all 16 bytes*
63	wide to avoid having to implement a multiplication by 10. /*
64	.type factor,@object
65	.align ALIGNARG(`4`)
66	factor: .tfloat ONE_SQR_CBRT2
67	.byte `0`, `0`, `0`, `0`, `0`, `0`
68	.tfloat ONE_CBRT2
69	.byte `0`, `0`, `0`, `0`, `0`, `0`
70	.tfloat `1.0`
71	.byte `0`, `0`, `0`, `0`, `0`, `0`
72	.tfloat CBRT2
73	.byte `0`, `0`, `0`, `0`, `0`, `0`
74	.tfloat SQR_CBRT2
75	ASM_SIZE_DIRECTIVE(factor)
76
77	.type two64,@object
78	.align ALIGNARG(`4`)
79	two64: .byte `0`, `0`, `0`, `0`, `0`, `0`, `0xf0`, `0x43`
80	ASM_SIZE_DIRECTIVE(two64)
81
82	#ifdef PIC
83	#define MO(op) op##@GOTOFF(%ebx)
84	#define MOX(op,x) op##@GOTOFF(%ebx,x,1)
85	#else
86	#define MO(op) op
87	#define MOX(op,x) op(x)
88	#endif
89
90	.text
91	ENTRY(__cbrtl)
92	movl `4`(%esp), %ecx
93	movl `12`(%esp), %eax
94	orl `8`(%esp), %ecx
95	movl %eax, %edx
96	andl $`0x7fff`, %eax
97	orl %eax, %ecx
98	jz `1f`
99	xorl %ecx, %ecx
100	cmpl $`0x7fff`, %eax
101	je `1f`
102
103	#ifdef PIC
104	pushl %ebx
105	cfi_adjust_cfa_offset (`4`)
106	cfi_rel_offset (ebx, `0`)
107	LOAD_PIC_REG (bx)
108	#endif
109
110	cmpl $`0`, %eax
111	jne `2f`
112
113	#ifdef PIC
114	fldt `8`(%esp)
115	#else
116	fldt `4`(%esp)
117	#endif
118	fmull MO(two64)
119	movl $-`64`, %ecx
120	#ifdef PIC
121	fstpt `8`(%esp)
122	movl `16`(%esp), %eax
123	#else
124	fstpt `4`(%esp)
125	movl `12`(%esp), %eax
126	#endif
127	movl %eax, %edx
128	andl $`0x7fff`, %eax
129
130	`2`: andl $`0x8000`, %edx
131	subl $`16382`, %eax
132	orl $`0x3ffe`, %edx
133	addl %eax, %ecx
134	#ifdef PIC
135	movl %edx, `16`(%esp)
136
137	fldt `8`(%esp) / xm /
138	#else
139	movl %edx, `12`(%esp)
140
141	fldt `4`(%esp) / xm /
142	#endif
143	fabs
144
145	/ The following code has two tracks:*
146	a) compute the normalized cbrt value
147	b) compute xe/3 and xe%3
148	The right track computes the value for b) and this is done
149	in an optimized way by avoiding division.
150
151	But why two tracks at all? Very easy: efficiency. Some FP
152	instruction can overlap with a certain amount of integer (and
153	FP) instructions. So we get (except for the imull) all
154	instructions for free. /*
155
156	fldt MO(f8) / f8 : xm /
157	fmul %st(`1`) / f8xm : xm /*
158
159	fldt MO(f7)
160	faddp / f7+f8xm : xm /*
161	fmul %st(`1`) / (f7+f8xm)xm : xm /
162	movl $`1431655766`, %eax
163	fldt MO(f6)
164	faddp / f6+(f7+f8xm)xm : xm /
165	imull %ecx
166	fmul %st(`1`) / (f6+(f7+f8xm)xm)xm : xm /*
167	movl %ecx, %eax
168	fldt MO(f5)
169	faddp / f5+(f6+(f7+f8xm)xm)xm : xm /*
170	sarl $`31`, %eax
171	fmul %st(`1`) / (f5+(f6+(f7+f8xm)xm)xm)xm : xm /
172	subl %eax, %edx
173	fldt MO(f4)
174	faddp / f4+(f5+(f6+(f7+f8xm)xm)xm)xm : xm /
175	fmul %st(`1`) / (f4+(f5+(f6+(f7+f8xm)xm)xm)xm)xm : xm /*
176	fldt MO(f3)
177	faddp / f3+(f4+(f5+(f6+(f7+f8xm)xm)xm)xm)xm : xm /*
178	fmul %st(`1`) / (f3+(f4+(f5+(f6+(f7+f8xm)xm)xm)xm)xm)xm : xm /
179	fldt MO(f2)
180	faddp / f2+(f3+(f4+(f5+(f6+(f7+f8xm)xm)xm)xm)xm)xm : xm /
181	fmul %st(`1`) / (f2+(f3+(f4+(f5+(f6+(f7+f8xm)xm)xm)xm)xm)xm)xm : xm /*
182	fldt MO(f1)
183	faddp / u:=f1+(f2+(f3+(f4+(f5+(f6+(f7+f8xm)xm)xm)xm)xm)xm)xm : xm /*
184
185	fld %st / u : u : xm /
186	fmul %st(`1`) / uu : u : xm /*
187	fld %st(`2`) / xm : uu : u : xm /*
188	fadd %st / 2xm : uu : u : xm /
189	fxch %st(`1`) / uu : 2xm : u : xm /
190	fmul %st(`2`) / t2:=uuu : 2xm : u : xm /*
191	movl %edx, %eax
192	fadd %st, %st(`1`) / t2 : t2+2xm : u : xm /*
193	leal (%edx,%edx,`2`),%edx
194	fadd %st(`0`) / 2t2 : t2+2xm : u : xm /
195	subl %edx, %ecx
196	faddp %st, %st(`3`) / t2+2xm : u : 2t2+xm /
197	shll $`4`, %ecx
198	fmulp / u(t2+2xm) : 2t2+xm /*
199	fdivp %st, %st(`1`) / u(t2+2xm)/(2t2+xm) /*
200	fldt MOX(`32`+factor,%ecx)
201	fmulp / u(t2+2xm)/(2t2+xm)FACT /
202	pushl %eax
203	cfi_adjust_cfa_offset (`4`)
204	fildl (%esp) / xe/3 : u(t2+2xm)/(2t2+xm)FACT /
205	fxch / u(t2+2xm)/(2t2+xm)FACT : xe/3 /
206	fscale / u(t2+2xm)/(2t2+xm)FACT2^xe/3 /*
207	popl %edx
208	cfi_adjust_cfa_offset (-`4`)
209	#ifdef PIC
210	movl `16`(%esp), %eax
211	popl %ebx
212	cfi_adjust_cfa_offset (-`4`)
213	cfi_restore (ebx)
214	#else
215	movl `12`(%esp), %eax
216	#endif
217	testl $`0x8000`, %eax
218	fstp %st(`1`)
219	jz `4f`
220	fchs
221	`4`: ret
222
223	/ Return the argument. /
224	`1`: fldt `4`(%esp)
225	fadd %st
226	ret
227	END(__cbrtl)
228	libm_alias_ldouble (__cbrt, cbrt)
229

source code of glibc/sysdeps/i386/fpu/s_cbrtl.S