1 | /* Compute cubic root of float value. |
2 | Copyright (C) 1997-2022 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <machine/asm.h> |
20 | #include <libm-alias-float.h> |
21 | |
22 | .section .rodata |
23 | |
24 | .align ALIGNARG(4) |
25 | .type f3,@object |
26 | f3: .double 0.191502161678719066 |
27 | ASM_SIZE_DIRECTIVE(f3) |
28 | .type f2,@object |
29 | f2: .double 0.697570460207922770 |
30 | ASM_SIZE_DIRECTIVE(f2) |
31 | .type f1,@object |
32 | f1: .double 0.492659620528969547 |
33 | ASM_SIZE_DIRECTIVE(f1) |
34 | |
35 | #define CBRT2 1.2599210498948731648 |
36 | #define ONE_CBRT2 0.793700525984099737355196796584 |
37 | #define SQR_CBRT2 1.5874010519681994748 |
38 | #define ONE_SQR_CBRT2 0.629960524947436582364439673883 |
39 | |
40 | .type factor,@object |
41 | .align ALIGNARG(4) |
42 | factor: .double ONE_SQR_CBRT2 |
43 | .double ONE_CBRT2 |
44 | .double 1.0 |
45 | .double CBRT2 |
46 | .double SQR_CBRT2 |
47 | ASM_SIZE_DIRECTIVE(factor) |
48 | |
49 | .type two25,@object |
50 | two25: .byte 0, 0, 0, 0x4c |
51 | ASM_SIZE_DIRECTIVE(two25) |
52 | |
53 | #ifdef PIC |
54 | #define MO(op) op##@GOTOFF(%ebx) |
55 | #define MOX(op,x) op##@GOTOFF(%ebx,x,1) |
56 | #else |
57 | #define MO(op) op |
58 | #define MOX(op,x) op(x) |
59 | #endif |
60 | |
61 | .text |
62 | ENTRY(__cbrtf) |
63 | movl 4(%esp), %eax |
64 | xorl %ecx, %ecx |
65 | movl %eax, %edx |
66 | andl $0x7fffffff, %eax |
67 | jz 1f |
68 | cmpl $0x7f800000, %eax |
69 | jae 1f |
70 | |
71 | #ifdef PIC |
72 | pushl %ebx |
73 | cfi_adjust_cfa_offset (4) |
74 | cfi_rel_offset (ebx, 0) |
75 | LOAD_PIC_REG (bx) |
76 | #endif |
77 | |
78 | cmpl $0x00800000, %eax |
79 | jae 2f |
80 | |
81 | #ifdef PIC |
82 | flds 8(%esp) |
83 | #else |
84 | flds 4(%esp) |
85 | #endif |
86 | fmuls MO(two25) |
87 | movl $-25, %ecx |
88 | #ifdef PIC |
89 | fstps 8(%esp) |
90 | movl 8(%esp), %eax |
91 | #else |
92 | fstps 4(%esp) |
93 | movl 4(%esp), %eax |
94 | #endif |
95 | movl %eax, %edx |
96 | andl $0x7fffffff, %eax |
97 | |
98 | 2: shrl $23, %eax |
99 | andl $0x807fffff, %edx |
100 | subl $126, %eax |
101 | orl $0x3f000000, %edx |
102 | addl %eax, %ecx |
103 | #ifdef PIC |
104 | movl %edx, 8(%esp) |
105 | |
106 | flds 8(%esp) /* xm */ |
107 | #else |
108 | movl %edx, 4(%esp) |
109 | |
110 | flds 4(%esp) /* xm */ |
111 | #endif |
112 | fabs |
113 | |
114 | /* The following code has two tracks: |
115 | a) compute the normalized cbrt value |
116 | b) compute xe/3 and xe%3 |
117 | The right track computes the value for b) and this is done |
118 | in an optimized way by avoiding division. |
119 | |
120 | But why two tracks at all? Very easy: efficiency. Some FP |
121 | instruction can overlap with a certain amount of integer (and |
122 | FP) instructions. So we get (except for the imull) all |
123 | instructions for free. */ |
124 | |
125 | fld %st(0) /* xm : xm */ |
126 | fmull MO(f3) /* f3*xm : xm */ |
127 | movl $1431655766, %eax |
128 | fsubrl MO(f2) /* f2-f3*xm : xm */ |
129 | imull %ecx |
130 | fmul %st(1) /* (f2-f3*xm)*xm : xm */ |
131 | movl %ecx, %eax |
132 | faddl MO(f1) /* u:=f1+(f2-f3*xm)*xm : xm */ |
133 | sarl $31, %eax |
134 | fld %st /* u : u : xm */ |
135 | subl %eax, %edx |
136 | fmul %st(1) /* u*u : u : xm */ |
137 | fld %st(2) /* xm : u*u : u : xm */ |
138 | fadd %st /* 2*xm : u*u : u : xm */ |
139 | fxch %st(1) /* u*u : 2*xm : u : xm */ |
140 | fmul %st(2) /* t2:=u*u*u : 2*xm : u : xm */ |
141 | movl %edx, %eax |
142 | fadd %st, %st(1) /* t2 : t2+2*xm : u : xm */ |
143 | leal (%edx,%edx,2),%edx |
144 | fadd %st(0) /* 2*t2 : t2+2*xm : u : xm */ |
145 | subl %edx, %ecx |
146 | faddp %st, %st(3) /* t2+2*xm : u : 2*t2+xm */ |
147 | shll $3, %ecx |
148 | fmulp /* u*(t2+2*xm) : 2*t2+xm */ |
149 | fdivp %st, %st(1) /* u*(t2+2*xm)/(2*t2+xm) */ |
150 | fmull MOX(16+factor,%ecx) /* u*(t2+2*xm)/(2*t2+xm)*FACT */ |
151 | pushl %eax |
152 | cfi_adjust_cfa_offset (4) |
153 | fildl (%esp) /* xe/3 : u*(t2+2*xm)/(2*t2+xm)*FACT */ |
154 | fxch /* u*(t2+2*xm)/(2*t2+xm)*FACT : xe/3 */ |
155 | fscale /* u*(t2+2*xm)/(2*t2+xm)*FACT*2^xe/3 */ |
156 | popl %edx |
157 | cfi_adjust_cfa_offset (-4) |
158 | #ifdef PIC |
159 | movl 8(%esp), %eax |
160 | popl %ebx |
161 | cfi_adjust_cfa_offset (-4) |
162 | cfi_restore (ebx) |
163 | #else |
164 | movl 4(%esp), %eax |
165 | #endif |
166 | testl %eax, %eax |
167 | fstp %st(1) |
168 | jns 4f |
169 | fchs |
170 | 4: ret |
171 | |
172 | /* Return the argument. */ |
173 | 1: flds 4(%esp) |
174 | ret |
175 | END(__cbrtf) |
176 | libm_alias_float (__cbrt, cbrt) |
177 | |