1 | /* Single-precision floating point square root. |
2 | Copyright (C) 1997-2022 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <math.h> |
20 | #include <math_private.h> |
21 | #include <fenv_libc.h> |
22 | #include <libm-alias-finite.h> |
23 | #include <math-use-builtins.h> |
24 | |
25 | float |
26 | __ieee754_sqrtf (float x) |
27 | { |
28 | #if USE_SQRTF_BUILTIN |
29 | return __builtin_sqrtf (x); |
30 | #else |
31 | /* The method is based on a description in |
32 | Computation of elementary functions on the IBM RISC System/6000 processor, |
33 | P. W. Markstein, IBM J. Res. Develop, 34(1) 1990. |
34 | Basically, it consists of two interleaved Newton-Raphson approximations, |
35 | one to find the actual square root, and one to find its reciprocal |
36 | without the expense of a division operation. The tricky bit here |
37 | is the use of the POWER/PowerPC multiply-add operation to get the |
38 | required accuracy with high speed. |
39 | |
40 | The argument reduction works by a combination of table lookup to |
41 | obtain the initial guesses, and some careful modification of the |
42 | generated guesses (which mostly runs on the integer unit, while the |
43 | Newton-Raphson is running on the FPU). */ |
44 | |
45 | extern const float __t_sqrt[1024]; |
46 | |
47 | if (x > 0) |
48 | { |
49 | if (x != INFINITY) |
50 | { |
51 | /* Variables named starting with 's' exist in the |
52 | argument-reduced space, so that 2 > sx >= 0.5, |
53 | 1.41... > sg >= 0.70.., 0.70.. >= sy > 0.35... . |
54 | Variables named ending with 'i' are integer versions of |
55 | floating-point values. */ |
56 | float sx; /* The value of which we're trying to find the square |
57 | root. */ |
58 | float sg, g; /* Guess of the square root of x. */ |
59 | float sd, d; /* Difference between the square of the guess and x. */ |
60 | float sy; /* Estimate of 1/2g (overestimated by 1ulp). */ |
61 | float sy2; /* 2*sy */ |
62 | float e; /* Difference between y*g and 1/2 (note that e==se). */ |
63 | float shx; /* == sx * fsg */ |
64 | float fsg; /* sg*fsg == g. */ |
65 | fenv_t fe; /* Saved floating-point environment (stores rounding |
66 | mode and whether the inexact exception is |
67 | enabled). */ |
68 | uint32_t xi, sxi, fsgi; |
69 | const float *t_sqrt; |
70 | |
71 | GET_FLOAT_WORD (xi, x); |
72 | fe = fegetenv_register (); |
73 | relax_fenv_state (); |
74 | sxi = (xi & 0x3fffffff) | 0x3f000000; |
75 | SET_FLOAT_WORD (sx, sxi); |
76 | t_sqrt = __t_sqrt + (xi >> (23 - 8 - 1) & 0x3fe); |
77 | sg = t_sqrt[0]; |
78 | sy = t_sqrt[1]; |
79 | |
80 | /* Here we have three Newton-Raphson iterations each of a |
81 | division and a square root and the remainder of the |
82 | argument reduction, all interleaved. */ |
83 | sd = -__builtin_fmaf (sg, sg, -sx); |
84 | fsgi = (xi + 0x40000000) >> 1 & 0x7f800000; |
85 | sy2 = sy + sy; |
86 | sg = __builtin_fmaf (sy, sd, sg); /* 16-bit approximation to |
87 | sqrt(sx). */ |
88 | e = -__builtin_fmaf (sy, sg, -0x1.0000020365653p-1); |
89 | SET_FLOAT_WORD (fsg, fsgi); |
90 | sd = -__builtin_fmaf (sg, sg, -sx); |
91 | sy = __builtin_fmaf (e, sy2, sy); |
92 | if ((xi & 0x7f800000) == 0) |
93 | goto denorm; |
94 | shx = sx * fsg; |
95 | sg = __builtin_fmaf (sy, sd, sg); /* 32-bit approximation to |
96 | sqrt(sx), but perhaps |
97 | rounded incorrectly. */ |
98 | sy2 = sy + sy; |
99 | g = sg * fsg; |
100 | e = -__builtin_fmaf (sy, sg, -0x1.0000020365653p-1); |
101 | d = -__builtin_fmaf (g, sg, -shx); |
102 | sy = __builtin_fmaf (e, sy2, sy); |
103 | fesetenv_register (fe); |
104 | return __builtin_fmaf (sy, d, g); |
105 | denorm: |
106 | /* For denormalised numbers, we normalise, calculate the |
107 | square root, and return an adjusted result. */ |
108 | fesetenv_register (fe); |
109 | return __ieee754_sqrtf (x * 0x1p+48) * 0x1p-24; |
110 | } |
111 | } |
112 | else if (x < 0) |
113 | { |
114 | /* For some reason, some PowerPC32 processors don't implement |
115 | FE_INVALID_SQRT. */ |
116 | # ifdef FE_INVALID_SQRT |
117 | feraiseexcept (FE_INVALID_SQRT); |
118 | |
119 | fenv_union_t u = { .fenv = fegetenv_register () }; |
120 | if ((u.l & FE_INVALID) == 0) |
121 | # endif |
122 | feraiseexcept (FE_INVALID); |
123 | x = NAN; |
124 | } |
125 | return f_washf (x); |
126 | #endif /* USE_SQRTF_BUILTIN */ |
127 | } |
128 | libm_alias_finite (__ieee754_sqrtf, __sqrtf) |
129 | |