1///////////////////////////////////////////////////////////////////////////
2//
3// Copyright (c) 2002, Industrial Light & Magic, a division of Lucas
4// Digital Ltd. LLC
5//
6// All rights reserved.
7//
8// Redistribution and use in source and binary forms, with or without
9// modification, are permitted provided that the following conditions are
10// met:
11// * Redistributions of source code must retain the above copyright
12// notice, this list of conditions and the following disclaimer.
13// * Redistributions in binary form must reproduce the above
14// copyright notice, this list of conditions and the following disclaimer
15// in the documentation and/or other materials provided with the
16// distribution.
17// * Neither the name of Industrial Light & Magic nor the names of
18// its contributors may be used to endorse or promote products derived
19// from this software without specific prior written permission.
20//
21// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32//
33///////////////////////////////////////////////////////////////////////////
34
35// Primary authors:
36// Florian Kainz <kainz@ilm.com>
37// Rod Bogart <rgb@ilm.com>
38
39//---------------------------------------------------------------------------
40//
41// half -- a 16-bit floating point number class:
42//
43// Type half can represent positive and negative numbers whose
44// magnitude is between roughly 6.1e-5 and 6.5e+4 with a relative
45// error of 9.8e-4; numbers smaller than 6.1e-5 can be represented
46// with an absolute error of 6.0e-8. All integers from -2048 to
47// +2048 can be represented exactly.
48//
49// Type half behaves (almost) like the built-in C++ floating point
50// types. In arithmetic expressions, half, float and double can be
51// mixed freely. Here are a few examples:
52//
53// half a (3.5);
54// float b (a + sqrt (a));
55// a += b;
56// b += a;
57// b = a + 7;
58//
59// Conversions from half to float are lossless; all half numbers
60// are exactly representable as floats.
61//
62// Conversions from float to half may not preserve a float's value
63// exactly. If a float is not representable as a half, then the
64// float value is rounded to the nearest representable half. If a
65// float value is exactly in the middle between the two closest
66// representable half values, then the float value is rounded to
67// the closest half whose least significant bit is zero.
68//
69// Overflows during float-to-half conversions cause arithmetic
70// exceptions. An overflow occurs when the float value to be
71// converted is too large to be represented as a half, or if the
72// float value is an infinity or a NAN.
73//
74// The implementation of type half makes the following assumptions
75// about the implementation of the built-in C++ types:
76//
77// float is an IEEE 754 single-precision number
78// sizeof (float) == 4
79// sizeof (unsigned int) == sizeof (float)
80// alignof (unsigned int) == alignof (float)
81// sizeof (unsigned short) == 2
82//
83//---------------------------------------------------------------------------
84
85#ifndef _HALF_H_
86#define _HALF_H_
87
88#include "halfExport.h" // for definition of HALF_EXPORT
89#include <iostream>
90
91class half
92{
93 public:
94
95 //-------------
96 // Constructors
97 //-------------
98
99 half (); // no initialization
100 half (float f);
101
102
103 //--------------------
104 // Conversion to float
105 //--------------------
106
107 operator float () const;
108
109
110 //------------
111 // Unary minus
112 //------------
113
114 half operator - () const;
115
116
117 //-----------
118 // Assignment
119 //-----------
120
121 half & operator = (half h);
122 half & operator = (float f);
123
124 half & operator += (half h);
125 half & operator += (float f);
126
127 half & operator -= (half h);
128 half & operator -= (float f);
129
130 half & operator *= (half h);
131 half & operator *= (float f);
132
133 half & operator /= (half h);
134 half & operator /= (float f);
135
136
137 //---------------------------------------------------------
138 // Round to n-bit precision (n should be between 0 and 10).
139 // After rounding, the significand's 10-n least significant
140 // bits will be zero.
141 //---------------------------------------------------------
142
143 half round (unsigned int n) const;
144
145
146 //--------------------------------------------------------------------
147 // Classification:
148 //
149 // h.isFinite() returns true if h is a normalized number,
150 // a denormalized number or zero
151 //
152 // h.isNormalized() returns true if h is a normalized number
153 //
154 // h.isDenormalized() returns true if h is a denormalized number
155 //
156 // h.isZero() returns true if h is zero
157 //
158 // h.isNan() returns true if h is a NAN
159 //
160 // h.isInfinity() returns true if h is a positive
161 // or a negative infinity
162 //
163 // h.isNegative() returns true if the sign bit of h
164 // is set (negative)
165 //--------------------------------------------------------------------
166
167 bool isFinite () const;
168 bool isNormalized () const;
169 bool isDenormalized () const;
170 bool isZero () const;
171 bool isNan () const;
172 bool isInfinity () const;
173 bool isNegative () const;
174
175
176 //--------------------------------------------
177 // Special values
178 //
179 // posInf() returns +infinity
180 //
181 // negInf() returns -infinity
182 //
183 // qNan() returns a NAN with the bit
184 // pattern 0111111111111111
185 //
186 // sNan() returns a NAN with the bit
187 // pattern 0111110111111111
188 //--------------------------------------------
189
190 static half posInf ();
191 static half negInf ();
192 static half qNan ();
193 static half sNan ();
194
195
196 //--------------------------------------
197 // Access to the internal representation
198 //--------------------------------------
199
200 HALF_EXPORT unsigned short bits () const;
201 HALF_EXPORT void setBits (unsigned short bits);
202
203
204 public:
205
206 union uif
207 {
208 unsigned int i;
209 float f;
210 };
211
212 private:
213
214 HALF_EXPORT static short convert (int i);
215 HALF_EXPORT static float overflow ();
216
217 unsigned short _h;
218
219 HALF_EXPORT static const uif _toFloat[1 << 16];
220 HALF_EXPORT static const unsigned short _eLut[1 << 9];
221};
222
223
224
225//-----------
226// Stream I/O
227//-----------
228
229HALF_EXPORT std::ostream & operator << (std::ostream &os, half h);
230HALF_EXPORT std::istream & operator >> (std::istream &is, half &h);
231
232
233//----------
234// Debugging
235//----------
236
237HALF_EXPORT void printBits (std::ostream &os, half h);
238HALF_EXPORT void printBits (std::ostream &os, float f);
239HALF_EXPORT void printBits (char c[19], half h);
240HALF_EXPORT void printBits (char c[35], float f);
241
242
243//-------------------------------------------------------------------------
244// Limits
245//
246// Visual C++ will complain if HALF_MIN, HALF_NRM_MIN etc. are not float
247// constants, but at least one other compiler (gcc 2.96) produces incorrect
248// results if they are.
249//-------------------------------------------------------------------------
250
251#if (defined _WIN32 || defined _WIN64) && defined _MSC_VER
252
253 #define HALF_MIN 5.96046448e-08f // Smallest positive half
254
255 #define HALF_NRM_MIN 6.10351562e-05f // Smallest positive normalized half
256
257 #define HALF_MAX 65504.0f // Largest positive half
258
259 #define HALF_EPSILON 0.00097656f // Smallest positive e for which
260 // half (1.0 + e) != half (1.0)
261#else
262
263 #define HALF_MIN 5.96046448e-08 // Smallest positive half
264
265 #define HALF_NRM_MIN 6.10351562e-05 // Smallest positive normalized half
266
267 #define HALF_MAX 65504.0 // Largest positive half
268
269 #define HALF_EPSILON 0.00097656 // Smallest positive e for which
270 // half (1.0 + e) != half (1.0)
271#endif
272
273
274#define HALF_MANT_DIG 11 // Number of digits in mantissa
275 // (significand + hidden leading 1)
276
277#define HALF_DIG 2 // Number of base 10 digits that
278 // can be represented without change
279
280#define HALF_RADIX 2 // Base of the exponent
281
282#define HALF_MIN_EXP -13 // Minimum negative integer such that
283 // HALF_RADIX raised to the power of
284 // one less than that integer is a
285 // normalized half
286
287#define HALF_MAX_EXP 16 // Maximum positive integer such that
288 // HALF_RADIX raised to the power of
289 // one less than that integer is a
290 // normalized half
291
292#define HALF_MIN_10_EXP -4 // Minimum positive integer such
293 // that 10 raised to that power is
294 // a normalized half
295
296#define HALF_MAX_10_EXP 4 // Maximum positive integer such
297 // that 10 raised to that power is
298 // a normalized half
299
300
301//---------------------------------------------------------------------------
302//
303// Implementation --
304//
305// Representation of a float:
306//
307// We assume that a float, f, is an IEEE 754 single-precision
308// floating point number, whose bits are arranged as follows:
309//
310// 31 (msb)
311// |
312// | 30 23
313// | | |
314// | | | 22 0 (lsb)
315// | | | | |
316// X XXXXXXXX XXXXXXXXXXXXXXXXXXXXXXX
317//
318// s e m
319//
320// S is the sign-bit, e is the exponent and m is the significand.
321//
322// If e is between 1 and 254, f is a normalized number:
323//
324// s e-127
325// f = (-1) * 2 * 1.m
326//
327// If e is 0, and m is not zero, f is a denormalized number:
328//
329// s -126
330// f = (-1) * 2 * 0.m
331//
332// If e and m are both zero, f is zero:
333//
334// f = 0.0
335//
336// If e is 255, f is an "infinity" or "not a number" (NAN),
337// depending on whether m is zero or not.
338//
339// Examples:
340//
341// 0 00000000 00000000000000000000000 = 0.0
342// 0 01111110 00000000000000000000000 = 0.5
343// 0 01111111 00000000000000000000000 = 1.0
344// 0 10000000 00000000000000000000000 = 2.0
345// 0 10000000 10000000000000000000000 = 3.0
346// 1 10000101 11110000010000000000000 = -124.0625
347// 0 11111111 00000000000000000000000 = +infinity
348// 1 11111111 00000000000000000000000 = -infinity
349// 0 11111111 10000000000000000000000 = NAN
350// 1 11111111 11111111111111111111111 = NAN
351//
352// Representation of a half:
353//
354// Here is the bit-layout for a half number, h:
355//
356// 15 (msb)
357// |
358// | 14 10
359// | | |
360// | | | 9 0 (lsb)
361// | | | | |
362// X XXXXX XXXXXXXXXX
363//
364// s e m
365//
366// S is the sign-bit, e is the exponent and m is the significand.
367//
368// If e is between 1 and 30, h is a normalized number:
369//
370// s e-15
371// h = (-1) * 2 * 1.m
372//
373// If e is 0, and m is not zero, h is a denormalized number:
374//
375// S -14
376// h = (-1) * 2 * 0.m
377//
378// If e and m are both zero, h is zero:
379//
380// h = 0.0
381//
382// If e is 31, h is an "infinity" or "not a number" (NAN),
383// depending on whether m is zero or not.
384//
385// Examples:
386//
387// 0 00000 0000000000 = 0.0
388// 0 01110 0000000000 = 0.5
389// 0 01111 0000000000 = 1.0
390// 0 10000 0000000000 = 2.0
391// 0 10000 1000000000 = 3.0
392// 1 10101 1111000001 = -124.0625
393// 0 11111 0000000000 = +infinity
394// 1 11111 0000000000 = -infinity
395// 0 11111 1000000000 = NAN
396// 1 11111 1111111111 = NAN
397//
398// Conversion:
399//
400// Converting from a float to a half requires some non-trivial bit
401// manipulations. In some cases, this makes conversion relatively
402// slow, but the most common case is accelerated via table lookups.
403//
404// Converting back from a half to a float is easier because we don't
405// have to do any rounding. In addition, there are only 65536
406// different half numbers; we can convert each of those numbers once
407// and store the results in a table. Later, all conversions can be
408// done using only simple table lookups.
409//
410//---------------------------------------------------------------------------
411
412
413//--------------------
414// Simple constructors
415//--------------------
416
417inline
418half::half ()
419{
420 // no initialization
421}
422
423
424//----------------------------
425// Half-from-float constructor
426//----------------------------
427
428inline
429half::half (float f)
430{
431 uif x;
432
433 x.f = f;
434
435 if (f == 0)
436 {
437 //
438 // Common special case - zero.
439 // Preserve the zero's sign bit.
440 //
441
442 _h = (x.i >> 16);
443 }
444 else
445 {
446 //
447 // We extract the combined sign and exponent, e, from our
448 // floating-point number, f. Then we convert e to the sign
449 // and exponent of the half number via a table lookup.
450 //
451 // For the most common case, where a normalized half is produced,
452 // the table lookup returns a non-zero value; in this case, all
453 // we have to do is round f's significand to 10 bits and combine
454 // the result with e.
455 //
456 // For all other cases (overflow, zeroes, denormalized numbers
457 // resulting from underflow, infinities and NANs), the table
458 // lookup returns zero, and we call a longer, non-inline function
459 // to do the float-to-half conversion.
460 //
461
462 register int e = (x.i >> 23) & 0x000001ff;
463
464 e = _eLut[e];
465
466 if (e)
467 {
468 //
469 // Simple case - round the significand, m, to 10
470 // bits and combine it with the sign and exponent.
471 //
472
473 register int m = x.i & 0x007fffff;
474 _h = e + ((m + 0x00000fff + ((m >> 13) & 1)) >> 13);
475 }
476 else
477 {
478 //
479 // Difficult case - call a function.
480 //
481
482 _h = convert (x.i);
483 }
484 }
485}
486
487
488//------------------------------------------
489// Half-to-float conversion via table lookup
490//------------------------------------------
491
492inline
493half::operator float () const
494{
495 return _toFloat[_h].f;
496}
497
498
499//-------------------------
500// Round to n-bit precision
501//-------------------------
502
503inline half
504half::round (unsigned int n) const
505{
506 //
507 // Parameter check.
508 //
509
510 if (n >= 10)
511 return *this;
512
513 //
514 // Disassemble h into the sign, s,
515 // and the combined exponent and significand, e.
516 //
517
518 unsigned short s = _h & 0x8000;
519 unsigned short e = _h & 0x7fff;
520
521 //
522 // Round the exponent and significand to the nearest value
523 // where ones occur only in the (10-n) most significant bits.
524 // Note that the exponent adjusts automatically if rounding
525 // up causes the significand to overflow.
526 //
527
528 e >>= 9 - n;
529 e += e & 1;
530 e <<= 9 - n;
531
532 //
533 // Check for exponent overflow.
534 //
535
536 if (e >= 0x7c00)
537 {
538 //
539 // Overflow occurred -- truncate instead of rounding.
540 //
541
542 e = _h;
543 e >>= 10 - n;
544 e <<= 10 - n;
545 }
546
547 //
548 // Put the original sign bit back.
549 //
550
551 half h;
552 h._h = s | e;
553
554 return h;
555}
556
557
558//-----------------------
559// Other inline functions
560//-----------------------
561
562inline half
563half::operator - () const
564{
565 half h;
566 h._h = _h ^ 0x8000;
567 return h;
568}
569
570
571inline half &
572half::operator = (half h)
573{
574 _h = h._h;
575 return *this;
576}
577
578
579inline half &
580half::operator = (float f)
581{
582 *this = half (f);
583 return *this;
584}
585
586
587inline half &
588half::operator += (half h)
589{
590 *this = half (float (*this) + float (h));
591 return *this;
592}
593
594
595inline half &
596half::operator += (float f)
597{
598 *this = half (float (*this) + f);
599 return *this;
600}
601
602
603inline half &
604half::operator -= (half h)
605{
606 *this = half (float (*this) - float (h));
607 return *this;
608}
609
610
611inline half &
612half::operator -= (float f)
613{
614 *this = half (float (*this) - f);
615 return *this;
616}
617
618
619inline half &
620half::operator *= (half h)
621{
622 *this = half (float (*this) * float (h));
623 return *this;
624}
625
626
627inline half &
628half::operator *= (float f)
629{
630 *this = half (float (*this) * f);
631 return *this;
632}
633
634
635inline half &
636half::operator /= (half h)
637{
638 *this = half (float (*this) / float (h));
639 return *this;
640}
641
642
643inline half &
644half::operator /= (float f)
645{
646 *this = half (float (*this) / f);
647 return *this;
648}
649
650
651inline bool
652half::isFinite () const
653{
654 unsigned short e = (_h >> 10) & 0x001f;
655 return e < 31;
656}
657
658
659inline bool
660half::isNormalized () const
661{
662 unsigned short e = (_h >> 10) & 0x001f;
663 return e > 0 && e < 31;
664}
665
666
667inline bool
668half::isDenormalized () const
669{
670 unsigned short e = (_h >> 10) & 0x001f;
671 unsigned short m = _h & 0x3ff;
672 return e == 0 && m != 0;
673}
674
675
676inline bool
677half::isZero () const
678{
679 return (_h & 0x7fff) == 0;
680}
681
682
683inline bool
684half::isNan () const
685{
686 unsigned short e = (_h >> 10) & 0x001f;
687 unsigned short m = _h & 0x3ff;
688 return e == 31 && m != 0;
689}
690
691
692inline bool
693half::isInfinity () const
694{
695 unsigned short e = (_h >> 10) & 0x001f;
696 unsigned short m = _h & 0x3ff;
697 return e == 31 && m == 0;
698}
699
700
701inline bool
702half::isNegative () const
703{
704 return (_h & 0x8000) != 0;
705}
706
707
708inline half
709half::posInf ()
710{
711 half h;
712 h._h = 0x7c00;
713 return h;
714}
715
716
717inline half
718half::negInf ()
719{
720 half h;
721 h._h = 0xfc00;
722 return h;
723}
724
725
726inline half
727half::qNan ()
728{
729 half h;
730 h._h = 0x7fff;
731 return h;
732}
733
734
735inline half
736half::sNan ()
737{
738 half h;
739 h._h = 0x7dff;
740 return h;
741}
742
743
744inline unsigned short
745half::bits () const
746{
747 return _h;
748}
749
750
751inline void
752half::setBits (unsigned short bits)
753{
754 _h = bits;
755}
756
757#endif
758