From 2c4cf2f6f2ccfb43869b9e6b881f29699bc29bfd Mon Sep 17 00:00:00 2001 From: Robert Griesemer Date: Mon, 23 Mar 2015 17:31:25 -0700 Subject: [PATCH] math/big: fix known bug in Float.Float64 - handle exponent over- and underflow - handle denormalized numbers - added test cases Change-Id: I1bbb9904b0c104f54696944e1f57559881f6eeeb Reviewed-on: https://go-review.googlesource.com/7982 Reviewed-by: Alan Donovan --- src/math/big/float.go | 71 +++++++++++++++++++++++++++++++------- src/math/big/float_test.go | 66 +++++++++++++++++++++++++++++++---- 2 files changed, 118 insertions(+), 19 deletions(-) diff --git a/src/math/big/float.go b/src/math/big/float.go index a86471e2a51..fa3751d0c7e 100644 --- a/src/math/big/float.go +++ b/src/math/big/float.go @@ -872,9 +872,14 @@ func (x *Float) Int64() (int64, Accuracy) { panic("unreachable") } -// Float64 returns the closest float64 value of x -// by rounding to nearest with 53 bits precision. -// BUG(gri) Float.Float64 doesn't handle exponent overflow. +// Float64 returns the float64 value nearest to x by rounding ToNearestEven +// with 53 bits of precision. +// If x is too small to be represented by a float64 +// (|x| < math.SmallestNonzeroFloat64), the result is (0, Below) or +// (-0, Above), respectively, depending on the sign of x. +// If x is too large to be represented by a float64 (|x| > math.MaxFloat64), +// the result is (+Inf, Above) or (-Inf, Below), depending on the sign of x. +// The result is (NaN, Undef) for NaNs. func (x *Float) Float64() (float64, Accuracy) { if debugFloat { x.validate() @@ -886,27 +891,67 @@ func (x *Float) Float64() (float64, Accuracy) { var r Float r.prec = 53 r.Set(x) - var s uint64 + + // Rounding via Set may have caused r to overflow + // to ±Inf (rounding never causes underflows to 0). + if r.form == inf { + r.exp = 10000 // cause overflow below + } + + // see also implementation of math.Ldexp + + e := int64(r.exp) + 1022 + if e <= -52 { + // underflow + if x.neg { + z := 0.0 + return -z, Above + } + return 0.0, Below + } + // e > -52 + + if e >= 2047 { + // overflow + if x.neg { + return math.Inf(-1), Below + } + return math.Inf(+1), Above + } + // -52 < e < 2047 + + denormal := false + if e < 0 { + denormal = true + e += 52 + } + // 0 < e < 2047 + + s := uint64(0) if r.neg { s = 1 << 63 } - e := uint64(1022+r.exp) & 0x7ff // TODO(gri) check for overflow - m := high64(r.mant) >> 11 & (1<<52 - 1) - return math.Float64frombits(s | e<<52 | m), r.acc + m := high64(r.mant) >> 11 & (1<<52 - 1) // cut off msb (implicit 1 bit) + z := math.Float64frombits(s | uint64(e)<<52 | m) + if denormal { + // adjust for denormal + // TODO(gri) does this change accuracy? + z /= 1 << 52 + } + return z, r.acc case zero: - z := 0.0 if x.neg { - z = -z + z := 0.0 + return -z, Exact } - return z, Exact + return 0.0, Exact case inf: - sign := +1 if x.neg { - sign = -1 + return math.Inf(-1), Exact } - return math.Inf(sign), Exact + return math.Inf(+1), Exact case nan: return math.NaN(), Undef diff --git a/src/math/big/float_test.go b/src/math/big/float_test.go index 379352c886d..7bfac5d66b0 100644 --- a/src/math/big/float_test.go +++ b/src/math/big/float_test.go @@ -627,6 +627,10 @@ func TestFloatSetFloat64(t *testing.T) { 3.14159265e10, 2.718281828e-123, 1.0 / 3, + math.MaxFloat32, + math.MaxFloat64, + math.SmallestNonzeroFloat32, + math.SmallestNonzeroFloat64, math.Inf(-1), math.Inf(0), -math.Inf(1), @@ -637,8 +641,8 @@ func TestFloatSetFloat64(t *testing.T) { } var f Float f.SetFloat64(want) - if got, _ := f.Float64(); got != want { - t.Errorf("got %g (%s); want %g", got, f.Format('p', 0), want) + if got, acc := f.Float64(); got != want || acc != Exact { + t.Errorf("got %g (%s, %s); want %g (Exact)", got, f.Format('p', 0), acc, want) } } } @@ -833,6 +837,56 @@ func TestFloatInt64(t *testing.T) { } } +func TestFloatFloat64(t *testing.T) { + for _, test := range []struct { + x string + out float64 + acc Accuracy + }{ + {"-Inf", math.Inf(-1), Exact}, + {"-0x1.fffffffffffff8p2147483646", -math.Inf(+1), Below}, // overflow in rounding + {"-1e10000", math.Inf(-1), Below}, // overflow + {"-0x1p1024", math.Inf(-1), Below}, // overflow + {"-0x1.fffffffffffff8p1023", -math.Inf(+1), Below}, // overflow + {"-0x1.fffffffffffff4p1023", -math.MaxFloat64, Above}, + {"-0x1.fffffffffffffp1023", -math.MaxFloat64, Exact}, + {"-12345.000000000000000000001", -12345, Above}, + {"-12345.0", -12345, Exact}, + {"-1.000000000000000000001", -1, Above}, + {"-1", -1, Exact}, + {"-0x0.0000000000001p-1022", -math.SmallestNonzeroFloat64, Exact}, + {"-0x0.0000000000001p-1023", -0, Above}, // underflow + {"-1e-1000", -0, Above}, // underflow + {"0", 0, Exact}, + {"1e-1000", 0, Below}, // underflow + {"0x0.0000000000001p-1023", 0, Below}, // underflow + {"0x0.0000000000001p-1022", math.SmallestNonzeroFloat64, Exact}, + {"1", 1, Exact}, + {"1.000000000000000000001", 1, Below}, + {"12345.0", 12345, Exact}, + {"12345.000000000000000000001", 12345, Below}, + {"0x1.fffffffffffffp1023", math.MaxFloat64, Exact}, + {"0x1.fffffffffffff4p1023", math.MaxFloat64, Below}, + {"0x1.fffffffffffff8p1023", math.Inf(+1), Above}, // overflow + {"0x1p1024", math.Inf(+1), Above}, // overflow + {"1e10000", math.Inf(+1), Above}, // overflow + {"0x1.fffffffffffff8p2147483646", math.Inf(+1), Above}, // overflow in rounding + {"+Inf", math.Inf(+1), Exact}, + } { + x := makeFloat(test.x) + out, acc := x.Float64() + if out != test.out || acc != test.acc { + t.Errorf("%s: got %g (%s); want %g (%s)", test.x, out, acc, test.out, test.acc) + } + } + + // test NaN + x := makeFloat("NaN") + if out, acc := x.Float64(); out == out || acc != Undef { + t.Errorf("NaN: got %g (%s); want NaN (Undef)", out, acc) + } +} + func TestFloatInt(t *testing.T) { for _, test := range []struct { x string @@ -1073,14 +1127,14 @@ func TestFloatAdd32(t *testing.T) { got, acc := z.Float64() want := float64(float32(y0) + float32(x0)) if got != want || acc != Exact { - t.Errorf("d = %d: %g + %g = %g (%s); want %g exactly", d, x0, y0, got, acc, want) + t.Errorf("d = %d: %g + %g = %g (%s); want %g (Exact)", d, x0, y0, got, acc, want) } z.Sub(z, y) got, acc = z.Float64() want = float64(float32(want) - float32(y0)) if got != want || acc != Exact { - t.Errorf("d = %d: %g - %g = %g (%s); want %g exactly", d, x0+y0, y0, got, acc, want) + t.Errorf("d = %d: %g - %g = %g (%s); want %g (Exact)", d, x0+y0, y0, got, acc, want) } } } @@ -1106,14 +1160,14 @@ func TestFloatAdd64(t *testing.T) { got, acc := z.Float64() want := x0 + y0 if got != want || acc != Exact { - t.Errorf("d = %d: %g + %g = %g (%s); want %g exactly", d, x0, y0, got, acc, want) + t.Errorf("d = %d: %g + %g = %g (%s); want %g (Exact)", d, x0, y0, got, acc, want) } z.Sub(z, y) got, acc = z.Float64() want -= y0 if got != want || acc != Exact { - t.Errorf("d = %d: %g - %g = %g (%s); want %g exactly", d, x0+y0, y0, got, acc, want) + t.Errorf("d = %d: %g - %g = %g (%s); want %g (Exact)", d, x0+y0, y0, got, acc, want) } } }