math/big: fix known bug in Float.Float64
- handle exponent over- and underflow
- handle denormalized numbers
- added test cases
Change-Id: I1bbb9904b0c104f54696944e1f57559881f6eeeb
Reviewed-on: https://go-review.googlesource.com/7982
Reviewed-by: Alan Donovan <adonovan@google.com>
diff --git a/src/math/big/float.go b/src/math/big/float.go
index a86471e..fa3751d 100644
--- a/src/math/big/float.go
+++ b/src/math/big/float.go
@@ -872,9 +872,14 @@
panic("unreachable")
}
-// Float64 returns the closest float64 value of x
-// by rounding to nearest with 53 bits precision.
-// BUG(gri) Float.Float64 doesn't handle exponent overflow.
+// Float64 returns the float64 value nearest to x by rounding ToNearestEven
+// with 53 bits of precision.
+// If x is too small to be represented by a float64
+// (|x| < math.SmallestNonzeroFloat64), the result is (0, Below) or
+// (-0, Above), respectively, depending on the sign of x.
+// If x is too large to be represented by a float64 (|x| > math.MaxFloat64),
+// the result is (+Inf, Above) or (-Inf, Below), depending on the sign of x.
+// The result is (NaN, Undef) for NaNs.
func (x *Float) Float64() (float64, Accuracy) {
if debugFloat {
x.validate()
@@ -886,27 +891,67 @@
var r Float
r.prec = 53
r.Set(x)
- var s uint64
+
+ // Rounding via Set may have caused r to overflow
+ // to ±Inf (rounding never causes underflows to 0).
+ if r.form == inf {
+ r.exp = 10000 // cause overflow below
+ }
+
+ // see also implementation of math.Ldexp
+
+ e := int64(r.exp) + 1022
+ if e <= -52 {
+ // underflow
+ if x.neg {
+ z := 0.0
+ return -z, Above
+ }
+ return 0.0, Below
+ }
+ // e > -52
+
+ if e >= 2047 {
+ // overflow
+ if x.neg {
+ return math.Inf(-1), Below
+ }
+ return math.Inf(+1), Above
+ }
+ // -52 < e < 2047
+
+ denormal := false
+ if e < 0 {
+ denormal = true
+ e += 52
+ }
+ // 0 < e < 2047
+
+ s := uint64(0)
if r.neg {
s = 1 << 63
}
- e := uint64(1022+r.exp) & 0x7ff // TODO(gri) check for overflow
- m := high64(r.mant) >> 11 & (1<<52 - 1)
- return math.Float64frombits(s | e<<52 | m), r.acc
+ m := high64(r.mant) >> 11 & (1<<52 - 1) // cut off msb (implicit 1 bit)
+ z := math.Float64frombits(s | uint64(e)<<52 | m)
+ if denormal {
+ // adjust for denormal
+ // TODO(gri) does this change accuracy?
+ z /= 1 << 52
+ }
+ return z, r.acc
case zero:
- z := 0.0
if x.neg {
- z = -z
+ z := 0.0
+ return -z, Exact
}
- return z, Exact
+ return 0.0, Exact
case inf:
- sign := +1
if x.neg {
- sign = -1
+ return math.Inf(-1), Exact
}
- return math.Inf(sign), Exact
+ return math.Inf(+1), Exact
case nan:
return math.NaN(), Undef
diff --git a/src/math/big/float_test.go b/src/math/big/float_test.go
index 379352c..7bfac5d 100644
--- a/src/math/big/float_test.go
+++ b/src/math/big/float_test.go
@@ -627,6 +627,10 @@
3.14159265e10,
2.718281828e-123,
1.0 / 3,
+ math.MaxFloat32,
+ math.MaxFloat64,
+ math.SmallestNonzeroFloat32,
+ math.SmallestNonzeroFloat64,
math.Inf(-1),
math.Inf(0),
-math.Inf(1),
@@ -637,8 +641,8 @@
}
var f Float
f.SetFloat64(want)
- if got, _ := f.Float64(); got != want {
- t.Errorf("got %g (%s); want %g", got, f.Format('p', 0), want)
+ if got, acc := f.Float64(); got != want || acc != Exact {
+ t.Errorf("got %g (%s, %s); want %g (Exact)", got, f.Format('p', 0), acc, want)
}
}
}
@@ -833,6 +837,56 @@
}
}
+func TestFloatFloat64(t *testing.T) {
+ for _, test := range []struct {
+ x string
+ out float64
+ acc Accuracy
+ }{
+ {"-Inf", math.Inf(-1), Exact},
+ {"-0x1.fffffffffffff8p2147483646", -math.Inf(+1), Below}, // overflow in rounding
+ {"-1e10000", math.Inf(-1), Below}, // overflow
+ {"-0x1p1024", math.Inf(-1), Below}, // overflow
+ {"-0x1.fffffffffffff8p1023", -math.Inf(+1), Below}, // overflow
+ {"-0x1.fffffffffffff4p1023", -math.MaxFloat64, Above},
+ {"-0x1.fffffffffffffp1023", -math.MaxFloat64, Exact},
+ {"-12345.000000000000000000001", -12345, Above},
+ {"-12345.0", -12345, Exact},
+ {"-1.000000000000000000001", -1, Above},
+ {"-1", -1, Exact},
+ {"-0x0.0000000000001p-1022", -math.SmallestNonzeroFloat64, Exact},
+ {"-0x0.0000000000001p-1023", -0, Above}, // underflow
+ {"-1e-1000", -0, Above}, // underflow
+ {"0", 0, Exact},
+ {"1e-1000", 0, Below}, // underflow
+ {"0x0.0000000000001p-1023", 0, Below}, // underflow
+ {"0x0.0000000000001p-1022", math.SmallestNonzeroFloat64, Exact},
+ {"1", 1, Exact},
+ {"1.000000000000000000001", 1, Below},
+ {"12345.0", 12345, Exact},
+ {"12345.000000000000000000001", 12345, Below},
+ {"0x1.fffffffffffffp1023", math.MaxFloat64, Exact},
+ {"0x1.fffffffffffff4p1023", math.MaxFloat64, Below},
+ {"0x1.fffffffffffff8p1023", math.Inf(+1), Above}, // overflow
+ {"0x1p1024", math.Inf(+1), Above}, // overflow
+ {"1e10000", math.Inf(+1), Above}, // overflow
+ {"0x1.fffffffffffff8p2147483646", math.Inf(+1), Above}, // overflow in rounding
+ {"+Inf", math.Inf(+1), Exact},
+ } {
+ x := makeFloat(test.x)
+ out, acc := x.Float64()
+ if out != test.out || acc != test.acc {
+ t.Errorf("%s: got %g (%s); want %g (%s)", test.x, out, acc, test.out, test.acc)
+ }
+ }
+
+ // test NaN
+ x := makeFloat("NaN")
+ if out, acc := x.Float64(); out == out || acc != Undef {
+ t.Errorf("NaN: got %g (%s); want NaN (Undef)", out, acc)
+ }
+}
+
func TestFloatInt(t *testing.T) {
for _, test := range []struct {
x string
@@ -1073,14 +1127,14 @@
got, acc := z.Float64()
want := float64(float32(y0) + float32(x0))
if got != want || acc != Exact {
- t.Errorf("d = %d: %g + %g = %g (%s); want %g exactly", d, x0, y0, got, acc, want)
+ t.Errorf("d = %d: %g + %g = %g (%s); want %g (Exact)", d, x0, y0, got, acc, want)
}
z.Sub(z, y)
got, acc = z.Float64()
want = float64(float32(want) - float32(y0))
if got != want || acc != Exact {
- t.Errorf("d = %d: %g - %g = %g (%s); want %g exactly", d, x0+y0, y0, got, acc, want)
+ t.Errorf("d = %d: %g - %g = %g (%s); want %g (Exact)", d, x0+y0, y0, got, acc, want)
}
}
}
@@ -1106,14 +1160,14 @@
got, acc := z.Float64()
want := x0 + y0
if got != want || acc != Exact {
- t.Errorf("d = %d: %g + %g = %g (%s); want %g exactly", d, x0, y0, got, acc, want)
+ t.Errorf("d = %d: %g + %g = %g (%s); want %g (Exact)", d, x0, y0, got, acc, want)
}
z.Sub(z, y)
got, acc = z.Float64()
want -= y0
if got != want || acc != Exact {
- t.Errorf("d = %d: %g - %g = %g (%s); want %g exactly", d, x0+y0, y0, got, acc, want)
+ t.Errorf("d = %d: %g - %g = %g (%s); want %g (Exact)", d, x0+y0, y0, got, acc, want)
}
}
}