blob: 2602004d836f4e4d50d9f3efd5b58c986dc78dcf [file] [log] [blame]
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package ggstat
import (
"math"
"reflect"
"sort"
"github.com/aclements/go-gg/generic"
"github.com/aclements/go-gg/generic/slice"
"github.com/aclements/go-gg/table"
"github.com/aclements/go-moremath/vec"
)
// XXX If this is just based on the number of bins, it can come up
// with really ugly boundary numbers. If the bin width is specified,
// then you could also specify the left edge and bins will be placed
// at [align+width*N, align+width*(N+1)]. ggplot2 also lets you
// specify the center alignment.
//
// XXX In Matlab and NumPy, bins are open on the right *except* for
// the last bin, which is closed on both.
//
// XXX Number of bins/bin width/specify boundaries, same bins across
// all groups/separate for each group/based on shared scales (don't
// have that information here), relative or absolute histogram (Matlab
// has lots more).
//
// XXX Scale transform.
//
// The result of Bin has two columns in addition to constant columns from the input:
//
// - Column X is the left edge of the bin.
//
// - Column W is the sum of the rows' weights, or column "count" is
// the number of rows in the bin.
type Bin struct {
// X is the name of the column to use for samples.
X string
// W is the optional name of the column to use for sample
// weights. It may be "" to weight each sample as 1.
W string
// Width controls how wide each bin should be. If not provided
// or 0, a width will be chosen to produce 30 bins. If X is an
// integer column, this width will be treated as an integer as
// well.
Width float64
// Center controls the center point of each bin. To center on
// integers, for example, you could use {Width: 1, Center:
// 0}.
// XXX What does center mean for integers? Should an unspecified center yield an autochosen one, or 0?
//Center float64
// Breaks is the set of break points to use as boundaries
// between bins. The interval of each bin is [Breaks[i],
// Breaks[i+1]). Data points before the first break are
// dropped. If provided, Width and Center are ignored.
Breaks table.Slice
// SplitGroups indicates that each group in the table should
// have separate bounds based on the data in that group alone.
// The default, false, indicates that the binning function
// should use the bounds of all of the data combined. This
// makes it easier to compare bins across groups.
SplitGroups bool
}
func (b Bin) F(g table.Grouping) table.Grouping {
breaks := reflect.ValueOf(b.Breaks)
agg := AggCount("count")
if b.W != "" {
agg = aggFn(vec.Sum, "", b.W)
}
if !breaks.IsValid() && !b.SplitGroups {
breaks = b.computeBreaks(g)
}
// Change b.X to the start of the bin.
g = table.MapTables(g, func(_ table.GroupID, t *table.Table) *table.Table {
breaks := breaks
if !breaks.IsValid() {
breaks = b.computeBreaks(t)
}
nbreaks := breaks.Len()
in := reflect.ValueOf(t.MustColumn(b.X))
nin := in.Len()
out := reflect.MakeSlice(breaks.Type(), nin, nin)
var found []int
for i := 0; i < nin; i++ {
elt := in.Index(i)
bin := sort.Search(nbreaks, func(j int) bool {
return generic.OrderR(elt, breaks.Index(j)) < 0
})
// 0 means the row doesn't fit on the front
// XXX Allow configuring the first and last bin as infinite or not.
bin = bin - 1
if bin >= 0 {
found = append(found, i)
out.Index(i).Set(breaks.Index(bin))
}
}
var nt table.Builder
for _, col := range t.Columns() {
if col == b.X {
nt.Add(col, slice.Select(out.Interface(), found))
} else if c, ok := t.Const(col); ok {
nt.AddConst(col, c)
} else {
nt.Add(col, slice.Select(t.Column(col), found))
}
}
return nt.Done()
})
// Group by the found bin
return Agg(b.X)(agg).F(g)
}
func (b Bin) computeBreaks(g table.Grouping) reflect.Value {
var cols []slice.T
for _, gid := range g.Tables() {
cols = append(cols, g.Table(gid).MustColumn(b.X))
}
data := slice.Concat(cols...)
min := slice.Min(data)
max := slice.Max(data)
rv := reflect.ValueOf(min)
switch rv.Type().Kind() {
case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64:
min, max := rv.Int(), reflect.ValueOf(max).Int()
width := int64(b.Width)
if width == 0 {
width = (max - min) / 30
if width < 1 {
width = 1
}
}
// XXX: This assumes boundaries should be aligned with
// 0. We should support explicit Center or Boundary
// requests.
min -= (min % width)
var breaks []int64
for i := min; i < max; i += width {
breaks = append(breaks, i)
}
outs := reflect.New(reflect.ValueOf(cols[0]).Type())
slice.Convert(outs.Interface(), breaks)
return outs.Elem()
case reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, reflect.Uintptr:
min, max := rv.Uint(), reflect.ValueOf(max).Uint()
width := uint64(b.Width)
if width == 0 {
width = (max - min) / 30
if width < 1 {
width = 1
}
}
min -= (min % width)
var breaks []uint64
for i := min; i < max; i += width {
breaks = append(breaks, i)
}
outs := reflect.New(reflect.ValueOf(cols[0]).Type())
slice.Convert(outs.Interface(), breaks)
return outs.Elem()
case reflect.Float32, reflect.Float64:
min, max := rv.Float(), reflect.ValueOf(max).Float()
width := b.Width
if width == 0 {
width = (max - min) / 30
if width == 0 {
width = 1
}
}
min -= math.Mod(min, width)
var breaks []float64
for i := min; i < max; i += width {
breaks = append(breaks, i)
}
outs := reflect.New(reflect.ValueOf(cols[0]).Type())
slice.Convert(outs.Interface(), breaks)
return outs.Elem()
default:
panic("can't compute breaks for unknown type")
}
}
// TODO: Count for categorical data.