vendor/github.com/aclements/go-gg/ggstat/bin.go - perf - Git at Google

 // Copyright 2016 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

 package ggstat

 import (
 	"math"
 	"reflect"
 	"sort"

 	"github.com/aclements/go-gg/generic"
 	"github.com/aclements/go-gg/generic/slice"
 	"github.com/aclements/go-gg/table"
 	"github.com/aclements/go-moremath/vec"
 )

 // XXX If this is just based on the number of bins, it can come up
 // with really ugly boundary numbers. If the bin width is specified,
 // then you could also specify the left edge and bins will be placed
 // at [align+width*N, align+width*(N+1)]. ggplot2 also lets you
 // specify the center alignment.
 //
 // XXX In Matlab and NumPy, bins are open on the right *except* for
 // the last bin, which is closed on both.
 //
 // XXX Number of bins/bin width/specify boundaries, same bins across
 // all groups/separate for each group/based on shared scales (don't
 // have that information here), relative or absolute histogram (Matlab
 // has lots more).
 //
 // XXX Scale transform.
 //
 // The result of Bin has two columns in addition to constant columns from the input:
 //
 // - Column X is the left edge of the bin.
 //
 // - Column W is the sum of the rows' weights, or column "count" is
 //   the number of rows in the bin.
 type Bin struct {
 	// X is the name of the column to use for samples.
 	X string

 	// W is the optional name of the column to use for sample
 	// weights. It may be "" to weight each sample as 1.
 	W string

 	// Width controls how wide each bin should be. If not provided
 	// or 0, a width will be chosen to produce 30 bins. If X is an
 	// integer column, this width will be treated as an integer as
 	// well.
 	Width float64

 	// Center controls the center point of each bin. To center on
 	// integers, for example, you could use {Width: 1, Center:
 	// 0}.
 	// XXX What does center mean for integers? Should an unspecified center yield an autochosen one, or 0?
 	//Center float64

 	// Breaks is the set of break points to use as boundaries
 	// between bins. The interval of each bin is [Breaks[i],
 	// Breaks[i+1]). Data points before the first break are
 	// dropped. If provided, Width and Center are ignored.
 	Breaks table.Slice

 	// SplitGroups indicates that each group in the table should
 	// have separate bounds based on the data in that group alone.
 	// The default, false, indicates that the binning function
 	// should use the bounds of all of the data combined. This
 	// makes it easier to compare bins across groups.
 	SplitGroups bool
 }

 func (b Bin) F(g table.Grouping) table.Grouping {
 	breaks := reflect.ValueOf(b.Breaks)
 	agg := AggCount("count")
 	if b.W != "" {
 		agg = aggFn(vec.Sum, "", b.W)
 	}
 	if !breaks.IsValid() && !b.SplitGroups {
 		breaks = b.computeBreaks(g)
 	}
 	// Change b.X to the start of the bin.
 	g = table.MapTables(g, func(_ table.GroupID, t *table.Table) *table.Table {
 		breaks := breaks
 		if !breaks.IsValid() {
 			breaks = b.computeBreaks(t)
 		}
 		nbreaks := breaks.Len()

 		in := reflect.ValueOf(t.MustColumn(b.X))
 		nin := in.Len()

 		out := reflect.MakeSlice(breaks.Type(), nin, nin)
 		var found []int
 		for i := 0; i < nin; i++ {
 			elt := in.Index(i)
 			bin := sort.Search(nbreaks, func(j int) bool {
 				return generic.OrderR(elt, breaks.Index(j)) < 0
 			})
 			// 0 means the row doesn't fit on the front
 			// XXX Allow configuring the first and last bin as infinite or not.
 			bin = bin - 1
 			if bin >= 0 {
 				found = append(found, i)
 				out.Index(i).Set(breaks.Index(bin))
 			}
 		}
 		var nt table.Builder
 		for _, col := range t.Columns() {
 			if col == b.X {
 				nt.Add(col, slice.Select(out.Interface(), found))
 			} else if c, ok := t.Const(col); ok {
 				nt.AddConst(col, c)
 			} else {
 				nt.Add(col, slice.Select(t.Column(col), found))
 			}
 		}
 		return nt.Done()
 	})
 	// Group by the found bin
 	return Agg(b.X)(agg).F(g)
 }

 func (b Bin) computeBreaks(g table.Grouping) reflect.Value {
 	var cols []slice.T
 	for _, gid := range g.Tables() {
 		cols = append(cols, g.Table(gid).MustColumn(b.X))
 	}
 	data := slice.Concat(cols...)

 	min := slice.Min(data)
 	max := slice.Max(data)

 	rv := reflect.ValueOf(min)
 	switch rv.Type().Kind() {
 	case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64:
 		min, max := rv.Int(), reflect.ValueOf(max).Int()
 		width := int64(b.Width)
 		if width == 0 {
 			width = (max - min) / 30
 			if width < 1 {
 				width = 1
 			}
 		}
 		// XXX: This assumes boundaries should be aligned with
 		// 0. We should support explicit Center or Boundary
 		// requests.
 		min -= (min % width)
 		var breaks []int64
 		for i := min; i < max; i += width {
 			breaks = append(breaks, i)
 		}
 		outs := reflect.New(reflect.ValueOf(cols[0]).Type())
 		slice.Convert(outs.Interface(), breaks)
 		return outs.Elem()
 	case reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, reflect.Uintptr:
 		min, max := rv.Uint(), reflect.ValueOf(max).Uint()
 		width := uint64(b.Width)
 		if width == 0 {
 			width = (max - min) / 30
 			if width < 1 {
 				width = 1
 			}
 		}
 		min -= (min % width)
 		var breaks []uint64
 		for i := min; i < max; i += width {
 			breaks = append(breaks, i)
 		}
 		outs := reflect.New(reflect.ValueOf(cols[0]).Type())
 		slice.Convert(outs.Interface(), breaks)
 		return outs.Elem()
 	case reflect.Float32, reflect.Float64:
 		min, max := rv.Float(), reflect.ValueOf(max).Float()
 		width := b.Width
 		if width == 0 {
 			width = (max - min) / 30
 			if width == 0 {
 				width = 1
 			}
 		}
 		min -= math.Mod(min, width)
 		var breaks []float64
 		for i := min; i < max; i += width {
 			breaks = append(breaks, i)
 		}
 		outs := reflect.New(reflect.ValueOf(cols[0]).Type())
 		slice.Convert(outs.Interface(), breaks)
 		return outs.Elem()
 	default:
 		panic("can't compute breaks for unknown type")
 	}
 }

 // TODO: Count for categorical data.
	// Copyright 2016 The Go Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style
	// license that can be found in the LICENSE file.

	package ggstat

	import (
	"math"
	"reflect"
	"sort"

	"github.com/aclements/go-gg/generic"
	"github.com/aclements/go-gg/generic/slice"
	"github.com/aclements/go-gg/table"
	"github.com/aclements/go-moremath/vec"
	)

	// XXX If this is just based on the number of bins, it can come up
	// with really ugly boundary numbers. If the bin width is specified,
	// then you could also specify the left edge and bins will be placed
	// at [align+widthN, align+width(N+1)]. ggplot2 also lets you
	// specify the center alignment.
	//
	// XXX In Matlab and NumPy, bins are open on the right except for
	// the last bin, which is closed on both.
	//
	// XXX Number of bins/bin width/specify boundaries, same bins across
	// all groups/separate for each group/based on shared scales (don't
	// have that information here), relative or absolute histogram (Matlab
	// has lots more).
	//
	// XXX Scale transform.
	//
	// The result of Bin has two columns in addition to constant columns from the input:
	//
	// - Column X is the left edge of the bin.
	//
	// - Column W is the sum of the rows' weights, or column "count" is
	// the number of rows in the bin.
	type Bin struct {
	// X is the name of the column to use for samples.
	X string

	// W is the optional name of the column to use for sample
	// weights. It may be "" to weight each sample as 1.
	W string

	// Width controls how wide each bin should be. If not provided
	// or 0, a width will be chosen to produce 30 bins. If X is an
	// integer column, this width will be treated as an integer as
	// well.
	Width float64

	// Center controls the center point of each bin. To center on
	// integers, for example, you could use {Width: 1, Center:
	// 0}.
	// XXX What does center mean for integers? Should an unspecified center yield an autochosen one, or 0?
	//Center float64

	// Breaks is the set of break points to use as boundaries
	// between bins. The interval of each bin is [Breaks[i],
	// Breaks[i+1]). Data points before the first break are
	// dropped. If provided, Width and Center are ignored.
	Breaks table.Slice

	// SplitGroups indicates that each group in the table should
	// have separate bounds based on the data in that group alone.
	// The default, false, indicates that the binning function
	// should use the bounds of all of the data combined. This
	// makes it easier to compare bins across groups.
	SplitGroups bool
	}

	func (b Bin) F(g table.Grouping) table.Grouping {
	breaks := reflect.ValueOf(b.Breaks)
	agg := AggCount("count")
	if b.W != "" {
	agg = aggFn(vec.Sum, "", b.W)
	}
	if !breaks.IsValid() && !b.SplitGroups {
	breaks = b.computeBreaks(g)
	}
	// Change b.X to the start of the bin.
	g = table.MapTables(g, func(_ table.GroupID, t table.Table) table.Table {
	breaks := breaks
	if !breaks.IsValid() {
	breaks = b.computeBreaks(t)
	}
	nbreaks := breaks.Len()

	in := reflect.ValueOf(t.MustColumn(b.X))
	nin := in.Len()

	out := reflect.MakeSlice(breaks.Type(), nin, nin)
	var found []int
	for i := 0; i < nin; i++ {
	elt := in.Index(i)
	bin := sort.Search(nbreaks, func(j int) bool {
	return generic.OrderR(elt, breaks.Index(j)) < 0
	})
	// 0 means the row doesn't fit on the front
	// XXX Allow configuring the first and last bin as infinite or not.
	bin = bin - 1
	if bin >= 0 {
	found = append(found, i)
	out.Index(i).Set(breaks.Index(bin))
	}
	}
	var nt table.Builder
	for _, col := range t.Columns() {
	if col == b.X {
	nt.Add(col, slice.Select(out.Interface(), found))
	} else if c, ok := t.Const(col); ok {
	nt.AddConst(col, c)
	} else {
	nt.Add(col, slice.Select(t.Column(col), found))
	}
	}
	return nt.Done()
	})
	// Group by the found bin
	return Agg(b.X)(agg).F(g)
	}

	func (b Bin) computeBreaks(g table.Grouping) reflect.Value {
	var cols []slice.T
	for _, gid := range g.Tables() {
	cols = append(cols, g.Table(gid).MustColumn(b.X))
	}
	data := slice.Concat(cols...)

	min := slice.Min(data)
	max := slice.Max(data)

	rv := reflect.ValueOf(min)
	switch rv.Type().Kind() {
	case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64:
	min, max := rv.Int(), reflect.ValueOf(max).Int()
	width := int64(b.Width)
	if width == 0 {
	width = (max - min) / 30
	if width < 1 {
	width = 1
	}
	}
	// XXX: This assumes boundaries should be aligned with
	// 0. We should support explicit Center or Boundary
	// requests.
	min -= (min % width)
	var breaks []int64
	for i := min; i < max; i += width {
	breaks = append(breaks, i)
	}
	outs := reflect.New(reflect.ValueOf(cols[0]).Type())
	slice.Convert(outs.Interface(), breaks)
	return outs.Elem()
	case reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, reflect.Uintptr:
	min, max := rv.Uint(), reflect.ValueOf(max).Uint()
	width := uint64(b.Width)
	if width == 0 {
	width = (max - min) / 30
	if width < 1 {
	width = 1
	}
	}
	min -= (min % width)
	var breaks []uint64
	for i := min; i < max; i += width {
	breaks = append(breaks, i)
	}
	outs := reflect.New(reflect.ValueOf(cols[0]).Type())
	slice.Convert(outs.Interface(), breaks)
	return outs.Elem()
	case reflect.Float32, reflect.Float64:
	min, max := rv.Float(), reflect.ValueOf(max).Float()
	width := b.Width
	if width == 0 {
	width = (max - min) / 30
	if width == 0 {
	width = 1
	}
	}
	min -= math.Mod(min, width)
	var breaks []float64
	for i := min; i < max; i += width {
	breaks = append(breaks, i)
	}
	outs := reflect.New(reflect.ValueOf(cols[0]).Type())
	slice.Convert(outs.Interface(), breaks)
	return outs.Elem()
	default:
	panic("can't compute breaks for unknown type")
	}
	}

	// TODO: Count for categorical data.