draw: use a sync.Pool for kernel scaling's temporary buffers.

benchmark                      old ns/op     new ns/op     delta
BenchmarkScaleBLLargeDown      257715146     260286012     +1.00%
BenchmarkScaleCRLargeDown      426797448     430078734     +0.77%
BenchmarkScaleBLDown           4449939       4222542       -5.11%
BenchmarkScaleCRDown           8160446       8010056       -1.84%
BenchmarkScaleBLUp             22290312      21044122      -5.59%
BenchmarkScaleCRUp             33010722      32021468      -3.00%
BenchmarkScaleCRSrcGray        13307961      13020192      -2.16%
BenchmarkScaleCRSrcNRGBA       40567431      40801939      +0.58%
BenchmarkScaleCRSrcRGBA        39892971      40240558      +0.87%
BenchmarkScaleCRSrcYCbCr       59020222      59686699      +1.13%

benchmark                      old allocs     new allocs     delta
BenchmarkScaleBLLargeDown      1              1              +0.00%
BenchmarkScaleCRLargeDown      1              2              +100.00%
BenchmarkScaleBLDown           1              0              -100.00%
BenchmarkScaleCRDown           1              0              -100.00%
BenchmarkScaleBLUp             1              0              -100.00%
BenchmarkScaleCRUp             1              0              -100.00%
BenchmarkScaleCRSrcGray        1              0              -100.00%
BenchmarkScaleCRSrcNRGBA       1              0              -100.00%
BenchmarkScaleCRSrcRGBA        1              0              -100.00%
BenchmarkScaleCRSrcYCbCr       1              0              -100.00%

benchmark                      old bytes     new bytes     delta
BenchmarkScaleBLLargeDown      14745600      2949200       -80.00%
BenchmarkScaleCRLargeDown      14745600      4915333       -66.67%
BenchmarkScaleBLDown           1523712       5079          -99.67%
BenchmarkScaleCRDown           1523712       7619          -99.50%
BenchmarkScaleBLUp             10117120      101175        -99.00%
BenchmarkScaleCRUp             10117120      202350        -98.00%
BenchmarkScaleCRSrcGray        4915200       49156         -99.00%
BenchmarkScaleCRSrcNRGBA       4915200       163853        -96.67%
BenchmarkScaleCRSrcRGBA        4915200       163853        -96.67%
BenchmarkScaleCRSrcYCbCr       4915200       245780        -95.00%

The increase in BenchmarkScale??LargeDown number of allocs I think is an
accounting error due to the low number of iterations: a low denominator.
I suspect that there are one or two extra allocs up front for using the
sync.Pool, but one fewer alloc per iteration. The number of iterations
is only 5 for BL and 3 for CR, for the default timeout. If I increase
the -test.benchtime value to 5s, then the reported average (allocs/op)
drop from 2 to 0, so the delta should actually be -100% instead of +0 or
+100%.

Change-Id: I21d9bb0086bdb25517b6a430e8a21bdf3db026f6
Reviewed-on: https://go-review.googlesource.com/8150
Reviewed-by: Rob Pike <r@golang.org>
diff --git a/draw/gen.go b/draw/gen.go
index 1acae2e..6efdafa 100644
--- a/draw/gen.go
+++ b/draw/gen.go
@@ -936,8 +936,14 @@
 			// Create a temporary buffer:
 			// scaleX distributes the source image's columns over the temporary image.
 			// scaleY distributes the temporary image's rows over the destination image.
-			// TODO: is it worth having a sync.Pool for this temporary buffer?
-			tmp := make([][4]float64, z.dw*z.sh)
+			var tmp [][4]float64
+			if z.pool.New != nil {
+				tmpp := z.pool.Get().(*[][4]float64)
+				defer z.pool.Put(tmpp)
+				tmp = *tmpp
+			} else {
+				tmp = z.makeTmpBuf()
+			}
 
 			// sr is the source pixels. If it extends beyond the src bounds,
 			// we cannot use the type-specific fast paths, as they access
diff --git a/draw/impl.go b/draw/impl.go
index 6278c92..a1de1cd 100644
--- a/draw/impl.go
+++ b/draw/impl.go
@@ -3036,8 +3036,14 @@
 	// Create a temporary buffer:
 	// scaleX distributes the source image's columns over the temporary image.
 	// scaleY distributes the temporary image's rows over the destination image.
-	// TODO: is it worth having a sync.Pool for this temporary buffer?
-	tmp := make([][4]float64, z.dw*z.sh)
+	var tmp [][4]float64
+	if z.pool.New != nil {
+		tmpp := z.pool.Get().(*[][4]float64)
+		defer z.pool.Put(tmpp)
+		tmp = *tmpp
+	} else {
+		tmp = z.makeTmpBuf()
+	}
 
 	// sr is the source pixels. If it extends beyond the src bounds,
 	// we cannot use the type-specific fast paths, as they access
diff --git a/draw/scale.go b/draw/scale.go
index 1e95971..2e481b1 100644
--- a/draw/scale.go
+++ b/draw/scale.go
@@ -10,6 +10,7 @@
 	"image"
 	"image/color"
 	"math"
+	"sync"
 
 	"golang.org/x/image/math/f64"
 )
@@ -88,13 +89,17 @@
 
 // Scale implements the Scaler interface.
 func (q *Kernel) Scale(dst Image, dr image.Rectangle, src image.Image, sr image.Rectangle, opts *Options) {
-	q.NewScaler(dr.Dx(), dr.Dy(), sr.Dx(), sr.Dy()).Scale(dst, dr, src, sr, opts)
+	q.newScaler(dr.Dx(), dr.Dy(), sr.Dx(), sr.Dy(), false).Scale(dst, dr, src, sr, opts)
 }
 
 // NewScaler returns a Scaler that is optimized for scaling multiple times with
 // the same fixed destination and source width and height.
 func (q *Kernel) NewScaler(dw, dh, sw, sh int) Scaler {
-	return &kernelScaler{
+	return q.newScaler(dw, dh, sw, sh, true)
+}
+
+func (q *Kernel) newScaler(dw, dh, sw, sh int, usePool bool) Scaler {
+	z := &kernelScaler{
 		kernel:     q,
 		dw:         int32(dw),
 		dh:         int32(dh),
@@ -103,6 +108,13 @@
 		horizontal: newDistrib(q, int32(dw), int32(sw)),
 		vertical:   newDistrib(q, int32(dh), int32(sh)),
 	}
+	if usePool {
+		z.pool.New = func() interface{} {
+			tmp := z.makeTmpBuf()
+			return &tmp
+		}
+	}
+	return z
 }
 
 var (
@@ -152,6 +164,11 @@
 	kernel               *Kernel
 	dw, dh, sw, sh       int32
 	horizontal, vertical distrib
+	pool                 sync.Pool
+}
+
+func (z *kernelScaler) makeTmpBuf() [][4]float64 {
+	return make([][4]float64, z.dw*z.sh)
 }
 
 // source is a range of contribs, their inverse total weight, and that ITW
diff --git a/draw/scale_test.go b/draw/scale_test.go
index bad9c5f..9cce5b1 100644
--- a/draw/scale_test.go
+++ b/draw/scale_test.go
@@ -393,6 +393,7 @@
 		scaler = n.NewScaler(dr.Dx(), dr.Dy(), sr.Dx(), sr.Dy())
 	}
 
+	b.ReportAllocs()
 	b.ResetTimer()
 	for i := 0; i < b.N; i++ {
 		scaler.Scale(dst, dr, src, sr, nil)
@@ -408,6 +409,7 @@
 	sr := src.Bounds()
 	m := transformMatrix(40, 10)
 
+	b.ReportAllocs()
 	b.ResetTimer()
 	for i := 0; i < b.N; i++ {
 		q.Transform(dst, m, src, sr, nil)