test/bench/timing.log - go - Git at Google

 All tests on r45 or r70

 Aug 3 2009

 First version of fasta. Translation of fasta.c, fetched from
 	http://shootout.alioth.debian.org/u32q/benchmark.php?test=fasta&lang=gpp&id=4

 fasta -n 25000000
 	gcc -O2 fasta.c	5.98u 0.00s 6.01r
 	gccgo -O2 fasta.go	8.82u 0.02s 8.85r
 	6g fasta.go	13.50u 0.02s 13.53r
 	6g -B fata.go	12.99u 0.02s 13.02r

 Aug 4 2009
 [added timing.sh]

 # myrandom:
 #   hand-written optimization of integer division
 #   use int32->float conversion
 fasta -n 25000000
 	# probably I/O library inefficiencies
 	gcc -O2 fasta.c	5.99u 0.00s 6.00r
 	gccgo -O2 fasta.go	8.82u 0.02s 8.85r
 	gc fasta	10.70u 0.00s 10.77r
 	gc_B fasta	10.09u 0.03s 10.12r

 reverse-complement < output-of-fasta-25000000
 	# we don't know - memory cache behavior?
 	gcc -O2 reverse-complement.c	2.04u 0.94s 10.54r
 	gccgo -O2 reverse-complement.go	6.54u 0.63s 7.17r
 	gc reverse-complement	6.55u 0.70s 7.26r
 	gc_B reverse-complement	6.32u 0.70s 7.10r

 nbody 50000000
 	# math.Sqrt needs to be in assembly; inlining is probably the other 50%
 	gcc -O2 nbody.c	21.61u 0.01s 24.80r
 	gccgo -O2 nbody.go	118.55u 0.02s 120.32r
 	gc nbody	100.84u 0.00s 100.85r
 	gc_B nbody	103.33u 0.00s 103.39r
 [
 hacked Sqrt in assembler
 	gc nbody	31.97u 0.00s 32.01r
 ]

 binary-tree 15 # too slow to use 20
 	# memory allocation and garbage collection
 	gcc -O2 binary-tree.c -lm	0.86u 0.00s 0.87r
 	gccgo -O2 binary-tree.go	1.69u 0.46s 2.15r
 	gccgo -O2 binary-tree-freelist.go	8.48u 0.00s 8.48r
 	gc binary-tree	9.60u 0.01s 9.62r
 	gc binary-tree-freelist	0.48u 0.01s 0.50r

 August 5, 2009

 fannkuch 12
 	# bounds checking is half the difference
 	# rest might be registerization
 	gcc -O2 fannkuch.c	60.09u 0.01s 60.32r
 	gccgo -O2 fannkuch.go	64.89u 0.00s 64.92r
 	gc fannkuch	124.59u 0.00s 124.67r
 	gc_B fannkuch	91.14u 0.00s 91.16r

 regex-dna 100000
 	# regexp code is slow on trivial regexp
 	gcc -O2 regex-dna.c -lpcre	0.92u 0.00s 0.99r
 	gc regexp-dna	26.94u 0.18s 28.75r
 	gc_B regexp-dna	26.51u 0.09s 26.75r

 spectral-norm 5500
 	gcc -O2 spectral-norm.c -lm	11.54u 0.00s 11.55r
 	gccgo -O2 spectral-norm.go	12.20u 0.00s 12.23r
 	gc spectral-norm	50.23u 0.00s 50.36r
 	gc_B spectral-norm	49.69u 0.01s 49.83r
 	gc spectral-norm-parallel	24.47u 0.03s 11.05r  # has shift >>1 not div /2
 	[using >>1 instead of /2 : gc gives 24.33u 0.00s 24.33r]

 August 6, 2009

 k-nucleotide 5000000
 	# string maps are slower than glib string maps
 	gcc -O2 -I/usr/include/glib-2.0 -I/usr/lib/glib-2.0/include k-nucleotide.c -lglib-2.0	k-nucleotide.c: 10.72u 0.01s 10.74r
 	gccgo -O2 k-nucleotide.go	21.64u 0.83s 22.78r
 	gc k-nucleotide	16.08u 0.06s 16.50r
 	gc_B k-nucleotide	17.32u 0.02s 17.37r

 mandelbrot 5500
 	# floating point code generator should use more registers
 	gcc -O2 mandelbrot.c	56.13u 0.02s 56.17r
 	gccgo -O2 mandelbrot.go	57.49u 0.01s 57.51r
 	gc mandelbrot	74.32u 0.00s 74.35r
 	gc_B mandelbrot	74.28u 0.01s 74.31r

 meteor 16000
 	# we don't know
 	gcc -O2 meteor-contest.c	0.10u 0.00s 0.10r
 	gccgo -O2 meteor-contest.go	0.12u 0.00s 0.14r
 	gc meteor-contest	0.24u 0.00s 0.26r
 	gc_B meteor-contest	0.23u 0.00s 0.24r

 pidigits 10000
 	# bignum is slower than gmp
 	gcc -O2 pidigits.c -lgmp	2.60u 0.00s 2.62r
 	gc pidigits	77.69u 0.14s 78.18r
 	gc_B pidigits	74.26u 0.18s 75.41r
 	gc_B pidigits	68.48u 0.20s 69.31r   # special case: no bounds checking in bignum

 August 7 2009

 # New gc does better division by powers of 2.  Significant improvements:

 spectral-norm 5500
 	# floating point code generator should use more registers; possibly inline evalA
 	gcc -O2 spectral-norm.c -lm	11.50u 0.00s 11.50r
 	gccgo -O2 spectral-norm.go	12.02u 0.00s 12.02r
 	gc spectral-norm	23.98u 0.00s 24.00r	# new time is 0.48 times old time, 52% faster
 	gc_B spectral-norm	23.71u 0.01s 23.72r	# ditto
 	gc spectral-norm-parallel	24.04u 0.00s 6.26r  # /2 put back.  note: 4x faster (on r70, idle)

 k-nucleotide 1000000
 	# string maps are slower than glib string maps
 	gcc -O2 -I/usr/include/glib-2.0 -I/usr/lib/glib-2.0/include k-nucleotide.c -lglib-2.0	10.82u 0.04s 10.87r
 	gccgo -O2 k-nucleotide.go	22.73u 0.89s 23.63r
 	gc k-nucleotide	15.97u 0.03s 16.04r
 	gc_B k-nucleotide	15.86u 0.06s 15.93r	# 8.5% faster, but probably due to weird cache effeccts in previous version

 pidigits 10000
 	# bignum is slower than gmp
 	gcc -O2 pidigits.c -lgmp	2.58u 0.00s 2.58r
 	gc pidigits	71.24u 0.04s 71.28r	# 8.5% faster
 	gc_B pidigits	71.25u 0.03s 71.29r	# 4% faster

 threadring 50000000
 	gcc -O2 threadring.c -lpthread	35.51u 160.21s 199.50r
 	gccgo -O2 threadring.go	90.33u 459.95s 448.03r
 	gc threadring	33.11u 0.00s 33.14r
 	GOMAXPROCS=4 gc threadring	114.48u 226.65s 371.59r
 	# change wait code to do <-make(chan int) instead of time.Sleep
 	gc threadring	28.41u 0.01s 29.35r
 	GOMAXPROCS=4 gc threadring	112.59u 232.83s 384.72r

 chameneos 6000000
 	gcc -O2 chameneosredux.c -lpthread	18.14u 276.52s 76.93r
 	gc chameneosredux	20.19u 0.01s 20.23r

 Aug 10 2009

 # new 6g with better fp registers, fast div and mod of integers
 # complete set of timings listed. significant changes marked ***

 fasta -n 25000000
 	# probably I/O library inefficiencies
 	gcc -O2 fasta.c	5.96u 0.00s 5.97r
 	gc fasta	10.59u 0.01s 10.61r
 	gc_B fasta	9.92u 0.02s 9.95r

 reverse-complement < output-of-fasta-25000000
 	# we don't know - memory cache behavior?
 	gcc -O2 reverse-complement.c	1.96u 1.56s 16.23r
 	gccgo -O2 reverse-complement.go	6.41u 0.62s 7.05r
 	gc reverse-complement	6.46u 0.70s 7.17r
 	gc_B reverse-complement	6.22u 0.72s 6.95r

 nbody 50000000
 	# math.Sqrt needs to be in assembly; inlining is probably the other 50%
 	gcc -O2 nbody.c	21.26u 0.01s 21.28r
 	gccgo -O2 nbody.go	116.68u 0.07s 116.80r
 	gc nbody	86.64u 0.01s 86.68r	# -14%
 	gc_B nbody	85.72u 0.02s 85.77r	# *** -17%

 binary-tree 15 # too slow to use 20
 	# memory allocation and garbage collection
 	gcc -O2 binary-tree.c -lm	0.87u 0.00s 0.87r
 	gccgo -O2 binary-tree.go	1.61u 0.47s 2.09r
 	gccgo -O2 binary-tree-freelist.go	0.00u 0.00s 0.01r
 	gc binary-tree	9.11u 0.01s 9.13r	# *** -5%
 	gc binary-tree-freelist	0.47u 0.01s 0.48r

 fannkuch 12
 	# bounds checking is half the difference
 	# rest might be registerization
 	gcc -O2 fannkuch.c	59.92u 0.00s 59.94r
 	gccgo -O2 fannkuch.go	65.54u 0.00s 65.58r
 	gc fannkuch	123.98u 0.01s 124.04r
 	gc_B fannkuch	90.75u 0.00s 90.78r

 regex-dna 100000
 	# regexp code is slow on trivial regexp
 	gcc -O2 regex-dna.c -lpcre	0.91u 0.00s 0.92r
 	gc regex-dna	27.25u 0.02s 27.28r
 	gc_B regex-dna	29.51u 0.03s 29.55r

 spectral-norm 5500
 	# possibly inline evalA
 	gcc -O2 spectral-norm.c -lm	11.57u 0.00s 11.57r
 	gccgo -O2 spectral-norm.go	12.07u 0.01s 12.08r
 	gc spectral-norm	23.99u 0.00s 24.00r
 	gc_B spectral-norm	23.73u 0.00s 23.75r

 k-nucleotide 1000000
 	# string maps are slower than glib string maps
 	gcc -O2 -I/usr/include/glib-2.0 -I/usr/lib/glib-2.0/include k-nucleotide.c -lglib-2.0	10.63u 0.02s 10.69r
 	gccgo -O2 k-nucleotide.go	23.19u 0.91s 24.12r
 	gc k-nucleotide	16.73u 0.04s 16.78r	# *** +5% (but this one seems to vary by more than that)
 	gc_B k-nucleotide	16.46u 0.04s 16.51r	# *** +5%

 mandelbrot 16000
 	gcc -O2 mandelbrot.c	56.16u 0.00s 56.16r
 	gccgo -O2 mandelbrot.go	57.41u 0.01s 57.42r
 	gc mandelbrot	64.05u 0.02s 64.08r	# *** -14%
 	gc_B mandelbrot	64.10u 0.02s 64.14r	# *** -14%

 meteor 16000
 	# we don't know
 	gcc -O2 meteor-contest.c	0.10u 0.00s 0.10r
 	gccgo -O2 meteor-contest.go	0.12u 0.00s 0.12r
 	gc meteor-contest	0.18u 0.00s 0.20r	# *** -25%
 	gc_B meteor-contest	0.17u 0.00s 0.18r	# *** -24%

 pidigits 10000
 	# bignum is slower than gmp
 	gcc -O2 pidigits.c -lgmp	2.57u 0.00s 2.57r
 	gc pidigits	71.82u 0.04s 71.89r
 	gc_B pidigits	71.84u 0.08s 71.98r

 threadring 50000000
 	gcc -O2 threadring.c -lpthread	30.91u 164.33s 204.57r
 	gccgo -O2 threadring.go	87.12u 460.04s 447.61r
 	gc threadring	38.55u 0.00s 38.56r	# *** +16%

 chameneos 6000000
 	gcc -O2 chameneosredux.c -lpthread	17.93u 323.65s 88.47r
 	gc chameneosredux	21.72u 0.00s 21.73r

 August 10 2009

 # In-place versions for some bignum operations.
 pidigits 10000
 	gcc -O2 pidigits.c -lgmp	2.56u 0.00s 2.57r
 	gc pidigits	55.22u 0.04s 55.29r	# *** -23%
 	gc_B pidigits	55.49u 0.02s 55.60r	# *** -23%

 September 3 2009

 # New 6g inlines slices, has a few other tweaks.
 # Complete rerun. Significant changes marked.

 fasta -n 25000000
 	# probably I/O library inefficiencies
 	gcc -O2 fasta.c	5.96u 0.00s 5.96r
 	gc fasta	10.63u 0.02s 10.66r
 	gc_B fasta	9.92u 0.01s 9.94r

 reverse-complement < output-of-fasta-25000000
 	# we don't know - memory cache behavior?
 	gcc -O2 reverse-complement.c	1.92u 0.33s 2.93r
 	gccgo -O2 reverse-complement.go	6.76u 0.72s 7.58r	# +5%
 	gc reverse-complement	6.59u 0.70s 7.29r	# +2%
 	gc_B reverse-complement	5.57u 0.80s 6.37r	# -10%

 nbody 50000000
 	# math.Sqrt needs to be in assembly; inlining is probably the other 50%
 	# also loop alignment appears to be critical
 	gcc -O2 nbody.c	21.28u 0.00s 21.28r
 	gccgo -O2 nbody.go	119.21u 0.00s 119.22r	# +2%
 	gc nbody	109.72u 0.00s 109.78r	# + 28% *****
 	gc_B nbody	85.90u 0.00s 85.91r

 binary-tree 15 # too slow to use 20
 	# memory allocation and garbage collection
 	gcc -O2 binary-tree.c -lm	0.86u 0.00s 0.87r
 	gccgo -O2 binary-tree.go	1.88u 0.54s 2.42r	# +17%
 	gccgo -O2 binary-tree-freelist.go	0.01u 0.01s 0.02r
 	gc binary-tree	8.94u 0.01s 8.96r	# -2%
 	gc binary-tree-freelist	0.47u 0.01s 0.48r

 fannkuch 12
 	# bounds checking is half the difference
 	# rest might be registerization
 	gcc -O2 fannkuch.c	60.12u 0.00s 60.12r
 	gccgo -O2 fannkuch.go	92.62u 0.00s 92.66r		# +41% ***
 	gc fannkuch	123.90u 0.00s 123.92r
 	gc_B fannkuch	89.71u 0.00s 89.74r	# -1%

 regex-dna 100000
 	# regexp code is slow on trivial regexp
 	gcc -O2 regex-dna.c -lpcre	0.88u 0.00s 0.88r
 	gc regex-dna	25.77u 0.01s 25.79r		# -5%
 	gc_B regex-dna	26.05u 0.02s 26.09r	# -12% ***

 spectral-norm 5500
 	# possibly inline evalA
 	gcc -O2 spectral-norm.c -lm	11.51u 0.00s 11.51r
 	gccgo -O2 spectral-norm.go	11.95u 0.00s 11.96r
 	gc spectral-norm	24.23u 0.00s 24.23r
 	gc_B spectral-norm	23.83u 0.00s 23.84r

 k-nucleotide 1000000
 	# string maps are slower than glib string maps
 	gcc -O2 -I/usr/include/glib-2.0 -I/usr/lib/glib-2.0/include k-nucleotide.c -lglib-2.0	10.68u 0.04s 10.72r
 	gccgo -O2 k-nucleotide.go	23.03u 0.88s 23.92r
 	gc k-nucleotide	15.79u 0.05s 15.85r	# -5% (but this one seems to vary by more than that)
 	gc_B k-nucleotide	17.88u 0.05s 17.95r # +8% (ditto)

 mandelbrot 16000
 	gcc -O2 mandelbrot.c	56.17u 0.02s 56.20r
 	gccgo -O2 mandelbrot.go	56.74u 0.02s 56.79r	 # -1%
 	gc mandelbrot	63.31u 0.01s 63.35r	# -1%
 	gc_B mandelbrot	63.29u 0.00s 63.31r	# -1%

 meteor 16000
 	# we don't know
 	gcc -O2 meteor-contest.c	0.10u 0.00s 0.10r
 	gccgo -O2 meteor-contest.go	0.11u 0.00s 0.12r
 	gc meteor-contest	0.18u 0.00s 0.19r
 	gc_B meteor-contest	0.17u 0.00s 0.18r

 pidigits 10000
 	# bignum is slower than gmp
 	gcc -O2 pidigits.c -lgmp	2.56u 0.00s 2.57r
 	gc pidigits	55.87u 0.03s 55.91r
 	gc_B pidigits	55.93u 0.03s 55.99r

 # these tests are compared using real time, since they run multiple processors
 # accuracy probably low
 threadring 50000000
 	gcc -O2 threadring.c -lpthread	26.31u 164.69s 199.92r	# -2%
 	gccgo -O2 threadring.go	87.90u 487.26s 472.81r	# +6%
 	gc threadring	28.89u 0.00s 28.90r	# -25% ***

 chameneos 6000000
 	gcc -O2 chameneosredux.c -lpthread	16.41u 296.91s 81.17r	# -8%
 	gc chameneosredux	19.97u 0.00s 19.97r	# -8%

 Sep 22, 2009

 # 6g inlines sliceslice in most cases.

 fasta -n 25000000
 	# probably I/O library inefficiencies
 	gc fasta	10.24u 0.00s 10.25r	# -4%
 	gc_B fasta	9.68u 0.01s 9.69r	# -3%

 reverse-complement < output-of-fasta-25000000
 	# we don't know - memory cache behavior?
 	gc reverse-complement	6.67u 0.69s 7.37r	# +1%
 	gc_B reverse-complement	6.00u 0.64s 6.65r	# +7%

 nbody -n 50000000
 	# math.Sqrt needs to be in assembly; inlining is probably the other 50%
 	# also loop alignment appears to be critical
 	gc nbody	86.27u 0.00s 86.29r	# -21%
 	gc_B nbody	104.52u 0.00s 104.54r	# +22%

 fannkuch 12
 	# bounds checking is half the difference
 	# rest might be registerization
 	gc fannkuch	128.36u 0.00s 128.37r	# +4%
 	gc_B fannkuch	89.32u 0.00s 89.34r

 regex-dna 100000
 	# regexp code is slow on trivial regexp
 	gc regex-dna	24.82u 0.01s 24.86r	# -4%
 	gc_B regex-dna	24.55u 0.01s 24.57r	# -6%

 spectral-norm 5500
 	# possibly inline evalA
 	gc spectral-norm	24.05u 0.00s 24.07r	# -1%
 	gc_B spectral-norm	23.60u 0.00s 23.65r	 # -1%

 k-nucleotide 1000000
 	# string maps are slower than glib string maps
 	gc k-nucleotide	17.84u 0.04s 17.89r	# +13% but mysterious variation continues
 	gc_B k-nucleotide	15.56u 0.08s 15.65r	# -13% (ditto)

 mandelbrot 16000
 	gc mandelbrot	64.08u 0.01s 64.11r	# +1%
 	gc_B mandelbrot	64.04u 0.00s 64.05r	# +1%

 pidigits 10000
 	# bignum is slower than gmp
 	gc pidigits	58.68u 0.02s 58.72r	# +5%
 	gc_B pidigits	58.86u 0.05s 58.99r	# +5%

 # these tests are compared using real time, since they run multiple processors
 # accuracy probably low
 threadring 50000000
 	gc threadring	32.70u 0.02s 32.77r	# +13%

 chameneos 6000000
 	gc chameneosredux	26.62u 0.00s 26.63r	# +13%

 Sep 24, 2009

 # Sqrt now in assembler for 6g.
 nbody -n 50000000
 	# remember, at least for 6g, alignment of loops may be important
 	gcc -O2 nbody.c	21.24u 0.00s 21.25r
 	gccgo -O2 nbody.go	121.03u 0.00s 121.04r
 	gc nbody	30.26u 0.00s 30.27r	# -65% ***
 	gc_B nbody	30.20u 0.02s 30.22r	# -72% ***

 Nov 13 2009

 # fix bug in regexp; take performance hit.  good regexps will come in time.
 regex-dna 100000
 	gcc -O2 regex-dna.c -lpcre	0.92u 0.00s 0.94r
 	gc regex-dna	29.78u 0.03s 29.83r
 	gc_B regex-dna	32.63u 0.03s 32.74r

 Nov 24 2009

 # Roger Peppe's rewrite of the benchmark
 chameneos 6000000
 	gcc -O2 chameneosredux.c -lpthread	18.00u 303.29s 83.64r
 	gc chameneosredux	12.10u 0.00s 12.10r  # 2.22X faster
	All tests on r45 or r70

	Aug 3 2009

	First version of fasta. Translation of fasta.c, fetched from
	http://shootout.alioth.debian.org/u32q/benchmark.php?test=fasta&lang=gpp&id=4

	fasta -n 25000000
	gcc -O2 fasta.c 5.98u 0.00s 6.01r
	gccgo -O2 fasta.go 8.82u 0.02s 8.85r
	6g fasta.go 13.50u 0.02s 13.53r
	6g -B fata.go 12.99u 0.02s 13.02r

	Aug 4 2009
	[added timing.sh]

	# myrandom:
	# hand-written optimization of integer division
	# use int32->float conversion
	fasta -n 25000000
	# probably I/O library inefficiencies
	gcc -O2 fasta.c 5.99u 0.00s 6.00r
	gccgo -O2 fasta.go 8.82u 0.02s 8.85r
	gc fasta 10.70u 0.00s 10.77r
	gc_B fasta 10.09u 0.03s 10.12r

	reverse-complement < output-of-fasta-25000000
	# we don't know - memory cache behavior?
	gcc -O2 reverse-complement.c 2.04u 0.94s 10.54r
	gccgo -O2 reverse-complement.go 6.54u 0.63s 7.17r
	gc reverse-complement 6.55u 0.70s 7.26r
	gc_B reverse-complement 6.32u 0.70s 7.10r

	nbody 50000000
	# math.Sqrt needs to be in assembly; inlining is probably the other 50%
	gcc -O2 nbody.c 21.61u 0.01s 24.80r
	gccgo -O2 nbody.go 118.55u 0.02s 120.32r
	gc nbody 100.84u 0.00s 100.85r
	gc_B nbody 103.33u 0.00s 103.39r
	[
	hacked Sqrt in assembler
	gc nbody 31.97u 0.00s 32.01r
	]

	binary-tree 15 # too slow to use 20
	# memory allocation and garbage collection
	gcc -O2 binary-tree.c -lm 0.86u 0.00s 0.87r
	gccgo -O2 binary-tree.go 1.69u 0.46s 2.15r
	gccgo -O2 binary-tree-freelist.go 8.48u 0.00s 8.48r
	gc binary-tree 9.60u 0.01s 9.62r
	gc binary-tree-freelist 0.48u 0.01s 0.50r

	August 5, 2009

	fannkuch 12
	# bounds checking is half the difference
	# rest might be registerization
	gcc -O2 fannkuch.c 60.09u 0.01s 60.32r
	gccgo -O2 fannkuch.go 64.89u 0.00s 64.92r
	gc fannkuch 124.59u 0.00s 124.67r
	gc_B fannkuch 91.14u 0.00s 91.16r

	regex-dna 100000
	# regexp code is slow on trivial regexp
	gcc -O2 regex-dna.c -lpcre 0.92u 0.00s 0.99r
	gc regexp-dna 26.94u 0.18s 28.75r
	gc_B regexp-dna 26.51u 0.09s 26.75r

	spectral-norm 5500
	gcc -O2 spectral-norm.c -lm 11.54u 0.00s 11.55r
	gccgo -O2 spectral-norm.go 12.20u 0.00s 12.23r
	gc spectral-norm 50.23u 0.00s 50.36r
	gc_B spectral-norm 49.69u 0.01s 49.83r
	gc spectral-norm-parallel 24.47u 0.03s 11.05r # has shift >>1 not div /2
	[using >>1 instead of /2 : gc gives 24.33u 0.00s 24.33r]

	August 6, 2009

	k-nucleotide 5000000
	# string maps are slower than glib string maps
	gcc -O2 -I/usr/include/glib-2.0 -I/usr/lib/glib-2.0/include k-nucleotide.c -lglib-2.0 k-nucleotide.c: 10.72u 0.01s 10.74r
	gccgo -O2 k-nucleotide.go 21.64u 0.83s 22.78r
	gc k-nucleotide 16.08u 0.06s 16.50r
	gc_B k-nucleotide 17.32u 0.02s 17.37r

	mandelbrot 5500
	# floating point code generator should use more registers
	gcc -O2 mandelbrot.c 56.13u 0.02s 56.17r
	gccgo -O2 mandelbrot.go 57.49u 0.01s 57.51r
	gc mandelbrot 74.32u 0.00s 74.35r
	gc_B mandelbrot 74.28u 0.01s 74.31r

	meteor 16000
	# we don't know
	gcc -O2 meteor-contest.c 0.10u 0.00s 0.10r
	gccgo -O2 meteor-contest.go 0.12u 0.00s 0.14r
	gc meteor-contest 0.24u 0.00s 0.26r
	gc_B meteor-contest 0.23u 0.00s 0.24r

	pidigits 10000
	# bignum is slower than gmp
	gcc -O2 pidigits.c -lgmp 2.60u 0.00s 2.62r
	gc pidigits 77.69u 0.14s 78.18r
	gc_B pidigits 74.26u 0.18s 75.41r
	gc_B pidigits 68.48u 0.20s 69.31r # special case: no bounds checking in bignum

	August 7 2009

	# New gc does better division by powers of 2. Significant improvements:

	spectral-norm 5500
	# floating point code generator should use more registers; possibly inline evalA
	gcc -O2 spectral-norm.c -lm 11.50u 0.00s 11.50r
	gccgo -O2 spectral-norm.go 12.02u 0.00s 12.02r
	gc spectral-norm 23.98u 0.00s 24.00r # new time is 0.48 times old time, 52% faster
	gc_B spectral-norm 23.71u 0.01s 23.72r # ditto
	gc spectral-norm-parallel 24.04u 0.00s 6.26r # /2 put back. note: 4x faster (on r70, idle)

	k-nucleotide 1000000
	# string maps are slower than glib string maps
	gcc -O2 -I/usr/include/glib-2.0 -I/usr/lib/glib-2.0/include k-nucleotide.c -lglib-2.0 10.82u 0.04s 10.87r
	gccgo -O2 k-nucleotide.go 22.73u 0.89s 23.63r
	gc k-nucleotide 15.97u 0.03s 16.04r
	gc_B k-nucleotide 15.86u 0.06s 15.93r # 8.5% faster, but probably due to weird cache effeccts in previous version

	pidigits 10000
	# bignum is slower than gmp
	gcc -O2 pidigits.c -lgmp 2.58u 0.00s 2.58r
	gc pidigits 71.24u 0.04s 71.28r # 8.5% faster
	gc_B pidigits 71.25u 0.03s 71.29r # 4% faster

	threadring 50000000
	gcc -O2 threadring.c -lpthread 35.51u 160.21s 199.50r
	gccgo -O2 threadring.go 90.33u 459.95s 448.03r
	gc threadring 33.11u 0.00s 33.14r
	GOMAXPROCS=4 gc threadring 114.48u 226.65s 371.59r
	# change wait code to do <-make(chan int) instead of time.Sleep
	gc threadring 28.41u 0.01s 29.35r
	GOMAXPROCS=4 gc threadring 112.59u 232.83s 384.72r

	chameneos 6000000
	gcc -O2 chameneosredux.c -lpthread 18.14u 276.52s 76.93r
	gc chameneosredux 20.19u 0.01s 20.23r

	Aug 10 2009

	# new 6g with better fp registers, fast div and mod of integers
	# complete set of timings listed. significant changes marked ***

	fasta -n 25000000
	# probably I/O library inefficiencies
	gcc -O2 fasta.c 5.96u 0.00s 5.97r
	gc fasta 10.59u 0.01s 10.61r
	gc_B fasta 9.92u 0.02s 9.95r

	reverse-complement < output-of-fasta-25000000
	# we don't know - memory cache behavior?
	gcc -O2 reverse-complement.c 1.96u 1.56s 16.23r
	gccgo -O2 reverse-complement.go 6.41u 0.62s 7.05r
	gc reverse-complement 6.46u 0.70s 7.17r
	gc_B reverse-complement 6.22u 0.72s 6.95r

	nbody 50000000
	# math.Sqrt needs to be in assembly; inlining is probably the other 50%
	gcc -O2 nbody.c 21.26u 0.01s 21.28r
	gccgo -O2 nbody.go 116.68u 0.07s 116.80r
	gc nbody 86.64u 0.01s 86.68r # -14%
	gc_B nbody 85.72u 0.02s 85.77r # *** -17%

	binary-tree 15 # too slow to use 20
	# memory allocation and garbage collection
	gcc -O2 binary-tree.c -lm 0.87u 0.00s 0.87r
	gccgo -O2 binary-tree.go 1.61u 0.47s 2.09r
	gccgo -O2 binary-tree-freelist.go 0.00u 0.00s 0.01r
	gc binary-tree 9.11u 0.01s 9.13r # *** -5%
	gc binary-tree-freelist 0.47u 0.01s 0.48r

	fannkuch 12
	# bounds checking is half the difference
	# rest might be registerization
	gcc -O2 fannkuch.c 59.92u 0.00s 59.94r
	gccgo -O2 fannkuch.go 65.54u 0.00s 65.58r
	gc fannkuch 123.98u 0.01s 124.04r
	gc_B fannkuch 90.75u 0.00s 90.78r

	regex-dna 100000
	# regexp code is slow on trivial regexp
	gcc -O2 regex-dna.c -lpcre 0.91u 0.00s 0.92r
	gc regex-dna 27.25u 0.02s 27.28r
	gc_B regex-dna 29.51u 0.03s 29.55r

	spectral-norm 5500
	# possibly inline evalA
	gcc -O2 spectral-norm.c -lm 11.57u 0.00s 11.57r
	gccgo -O2 spectral-norm.go 12.07u 0.01s 12.08r
	gc spectral-norm 23.99u 0.00s 24.00r
	gc_B spectral-norm 23.73u 0.00s 23.75r

	k-nucleotide 1000000
	# string maps are slower than glib string maps
	gcc -O2 -I/usr/include/glib-2.0 -I/usr/lib/glib-2.0/include k-nucleotide.c -lglib-2.0 10.63u 0.02s 10.69r
	gccgo -O2 k-nucleotide.go 23.19u 0.91s 24.12r
	gc k-nucleotide 16.73u 0.04s 16.78r # *** +5% (but this one seems to vary by more than that)
	gc_B k-nucleotide 16.46u 0.04s 16.51r # *** +5%

	mandelbrot 16000
	gcc -O2 mandelbrot.c 56.16u 0.00s 56.16r
	gccgo -O2 mandelbrot.go 57.41u 0.01s 57.42r
	gc mandelbrot 64.05u 0.02s 64.08r # *** -14%
	gc_B mandelbrot 64.10u 0.02s 64.14r # *** -14%

	meteor 16000
	# we don't know
	gcc -O2 meteor-contest.c 0.10u 0.00s 0.10r
	gccgo -O2 meteor-contest.go 0.12u 0.00s 0.12r
	gc meteor-contest 0.18u 0.00s 0.20r # *** -25%
	gc_B meteor-contest 0.17u 0.00s 0.18r # *** -24%

	pidigits 10000
	# bignum is slower than gmp
	gcc -O2 pidigits.c -lgmp 2.57u 0.00s 2.57r
	gc pidigits 71.82u 0.04s 71.89r
	gc_B pidigits 71.84u 0.08s 71.98r

	threadring 50000000
	gcc -O2 threadring.c -lpthread 30.91u 164.33s 204.57r
	gccgo -O2 threadring.go 87.12u 460.04s 447.61r
	gc threadring 38.55u 0.00s 38.56r # *** +16%

	chameneos 6000000
	gcc -O2 chameneosredux.c -lpthread 17.93u 323.65s 88.47r
	gc chameneosredux 21.72u 0.00s 21.73r

	August 10 2009

	# In-place versions for some bignum operations.
	pidigits 10000
	gcc -O2 pidigits.c -lgmp 2.56u 0.00s 2.57r
	gc pidigits 55.22u 0.04s 55.29r # *** -23%
	gc_B pidigits 55.49u 0.02s 55.60r # *** -23%

	September 3 2009

	# New 6g inlines slices, has a few other tweaks.
	# Complete rerun. Significant changes marked.

	fasta -n 25000000
	# probably I/O library inefficiencies
	gcc -O2 fasta.c 5.96u 0.00s 5.96r
	gc fasta 10.63u 0.02s 10.66r
	gc_B fasta 9.92u 0.01s 9.94r

	reverse-complement < output-of-fasta-25000000
	# we don't know - memory cache behavior?
	gcc -O2 reverse-complement.c 1.92u 0.33s 2.93r
	gccgo -O2 reverse-complement.go 6.76u 0.72s 7.58r # +5%
	gc reverse-complement 6.59u 0.70s 7.29r # +2%
	gc_B reverse-complement 5.57u 0.80s 6.37r # -10%

	nbody 50000000
	# math.Sqrt needs to be in assembly; inlining is probably the other 50%
	# also loop alignment appears to be critical
	gcc -O2 nbody.c 21.28u 0.00s 21.28r
	gccgo -O2 nbody.go 119.21u 0.00s 119.22r # +2%
	gc nbody 109.72u 0.00s 109.78r # + 28% *****
	gc_B nbody 85.90u 0.00s 85.91r

	binary-tree 15 # too slow to use 20
	# memory allocation and garbage collection
	gcc -O2 binary-tree.c -lm 0.86u 0.00s 0.87r
	gccgo -O2 binary-tree.go 1.88u 0.54s 2.42r # +17%
	gccgo -O2 binary-tree-freelist.go 0.01u 0.01s 0.02r
	gc binary-tree 8.94u 0.01s 8.96r # -2%
	gc binary-tree-freelist 0.47u 0.01s 0.48r

	fannkuch 12
	# bounds checking is half the difference
	# rest might be registerization
	gcc -O2 fannkuch.c 60.12u 0.00s 60.12r
	gccgo -O2 fannkuch.go 92.62u 0.00s 92.66r # +41% ***
	gc fannkuch 123.90u 0.00s 123.92r
	gc_B fannkuch 89.71u 0.00s 89.74r # -1%

	regex-dna 100000
	# regexp code is slow on trivial regexp
	gcc -O2 regex-dna.c -lpcre 0.88u 0.00s 0.88r
	gc regex-dna 25.77u 0.01s 25.79r # -5%
	gc_B regex-dna 26.05u 0.02s 26.09r # -12% ***

	spectral-norm 5500
	# possibly inline evalA
	gcc -O2 spectral-norm.c -lm 11.51u 0.00s 11.51r
	gccgo -O2 spectral-norm.go 11.95u 0.00s 11.96r
	gc spectral-norm 24.23u 0.00s 24.23r
	gc_B spectral-norm 23.83u 0.00s 23.84r

	k-nucleotide 1000000
	# string maps are slower than glib string maps
	gcc -O2 -I/usr/include/glib-2.0 -I/usr/lib/glib-2.0/include k-nucleotide.c -lglib-2.0 10.68u 0.04s 10.72r
	gccgo -O2 k-nucleotide.go 23.03u 0.88s 23.92r
	gc k-nucleotide 15.79u 0.05s 15.85r # -5% (but this one seems to vary by more than that)
	gc_B k-nucleotide 17.88u 0.05s 17.95r # +8% (ditto)

	mandelbrot 16000
	gcc -O2 mandelbrot.c 56.17u 0.02s 56.20r
	gccgo -O2 mandelbrot.go 56.74u 0.02s 56.79r # -1%
	gc mandelbrot 63.31u 0.01s 63.35r # -1%
	gc_B mandelbrot 63.29u 0.00s 63.31r # -1%

	meteor 16000
	# we don't know
	gcc -O2 meteor-contest.c 0.10u 0.00s 0.10r
	gccgo -O2 meteor-contest.go 0.11u 0.00s 0.12r
	gc meteor-contest 0.18u 0.00s 0.19r
	gc_B meteor-contest 0.17u 0.00s 0.18r

	pidigits 10000
	# bignum is slower than gmp
	gcc -O2 pidigits.c -lgmp 2.56u 0.00s 2.57r
	gc pidigits 55.87u 0.03s 55.91r
	gc_B pidigits 55.93u 0.03s 55.99r

	# these tests are compared using real time, since they run multiple processors
	# accuracy probably low
	threadring 50000000
	gcc -O2 threadring.c -lpthread 26.31u 164.69s 199.92r # -2%
	gccgo -O2 threadring.go 87.90u 487.26s 472.81r # +6%
	gc threadring 28.89u 0.00s 28.90r # -25% ***

	chameneos 6000000
	gcc -O2 chameneosredux.c -lpthread 16.41u 296.91s 81.17r # -8%
	gc chameneosredux 19.97u 0.00s 19.97r # -8%

	Sep 22, 2009

	# 6g inlines sliceslice in most cases.

	fasta -n 25000000
	# probably I/O library inefficiencies
	gc fasta 10.24u 0.00s 10.25r # -4%
	gc_B fasta 9.68u 0.01s 9.69r # -3%

	reverse-complement < output-of-fasta-25000000
	# we don't know - memory cache behavior?
	gc reverse-complement 6.67u 0.69s 7.37r # +1%
	gc_B reverse-complement 6.00u 0.64s 6.65r # +7%

	nbody -n 50000000
	# math.Sqrt needs to be in assembly; inlining is probably the other 50%
	# also loop alignment appears to be critical
	gc nbody 86.27u 0.00s 86.29r # -21%
	gc_B nbody 104.52u 0.00s 104.54r # +22%

	fannkuch 12
	# bounds checking is half the difference
	# rest might be registerization
	gc fannkuch 128.36u 0.00s 128.37r # +4%
	gc_B fannkuch 89.32u 0.00s 89.34r

	regex-dna 100000
	# regexp code is slow on trivial regexp
	gc regex-dna 24.82u 0.01s 24.86r # -4%
	gc_B regex-dna 24.55u 0.01s 24.57r # -6%

	spectral-norm 5500
	# possibly inline evalA
	gc spectral-norm 24.05u 0.00s 24.07r # -1%
	gc_B spectral-norm 23.60u 0.00s 23.65r # -1%

	k-nucleotide 1000000
	# string maps are slower than glib string maps
	gc k-nucleotide 17.84u 0.04s 17.89r # +13% but mysterious variation continues
	gc_B k-nucleotide 15.56u 0.08s 15.65r # -13% (ditto)

	mandelbrot 16000
	gc mandelbrot 64.08u 0.01s 64.11r # +1%
	gc_B mandelbrot 64.04u 0.00s 64.05r # +1%

	pidigits 10000
	# bignum is slower than gmp
	gc pidigits 58.68u 0.02s 58.72r # +5%
	gc_B pidigits 58.86u 0.05s 58.99r # +5%

	# these tests are compared using real time, since they run multiple processors
	# accuracy probably low
	threadring 50000000
	gc threadring 32.70u 0.02s 32.77r # +13%

	chameneos 6000000
	gc chameneosredux 26.62u 0.00s 26.63r # +13%

	Sep 24, 2009

	# Sqrt now in assembler for 6g.
	nbody -n 50000000
	# remember, at least for 6g, alignment of loops may be important
	gcc -O2 nbody.c 21.24u 0.00s 21.25r
	gccgo -O2 nbody.go 121.03u 0.00s 121.04r
	gc nbody 30.26u 0.00s 30.27r # -65% ***
	gc_B nbody 30.20u 0.02s 30.22r # -72% ***

	Nov 13 2009

	# fix bug in regexp; take performance hit. good regexps will come in time.
	regex-dna 100000
	gcc -O2 regex-dna.c -lpcre 0.92u 0.00s 0.94r
	gc regex-dna 29.78u 0.03s 29.83r
	gc_B regex-dna 32.63u 0.03s 32.74r

	Nov 24 2009

	# Roger Peppe's rewrite of the benchmark
	chameneos 6000000
	gcc -O2 chameneosredux.c -lpthread 18.00u 303.29s 83.64r
	gc chameneosredux 12.10u 0.00s 12.10r # 2.22X faster