1
0
mirror of https://github.com/golang/go synced 2024-11-12 08:40:21 -07:00
go/test/bench/timing.log
Robert Griesemer 0dbd8971a2 - use in-place bignum operations where available
- runs approx. 30% faster

R=r
DELTA=24  (10 added, 2 deleted, 12 changed)
OCL=32984
CL=33005
2009-08-10 17:44:46 -07:00

241 lines
7.9 KiB
Plaintext

All tests on r45 or r70
Aug 3 2009
First version of fasta. Translation of fasta.c, fetched from
http://shootout.alioth.debian.org/u32q/benchmark.php?test=fasta&lang=gpp&id=4
fasta -n 25000000
gcc -O2 fasta.c 5.98u 0.00s 6.01r
gccgo -O2 fasta.go 8.82u 0.02s 8.85r
6g fasta.go 13.50u 0.02s 13.53r
6g -B fata.go 12.99u 0.02s 13.02r
Aug 4 2009
[added timing.sh]
# myrandom:
# hand-written optimization of integer division
# use int32->float conversion
fasta -n 25000000
# probably I/O library inefficiencies
gcc -O2 fasta.c 5.99u 0.00s 6.00r
gccgo -O2 fasta.go 8.82u 0.02s 8.85r
gc fasta 10.70u 0.00s 10.77r
gc_B fasta 10.09u 0.03s 10.12r
reverse-complement < output-of-fasta-25000000
# we don't know - memory cache behavior?
gcc -O2 reverse-complement.c 2.04u 0.94s 10.54r
gccgo -O2 reverse-complement.go 6.54u 0.63s 7.17r
gc reverse-complement 6.55u 0.70s 7.26r
gc_B reverse-complement 6.32u 0.70s 7.10r
nbody 50000000
# math.Sqrt needs to be in assembly; inlining is probably the other 50%
gcc -O2 nbody.c 21.61u 0.01s 24.80r
gccgo -O2 nbody.go 118.55u 0.02s 120.32r
gc nbody 100.84u 0.00s 100.85r
gc_B nbody 103.33u 0.00s 103.39r
[
hacked Sqrt in assembler
gc nbody 31.97u 0.00s 32.01r
]
binary-tree 15 # too slow to use 20
# memory allocation and garbage collection
gcc -O2 binary-tree.c -lm 0.86u 0.00s 0.87r
gccgo -O2 binary-tree.go 1.69u 0.46s 2.15r
gccgo -O2 binary-tree-freelist.go 8.48u 0.00s 8.48r
gc binary-tree 9.60u 0.01s 9.62r
gc binary-tree-freelist 0.48u 0.01s 0.50r
August 5, 2009
fannkuch 12
# bounds checking is half the difference
# rest might be registerization
gcc -O2 fannkuch.c 60.09u 0.01s 60.32r
gccgo -O2 fannkuch.go 64.89u 0.00s 64.92r
gc fannkuch 124.59u 0.00s 124.67r
gc_B fannkuch 91.14u 0.00s 91.16r
regex-dna 100000
# regexp code is slow on trivial regexp
gcc -O2 regex-dna.c -lpcre 0.92u 0.00s 0.99r
gc regexp-dna 26.94u 0.18s 28.75r
gc_B regexp-dna 26.51u 0.09s 26.75r
spectral-norm 5500
gcc -O2 spectral-norm.c -lm 11.54u 0.00s 11.55r
gccgo -O2 spectral-norm.go 12.20u 0.00s 12.23r
gc spectral-norm 50.23u 0.00s 50.36r
gc_B spectral-norm 49.69u 0.01s 49.83r
gc spectral-norm-parallel 24.47u 0.03s 11.05r # has shift >>1 not div /2
[using >>1 instead of /2 : gc gives 24.33u 0.00s 24.33r]
August 6, 2009
k-nucleotide 5000000
# string maps are slower than glib string maps
gcc -O2 -I/usr/include/glib-2.0 -I/usr/lib/glib-2.0/include k-nucleotide.c -lglib-2.0 k-nucleotide.c: 10.72u 0.01s 10.74r
gccgo -O2 k-nucleotide.go 21.64u 0.83s 22.78r
gc k-nucleotide 16.08u 0.06s 16.50r
gc_B k-nucleotide 17.32u 0.02s 17.37r
mandelbrot 5500
# floating point code generator should use more registers
gcc -O2 mandelbrot.c 56.13u 0.02s 56.17r
gccgo -O2 mandelbrot.go 57.49u 0.01s 57.51r
gc mandelbrot 74.32u 0.00s 74.35r
gc_B mandelbrot 74.28u 0.01s 74.31r
meteor 16000
# we don't know
gcc -O2 meteor-contest.c 0.10u 0.00s 0.10r
gccgo -O2 meteor-contest.go 0.12u 0.00s 0.14r
gc meteor-contest 0.24u 0.00s 0.26r
gc_B meteor-contest 0.23u 0.00s 0.24r
pidigits 10000
# bignum is slower than gmp
gcc -O2 pidigits.c -lgmp 2.60u 0.00s 2.62r
gc pidigits 77.69u 0.14s 78.18r
gc_B pidigits 74.26u 0.18s 75.41r
gc_B pidigits 68.48u 0.20s 69.31r # special case: no bounds checking in bignum
August 7 2009
# New gc does better division by powers of 2. Significant improvements:
spectral-norm 5500
# floating point code generator should use more registers; possibly inline evalA
gcc -O2 spectral-norm.c -lm 11.50u 0.00s 11.50r
gccgo -O2 spectral-norm.go 12.02u 0.00s 12.02r
gc spectral-norm 23.98u 0.00s 24.00r # new time is 0.48 times old time, 52% faster
gc_B spectral-norm 23.71u 0.01s 23.72r # ditto
gc spectral-norm-parallel 24.04u 0.00s 6.26r # /2 put back. note: 4x faster (on r70, idle)
k-nucleotide 1000000
# string maps are slower than glib string maps
gcc -O2 -I/usr/include/glib-2.0 -I/usr/lib/glib-2.0/include k-nucleotide.c -lglib-2.0 10.82u 0.04s 10.87r
gccgo -O2 k-nucleotide.go 22.73u 0.89s 23.63r
gc k-nucleotide 15.97u 0.03s 16.04r
gc_B k-nucleotide 15.86u 0.06s 15.93r # 8.5% faster, but probably due to weird cache effeccts in previous version
pidigits 10000
# bignum is slower than gmp
gcc -O2 pidigits.c -lgmp 2.58u 0.00s 2.58r
gc pidigits 71.24u 0.04s 71.28r # 8.5% faster
gc_B pidigits 71.25u 0.03s 71.29r # 4% faster
threadring 50000000
gcc -O2 threadring.c -lpthread 35.51u 160.21s 199.50r
gccgo -O2 threadring.go 90.33u 459.95s 448.03r
gc threadring 33.11u 0.00s 33.14r
GOMAXPROCS=4 gc threadring 114.48u 226.65s 371.59r
# change wait code to do <-make(chan int) instead of time.Sleep
gc threadring 28.41u 0.01s 29.35r
GOMAXPROCS=4 gc threadring 112.59u 232.83s 384.72r
chameneos 6000000
gcc -O2 chameneosredux.c -lpthread 18.14u 276.52s 76.93r
gc chameneosredux 20.19u 0.01s 20.23r
Aug 10 2009
# new 6g with better fp registers, fast div and mod of integers
# complete set of timings listed. significant changes marked ***
fasta -n 25000000
# probably I/O library inefficiencies
gcc -O2 fasta.c 5.96u 0.00s 5.97r
gc fasta 10.59u 0.01s 10.61r
gc_B fasta 9.92u 0.02s 9.95r
reverse-complement < output-of-fasta-25000000
# we don't know - memory cache behavior?
gcc -O2 reverse-complement.c 1.96u 1.56s 16.23r
gccgo -O2 reverse-complement.go 6.41u 0.62s 7.05r
gc reverse-complement 6.46u 0.70s 7.17r
gc_B reverse-complement 6.22u 0.72s 6.95r
nbody 50000000
# math.Sqrt needs to be in assembly; inlining is probably the other 50%
gcc -O2 nbody.c 21.26u 0.01s 21.28r
gccgo -O2 nbody.go 116.68u 0.07s 116.80r
gc nbody 86.64u 0.01s 86.68r # -14%
gc_B nbody 85.72u 0.02s 85.77r # *** -17%
binary-tree 15 # too slow to use 20
# memory allocation and garbage collection
gcc -O2 binary-tree.c -lm 0.87u 0.00s 0.87r
gccgo -O2 binary-tree.go 1.61u 0.47s 2.09r
gccgo -O2 binary-tree-freelist.go 0.00u 0.00s 0.01r
gc binary-tree 9.11u 0.01s 9.13r # *** -5%
gc binary-tree-freelist 0.47u 0.01s 0.48r
fannkuch 12
# bounds checking is half the difference
# rest might be registerization
gcc -O2 fannkuch.c 59.92u 0.00s 59.94r
gccgo -O2 fannkuch.go 65.54u 0.00s 65.58r
gc fannkuch 123.98u 0.01s 124.04r
gc_B fannkuch 90.75u 0.00s 90.78r
regex-dna 100000
# regexp code is slow on trivial regexp
gcc -O2 regex-dna.c -lpcre 0.91u 0.00s 0.92r
gc regex-dna 27.25u 0.02s 27.28r
gc_B regex-dna 29.51u 0.03s 29.55r
spectral-norm 5500
# possibly inline evalA
gcc -O2 spectral-norm.c -lm 11.57u 0.00s 11.57r
gccgo -O2 spectral-norm.go 12.07u 0.01s 12.08r
gc spectral-norm 23.99u 0.00s 24.00r
gc_B spectral-norm 23.73u 0.00s 23.75r
k-nucleotide 1000000
# string maps are slower than glib string maps
gcc -O2 -I/usr/include/glib-2.0 -I/usr/lib/glib-2.0/include k-nucleotide.c -lglib-2.0 10.63u 0.02s 10.69r
gccgo -O2 k-nucleotide.go 23.19u 0.91s 24.12r
gc k-nucleotide 16.73u 0.04s 16.78r # *** +5% (but this one seems to vary by more than that)
gc_B k-nucleotide 16.46u 0.04s 16.51r # *** +5%
mandelbrot 16000
gcc -O2 mandelbrot.c 56.16u 0.00s 56.16r
gccgo -O2 mandelbrot.go 57.41u 0.01s 57.42r
gc mandelbrot 64.05u 0.02s 64.08r # *** -14%
gc_B mandelbrot 64.10u 0.02s 64.14r # *** -14%
meteor 16000
# we don't know
gcc -O2 meteor-contest.c 0.10u 0.00s 0.10r
gccgo -O2 meteor-contest.go 0.12u 0.00s 0.12r
gc meteor-contest 0.18u 0.00s 0.20r # *** -25%
gc_B meteor-contest 0.17u 0.00s 0.18r # *** -24%
pidigits 10000
# bignum is slower than gmp
gcc -O2 pidigits.c -lgmp 2.57u 0.00s 2.57r
gc pidigits 71.82u 0.04s 71.89r
gc_B pidigits 71.84u 0.08s 71.98r
threadring 50000000
gcc -O2 threadring.c -lpthread 30.91u 164.33s 204.57r
gccgo -O2 threadring.go 87.12u 460.04s 447.61r
gc threadring 38.55u 0.00s 38.56r # *** +16%
chameneos 6000000
gcc -O2 chameneosredux.c -lpthread 17.93u 323.65s 88.47r
gc chameneosredux 21.72u 0.00s 21.73r
August 10 2009
# In-place versions for some bignum operations.
pidigits 10000
gcc -O2 pidigits.c -lgmp 2.56u 0.00s 2.57r
gc pidigits 55.22u 0.04s 55.29r # *** -23%
gc_B pidigits 55.49u 0.02s 55.60r # *** -23%