1
0
mirror of https://github.com/golang/go synced 2024-11-24 21:10:04 -07:00

suffixarray: improved serialization code

Use gobs to serialize indexes instead of encoding/binary.

Even with gobs, serialize data in slices instead of
applying gob to the entire data structure at once,
to reduce the amount of extra buffer memory needed
inside gob.

7x faster Write/Read for new BenchmarkSaveRestore
compared to old code; possibly because encoding/binary
is more expensive for int32 slice elements (interface
call to get little/big endian encoding), while gob's
encoding is fixed (unconfirmed).

new (using gobs):
suffixarray.BenchmarkSaveRestore	       1	2153604000 ns/op

old (using encoding/binary):
suffixarray.BenchmarkSaveRestore	       1	15118322000 ns/op

The actual serialized data is slightly larger then using
the old code for very large indices because full 32bit indices
require 5bytes using gobs instead of 4bytes (encoding/binary)
in serialized form.

R=r
CC=golang-dev
https://golang.org/cl/5087041
This commit is contained in:
Robert Griesemer 2011-09-20 14:36:19 -07:00
parent 86e65bac5c
commit 5ee7ef90cd
2 changed files with 65 additions and 18 deletions

View File

@ -18,8 +18,8 @@ package suffixarray
import (
"bytes"
"encoding/binary"
"exp/regexp"
"gob"
"io"
"os"
"sort"
@ -37,13 +37,18 @@ func New(data []byte) *Index {
return &Index{data, qsufsort(data)}
}
// Read and Write slice the data into successive portions of length gobN,
// so gob can allocate smaller buffers for its I/O.
const gobN = 1 << 16 // slightly better than say 1 << 20 (BenchmarkSaveRestore)
// Read reads the index from r into x; x must not be nil.
func (x *Index) Read(r io.Reader) os.Error {
var n int32
if err := binary.Read(r, binary.LittleEndian, &n); err != nil {
d := gob.NewDecoder(r)
var n int
if err := d.Decode(&n); err != nil {
return err
}
if 2*n < int32(cap(x.data)) || int32(cap(x.data)) < n {
if 2*n < cap(x.data) || cap(x.data) < n {
// new data is significantly smaller or larger then
// existing buffers - allocate new ones
x.data = make([]byte, n)
@ -53,28 +58,51 @@ func (x *Index) Read(r io.Reader) os.Error {
x.data = x.data[0:n]
x.sa = x.sa[0:n]
}
if err := binary.Read(r, binary.LittleEndian, x.data); err != nil {
return err
for i := 0; i < n; {
j := i + gobN
if j > n {
j = n
}
// data holds next piece of x.data; its length is updated by Decode
data := x.data[i:j]
if err := d.Decode(&data); err != nil {
return err
}
if len(data) != j-i {
return os.NewError("suffixarray.Read: inconsistent data format")
}
// sa holds next piece of x.data; its length is updated by Decode
sa := x.sa[i:j]
if err := d.Decode(&sa); err != nil {
return err
}
if len(sa) != j-i {
return os.NewError("suffixarray.Read: inconsistent data format")
}
i = j
}
if err := binary.Read(r, binary.LittleEndian, x.sa); err != nil {
return err
}
return nil
}
// Write writes the index x to w.
func (x *Index) Write(w io.Writer) os.Error {
n := int32(len(x.data))
if err := binary.Write(w, binary.LittleEndian, n); err != nil {
e := gob.NewEncoder(w)
n := len(x.data)
if err := e.Encode(n); err != nil {
return err
}
if err := binary.Write(w, binary.LittleEndian, x.data); err != nil {
return err
}
if err := binary.Write(w, binary.LittleEndian, x.sa); err != nil {
return err
for i := 0; i < n; {
j := i + gobN
if j > n {
j = n
}
if err := e.Encode(x.data[i:j]); err != nil {
return err
}
if err := e.Encode(x.sa[i:j]); err != nil {
return err
}
i = j
}
return nil
}

View File

@ -7,6 +7,7 @@ package suffixarray
import (
"bytes"
"exp/regexp"
"rand"
"sort"
"strings"
"testing"
@ -255,3 +256,21 @@ func TestIndex(t *testing.T) {
testLookups(t, &tc, x, -1)
}
}
func BenchmarkSaveRestore(b *testing.B) {
b.StopTimer()
r := rand.New(rand.NewSource(0x5a77a1)) // guarantee always same sequence
data := make([]byte, 10<<20) // 10MB index data
for i := range data {
data[i] = byte(r.Intn(256))
}
x := New(data)
testSaveRestore(nil, nil, x) // verify correctness
buf := bytes.NewBuffer(make([]byte, len(data))) // avoid frequent growing
b.StartTimer()
for i := 0; i < b.N; i++ {
x.Write(buf)
var y Index
y.Read(buf)
}
}