1
0
mirror of https://github.com/golang/go synced 2024-11-25 06:37:58 -07:00

index/suffixarray: 4.5x faster index serialization (to memory)

Benchmark results (best of 3 runs):

old: suffixarray.BenchmarkSaveRestore	       1	1931909000 ns/op	  28.21 MB/s
new: suffixarray.BenchmarkSaveRestore	       5	 429721800 ns/op	 117.14 MB/s

R=golang-dev, r
CC=golang-dev
https://golang.org/cl/5161043
This commit is contained in:
Robert Griesemer 2011-09-30 11:31:28 -07:00
parent 40c26fff14
commit a7a7cc5a55
2 changed files with 101 additions and 44 deletions

View File

@ -18,7 +18,7 @@ package suffixarray
import ( import (
"bytes" "bytes"
"gob" "encoding/binary"
"io" "io"
"os" "os"
"regexp" "regexp"
@ -37,17 +37,76 @@ func New(data []byte) *Index {
return &Index{data, qsufsort(data)} return &Index{data, qsufsort(data)}
} }
// Read and Write slice the data into successive portions of length gobN, // writeInt writes an int x to w using buf to buffer the write.
// so gob can allocate smaller buffers for its I/O. func writeInt(w io.Writer, buf []byte, x int) os.Error {
const gobN = 1 << 16 // slightly better than say 1 << 20 (BenchmarkSaveRestore) binary.PutVarint(buf, int64(x))
_, err := w.Write(buf[0:binary.MaxVarintLen64])
return err
}
// readInt reads an int x from r using buf to buffer the read and returns x.
func readInt(r io.Reader, buf []byte) (int, os.Error) {
_, err := io.ReadFull(r, buf[0:binary.MaxVarintLen64]) // ok to continue with error
x, _ := binary.Varint(buf)
return int(x), err
}
// writeSlice writes data[:n] to w and returns n.
// It uses buf to buffer the write.
func writeSlice(w io.Writer, buf []byte, data []int) (n int, err os.Error) {
// encode as many elements as fit into buf
p := binary.MaxVarintLen64
for ; n < len(data) && p+binary.MaxVarintLen64 <= len(buf); n++ {
p += binary.PutUvarint(buf[p:], uint64(data[n]))
}
// update buffer size
binary.PutVarint(buf, int64(p))
// write buffer
_, err = w.Write(buf[0:p])
return
}
// readSlice reads data[:n] from r and returns n.
// It uses buf to buffer the read.
func readSlice(r io.Reader, buf []byte, data []int) (n int, err os.Error) {
// read buffer size
var size int
size, err = readInt(r, buf)
if err != nil {
return
}
// read buffer w/o the size
if _, err = io.ReadFull(r, buf[binary.MaxVarintLen64:size]); err != nil {
return
}
// decode as many elements as present in buf
for p := binary.MaxVarintLen64; p < size; n++ {
x, w := binary.Uvarint(buf[p:])
data[n] = int(x)
p += w
}
return
}
const bufSize = 16 << 10 // reasonable for BenchmarkSaveRestore
// Read reads the index from r into x; x must not be nil. // Read reads the index from r into x; x must not be nil.
func (x *Index) Read(r io.Reader) os.Error { func (x *Index) Read(r io.Reader) os.Error {
d := gob.NewDecoder(r) // buffer for all reads
var n int buf := make([]byte, bufSize)
if err := d.Decode(&n); err != nil {
// read length
n, err := readInt(r, buf)
if err != nil {
return err return err
} }
// allocate space
if 2*n < cap(x.data) || cap(x.data) < n { if 2*n < cap(x.data) || cap(x.data) < n {
// new data is significantly smaller or larger then // new data is significantly smaller or larger then
// existing buffers - allocate new ones // existing buffers - allocate new ones
@ -58,51 +117,45 @@ func (x *Index) Read(r io.Reader) os.Error {
x.data = x.data[0:n] x.data = x.data[0:n]
x.sa = x.sa[0:n] x.sa = x.sa[0:n]
} }
for i := 0; i < n; {
j := i + gobN // read data
if j > n { if _, err := io.ReadFull(r, x.data); err != nil {
j = n return err
} }
// data holds next piece of x.data; its length is updated by Decode
data := x.data[i:j] // read index
if err := d.Decode(&data); err != nil { for sa := x.sa; len(sa) > 0; {
n, err := readSlice(r, buf, sa)
if err != nil {
return err return err
} }
if len(data) != j-i { sa = sa[n:]
return os.NewError("suffixarray.Read: inconsistent data format")
}
// sa holds next piece of x.data; its length is updated by Decode
sa := x.sa[i:j]
if err := d.Decode(&sa); err != nil {
return err
}
if len(sa) != j-i {
return os.NewError("suffixarray.Read: inconsistent data format")
}
i = j
} }
return nil return nil
} }
// Write writes the index x to w. // Write writes the index x to w.
func (x *Index) Write(w io.Writer) os.Error { func (x *Index) Write(w io.Writer) os.Error {
e := gob.NewEncoder(w) // buffer for all writes
n := len(x.data) buf := make([]byte, bufSize)
if err := e.Encode(n); err != nil {
// write length
if err := writeInt(w, buf, len(x.data)); err != nil {
return err return err
} }
for i := 0; i < n; {
j := i + gobN // write data
if j > n { if _, err := w.Write(x.data); err != nil {
j = n return err
} }
if err := e.Encode(x.data[i:j]); err != nil {
// write index
for sa := x.sa; len(sa) > 0; {
n, err := writeSlice(w, buf, sa)
if err != nil {
return err return err
} }
if err := e.Encode(x.sa[i:j]); err != nil { sa = sa[n:]
return err
}
i = j
} }
return nil return nil
} }

View File

@ -230,11 +230,13 @@ func equal(x, y *Index) bool {
return true return true
} }
func testSaveRestore(t *testing.T, tc *testCase, x *Index) { // returns the serialized index size
func testSaveRestore(t *testing.T, tc *testCase, x *Index) int {
var buf bytes.Buffer var buf bytes.Buffer
if err := x.Write(&buf); err != nil { if err := x.Write(&buf); err != nil {
t.Errorf("failed writing index %s (%s)", tc.name, err) t.Errorf("failed writing index %s (%s)", tc.name, err)
} }
size := buf.Len()
var y Index var y Index
if err := y.Read(&buf); err != nil { if err := y.Read(&buf); err != nil {
t.Errorf("failed reading index %s (%s)", tc.name, err) t.Errorf("failed reading index %s (%s)", tc.name, err)
@ -242,6 +244,7 @@ func testSaveRestore(t *testing.T, tc *testCase, x *Index) {
if !equal(x, &y) { if !equal(x, &y) {
t.Errorf("restored index doesn't match saved index %s", tc.name) t.Errorf("restored index doesn't match saved index %s", tc.name)
} }
return size
} }
func TestIndex(t *testing.T) { func TestIndex(t *testing.T) {
@ -284,13 +287,14 @@ func BenchmarkNewIndexRepeat(b *testing.B) {
func BenchmarkSaveRestore(b *testing.B) { func BenchmarkSaveRestore(b *testing.B) {
b.StopTimer() b.StopTimer()
r := rand.New(rand.NewSource(0x5a77a1)) // guarantee always same sequence r := rand.New(rand.NewSource(0x5a77a1)) // guarantee always same sequence
data := make([]byte, 10<<20) // 10MB index data data := make([]byte, 10<<20) // 10MB of data to index
for i := range data { for i := range data {
data[i] = byte(r.Intn(256)) data[i] = byte(r.Intn(256))
} }
x := New(data) x := New(data)
testSaveRestore(nil, nil, x) // verify correctness size := testSaveRestore(nil, nil, x) // verify correctness
buf := bytes.NewBuffer(make([]byte, len(data))) // avoid frequent growing buf := bytes.NewBuffer(make([]byte, size)) // avoid growing
b.SetBytes(int64(size))
b.StartTimer() b.StartTimer()
for i := 0; i < b.N; i++ { for i := 0; i < b.N; i++ {
x.Write(buf) x.Write(buf)