mirror of
https://github.com/golang/go
synced 2024-11-25 04:27:56 -07:00
index/suffixarray: 4.5x faster index serialization (to memory)
Benchmark results (best of 3 runs): old: suffixarray.BenchmarkSaveRestore 1 1931909000 ns/op 28.21 MB/s new: suffixarray.BenchmarkSaveRestore 5 429721800 ns/op 117.14 MB/s R=golang-dev, r CC=golang-dev https://golang.org/cl/5161043
This commit is contained in:
parent
40c26fff14
commit
a7a7cc5a55
@ -18,7 +18,7 @@ package suffixarray
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
||||||
"gob"
|
"encoding/binary"
|
||||||
"io"
|
"io"
|
||||||
"os"
|
"os"
|
||||||
"regexp"
|
"regexp"
|
||||||
@ -37,17 +37,76 @@ func New(data []byte) *Index {
|
|||||||
return &Index{data, qsufsort(data)}
|
return &Index{data, qsufsort(data)}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Read and Write slice the data into successive portions of length gobN,
|
// writeInt writes an int x to w using buf to buffer the write.
|
||||||
// so gob can allocate smaller buffers for its I/O.
|
func writeInt(w io.Writer, buf []byte, x int) os.Error {
|
||||||
const gobN = 1 << 16 // slightly better than say 1 << 20 (BenchmarkSaveRestore)
|
binary.PutVarint(buf, int64(x))
|
||||||
|
_, err := w.Write(buf[0:binary.MaxVarintLen64])
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
// readInt reads an int x from r using buf to buffer the read and returns x.
|
||||||
|
func readInt(r io.Reader, buf []byte) (int, os.Error) {
|
||||||
|
_, err := io.ReadFull(r, buf[0:binary.MaxVarintLen64]) // ok to continue with error
|
||||||
|
x, _ := binary.Varint(buf)
|
||||||
|
return int(x), err
|
||||||
|
}
|
||||||
|
|
||||||
|
// writeSlice writes data[:n] to w and returns n.
|
||||||
|
// It uses buf to buffer the write.
|
||||||
|
func writeSlice(w io.Writer, buf []byte, data []int) (n int, err os.Error) {
|
||||||
|
// encode as many elements as fit into buf
|
||||||
|
p := binary.MaxVarintLen64
|
||||||
|
for ; n < len(data) && p+binary.MaxVarintLen64 <= len(buf); n++ {
|
||||||
|
p += binary.PutUvarint(buf[p:], uint64(data[n]))
|
||||||
|
}
|
||||||
|
|
||||||
|
// update buffer size
|
||||||
|
binary.PutVarint(buf, int64(p))
|
||||||
|
|
||||||
|
// write buffer
|
||||||
|
_, err = w.Write(buf[0:p])
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// readSlice reads data[:n] from r and returns n.
|
||||||
|
// It uses buf to buffer the read.
|
||||||
|
func readSlice(r io.Reader, buf []byte, data []int) (n int, err os.Error) {
|
||||||
|
// read buffer size
|
||||||
|
var size int
|
||||||
|
size, err = readInt(r, buf)
|
||||||
|
if err != nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// read buffer w/o the size
|
||||||
|
if _, err = io.ReadFull(r, buf[binary.MaxVarintLen64:size]); err != nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// decode as many elements as present in buf
|
||||||
|
for p := binary.MaxVarintLen64; p < size; n++ {
|
||||||
|
x, w := binary.Uvarint(buf[p:])
|
||||||
|
data[n] = int(x)
|
||||||
|
p += w
|
||||||
|
}
|
||||||
|
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
const bufSize = 16 << 10 // reasonable for BenchmarkSaveRestore
|
||||||
|
|
||||||
// Read reads the index from r into x; x must not be nil.
|
// Read reads the index from r into x; x must not be nil.
|
||||||
func (x *Index) Read(r io.Reader) os.Error {
|
func (x *Index) Read(r io.Reader) os.Error {
|
||||||
d := gob.NewDecoder(r)
|
// buffer for all reads
|
||||||
var n int
|
buf := make([]byte, bufSize)
|
||||||
if err := d.Decode(&n); err != nil {
|
|
||||||
|
// read length
|
||||||
|
n, err := readInt(r, buf)
|
||||||
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// allocate space
|
||||||
if 2*n < cap(x.data) || cap(x.data) < n {
|
if 2*n < cap(x.data) || cap(x.data) < n {
|
||||||
// new data is significantly smaller or larger then
|
// new data is significantly smaller or larger then
|
||||||
// existing buffers - allocate new ones
|
// existing buffers - allocate new ones
|
||||||
@ -58,51 +117,45 @@ func (x *Index) Read(r io.Reader) os.Error {
|
|||||||
x.data = x.data[0:n]
|
x.data = x.data[0:n]
|
||||||
x.sa = x.sa[0:n]
|
x.sa = x.sa[0:n]
|
||||||
}
|
}
|
||||||
for i := 0; i < n; {
|
|
||||||
j := i + gobN
|
// read data
|
||||||
if j > n {
|
if _, err := io.ReadFull(r, x.data); err != nil {
|
||||||
j = n
|
|
||||||
}
|
|
||||||
// data holds next piece of x.data; its length is updated by Decode
|
|
||||||
data := x.data[i:j]
|
|
||||||
if err := d.Decode(&data); err != nil {
|
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
if len(data) != j-i {
|
|
||||||
return os.NewError("suffixarray.Read: inconsistent data format")
|
// read index
|
||||||
}
|
for sa := x.sa; len(sa) > 0; {
|
||||||
// sa holds next piece of x.data; its length is updated by Decode
|
n, err := readSlice(r, buf, sa)
|
||||||
sa := x.sa[i:j]
|
if err != nil {
|
||||||
if err := d.Decode(&sa); err != nil {
|
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
if len(sa) != j-i {
|
sa = sa[n:]
|
||||||
return os.NewError("suffixarray.Read: inconsistent data format")
|
|
||||||
}
|
|
||||||
i = j
|
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// Write writes the index x to w.
|
// Write writes the index x to w.
|
||||||
func (x *Index) Write(w io.Writer) os.Error {
|
func (x *Index) Write(w io.Writer) os.Error {
|
||||||
e := gob.NewEncoder(w)
|
// buffer for all writes
|
||||||
n := len(x.data)
|
buf := make([]byte, bufSize)
|
||||||
if err := e.Encode(n); err != nil {
|
|
||||||
|
// write length
|
||||||
|
if err := writeInt(w, buf, len(x.data)); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
for i := 0; i < n; {
|
|
||||||
j := i + gobN
|
// write data
|
||||||
if j > n {
|
if _, err := w.Write(x.data); err != nil {
|
||||||
j = n
|
|
||||||
}
|
|
||||||
if err := e.Encode(x.data[i:j]); err != nil {
|
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
if err := e.Encode(x.sa[i:j]); err != nil {
|
|
||||||
|
// write index
|
||||||
|
for sa := x.sa; len(sa) > 0; {
|
||||||
|
n, err := writeSlice(w, buf, sa)
|
||||||
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
i = j
|
sa = sa[n:]
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
@ -230,11 +230,13 @@ func equal(x, y *Index) bool {
|
|||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
|
|
||||||
func testSaveRestore(t *testing.T, tc *testCase, x *Index) {
|
// returns the serialized index size
|
||||||
|
func testSaveRestore(t *testing.T, tc *testCase, x *Index) int {
|
||||||
var buf bytes.Buffer
|
var buf bytes.Buffer
|
||||||
if err := x.Write(&buf); err != nil {
|
if err := x.Write(&buf); err != nil {
|
||||||
t.Errorf("failed writing index %s (%s)", tc.name, err)
|
t.Errorf("failed writing index %s (%s)", tc.name, err)
|
||||||
}
|
}
|
||||||
|
size := buf.Len()
|
||||||
var y Index
|
var y Index
|
||||||
if err := y.Read(&buf); err != nil {
|
if err := y.Read(&buf); err != nil {
|
||||||
t.Errorf("failed reading index %s (%s)", tc.name, err)
|
t.Errorf("failed reading index %s (%s)", tc.name, err)
|
||||||
@ -242,6 +244,7 @@ func testSaveRestore(t *testing.T, tc *testCase, x *Index) {
|
|||||||
if !equal(x, &y) {
|
if !equal(x, &y) {
|
||||||
t.Errorf("restored index doesn't match saved index %s", tc.name)
|
t.Errorf("restored index doesn't match saved index %s", tc.name)
|
||||||
}
|
}
|
||||||
|
return size
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestIndex(t *testing.T) {
|
func TestIndex(t *testing.T) {
|
||||||
@ -284,13 +287,14 @@ func BenchmarkNewIndexRepeat(b *testing.B) {
|
|||||||
func BenchmarkSaveRestore(b *testing.B) {
|
func BenchmarkSaveRestore(b *testing.B) {
|
||||||
b.StopTimer()
|
b.StopTimer()
|
||||||
r := rand.New(rand.NewSource(0x5a77a1)) // guarantee always same sequence
|
r := rand.New(rand.NewSource(0x5a77a1)) // guarantee always same sequence
|
||||||
data := make([]byte, 10<<20) // 10MB index data
|
data := make([]byte, 10<<20) // 10MB of data to index
|
||||||
for i := range data {
|
for i := range data {
|
||||||
data[i] = byte(r.Intn(256))
|
data[i] = byte(r.Intn(256))
|
||||||
}
|
}
|
||||||
x := New(data)
|
x := New(data)
|
||||||
testSaveRestore(nil, nil, x) // verify correctness
|
size := testSaveRestore(nil, nil, x) // verify correctness
|
||||||
buf := bytes.NewBuffer(make([]byte, len(data))) // avoid frequent growing
|
buf := bytes.NewBuffer(make([]byte, size)) // avoid growing
|
||||||
|
b.SetBytes(int64(size))
|
||||||
b.StartTimer()
|
b.StartTimer()
|
||||||
for i := 0; i < b.N; i++ {
|
for i := 0; i < b.N; i++ {
|
||||||
x.Write(buf)
|
x.Write(buf)
|
||||||
|
Loading…
Reference in New Issue
Block a user