mirror of
https://github.com/golang/go
synced 2024-11-05 17:26:11 -07:00
encoding/csv: avoid allocations when reading records
This commit changes parseRecord to allocate a single string per record,
instead of per field, by using indexes into the raw record.
Benchstat (done with f69991c17
)
name old time/op new time/op delta
Read-8 3.17µs ± 0% 2.78µs ± 1% -12.35% (p=0.016 n=4+5)
ReadWithFieldsPerRecord-8 3.18µs ± 1% 2.79µs ± 1% -12.23% (p=0.008 n=5+5)
ReadWithoutFieldsPerRecord-8 4.59µs ± 0% 2.77µs ± 0% -39.58% (p=0.016 n=4+5)
ReadLargeFields-8 57.0µs ± 0% 55.7µs ± 0% -2.18% (p=0.008 n=5+5)
name old alloc/op new alloc/op delta
Read-8 660B ± 0% 664B ± 0% +0.61% (p=0.008 n=5+5)
ReadWithFieldsPerRecord-8 660B ± 0% 664B ± 0% +0.61% (p=0.008 n=5+5)
ReadWithoutFieldsPerRecord-8 1.14kB ± 0% 0.66kB ± 0% -41.75% (p=0.008 n=5+5)
ReadLargeFields-8 3.86kB ± 0% 3.94kB ± 0% +1.86% (p=0.008 n=5+5)
name old allocs/op new allocs/op delta
Read-8 30.0 ± 0% 18.0 ± 0% -40.00% (p=0.008 n=5+5)
ReadWithFieldsPerRecord-8 30.0 ± 0% 18.0 ± 0% -40.00% (p=0.008 n=5+5)
ReadWithoutFieldsPerRecord-8 50.0 ± 0% 18.0 ± 0% -64.00% (p=0.008 n=5+5)
ReadLargeFields-8 66.0 ± 0% 24.0 ± 0% -63.64% (p=0.008 n=5+5)
For a simple application that I wrote, which reads in a CSV file (via
ReadAll) and outputs the number of rows read (15857625 rows), this change
reduces the total time on my notebook from ~58 seconds to ~48 seconds.
This reduces time and allocations (bytes) each by ~6% for a real world
CSV file at work (~230000 rows, 13 colums).
Updates #16791
Change-Id: Ia07177c94624e55cdd3064a7d2751fb69322d3e4
Reviewed-on: https://go-review.googlesource.com/24723
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
Run-TryBot: Brad Fitzpatrick <bradfitz@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
This commit is contained in:
parent
dce0df29dd
commit
bd06d4827a
@ -114,7 +114,14 @@ type Reader struct {
|
||||
line int
|
||||
column int
|
||||
r *bufio.Reader
|
||||
field bytes.Buffer
|
||||
// lineBuffer holds the unescaped fields read by readField, one after another.
|
||||
// The fields can be accessed by using the indexes in fieldIndexes.
|
||||
// Example: for the row `a,"b","c""d",e` lineBuffer will contain `abc"de` and
|
||||
// fieldIndexes will contain the indexes 0, 1, 2, 5.
|
||||
lineBuffer bytes.Buffer
|
||||
// Indexes of fields inside lineBuffer
|
||||
// The i'th field starts at offset fieldIndexes[i] in lineBuffer.
|
||||
fieldIndexes []int
|
||||
}
|
||||
|
||||
// NewReader returns a new Reader that reads from r.
|
||||
@ -233,31 +240,54 @@ func (r *Reader) parseRecord() (fields []string, err error) {
|
||||
}
|
||||
r.r.UnreadRune()
|
||||
|
||||
r.lineBuffer.Reset()
|
||||
r.fieldIndexes = r.fieldIndexes[:0]
|
||||
|
||||
// At this point we have at least one field.
|
||||
for {
|
||||
idx := r.lineBuffer.Len()
|
||||
|
||||
haveField, delim, err := r.parseField()
|
||||
if haveField {
|
||||
// If FieldsPerRecord is greater than 0 we can assume the final
|
||||
// length of fields to be equal to FieldsPerRecord.
|
||||
if r.FieldsPerRecord > 0 && fields == nil {
|
||||
fields = make([]string, 0, r.FieldsPerRecord)
|
||||
}
|
||||
fields = append(fields, r.field.String())
|
||||
r.fieldIndexes = append(r.fieldIndexes, idx)
|
||||
}
|
||||
|
||||
if delim == '\n' || err == io.EOF {
|
||||
return fields, err
|
||||
} else if err != nil {
|
||||
if len(r.fieldIndexes) == 0 {
|
||||
return nil, err
|
||||
}
|
||||
break
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
fieldCount := len(r.fieldIndexes)
|
||||
// Using this approach (creating a single string and taking slices of it)
|
||||
// means that a single reference to any of the fields will retain the whole
|
||||
// string. The risk of a nontrivial space leak caused by this is considered
|
||||
// minimal and a tradeoff for better performance through the combined
|
||||
// allocations.
|
||||
line := r.lineBuffer.String()
|
||||
fields = make([]string, fieldCount)
|
||||
|
||||
for i, idx := range r.fieldIndexes {
|
||||
if i == fieldCount-1 {
|
||||
fields[i] = line[idx:]
|
||||
} else {
|
||||
fields[i] = line[idx:r.fieldIndexes[i+1]]
|
||||
}
|
||||
}
|
||||
|
||||
return fields, nil
|
||||
}
|
||||
|
||||
// parseField parses the next field in the record. The read field is
|
||||
// located in r.field. Delim is the first character not part of the field
|
||||
// appended to r.lineBuffer. Delim is the first character not part of the field
|
||||
// (r.Comma or '\n').
|
||||
func (r *Reader) parseField() (haveField bool, delim rune, err error) {
|
||||
r.field.Reset()
|
||||
|
||||
r1, err := r.readRune()
|
||||
for err == nil && r.TrimLeadingSpace && r1 != '\n' && unicode.IsSpace(r1) {
|
||||
r1, err = r.readRune()
|
||||
@ -310,19 +340,19 @@ func (r *Reader) parseField() (haveField bool, delim rune, err error) {
|
||||
return false, 0, r.error(ErrQuote)
|
||||
}
|
||||
// accept the bare quote
|
||||
r.field.WriteRune('"')
|
||||
r.lineBuffer.WriteRune('"')
|
||||
}
|
||||
case '\n':
|
||||
r.line++
|
||||
r.column = -1
|
||||
}
|
||||
r.field.WriteRune(r1)
|
||||
r.lineBuffer.WriteRune(r1)
|
||||
}
|
||||
|
||||
default:
|
||||
// unquoted field
|
||||
for {
|
||||
r.field.WriteRune(r1)
|
||||
r.lineBuffer.WriteRune(r1)
|
||||
r1, err = r.readRune()
|
||||
if err != nil || r1 == r.Comma {
|
||||
break
|
||||
|
Loading…
Reference in New Issue
Block a user