encoding/csv: avoid allocations when reading records

This commit changes parseRecord to allocate a single string per record, instead of per field, by using indexes into the raw record. Benchstat (done with f69991c17) name old time/op new time/op delta Read-8 3.17µs ± 0% 2.78µs ± 1% -12.35% (p=0.016 n=4+5) ReadWithFieldsPerRecord-8 3.18µs ± 1% 2.79µs ± 1% -12.23% (p=0.008 n=5+5) ReadWithoutFieldsPerRecord-8 4.59µs ± 0% 2.77µs ± 0% -39.58% (p=0.016 n=4+5) ReadLargeFields-8 57.0µs ± 0% 55.7µs ± 0% -2.18% (p=0.008 n=5+5) name old alloc/op new alloc/op delta Read-8 660B ± 0% 664B ± 0% +0.61% (p=0.008 n=5+5) ReadWithFieldsPerRecord-8 660B ± 0% 664B ± 0% +0.61% (p=0.008 n=5+5) ReadWithoutFieldsPerRecord-8 1.14kB ± 0% 0.66kB ± 0% -41.75% (p=0.008 n=5+5) ReadLargeFields-8 3.86kB ± 0% 3.94kB ± 0% +1.86% (p=0.008 n=5+5) name old allocs/op new allocs/op delta Read-8 30.0 ± 0% 18.0 ± 0% -40.00% (p=0.008 n=5+5) ReadWithFieldsPerRecord-8 30.0 ± 0% 18.0 ± 0% -40.00% (p=0.008 n=5+5) ReadWithoutFieldsPerRecord-8 50.0 ± 0% 18.0 ± 0% -64.00% (p=0.008 n=5+5) ReadLargeFields-8 66.0 ± 0% 24.0 ± 0% -63.64% (p=0.008 n=5+5) For a simple application that I wrote, which reads in a CSV file (via ReadAll) and outputs the number of rows read (15857625 rows), this change reduces the total time on my notebook from ~58 seconds to ~48 seconds. This reduces time and allocations (bytes) each by ~6% for a real world CSV file at work (~230000 rows, 13 colums). Updates #16791 Change-Id: Ia07177c94624e55cdd3064a7d2751fb69322d3e4 Reviewed-on: https://go-review.googlesource.com/24723 Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org> Run-TryBot: Brad Fitzpatrick <bradfitz@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org>
2024-11-18 13:54:59 -07:00 · 2016-07-03 17:49:29 +02:00 · 2016-07-03 17:49:29 +02:00 · bd06d4827a
commit bd06d4827a
parent dce0df29dd
1 changed files with 45 additions and 15 deletions
--- a/src/encoding/csv/reader.go
+++ b/src/encoding/csv/reader.go
@ -114,7 +114,14 @@ type Reader struct {
 	line   int
 	column int
 	r      *bufio.Reader
-	field  bytes.Buffer
+	// lineBuffer holds the unescaped fields read by readField, one after another.
+	// The fields can be accessed by using the indexes in fieldIndexes.
+	// Example: for the row `a,"b","c""d",e` lineBuffer will contain `abc"de` and
+	// fieldIndexes will contain the indexes 0, 1, 2, 5.
+	lineBuffer bytes.Buffer
+	// Indexes of fields inside lineBuffer
+	// The i'th field starts at offset fieldIndexes[i] in lineBuffer.
+	fieldIndexes []int
 }

 // NewReader returns a new Reader that reads from r.
@ -233,31 +240,54 @@ func (r *Reader) parseRecord() (fields []string, err error) {
 	}
 	r.r.UnreadRune()

+	r.lineBuffer.Reset()
+	r.fieldIndexes = r.fieldIndexes[:0]
+
 	// At this point we have at least one field.
 	for {
+		idx := r.lineBuffer.Len()
+
 		haveField, delim, err := r.parseField()
 		if haveField {
-			// If FieldsPerRecord is greater than 0 we can assume the final
-			// length of fields to be equal to FieldsPerRecord.
-			if r.FieldsPerRecord > 0 && fields == nil {
-				fields = make([]string, 0, r.FieldsPerRecord)
-			}
-			fields = append(fields, r.field.String())
+			r.fieldIndexes = append(r.fieldIndexes, idx)
 		}
+
 		if delim == '\n' || err == io.EOF {
-			return fields, err
-		} else if err != nil {
+			if len(r.fieldIndexes) == 0 {
+				return nil, err
+			}
+			break
+		}
+
+		if err != nil {
 			return nil, err
 		}
 	}
+
+	fieldCount := len(r.fieldIndexes)
+	// Using this approach (creating a single string and taking slices of it)
+	// means that a single reference to any of the fields will retain the whole
+	// string. The risk of a nontrivial space leak caused by this is considered
+	// minimal and a tradeoff for better performance through the combined
+	// allocations.
+	line := r.lineBuffer.String()
+	fields = make([]string, fieldCount)
+
+	for i, idx := range r.fieldIndexes {
+		if i == fieldCount-1 {
+			fields[i] = line[idx:]
+		} else {
+			fields[i] = line[idx:r.fieldIndexes[i+1]]
+		}
+	}
+
+	return fields, nil
 }

 // parseField parses the next field in the record. The read field is
-// located in r.field. Delim is the first character not part of the field
+// appended to r.lineBuffer. Delim is the first character not part of the field
 // (r.Comma or '\n').
 func (r *Reader) parseField() (haveField bool, delim rune, err error) {
-	r.field.Reset()
-
 	r1, err := r.readRune()
 	for err == nil && r.TrimLeadingSpace && r1 != '\n' && unicode.IsSpace(r1) {
 		r1, err = r.readRune()
@ -310,19 +340,19 @@ func (r *Reader) parseField() (haveField bool, delim rune, err error) {
 						return false, 0, r.error(ErrQuote)
 					}
 					// accept the bare quote
-					r.field.WriteRune('"')
+					r.lineBuffer.WriteRune('"')
 				}
 			case '\n':
 				r.line++
 				r.column = -1
 			}
-			r.field.WriteRune(r1)
+			r.lineBuffer.WriteRune(r1)
 		}

 	default:
 		// unquoted field
 		for {
-			r.field.WriteRune(r1)
+			r.lineBuffer.WriteRune(r1)
 			r1, err = r.readRune()
 			if err != nil || r1 == r.Comma {
 				break