1
0
mirror of https://github.com/golang/go synced 2024-11-26 05:07:59 -07:00

go/scanner: optimize scanIdentifier

While profiling parsing, I noticed that scanIdentifier was extremely
hot, and could be optimized: it is responsible for a significant
fraction of scanning and had a lot of unnecessary branching, bounds
checks, and function calls.

This CL implements some of those optimizations, while trying to strike a
balance between optimization and readability. It achieves this by
optimizing for the common case of ASCII identifiers, falling back on the
slower scan when encountering the first non-ASCII character.

Benchmark results:

name                               old time/op    new time/op    delta
Scan-12                              16.9µs ± 4%    15.8µs ± 5%   -6.92%  (p=0.000 n=20+18)
ScanFiles/go/types/expr.go-12         793µs ± 4%     672µs ± 6%  -15.23%  (p=0.000 n=20+20)
ScanFiles/go/parser/parser.go-12     1.08ms ± 6%    0.90ms ± 4%  -16.68%  (p=0.000 n=20+20)
ScanFiles/net/http/server.go-12      1.44ms ± 4%    1.23ms ± 5%  -14.58%  (p=0.000 n=18+20)
ScanFiles/go/scanner/errors.go-12    40.7µs ± 2%    32.6µs ± 3%  -20.01%  (p=0.000 n=19+20)

Change-Id: If78380004248e3ea75cfc78eb7f38f528124dced
Reviewed-on: https://go-review.googlesource.com/c/go/+/308611
Trust: Robert Findley <rfindley@google.com>
Trust: Robert Griesemer <gri@golang.org>
Run-TryBot: Robert Findley <rfindley@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Robert Griesemer <gri@golang.org>
This commit is contained in:
Rob Findley 2021-04-01 12:23:22 -04:00 committed by Robert Findley
parent 074a49bfe8
commit cde6a675bc

View File

@ -48,11 +48,16 @@ type Scanner struct {
ErrorCount int // number of errors encountered
}
const bom = 0xFEFF // byte order mark, only permitted as very first character
const (
bom = 0xFEFF // byte order mark, only permitted as very first character
eof = -1 // end of file
)
// Read the next Unicode char into s.ch.
// s.ch < 0 means end-of-file.
//
// For optimization, there is some overlap between this method and
// s.scanIdentifier.
func (s *Scanner) next() {
if s.rdOffset < len(s.src) {
s.offset = s.rdOffset
@ -81,7 +86,7 @@ func (s *Scanner) next() {
s.lineOffset = s.offset
s.file.AddLine(s.offset)
}
s.ch = -1 // eof
s.ch = eof
}
}
@ -347,11 +352,53 @@ func isDigit(ch rune) bool {
return isDecimal(ch) || ch >= utf8.RuneSelf && unicode.IsDigit(ch)
}
// scanIdentifier reads the string of valid identifier characters at s.offset.
// It must only be called when s.ch is known to be a valid letter.
//
// Be careful when making changes to this function: it is optimized and affects
// scanning performance significantly.
func (s *Scanner) scanIdentifier() string {
offs := s.offset
for isLetter(s.ch) || isDigit(s.ch) {
// Optimize for the common case of an ASCII identifier.
//
// Ranging over s.src[s.rdOffset:] lets us avoid some bounds checks, and
// avoids conversions to runes.
//
// In case we encounter a non-ASCII character, fall back on the slower path
// of calling into s.next().
for rdOffset, b := range s.src[s.rdOffset:] {
if 'a' <= b && b <= 'z' || 'A' <= b && b <= 'Z' || b == '_' || '0' <= b && b <= '9' {
// Avoid assigning a rune for the common case of an ascii character.
continue
}
s.rdOffset += rdOffset
if b < utf8.RuneSelf {
// Optimization: we've encountered an ASCII character that's not a letter
// or number. Avoid the call into s.next() and corresponding set up.
//
// Note that s.next() does some line accounting if s.ch is '\n', so this
// shortcut is only possible because we know that the preceding character
// is not '\n'.
s.ch = rune(b)
s.offset = s.rdOffset
s.rdOffset++
goto exit
}
// We know that the preceding character is valid for an identifier because
// scanIdentifier is only called when s.ch is a letter, so calling s.next()
// at s.rdOffset resets the scanner state.
s.next()
for isLetter(s.ch) || isDigit(s.ch) {
s.next()
}
goto exit
}
s.offset = len(s.src)
s.rdOffset = len(s.src)
s.ch = eof
exit:
return string(s.src[offs:s.offset])
}