mirror of
https://github.com/golang/go
synced 2024-11-22 01:54:42 -07:00
scanner: error handler must be provided to Init
Init may report an error on the first character and thus one needs an ability to set the error handler for Init. Was a design bug. Added corresponding test cases and better documentation. Also: Fixed a subtle infinite loop exposed by one of the new test cases. Fixes #1380. R=rsc, gri CC=golang-dev https://golang.org/cl/4094041
This commit is contained in:
parent
ab036abdf0
commit
1161d19024
@ -144,7 +144,7 @@ type Scanner struct {
|
|||||||
// the token text's head may be buffered in tokBuf while the token text's
|
// the token text's head may be buffered in tokBuf while the token text's
|
||||||
// tail is stored in srcBuf.
|
// tail is stored in srcBuf.
|
||||||
tokBuf bytes.Buffer // token text head that is not in srcBuf anymore
|
tokBuf bytes.Buffer // token text head that is not in srcBuf anymore
|
||||||
tokPos int // token text tail position (srcBuf index)
|
tokPos int // token text tail position (srcBuf index); valid if >= 0
|
||||||
tokEnd int // token text tail end (srcBuf index)
|
tokEnd int // token text tail end (srcBuf index)
|
||||||
|
|
||||||
// One character look-ahead
|
// One character look-ahead
|
||||||
@ -175,13 +175,14 @@ type Scanner struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// Init initializes a Scanner with a new source and returns itself.
|
// Init initializes a Scanner with a new source and returns s.
|
||||||
// Error is set to nil, ErrorCount is set to 0, Mode is set to GoTokens,
|
// Error is set to nil, ErrorCount is set to 0, Mode is set to GoTokens,
|
||||||
// and Whitespace is set to GoWhitespace.
|
// and Whitespace is set to GoWhitespace.
|
||||||
func (s *Scanner) Init(src io.Reader) *Scanner {
|
func (s *Scanner) Init(src io.Reader) *Scanner {
|
||||||
s.src = src
|
s.src = src
|
||||||
|
|
||||||
// initialize source buffer
|
// initialize source buffer
|
||||||
|
// (the first call to next() will fill it by calling src.Read)
|
||||||
s.srcBuf[0] = utf8.RuneSelf // sentinel
|
s.srcBuf[0] = utf8.RuneSelf // sentinel
|
||||||
s.srcPos = 0
|
s.srcPos = 0
|
||||||
s.srcEnd = 0
|
s.srcEnd = 0
|
||||||
@ -192,10 +193,11 @@ func (s *Scanner) Init(src io.Reader) *Scanner {
|
|||||||
s.column = 0
|
s.column = 0
|
||||||
|
|
||||||
// initialize token text buffer
|
// initialize token text buffer
|
||||||
|
// (required for first call to next()).
|
||||||
s.tokPos = -1
|
s.tokPos = -1
|
||||||
|
|
||||||
// initialize one character look-ahead
|
// initialize one character look-ahead
|
||||||
s.ch = s.next()
|
s.ch = -1 // no char read yet
|
||||||
|
|
||||||
// initialize public fields
|
// initialize public fields
|
||||||
s.Error = nil
|
s.Error = nil
|
||||||
@ -222,15 +224,20 @@ func (s *Scanner) next() int {
|
|||||||
if s.tokPos >= 0 {
|
if s.tokPos >= 0 {
|
||||||
s.tokBuf.Write(s.srcBuf[s.tokPos:s.srcPos])
|
s.tokBuf.Write(s.srcBuf[s.tokPos:s.srcPos])
|
||||||
s.tokPos = 0
|
s.tokPos = 0
|
||||||
|
// s.tokEnd is set by Scan()
|
||||||
}
|
}
|
||||||
// move unread bytes to beginning of buffer
|
// move unread bytes to beginning of buffer
|
||||||
copy(s.srcBuf[0:], s.srcBuf[s.srcPos:s.srcEnd])
|
copy(s.srcBuf[0:], s.srcBuf[s.srcPos:s.srcEnd])
|
||||||
s.srcBufOffset += s.srcPos
|
s.srcBufOffset += s.srcPos
|
||||||
// read more bytes
|
// read more bytes
|
||||||
|
// (an io.Reader must return os.EOF when it reaches
|
||||||
|
// the end of what it is reading - simply returning
|
||||||
|
// n == 0 will make this loop retry forever; but the
|
||||||
|
// error is in the reader implementation in that case)
|
||||||
i := s.srcEnd - s.srcPos
|
i := s.srcEnd - s.srcPos
|
||||||
n, err := s.src.Read(s.srcBuf[i:bufLen])
|
n, err := s.src.Read(s.srcBuf[i:bufLen])
|
||||||
s.srcEnd = i + n
|
|
||||||
s.srcPos = 0
|
s.srcPos = 0
|
||||||
|
s.srcEnd = i + n
|
||||||
s.srcBuf[s.srcEnd] = utf8.RuneSelf // sentinel
|
s.srcBuf[s.srcEnd] = utf8.RuneSelf // sentinel
|
||||||
if err != nil {
|
if err != nil {
|
||||||
if s.srcEnd == 0 {
|
if s.srcEnd == 0 {
|
||||||
@ -238,8 +245,12 @@ func (s *Scanner) next() int {
|
|||||||
}
|
}
|
||||||
if err != os.EOF {
|
if err != os.EOF {
|
||||||
s.error(err.String())
|
s.error(err.String())
|
||||||
break
|
|
||||||
}
|
}
|
||||||
|
// If err == EOF, we won't be getting more
|
||||||
|
// bytes; break to avoid infinite loop. If
|
||||||
|
// err is something else, we don't know if
|
||||||
|
// we can get more bytes; thus also break.
|
||||||
|
break
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// at least one byte
|
// at least one byte
|
||||||
@ -251,7 +262,7 @@ func (s *Scanner) next() int {
|
|||||||
if ch == utf8.RuneError && width == 1 {
|
if ch == utf8.RuneError && width == 1 {
|
||||||
s.error("illegal UTF-8 encoding")
|
s.error("illegal UTF-8 encoding")
|
||||||
}
|
}
|
||||||
s.srcPos += width - 1
|
s.srcPos += width - 1 // -1 because of s.srcPos++ below
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -272,13 +283,13 @@ func (s *Scanner) next() int {
|
|||||||
|
|
||||||
// Next reads and returns the next Unicode character.
|
// Next reads and returns the next Unicode character.
|
||||||
// It returns EOF at the end of the source. It reports
|
// It returns EOF at the end of the source. It reports
|
||||||
// a read error by calling s.Error, if set, or else
|
// a read error by calling s.Error, if not nil; otherwise
|
||||||
// prints an error message to os.Stderr. Next does not
|
// it prints an error message to os.Stderr. Next does not
|
||||||
// update the Scanner's Position field; use Pos() to
|
// update the Scanner's Position field; use Pos() to
|
||||||
// get the current position.
|
// get the current position.
|
||||||
func (s *Scanner) Next() int {
|
func (s *Scanner) Next() int {
|
||||||
s.tokPos = -1 // don't collect token text
|
s.tokPos = -1 // don't collect token text
|
||||||
ch := s.ch
|
ch := s.Peek()
|
||||||
s.ch = s.next()
|
s.ch = s.next()
|
||||||
return ch
|
return ch
|
||||||
}
|
}
|
||||||
@ -288,6 +299,9 @@ func (s *Scanner) Next() int {
|
|||||||
// the scanner. It returns EOF if the scanner's position is at the last
|
// the scanner. It returns EOF if the scanner's position is at the last
|
||||||
// character of the source.
|
// character of the source.
|
||||||
func (s *Scanner) Peek() int {
|
func (s *Scanner) Peek() int {
|
||||||
|
if s.ch < 0 {
|
||||||
|
s.ch = s.next()
|
||||||
|
}
|
||||||
return s.ch
|
return s.ch
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -511,10 +525,10 @@ func (s *Scanner) scanComment(ch int) {
|
|||||||
// Scan reads the next token or Unicode character from source and returns it.
|
// Scan reads the next token or Unicode character from source and returns it.
|
||||||
// It only recognizes tokens t for which the respective Mode bit (1<<-t) is set.
|
// It only recognizes tokens t for which the respective Mode bit (1<<-t) is set.
|
||||||
// It returns EOF at the end of the source. It reports scanner errors (read and
|
// It returns EOF at the end of the source. It reports scanner errors (read and
|
||||||
// token errors) by calling s.Error, if set; otherwise it prints an error message
|
// token errors) by calling s.Error, if not nil; otherwise it prints an error
|
||||||
// to os.Stderr.
|
// message to os.Stderr.
|
||||||
func (s *Scanner) Scan() int {
|
func (s *Scanner) Scan() int {
|
||||||
ch := s.ch
|
ch := s.Peek()
|
||||||
|
|
||||||
// reset token text position
|
// reset token text position
|
||||||
s.tokPos = -1
|
s.tokPos = -1
|
||||||
|
@ -10,6 +10,7 @@ import (
|
|||||||
"os"
|
"os"
|
||||||
"strings"
|
"strings"
|
||||||
"testing"
|
"testing"
|
||||||
|
"utf8"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@ -408,7 +409,7 @@ func TestScanWhitespace(t *testing.T) {
|
|||||||
func testError(t *testing.T, src, msg string, tok int) {
|
func testError(t *testing.T, src, msg string, tok int) {
|
||||||
s := new(Scanner).Init(bytes.NewBufferString(src))
|
s := new(Scanner).Init(bytes.NewBufferString(src))
|
||||||
errorCalled := false
|
errorCalled := false
|
||||||
s.Error = func(s *Scanner, m string) {
|
s.Error = func(_ *Scanner, m string) {
|
||||||
if !errorCalled {
|
if !errorCalled {
|
||||||
// only look at first error
|
// only look at first error
|
||||||
if m != msg {
|
if m != msg {
|
||||||
@ -431,6 +432,8 @@ func testError(t *testing.T, src, msg string, tok int) {
|
|||||||
|
|
||||||
|
|
||||||
func TestError(t *testing.T) {
|
func TestError(t *testing.T) {
|
||||||
|
testError(t, "\x00", "illegal character NUL", 0)
|
||||||
|
testError(t, "\xff", "illegal UTF-8 encoding", utf8.RuneError)
|
||||||
testError(t, `01238`, "illegal octal number", Int)
|
testError(t, `01238`, "illegal octal number", Int)
|
||||||
testError(t, `'\"'`, "illegal char escape", Char)
|
testError(t, `'\"'`, "illegal char escape", Char)
|
||||||
testError(t, `'aa'`, "illegal char literal", Char)
|
testError(t, `'aa'`, "illegal char literal", Char)
|
||||||
@ -467,6 +470,7 @@ func TestPos(t *testing.T) {
|
|||||||
s := new(Scanner).Init(bytes.NewBufferString("abc\n012\n\nx"))
|
s := new(Scanner).Init(bytes.NewBufferString("abc\n012\n\nx"))
|
||||||
s.Mode = 0
|
s.Mode = 0
|
||||||
s.Whitespace = 0
|
s.Whitespace = 0
|
||||||
|
s.Peek() // get a defined position
|
||||||
checkPos(t, s, 0, 1, 1, 'a')
|
checkPos(t, s, 0, 1, 1, 'a')
|
||||||
checkPos(t, s, 1, 1, 2, 'b')
|
checkPos(t, s, 1, 1, 2, 'b')
|
||||||
checkPos(t, s, 2, 1, 3, 'c')
|
checkPos(t, s, 2, 1, 3, 'c')
|
||||||
|
Loading…
Reference in New Issue
Block a user