From 396228a6525b211e0c368f255724eeefef264062 Mon Sep 17 00:00:00 2001 From: Robert Griesemer Date: Tue, 2 Nov 2010 10:38:07 -0700 Subject: [PATCH] go/scanner: removed scanner/internal-only uses of token.Position First step towards a more light-weight implementation of token.Position: - only use token.Position for reporting token and error position - use offsets only for scanner control - no interface changes yet R=rsc CC=golang-dev https://golang.org/cl/2825041 --- src/pkg/go/scanner/scanner.go | 225 +++++++++++++++++------------ src/pkg/go/scanner/scanner_test.go | 20 +-- 2 files changed, 141 insertions(+), 104 deletions(-) diff --git a/src/pkg/go/scanner/scanner.go b/src/pkg/go/scanner/scanner.go index 663636c46ee..ab117147059 100644 --- a/src/pkg/go/scanner/scanner.go +++ b/src/pkg/go/scanner/scanner.go @@ -29,10 +29,14 @@ type Scanner struct { mode uint // scanning mode // scanning state - pos token.Position // previous reading position (position before ch) - offset int // current reading offset (position after ch) - ch int // one char look-ahead - insertSemi bool // insert a semicolon before next newline + filename string // current filename; may change via //line filename:line comment + line int // current line + column int // current column + + ch int // current character + offset int // character offset + rdOffset int // reading offset (position after current character) + insertSemi bool // insert a semicolon before next newline // public state - ok to modify ErrorCount int // number of errors encountered @@ -43,29 +47,31 @@ type Scanner struct { // S.ch < 0 means end-of-file. // func (S *Scanner) next() { - if S.offset < len(S.src) { - S.pos.Offset = S.offset - S.pos.Column++ + S.column++ + if S.rdOffset < len(S.src) { + S.offset = S.rdOffset if S.ch == '\n' { - // next character starts a new line - S.pos.Line++ - S.pos.Column = 1 + S.line++ + S.column = 1 } - r, w := int(S.src[S.offset]), 1 + r, w := int(S.src[S.rdOffset]), 1 switch { case r == 0: - S.error(S.pos, "illegal character NUL") + S.error("illegal character NUL") case r >= 0x80: // not ASCII - r, w = utf8.DecodeRune(S.src[S.offset:]) + r, w = utf8.DecodeRune(S.src[S.rdOffset:]) if r == utf8.RuneError && w == 1 { - S.error(S.pos, "illegal UTF-8 encoding") + S.error("illegal UTF-8 encoding") } } - S.offset += w + S.rdOffset += w S.ch = r } else { - S.pos.Offset = len(S.src) + S.offset = len(S.src) + if S.ch == '\n' { + S.column = 1 + } S.ch = -1 // eof } } @@ -94,9 +100,17 @@ func (S *Scanner) Init(filename string, src []byte, err ErrorHandler, mode uint) S.src = src S.err = err S.mode = mode - S.pos = token.Position{filename, 0, 1, 0} + + S.filename = filename + S.line = 1 + S.column = 0 + + S.ch = ' ' S.offset = 0 + S.rdOffset = 0 + S.insertSemi = false S.ErrorCount = 0 + S.next() } @@ -131,7 +145,12 @@ func charString(ch int) string { } -func (S *Scanner) error(pos token.Position, msg string) { +func (S *Scanner) error(msg string) { + S.errorAt(token.Position{S.filename, S.offset, S.line, S.column}, msg) +} + + +func (S *Scanner) errorAt(pos token.Position, msg string) { if S.err != nil { S.err.Error(pos, msg) } @@ -139,18 +158,28 @@ func (S *Scanner) error(pos token.Position, msg string) { } -func (S *Scanner) expect(ch int) { - if S.ch != ch { - S.error(S.pos, "expected "+charString(ch)+", found "+charString(S.ch)) +var prefix = []byte("//line ") + +func (S *Scanner) interpretLineComment(text []byte) { + if bytes.HasPrefix(text, prefix) { + // get filename and line number, if any + if i := bytes.Index(text, []byte{':'}); i > 0 { + if line, err := strconv.Atoi(string(text[i+1:])); err == nil && line > 0 { + // valid //line filename:line comment; + // update scanner position + S.filename = string(text[len(prefix):i]) + S.line = line - 1 // -1 since the '\n' has not been consumed yet + } + } } - S.next() // always make progress } -var prefix = []byte("line ") - -func (S *Scanner) scanComment(pos token.Position) { - // first '/' already consumed +func (S *Scanner) scanComment() { + // initial '/' already consumed; S.ch == '/' || S.ch == '*' + offs := S.offset - 1 // position of initial '/' + col := S.column - 1 + pos := token.Position{S.filename, S.offset - 1, S.line, S.column - 1} if S.ch == '/' { //-style comment @@ -159,21 +188,9 @@ func (S *Scanner) scanComment(pos token.Position) { if S.ch == '\n' { // '\n' is not part of the comment for purposes of scanning // (the comment ends on the same line where it started) - if pos.Column == 1 { - text := S.src[pos.Offset+2 : S.pos.Offset] - if bytes.HasPrefix(text, prefix) { - // comment starts at beginning of line with "//line "; - // get filename and line number, if any - i := bytes.Index(text, []byte{':'}) - if i >= 0 { - if line, err := strconv.Atoi(string(text[i+1:])); err == nil && line > 0 { - // valid //line filename:line comment; - // update scanner position - S.pos.Filename = string(text[len(prefix):i]) - S.pos.Line = line - 1 // -1 since the '\n' has not been consumed yet - } - } - } + if col == 1 { + // comment starts at the beginning of the current line + S.interpretLineComment(S.src[offs:S.offset]) } return } @@ -181,7 +198,7 @@ func (S *Scanner) scanComment(pos token.Position) { } else { /*-style comment */ - S.expect('*') + S.next() for S.ch >= 0 { ch := S.ch S.next() @@ -192,47 +209,56 @@ func (S *Scanner) scanComment(pos token.Position) { } } - S.error(pos, "comment not terminated") + S.errorAt(pos, "comment not terminated") } -func (S *Scanner) findLineEnd(pos token.Position) bool { - // initial '/' already consumed; pos is position of '/' +func (S *Scanner) findLineEnd() bool { + // initial '/' already consumed + + defer func(line, col, offs int) { + // reset scanner state to where it was upon calling findLineEnd + // (we don't scan //line comments and ignore errors thus + // S.filename and S.ErrorCount don't change) + S.line = line + S.column = col + S.ch = '/' + S.offset = offs + S.rdOffset = offs + 1 + S.next() // consume initial '/' again + }(S.line, S.column-1, S.offset-1) // read ahead until a newline, EOF, or non-comment token is found - lineend := false - for pos1 := pos; S.ch == '/' || S.ch == '*'; { + for S.ch == '/' || S.ch == '*' { if S.ch == '/' { //-style comment always contains a newline - lineend = true - break + return true } - S.scanComment(pos1) - if pos1.Line < S.pos.Line { - /*-style comment contained a newline */ - lineend = true - break + /*-style comment: look for newline */ + S.next() + for S.ch >= 0 { + ch := S.ch + if ch == '\n' { + return true + } + S.next() + if ch == '*' && S.ch == '/' { + S.next() + break + } } S.skipWhitespace() // S.insertSemi is set if S.ch < 0 || S.ch == '\n' { - // line end - lineend = true - break + return true } if S.ch != '/' { // non-comment token - break + return false } - pos1 = S.pos S.next() // consume '/' } - // reset position to where it was upon calling findLineEnd - S.pos = pos - S.offset = pos.Offset + 1 - S.next() // consume initial '/' again - - return lineend + return false } @@ -247,11 +273,11 @@ func isDigit(ch int) bool { func (S *Scanner) scanIdentifier() token.Token { - pos := S.pos.Offset + offs := S.offset for isLetter(S.ch) || isDigit(S.ch) { S.next() } - return token.Lookup(S.src[pos:S.pos.Offset]) + return token.Lookup(S.src[offs:S.offset]) } @@ -275,7 +301,7 @@ func (S *Scanner) scanMantissa(base int) { } -func (S *Scanner) scanNumber(pos token.Position, seenDecimalPoint bool) token.Token { +func (S *Scanner) scanNumber(seenDecimalPoint bool) token.Token { // digitVal(S.ch) < 10 tok := token.INT @@ -287,6 +313,7 @@ func (S *Scanner) scanNumber(pos token.Position, seenDecimalPoint bool) token.To if S.ch == '0' { // int or float + pos := token.Position{S.filename, S.offset, S.line, S.column} S.next() if S.ch == 'x' || S.ch == 'X' { // hexadecimal int @@ -306,7 +333,7 @@ func (S *Scanner) scanNumber(pos token.Position, seenDecimalPoint bool) token.To } // octal int if seenDecimalDigit { - S.error(pos, "illegal octal number") + S.errorAt(pos, "illegal octal number") } } goto exit @@ -343,7 +370,7 @@ exit: func (S *Scanner) scanEscape(quote int) { - pos := S.pos + pos := token.Position{S.filename, S.offset, S.line, S.column} var i, base, max uint32 switch S.ch { @@ -363,7 +390,7 @@ func (S *Scanner) scanEscape(quote int) { i, base, max = 8, 16, unicode.MaxRune default: S.next() // always make progress - S.error(pos, "unknown escape sequence") + S.errorAt(pos, "unknown escape sequence") return } @@ -371,7 +398,7 @@ func (S *Scanner) scanEscape(quote int) { for ; i > 0 && S.ch != quote && S.ch >= 0; i-- { d := uint32(digitVal(S.ch)) if d >= base { - S.error(S.pos, "illegal character in escape sequence") + S.error("illegal character in escape sequence") break } x = x*base + d @@ -382,13 +409,14 @@ func (S *Scanner) scanEscape(quote int) { S.next() } if x > max || 0xd800 <= x && x < 0xe000 { - S.error(pos, "escape sequence is invalid Unicode code point") + S.errorAt(pos, "escape sequence is invalid Unicode code point") } } -func (S *Scanner) scanChar(pos token.Position) { - // '\'' already consumed +func (S *Scanner) scanChar() { + // '\'' opening already consumed + pos := token.Position{S.filename, S.offset - 1, S.line, S.column - 1} n := 0 for S.ch != '\'' { @@ -396,7 +424,7 @@ func (S *Scanner) scanChar(pos token.Position) { n++ S.next() if ch == '\n' || ch < 0 { - S.error(pos, "character literal not terminated") + S.errorAt(pos, "character literal not terminated") n = 1 break } @@ -408,19 +436,20 @@ func (S *Scanner) scanChar(pos token.Position) { S.next() if n != 1 { - S.error(pos, "illegal character literal") + S.errorAt(pos, "illegal character literal") } } -func (S *Scanner) scanString(pos token.Position) { - // '"' already consumed +func (S *Scanner) scanString() { + // '"' opening already consumed + pos := token.Position{S.filename, S.offset - 1, S.line, S.column - 1} for S.ch != '"' { ch := S.ch S.next() if ch == '\n' || ch < 0 { - S.error(pos, "string not terminated") + S.errorAt(pos, "string not terminated") break } if ch == '\\' { @@ -432,14 +461,15 @@ func (S *Scanner) scanString(pos token.Position) { } -func (S *Scanner) scanRawString(pos token.Position) { - // '`' already consumed +func (S *Scanner) scanRawString() { + // '`' opening already consumed + pos := token.Position{S.filename, S.offset - 1, S.line, S.column - 1} for S.ch != '`' { ch := S.ch S.next() if ch < 0 { - S.error(pos, "string not terminated") + S.errorAt(pos, "string not terminated") break } } @@ -524,7 +554,8 @@ scanAgain: // current token start insertSemi := false - pos, tok = S.pos, token.ILLEGAL + pos, tok = token.Position{S.filename, S.offset, S.line, S.column}, token.ILLEGAL + offs := S.offset // determine token value switch ch := S.ch; { @@ -536,7 +567,7 @@ scanAgain: } case digitVal(ch) < 10: insertSemi = true - tok = S.scanNumber(pos, false) + tok = S.scanNumber(false) default: S.next() // always make progress switch ch { @@ -555,21 +586,21 @@ scanAgain: case '"': insertSemi = true tok = token.STRING - S.scanString(pos) + S.scanString() case '\'': insertSemi = true tok = token.CHAR - S.scanChar(pos) + S.scanChar() case '`': insertSemi = true tok = token.STRING - S.scanRawString(pos) + S.scanRawString() case ':': tok = S.switch2(token.COLON, token.DEFINE) case '.': if digitVal(S.ch) < 10 { insertSemi = true - tok = S.scanNumber(pos, true) + tok = S.scanNumber(true) } else if S.ch == '.' { S.next() if S.ch == '.' { @@ -613,15 +644,19 @@ scanAgain: case '/': if S.ch == '/' || S.ch == '*' { // comment - if S.insertSemi && S.findLineEnd(pos) { + line := S.line + col := S.column - 1 // beginning of comment + if S.insertSemi && S.findLineEnd() { // reset position to the beginning of the comment - S.pos = pos - S.offset = pos.Offset + 1 + S.line = line + S.column = col S.ch = '/' + S.offset = offs + S.rdOffset = offs + 1 S.insertSemi = false // newline consumed return pos, token.SEMICOLON, newline } - S.scanComment(pos) + S.scanComment() if S.mode&ScanComments == 0 { // skip comment S.insertSemi = false // newline consumed @@ -659,7 +694,7 @@ scanAgain: tok = S.switch3(token.OR, token.OR_ASSIGN, '|', token.LOR) default: if S.mode&AllowIllegalChars == 0 { - S.error(pos, "illegal character "+charString(ch)) + S.errorAt(pos, "illegal character "+charString(ch)) } insertSemi = S.insertSemi // preserve insertSemi info } @@ -668,7 +703,7 @@ scanAgain: if S.mode&InsertSemis != 0 { S.insertSemi = insertSemi } - return pos, tok, S.src[pos.Offset:S.pos.Offset] + return pos, tok, S.src[offs:S.offset] } diff --git a/src/pkg/go/scanner/scanner_test.go b/src/pkg/go/scanner/scanner_test.go index 794b191e838..c40753fb03b 100644 --- a/src/pkg/go/scanner/scanner_test.go +++ b/src/pkg/go/scanner/scanner_test.go @@ -198,16 +198,16 @@ func newlineCount(s string) int { func checkPos(t *testing.T, lit string, pos, expected token.Position) { if pos.Filename != expected.Filename { - t.Errorf("bad filename for %s: got %s, expected %s", lit, pos.Filename, expected.Filename) + t.Errorf("bad filename for %q: got %s, expected %s", lit, pos.Filename, expected.Filename) } if pos.Offset != expected.Offset { - t.Errorf("bad position for %s: got %d, expected %d", lit, pos.Offset, expected.Offset) + t.Errorf("bad position for %q: got %d, expected %d", lit, pos.Offset, expected.Offset) } if pos.Line != expected.Line { - t.Errorf("bad line for %s: got %d, expected %d", lit, pos.Line, expected.Line) + t.Errorf("bad line for %q: got %d, expected %d", lit, pos.Line, expected.Line) } if pos.Column != expected.Column { - t.Errorf("bad column for %s: got %d, expected %d", lit, pos.Column, expected.Column) + t.Errorf("bad column for %q: got %d, expected %d", lit, pos.Column, expected.Column) } } @@ -276,15 +276,15 @@ func checkSemi(t *testing.T, line string, mode uint) { semiLit = ";" } // next token must be a semicolon - offs := pos.Offset + 1 + semiPos := pos + semiPos.Offset++ + semiPos.Column++ pos, tok, lit = S.Scan() if tok == token.SEMICOLON { - if pos.Offset != offs { - t.Errorf("bad offset for %q: got %d, expected %d", line, pos.Offset, offs) - } if string(lit) != semiLit { t.Errorf(`bad literal for %q: got %q, expected %q`, line, lit, semiLit) } + checkPos(t, line, pos, semiPos) } else { t.Errorf("bad token for %q: got %s, expected ;", line, tok.String()) } @@ -399,11 +399,13 @@ var lines = []string{ "foo$/*\n*/", "foo$/*comment*/ \n", "foo$/*\n*/ ", + "foo $// comment\n", "foo $/*comment*/\n", "foo $/*\n*/", - + "foo $/* */ /* \n */ bar$/**/\n", "foo $/*0*/ /*1*/ /*2*/\n", + "foo $/*comment*/ \n", "foo $/*0*/ /*1*/ /*2*/ \n", "foo $/**/ /*-------------*/ /*----\n*/bar $/* \n*/baa$\n",