// Copyright 2009 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package Scanner import Platform "platform" import Utils "utils" export const ( ILLEGAL = iota; EOF; INT; FLOAT; STRING; COMMA; COLON; SEMICOLON; PERIOD; LPAREN; RPAREN; LBRACK; RBRACK; LBRACE; RBRACE; ASSIGN; DEFINE; INC; DEC; NOT; AND; OR; XOR; ADD; SUB; MUL; QUO; REM; EQL; NEQ; LSS; LEQ; GTR; GEQ; SHL; SHR; ARROW; ADD_ASSIGN; SUB_ASSIGN; MUL_ASSIGN; QUO_ASSIGN; REM_ASSIGN; AND_ASSIGN; OR_ASSIGN; XOR_ASSIGN; SHL_ASSIGN; SHR_ASSIGN; LAND; LOR; // IDENT must be immediately before keywords IDENT; // keywords KEYWORDS_BEG; BREAK; CASE; CHAN; CONST; CONTINUE; DEFAULT; ELSE; EXPORT; FALLTHROUGH; FOR; FUNC; GO; GOTO; IF; IMPORT; INTERFACE; MAP; PACKAGE; RANGE; RETURN; SELECT; STRUCT; SWITCH; TYPE; VAR; KEYWORDS_END; ) var Keywords map [string] int; var VerboseMsgs bool; // error message customization export func TokenName(tok int) string { switch (tok) { case ILLEGAL: return "illegal"; case EOF: return "eof"; case INT: return "int"; case FLOAT: return "float"; case STRING: return "string"; case COMMA: return ","; case COLON: return ":"; case SEMICOLON: return ";"; case PERIOD: return "."; case LPAREN: return "("; case RPAREN: return ")"; case LBRACK: return "["; case RBRACK: return "]"; case LBRACE: return "LBRACE"; case RBRACE: return "RBRACE"; case ASSIGN: return "="; case DEFINE: return ":="; case INC: return "++"; case DEC: return "--"; case NOT: return "!"; case AND: return "&"; case OR: return "|"; case XOR: return "^"; case ADD: return "+"; case SUB: return "-"; case MUL: return "*"; case QUO: return "/"; case REM: return "%"; case EQL: return "=="; case NEQ: return "!="; case LSS: return "<"; case LEQ: return "<="; case GTR: return ">"; case GEQ: return ">="; case SHL: return "<<"; case SHR: return ">>"; case ARROW: return "<-"; case ADD_ASSIGN: return "+="; case SUB_ASSIGN: return "-="; case MUL_ASSIGN: return "+="; case QUO_ASSIGN: return "/="; case REM_ASSIGN: return "%="; case AND_ASSIGN: return "&="; case OR_ASSIGN: return "|="; case XOR_ASSIGN: return "^="; case SHL_ASSIGN: return "<<="; case SHR_ASSIGN: return ">>="; case LAND: return "&&"; case LOR: return "||"; case IDENT: return "ident"; case BREAK: return "break"; case CASE: return "case"; case CHAN: return "chan"; case CONST: return "const"; case CONTINUE: return "continue"; case DEFAULT: return "default"; case ELSE: return "else"; case EXPORT: return "export"; case FALLTHROUGH: return "fallthrough"; case FOR: return "for"; case FUNC: return "func"; case GO: return "go"; case GOTO: return "goto"; case IF: return "if"; case IMPORT: return "import"; case INTERFACE: return "interface"; case MAP: return "map"; case PACKAGE: return "package"; case RANGE: return "range"; case RETURN: return "return"; case SELECT: return "select"; case STRUCT: return "struct"; case SWITCH: return "switch"; case TYPE: return "type"; case VAR: return "var"; } return "???"; } func init() { Keywords = make(map [string] int); for i := KEYWORDS_BEG; i <= KEYWORDS_END; i++ { Keywords[TokenName(i)] = i; } // Provide column information in error messages for gri only... VerboseMsgs = Platform.USER == "gri"; } func is_whitespace(ch int) bool { return ch == ' ' || ch == '\r' || ch == '\n' || ch == '\t'; } func is_letter(ch int) bool { return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch == '_' || ch >= 128 ; } func digit_val(ch int) int { if '0' <= ch && ch <= '9' { return ch - '0'; } if 'a' <= ch && ch <= 'f' { return ch - 'a' + 10; } if 'A' <= ch && ch <= 'F' { return ch - 'A' + 10; } return 16; // larger than any legal digit val } export type Scanner struct { filename string; // error reporting only nerrors int; // number of errors errpos int; // last error position src string; // scanned source pos int; // current reading position ch int; // one char look-ahead chpos int; // position of ch } // Read the next Unicode char into S.ch. // S.ch < 0 means end-of-file. // func (S *Scanner) Next() { const ( Bit1 = 7; Bitx = 6; Bit2 = 5; Bit3 = 4; Bit4 = 3; T1 = (1 << (Bit1 + 1) - 1) ^ 0xFF; // 0000 0000 Tx = (1 << (Bitx + 1) - 1) ^ 0xFF; // 1000 0000 T2 = (1 << (Bit2 + 1) - 1) ^ 0xFF; // 1100 0000 T3 = (1 << (Bit3 + 1) - 1) ^ 0xFF; // 1110 0000 T4 = (1 << (Bit4 + 1) - 1) ^ 0xFF; // 1111 0000 Rune1 = 1 << (Bit1 + 0*Bitx) - 1; // 0000 0000 0111 1111 Rune2 = 1 << (Bit2 + 1*Bitx) - 1; // 0000 0111 1111 1111 Rune3 = 1 << (Bit3 + 2*Bitx) - 1; // 1111 1111 1111 1111 Maskx = 0x3F; // 1 << Bitx - 1; // 0011 1111 Testx = 0xC0; // Maskx ^ 0xFF; // 1100 0000 Bad = 0xFFFD; // Runeerror ); src := S.src; lim := len(src); pos := S.pos; // 1-byte sequence // 0000-007F => T1 if pos >= lim { S.ch = -1; // end of file S.chpos = lim; return; } c0 := int(src[pos]); pos++; if c0 < Tx { S.ch = c0; S.chpos = S.pos; S.pos = pos; return; } // 2-byte sequence // 0080-07FF => T2 Tx if pos >= lim { goto bad; } c1 := int(src[pos]) ^ Tx; pos++; if c1 & Testx != 0 { goto bad; } if c0 < T3 { if c0 < T2 { goto bad; } r := (c0 << Bitx | c1) & Rune2; if r <= Rune1 { goto bad; } S.ch = r; S.chpos = S.pos; S.pos = pos; return; } // 3-byte sequence // 0800-FFFF => T3 Tx Tx if pos >= lim { goto bad; } c2 := int(src[pos]) ^ Tx; pos++; if c2 & Testx != 0 { goto bad; } if c0 < T4 { r := (((c0 << Bitx | c1) << Bitx) | c2) & Rune3; if r <= Rune2 { goto bad; } S.ch = r; S.chpos = S.pos; S.pos = pos; return; } // bad encoding bad: S.ch = Bad; S.chpos = S.pos; S.pos += 1; return; } // Compute (line, column) information for a given source position. func (S *Scanner) LineCol(pos int) (line, col int) { line = 1; lpos := 0; src := S.src; if pos > len(src) { pos = len(src); } for i := 0; i < pos; i++ { if src[i] == '\n' { line++; lpos = i; } } return line, pos - lpos; } func (S *Scanner) Error(pos int, msg string) { const errdist = 10; delta := pos - S.errpos; // may be negative! if delta < 0 { delta = -delta; } if delta > errdist || S.nerrors == 0 /* always report first error */ { print(S.filename); if pos >= 0 { // print position line, col := S.LineCol(pos); if VerboseMsgs { print(":", line, ":", col); } else { print(":", line); } } print(": ", msg, "\n"); S.nerrors++; S.errpos = pos; } if S.nerrors >= 10 { sys.exit(1); } } func (S *Scanner) Open(filename, src string) { S.filename = filename; S.nerrors = 0; S.errpos = 0; S.src = src; S.pos = 0; S.Next(); } func CharString(ch int) string { s := string(ch); switch ch { case '\a': s = `\a`; case '\b': s = `\b`; case '\f': s = `\f`; case '\n': s = `\n`; case '\r': s = `\r`; case '\t': s = `\t`; case '\v': s = `\v`; case '\\': s = `\\`; case '\'': s = `\'`; } return "'" + s + "' (U+" + Utils.IntToString(ch, 16) + ")"; } func (S *Scanner) Expect(ch int) { if S.ch != ch { S.Error(S.chpos, "expected " + CharString(ch) + ", found " + CharString(S.ch)); } S.Next(); // make always progress } func (S *Scanner) SkipWhitespace() { for is_whitespace(S.ch) { S.Next(); } } func (S *Scanner) SkipComment() { // '/' already consumed if S.ch == '/' { // comment S.Next(); for S.ch != '\n' && S.ch >= 0 { S.Next(); } } else { /* comment */ pos := S.chpos - 1; S.Expect('*'); for S.ch >= 0 { ch := S.ch; S.Next(); if ch == '*' && S.ch == '/' { S.Next(); return; } } S.Error(pos, "comment not terminated"); } } func (S *Scanner) ScanIdentifier() (tok int, val string) { pos := S.chpos; for is_letter(S.ch) || digit_val(S.ch) < 10 { S.Next(); } val = S.src[pos : S.chpos]; var present bool; tok, present = Keywords[val]; if !present { tok = IDENT; } return tok, val; } func (S *Scanner) ScanMantissa(base int) { for digit_val(S.ch) < base { S.Next(); } } func (S *Scanner) ScanNumber(seen_decimal_point bool) (tok int, val string) { pos := S.chpos; tok = INT; if seen_decimal_point { tok = FLOAT; pos--; // '.' is one byte S.ScanMantissa(10); goto exponent; } if S.ch == '0' { // int or float S.Next(); if S.ch == 'x' || S.ch == 'X' { // hexadecimal int S.Next(); S.ScanMantissa(16); } else { // octal int or float S.ScanMantissa(8); if digit_val(S.ch) < 10 || S.ch == '.' || S.ch == 'e' || S.ch == 'E' { // float tok = FLOAT; goto mantissa; } // octal int } goto exit; } mantissa: // decimal int or float S.ScanMantissa(10); if S.ch == '.' { // float tok = FLOAT; S.Next(); S.ScanMantissa(10) } exponent: if S.ch == 'e' || S.ch == 'E' { // float tok = FLOAT; S.Next(); if S.ch == '-' || S.ch == '+' { S.Next(); } S.ScanMantissa(10); } exit: return tok, S.src[pos : S.chpos]; } func (S *Scanner) ScanDigits(n int, base int) { for digit_val(S.ch) < base { S.Next(); n--; } if n > 0 { S.Error(S.chpos, "illegal char escape"); } } func (S *Scanner) ScanEscape(quote int) string { // TODO: fix this routine ch := S.ch; pos := S.chpos; S.Next(); switch (ch) { case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\': return string(ch); case '0', '1', '2', '3', '4', '5', '6', '7': S.ScanDigits(3 - 1, 8); // 1 char already read return ""; // TODO fix this case 'x': S.ScanDigits(2, 16); return ""; // TODO fix this case 'u': S.ScanDigits(4, 16); return ""; // TODO fix this case 'U': S.ScanDigits(8, 16); return ""; // TODO fix this default: // check for quote outside the switch for better generated code (eventually) if ch == quote { return string(quote); } S.Error(pos, "illegal char escape"); } return ""; // TODO fix this } func (S *Scanner) ScanChar() string { // '\'' already consumed pos := S.chpos - 1; ch := S.ch; S.Next(); if ch == '\\' { S.ScanEscape('\''); } S.Expect('\''); return S.src[pos : S.chpos]; } func (S *Scanner) ScanString() string { // '"' already consumed pos := S.chpos - 1; for S.ch != '"' { ch := S.ch; S.Next(); if ch == '\n' || ch < 0 { S.Error(pos, "string not terminated"); break; } if ch == '\\' { S.ScanEscape('"'); } } S.Next(); return S.src[pos : S.chpos]; } func (S *Scanner) ScanRawString() string { // '`' already consumed pos := S.chpos - 1; for S.ch != '`' { ch := S.ch; S.Next(); if ch == '\n' || ch < 0 { S.Error(pos, "string not terminated"); break; } } S.Next(); return S.src[pos : S.chpos]; } func (S *Scanner) Select2(tok0, tok1 int) int { if S.ch == '=' { S.Next(); return tok1; } return tok0; } func (S *Scanner) Select3(tok0, tok1, ch2, tok2 int) int { if S.ch == '=' { S.Next(); return tok1; } if S.ch == ch2 { S.Next(); return tok2; } return tok0; } func (S *Scanner) Select4(tok0, tok1, ch2, tok2, tok3 int) int { if S.ch == '=' { S.Next(); return tok1; } if S.ch == ch2 { S.Next(); if S.ch == '=' { S.Next(); return tok3; } return tok2; } return tok0; } func (S *Scanner) Scan() (tok, pos int, val string) { S.SkipWhitespace(); ch := S.ch; tok = ILLEGAL; pos = S.chpos; switch { case is_letter(ch): tok, val = S.ScanIdentifier(); case digit_val(ch) < 10: tok, val = S.ScanNumber(false); default: S.Next(); // always make progress switch ch { case -1: tok = EOF; case '"': tok, val = STRING, S.ScanString(); case '\'': tok, val = INT, S.ScanChar(); case '`': tok, val = STRING, S.ScanRawString(); case ':': tok = S.Select2(COLON, DEFINE); case '.': if digit_val(S.ch) < 10 { tok, val = S.ScanNumber(true); } else { tok = PERIOD; } case ',': tok = COMMA; case ';': tok = SEMICOLON; case '(': tok = LPAREN; case ')': tok = RPAREN; case '[': tok = LBRACK; case ']': tok = RBRACK; case '{': tok = LBRACE; case '}': tok = RBRACE; case '+': tok = S.Select3(ADD, ADD_ASSIGN, '+', INC); case '-': tok = S.Select3(SUB, SUB_ASSIGN, '-', DEC); case '*': tok = S.Select2(MUL, MUL_ASSIGN); case '/': if S.ch == '/' || S.ch == '*' { S.SkipComment(); // cannot simply return because of 6g bug tok, pos, val = S.Scan(); return tok, pos, val; } tok = S.Select2(QUO, QUO_ASSIGN); case '%': tok = S.Select2(REM, REM_ASSIGN); case '^': tok = S.Select2(XOR, XOR_ASSIGN); case '<': if S.ch == '-' { S.Next(); tok = ARROW; } else { tok = S.Select4(LSS, LEQ, '<', SHL, SHL_ASSIGN); } case '>': tok = S.Select4(GTR, GEQ, '>', SHR, SHR_ASSIGN); case '=': tok = S.Select2(ASSIGN, EQL); case '!': tok = S.Select2(NOT, NEQ); case '&': tok = S.Select3(AND, AND_ASSIGN, '&', LAND); case '|': tok = S.Select3(OR, OR_ASSIGN, '|', LOR); default: S.Error(pos, "illegal character " + CharString(ch)); tok = ILLEGAL; } } return tok, pos, val; } export type Token struct { pos int; tok int; val string; } func (S *Scanner) Server(c chan *Token) { for { t := new(Token); t.tok, t.pos, t.val = S.Scan(); c <- t; if t.tok == EOF { break; } } }