// Copyright 2009 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package Scanner export EOF; const ( ILLEGAL = iota; EOF; IDENT; STRING; NUMBER; COMMA; COLON; SEMICOLON; PERIOD; LPAREN; RPAREN; LBRACK; RBRACK; LBRACE; RBRACE; ASSIGN; DEFINE; INC; DEC; NOT; AND; OR; XOR; ADD; SUB; MUL; QUO; REM; EQL; NEQ; LSS; LEQ; GTR; GEQ; SHL; SHR; ADD_ASSIGN; SUB_ASSIGN; MUL_ASSIGN; QUO_ASSIGN; REM_ASSIGN; AND_ASSIGN; OR_ASSIGN; XOR_ASSIGN; SHL_ASSIGN; SHR_ASSIGN; CAND; COR; // keywords KEYWORDS_BEG; BREAK; CASE; CONST; CONTINUE; DEFAULT; ELSE; EXPORT; FALLTHROUGH; FALSE; FOR; FUNC; GO; GOTO; IF; IMPORT; INTERFACE; MAP; NEW; NIL; PACKAGE; RANGE; RETURN; SELECT; STRUCT; SWITCH; TRUE; TYPE; VAR; KEYWORDS_END; ) var Keywords *map [string] int; export TokenName func TokenName(tok int) string { switch (tok) { case ILLEGAL: return "illegal"; case EOF: return "eof"; case IDENT: return "ident"; case STRING: return "string"; case NUMBER: return "number"; case COMMA: return ","; case COLON: return ":"; case SEMICOLON: return ";"; case PERIOD: return "."; case LPAREN: return "("; case RPAREN: return ")"; case LBRACK: return "["; case RBRACK: return "]"; case LBRACE: return "{"; case RBRACE: return "}"; case ASSIGN: return "="; case DEFINE: return ":="; case INC: return "++"; case DEC: return "--"; case NOT: return "!"; case AND: return "&"; case OR: return "|"; case XOR: return "^"; case ADD: return "+"; case SUB: return "-"; case MUL: return "*"; case QUO: return "/"; case REM: return "%"; case EQL: return "=="; case NEQ: return "!="; case LSS: return "<"; case LEQ: return "<="; case GTR: return ">"; case GEQ: return ">="; case SHL: return "<<"; case SHR: return ">>"; case ADD_ASSIGN: return "+="; case SUB_ASSIGN: return "-="; case MUL_ASSIGN: return "+="; case QUO_ASSIGN: return "/="; case REM_ASSIGN: return "%="; case AND_ASSIGN: return "&="; case OR_ASSIGN: return "|="; case XOR_ASSIGN: return "^="; case SHL_ASSIGN: return "<<="; case SHR_ASSIGN: return ">>="; case CAND: return "&&"; case COR: return "||"; case BREAK: return "break"; case CASE: return "case"; case CONST: return "const"; case CONTINUE: return "continue"; case DEFAULT: return "default"; case ELSE: return "else"; case EXPORT: return "export"; case FALLTHROUGH: return "fallthrough"; case FALSE: return "false"; case FOR: return "for"; case FUNC: return "func"; case GO: return "go"; case GOTO: return "goto"; case IF: return "if"; case IMPORT: return "import"; case INTERFACE: return "interface"; case MAP: return "map"; case NEW: return "new"; case NIL: return "nil"; case PACKAGE: return "package"; case RANGE: return "range"; case RETURN: return "return"; case SELECT: return "select"; case STRUCT: return "struct"; case SWITCH: return "switch"; case TRUE: return "true"; case TYPE: return "type"; case VAR: return "var"; } return "???"; } func is_whitespace (ch int) bool { return ch == ' ' || ch == '\r' || ch == '\n' || ch == '\t'; } func is_letter (ch int) bool { return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch == '_' || ch >= 128 ; } func digit_val (ch int) int { if '0' <= ch && ch <= '9' { return ch - '0'; } if 'a' <= ch && ch <= 'f' { return ch - 'a' + 10; } if 'A' <= ch && ch <= 'F' { return ch - 'A' + 10; } return 16; // larger than any legal digit val } export Scanner type Scanner struct { src string; pos int; ch int; // one char look-ahead } export Token type Token struct { val int; beg, end int; txt string; } func (T *Token) Print () { print TokenName(T.val), " [", T.beg, ", ", T.end, "[ ", T.txt, "\n"; } // Read the next Unicode char into S.ch. // S.ch < 0 means end-of-file. // func (S *Scanner) Next () { const ( Bit1 = 7; Bitx = 6; Bit2 = 5; Bit3 = 4; Bit4 = 3; // TODO 6g constant evaluation incomplete T1 = 0x00; // (1 << (Bit1 + 1) - 1) ^ 0xFF; // 0000 0000 Tx = 0x80; // (1 << (Bitx + 1) - 1) ^ 0xFF; // 1000 0000 T2 = 0xC0; // (1 << (Bit2 + 1) - 1) ^ 0xFF; // 1100 0000 T3 = 0xE0; // (1 << (Bit3 + 1) - 1) ^ 0xFF; // 1110 0000 T4 = 0xF0; // (1 << (Bit4 + 1) - 1) ^ 0xFF; // 1111 0000 Rune1 = 1 << (Bit1 + 0*Bitx) - 1; // 0000 0000 0111 1111 Rune2 = 1 << (Bit2 + 1*Bitx) - 1; // 0000 0111 1111 1111 Rune3 = 1 << (Bit3 + 2*Bitx) - 1; // 1111 1111 1111 1111 Maskx = 0x3F; // 1 << Bitx - 1; // 0011 1111 Testx = 0xC0; // Maskx ^ 0xFF; // 1100 0000 Bad = 0xFFFD; // Runeerror ); src := S.src; // TODO only needed because of 6g bug lim := len(src); pos := S.pos; // 1-byte sequence // 0000-007F => T1 if pos >= lim { S.ch = -1; // end of file return; } c0 := int(src[pos]); pos++; if c0 < Tx { S.ch = c0; S.pos = pos; return; } // 2-byte sequence // 0080-07FF => T2 Tx if pos >= lim { goto bad; } c1 := int(src[pos]) ^ Tx; pos++; if c1 & Testx != 0 { goto bad; } if c0 < T3 { if c0 < T2 { goto bad; } r := (c0 << Bitx | c1) & Rune2; if r <= Rune1 { goto bad; } S.ch = r; S.pos = pos; return; } // 3-byte sequence // 0800-FFFF => T3 Tx Tx if pos >= lim { goto bad; } c2 := int(src[pos]) ^ Tx; pos++; if c2 & Testx != 0 { goto bad; } if c0 < T4 { r := (((c0 << Bitx | c1) << Bitx) | c2) & Rune3; if r <= Rune2 { goto bad; } S.ch = r; S.pos = pos; return; } // bad encoding bad: S.ch = Bad; S.pos += 1; return; } func Init () { Keywords = new(map [string] int); for i := KEYWORDS_BEG; i <= KEYWORDS_END; i++ { Keywords[TokenName(i)] = i; } } func (S *Scanner) Open (src string) { if Keywords == nil { Init(); } S.src = src; S.pos = 0; S.Next(); } func (S *Scanner) Expect (ch int) { if S.ch != ch { panic "expected ", string(ch), " found ", string(S.ch); } S.Next(); } func (S *Scanner) SkipWhitespace () { for is_whitespace(S.ch) { S.Next(); } } func (S *Scanner) SkipComment () { if S.ch == '/' { // comment S.Next(); for S.ch != '\n' && S.ch >= 0 { S.Next(); } } else { /* comment */ S.Next(); for S.ch >= 0 { ch := S.ch; S.Next(); if ch == '*' && S.ch == '/' { S.Next(); return; } } panic "comment not terminated"; } } func (S *Scanner) ScanIdentifier () int { beg := S.pos - 1; for is_letter(S.ch) || digit_val(S.ch) < 10 { S.Next(); } end := S.pos - 1; var tok int; var present bool; tok, present = Keywords[S.src[beg : end]]; if !present { tok = IDENT; } return tok; } func (S *Scanner) ScanMantissa (base int) { for digit_val(S.ch) < base { S.Next(); } } func (S *Scanner) ScanNumber (seen_decimal_point bool) int { if seen_decimal_point { S.ScanMantissa(10); goto exponent; } if S.ch == '0' { // TODO bug: doesn't accept 09.0 ! // int S.Next(); if S.ch == 'x' || S.ch == 'X' { // hexadecimal int S.Next(); S.ScanMantissa(16); } else { // octal int S.ScanMantissa(8); } return NUMBER; } // decimal int or float S.ScanMantissa(10); if S.ch == '.' { // float S.Next(); S.ScanMantissa(10) } exponent: if S.ch == 'e' || S.ch == 'E' { // float S.Next(); if S.ch == '-' || S.ch == '+' { S.Next(); } S.ScanMantissa(10); } return NUMBER; } func (S *Scanner) ScanDigits(n int, base int) { for digit_val(S.ch) < base { S.Next(); n--; } if n > 0 { panic "illegal char escape"; } } func (S *Scanner) ScanEscape () string { // TODO: fix this routine ch := S.ch; S.Next(); switch (ch) { case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', '\'', '"': return string(ch); case '0', '1', '2', '3', '4', '5', '6', '7': S.ScanDigits(3 - 1, 8); // 1 char already read return ""; // TODO fix this case 'x': S.ScanDigits(2, 16); return ""; // TODO fix this case 'u': S.ScanDigits(4, 16); return ""; // TODO fix this case 'U': S.ScanDigits(8, 16); return ""; // TODO fix this default: panic "illegal char escape"; } } func (S *Scanner) ScanChar () int { // '\'' already consumed ch := S.ch; S.Next(); if ch == '\\' { S.ScanEscape(); } S.Expect('\''); return NUMBER; } func (S *Scanner) ScanString () int { // '"' already consumed for S.ch != '"' { ch := S.ch; S.Next(); if ch == '\n' || ch < 0 { panic "string not terminated"; } if ch == '\\' { S.ScanEscape(); } } S.Next(); return STRING; } func (S *Scanner) ScanRawString () int { // '`' already consumed for S.ch != '`' { ch := S.ch; S.Next(); if ch == '\n' || ch < 0 { panic "string not terminated"; } } S.Next(); return STRING; } func (S *Scanner) Select2 (tok0, tok1 int) int { if S.ch == '=' { S.Next(); return tok1; } return tok0; } func (S *Scanner) Select3 (tok0, tok1, ch2, tok2 int) int { if S.ch == '=' { S.Next(); return tok1; } if S.ch == ch2 { S.Next(); return tok2; } return tok0; } func (S *Scanner) Select4 (tok0, tok1, ch2, tok2, tok3 int) int { if S.ch == '=' { S.Next(); return tok1; } if S.ch == ch2 { S.Next(); if S.ch == '=' { S.Next(); return tok3; } return tok2; } return tok0; } func (S *Scanner) Scan (t *Token) (tok, beg, end int) { S.SkipWhitespace(); var tok int = ILLEGAL; var beg int = S.pos - 1; var end int = beg; ch := S.ch; switch { case is_letter(ch): tok = S.ScanIdentifier(); case digit_val(ch) < 10: tok = S.ScanNumber(false); default: S.Next(); switch ch { case -1: tok = EOF; case '"': tok = S.ScanString(); case '\'': tok = S.ScanChar(); case '`': tok = S.ScanRawString(); case ':': tok = S.Select2(COLON, DEFINE); case '.': if digit_val(S.ch) < 10 { tok = S.ScanNumber(true); } else { tok = PERIOD; } case ',': tok = COMMA; case ';': tok = SEMICOLON; case '(': tok = LPAREN; case ')': tok = RPAREN; case '[': tok = LBRACK; case ']': tok = RBRACK; case '{': tok = LBRACE; case '}': tok = RBRACE; case '+': tok = S.Select3(ADD, ADD_ASSIGN, '+', INC); case '-': tok = S.Select3(SUB, SUB_ASSIGN, '-', DEC); case '*': tok = S.Select2(MUL, MUL_ASSIGN); case '/': if S.ch == '/' || S.ch == '*' { S.SkipComment(); // cannot simply return because of 6g bug tok, beg, end = S.Scan(t); return tok, beg, end; } tok = S.Select2(QUO, QUO_ASSIGN); case '%': tok = S.Select2(REM, REM_ASSIGN); case '^': tok = S.Select2(XOR, XOR_ASSIGN); case '<': tok = S.Select4(LSS, LEQ, '<', SHL, SHL_ASSIGN); case '>': tok = S.Select4(GTR, GEQ, '>', SHR, SHR_ASSIGN); case '=': tok = S.Select2(ASSIGN, EQL); case '!': tok = S.Select2(NOT, NEQ); case '&': tok = S.Select3(AND, AND_ASSIGN, '&', CAND); case '|': tok = S.Select3(OR, OR_ASSIGN, '|', COR); default: tok = ILLEGAL; } } end = S.pos - 1; t.val = tok; t.beg = beg; t.end = end; t.txt = S.src[beg : end]; return tok, beg, end; }