1
0
mirror of https://github.com/golang/go synced 2024-10-01 16:38:34 -06:00
go/usr/gri/src/scanner.go

692 lines
11 KiB
Go
Raw Normal View History

// Copyright 2009 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package Scanner
export
ILLEGAL, EOF, IDENT, STRING, NUMBER,
COMMA, COLON, SEMICOLON, PERIOD,
LPAREN, RPAREN, LBRACK, RBRACK, LBRACE, RBRACE,
ASSIGN, DEFINE,
INC, DEC, NOT,
AND, OR, XOR,
ADD, SUB, MUL, QUO, REM,
EQL, NEQ, LSS, LEQ, GTR, GEQ,
SHL, SHR,
ADD_ASSIGN, SUB_ASSIGN, MUL_ASSIGN, QUO_ASSIGN, REM_ASSIGN,
AND_ASSIGN, OR_ASSIGN, XOR_ASSIGN, SHL_ASSIGN, SHR_ASSIGN,
CAND, COR,
BREAK, CASE, CHAN, CONST, CONTINUE, DEFAULT, ELSE, EXPORT, FALLTHROUGH, FALSE,
FOR, FUNC, GO, GOTO, IF, IMPORT, INTERFACE, MAP, NEW, NIL, PACKAGE, RANGE,
RETURN, SELECT, STRUCT, SWITCH, TRUE, TYPE, VAR
const (
ILLEGAL = iota;
2008-07-03 17:51:22 -06:00
EOF;
IDENT;
STRING;
NUMBER;
COMMA;
COLON;
SEMICOLON;
PERIOD;
LPAREN;
RPAREN;
LBRACK;
RBRACK;
LBRACE;
RBRACE;
2008-07-03 17:51:22 -06:00
ASSIGN;
DEFINE;
2008-07-03 17:51:22 -06:00
INC;
DEC;
NOT;
2008-07-03 17:51:22 -06:00
AND;
OR;
XOR;
2008-07-03 17:51:22 -06:00
ADD;
SUB;
MUL;
QUO;
REM;
2008-07-03 17:51:22 -06:00
EQL;
NEQ;
LSS;
LEQ;
GTR;
GEQ;
SHL;
SHR;
ADD_ASSIGN;
SUB_ASSIGN;
MUL_ASSIGN;
QUO_ASSIGN;
REM_ASSIGN;
AND_ASSIGN;
OR_ASSIGN;
XOR_ASSIGN;
2008-07-03 00:19:31 -06:00
2008-07-03 17:51:22 -06:00
SHL_ASSIGN;
SHR_ASSIGN;
2008-07-03 00:19:31 -06:00
2008-07-03 17:51:22 -06:00
CAND;
COR;
// keywords
2008-07-03 17:51:22 -06:00
KEYWORDS_BEG;
BREAK;
CASE;
CHAN;
2008-07-03 17:51:22 -06:00
CONST;
CONTINUE;
DEFAULT;
ELSE;
EXPORT;
FALLTHROUGH;
FALSE;
FOR;
FUNC;
GO;
GOTO;
IF;
IMPORT;
INTERFACE;
MAP;
NEW;
NIL;
PACKAGE;
RANGE;
RETURN;
SELECT;
STRUCT;
SWITCH;
TRUE;
TYPE;
VAR;
KEYWORDS_END;
)
2008-07-03 17:51:22 -06:00
var Keywords *map [string] int;
export TokenName
func TokenName(tok int) string {
switch (tok) {
case ILLEGAL: return "illegal";
case EOF: return "eof";
case IDENT: return "ident";
case STRING: return "string";
case NUMBER: return "number";
case COMMA: return ",";
case COLON: return ":";
case SEMICOLON: return ";";
case PERIOD: return ".";
case LPAREN: return "(";
case RPAREN: return ")";
case LBRACK: return "[";
case RBRACK: return "]";
case LBRACE: return "{";
case RBRACE: return "}";
case ASSIGN: return "=";
case DEFINE: return ":=";
case INC: return "++";
case DEC: return "--";
case NOT: return "!";
2008-07-03 00:19:31 -06:00
case AND: return "&";
case OR: return "|";
case XOR: return "^";
case ADD: return "+";
case SUB: return "-";
case MUL: return "*";
case QUO: return "/";
case REM: return "%";
case EQL: return "==";
case NEQ: return "!=";
case LSS: return "<";
case LEQ: return "<=";
case GTR: return ">";
case GEQ: return ">=";
case SHL: return "<<";
case SHR: return ">>";
case ADD_ASSIGN: return "+=";
case SUB_ASSIGN: return "-=";
case MUL_ASSIGN: return "+=";
case QUO_ASSIGN: return "/=";
case REM_ASSIGN: return "%=";
case AND_ASSIGN: return "&=";
case OR_ASSIGN: return "|=";
case XOR_ASSIGN: return "^=";
case SHL_ASSIGN: return "<<=";
case SHR_ASSIGN: return ">>=";
case CAND: return "&&";
case COR: return "||";
case BREAK: return "break";
case CASE: return "case";
case CHAN: return "chan";
case CONST: return "const";
case CONTINUE: return "continue";
case DEFAULT: return "default";
case ELSE: return "else";
case EXPORT: return "export";
case FALLTHROUGH: return "fallthrough";
case FALSE: return "false";
case FOR: return "for";
case FUNC: return "func";
case GO: return "go";
case GOTO: return "goto";
case IF: return "if";
case IMPORT: return "import";
case INTERFACE: return "interface";
case MAP: return "map";
case NEW: return "new";
case NIL: return "nil";
case PACKAGE: return "package";
case RANGE: return "range";
case RETURN: return "return";
case SELECT: return "select";
case STRUCT: return "struct";
case SWITCH: return "switch";
case TRUE: return "true";
case TYPE: return "type";
case VAR: return "var";
}
return "???";
}
func is_whitespace (ch int) bool {
return ch == ' ' || ch == '\r' || ch == '\n' || ch == '\t';
}
func is_letter (ch int) bool {
return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch == '_' || ch >= 128 ;
}
2008-07-03 19:07:03 -06:00
func digit_val (ch int) int {
if '0' <= ch && ch <= '9' {
return ch - '0';
}
if 'a' <= ch && ch <= 'f' {
return ch - 'a' + 10;
}
if 'A' <= ch && ch <= 'F' {
return ch - 'A' + 10;
}
return 16; // larger than any legal digit val
}
2008-07-03 19:07:03 -06:00
export Scanner
type Scanner struct {
src string;
pos int;
ch int; // one char look-ahead
}
/*
2008-07-03 19:07:03 -06:00
export Token
type Token struct {
val int;
beg, end int;
txt string;
}
2008-07-03 19:07:03 -06:00
func (T *Token) Print () {
print TokenName(T.val), " [", T.beg, ", ", T.end, "[ ", T.txt, "\n";
}
*/
2008-07-03 17:51:22 -06:00
// Read the next Unicode char into S.ch.
// S.ch < 0 means end-of-file.
//
func (S *Scanner) Next () {
const (
Bit1 = 7;
Bitx = 6;
Bit2 = 5;
Bit3 = 4;
Bit4 = 3;
2008-07-03 17:51:22 -06:00
// TODO 6g constant evaluation incomplete
T1 = 0x00; // (1 << (Bit1 + 1) - 1) ^ 0xFF; // 0000 0000
Tx = 0x80; // (1 << (Bitx + 1) - 1) ^ 0xFF; // 1000 0000
T2 = 0xC0; // (1 << (Bit2 + 1) - 1) ^ 0xFF; // 1100 0000
T3 = 0xE0; // (1 << (Bit3 + 1) - 1) ^ 0xFF; // 1110 0000
T4 = 0xF0; // (1 << (Bit4 + 1) - 1) ^ 0xFF; // 1111 0000
Rune1 = 1 << (Bit1 + 0*Bitx) - 1; // 0000 0000 0111 1111
Rune2 = 1 << (Bit2 + 1*Bitx) - 1; // 0000 0111 1111 1111
Rune3 = 1 << (Bit3 + 2*Bitx) - 1; // 1111 1111 1111 1111
Maskx = 0x3F; // 1 << Bitx - 1; // 0011 1111
Testx = 0xC0; // Maskx ^ 0xFF; // 1100 0000
Bad = 0xFFFD; // Runeerror
);
src := S.src; // TODO only needed because of 6g bug
lim := len(src);
pos := S.pos;
// 1-byte sequence
// 0000-007F => T1
if pos >= lim {
2008-07-03 17:51:22 -06:00
S.ch = -1; // end of file
return;
}
2008-07-03 17:51:22 -06:00
c0 := int(src[pos]);
pos++;
if c0 < Tx {
S.ch = c0;
2008-07-03 17:51:22 -06:00
S.pos = pos;
return;
}
// 2-byte sequence
// 0080-07FF => T2 Tx
2008-07-03 17:51:22 -06:00
if pos >= lim {
goto bad;
}
2008-07-03 17:51:22 -06:00
c1 := int(src[pos]) ^ Tx;
pos++;
if c1 & Testx != 0 {
goto bad;
}
if c0 < T3 {
if c0 < T2 {
goto bad;
}
r := (c0 << Bitx | c1) & Rune2;
if r <= Rune1 {
goto bad;
}
S.ch = r;
2008-07-03 17:51:22 -06:00
S.pos = pos;
return;
}
2008-07-03 17:51:22 -06:00
// 3-byte sequence
// 0800-FFFF => T3 Tx Tx
2008-07-03 17:51:22 -06:00
if pos >= lim {
goto bad;
}
2008-07-03 17:51:22 -06:00
c2 := int(src[pos]) ^ Tx;
pos++;
if c2 & Testx != 0 {
goto bad;
}
if c0 < T4 {
r := (((c0 << Bitx | c1) << Bitx) | c2) & Rune3;
if r <= Rune2 {
goto bad;
}
S.ch = r;
2008-07-03 17:51:22 -06:00
S.pos = pos;
return;
}
// bad encoding
bad:
S.ch = Bad;
S.pos += 1;
return;
}
func Init () {
Keywords = new(map [string] int);
for i := KEYWORDS_BEG; i <= KEYWORDS_END; i++ {
Keywords[TokenName(i)] = i;
}
}
func (S *Scanner) Open (src string) {
if Keywords == nil {
Init();
}
S.src = src;
S.pos = 0;
S.Next();
}
2008-07-03 17:51:22 -06:00
func (S *Scanner) Expect (ch int) {
if S.ch != ch {
panic "expected ", string(ch), " found ", string(S.ch);
}
S.Next();
}
func (S *Scanner) SkipWhitespace () {
for is_whitespace(S.ch) {
S.Next();
}
}
func (S *Scanner) SkipComment () {
if S.ch == '/' {
// comment
2008-07-03 17:51:22 -06:00
S.Next();
for S.ch != '\n' && S.ch >= 0 {
S.Next();
}
} else {
/* comment */
2008-07-03 17:51:22 -06:00
S.Next();
for S.ch >= 0 {
ch := S.ch;
S.Next();
2008-07-03 17:51:22 -06:00
if ch == '*' && S.ch == '/' {
S.Next();
return;
}
}
panic "comment not terminated";
}
}
func (S *Scanner) ScanIdentifier () int {
beg := S.pos - 1;
2008-07-03 19:07:03 -06:00
for is_letter(S.ch) || digit_val(S.ch) < 10 {
S.Next();
}
end := S.pos - 1;
var tok int;
var present bool;
tok, present = Keywords[S.src[beg : end]];
if !present {
tok = IDENT;
}
return tok;
}
2008-07-03 17:51:22 -06:00
func (S *Scanner) ScanMantissa (base int) {
2008-07-03 19:07:03 -06:00
for digit_val(S.ch) < base {
S.Next();
2008-07-03 00:19:31 -06:00
}
}
2008-07-03 17:51:22 -06:00
func (S *Scanner) ScanNumber (seen_decimal_point bool) int {
if seen_decimal_point {
S.ScanMantissa(10);
goto exponent;
}
if S.ch == '0' {
2008-07-03 19:07:03 -06:00
// TODO bug: doesn't accept 09.0 !
2008-07-03 17:51:22 -06:00
// int
S.Next();
if S.ch == 'x' || S.ch == 'X' {
// hexadecimal int
S.Next();
S.ScanMantissa(16);
} else {
// octal int
S.ScanMantissa(8);
}
return NUMBER;
}
// decimal int or float
S.ScanMantissa(10);
2008-07-03 00:19:31 -06:00
if S.ch == '.' {
2008-07-03 17:51:22 -06:00
// float
S.Next();
2008-07-03 17:51:22 -06:00
S.ScanMantissa(10)
2008-07-03 00:19:31 -06:00
}
2008-07-03 17:51:22 -06:00
exponent:
2008-07-03 00:19:31 -06:00
if S.ch == 'e' || S.ch == 'E' {
2008-07-03 17:51:22 -06:00
// float
S.Next();
2008-07-03 00:19:31 -06:00
if S.ch == '-' || S.ch == '+' {
S.Next();
2008-07-03 00:19:31 -06:00
}
2008-07-03 17:51:22 -06:00
S.ScanMantissa(10);
}
2008-07-03 00:19:31 -06:00
return NUMBER;
}
2008-07-03 19:07:03 -06:00
func (S *Scanner) ScanDigits(n int, base int) {
for digit_val(S.ch) < base {
S.Next();
2008-07-03 19:07:03 -06:00
n--;
}
2008-07-03 19:07:03 -06:00
if n > 0 {
panic "illegal char escape";
}
}
2008-07-03 17:51:22 -06:00
func (S *Scanner) ScanEscape () string {
// TODO: fix this routine
2008-07-03 17:51:22 -06:00
ch := S.ch;
S.Next();
switch (ch) {
case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', '\'', '"':
return string(ch);
case '0', '1', '2', '3', '4', '5', '6', '7':
2008-07-03 19:07:03 -06:00
S.ScanDigits(3 - 1, 8); // 1 char already read
2008-07-03 17:51:22 -06:00
return ""; // TODO fix this
case 'x':
2008-07-03 19:07:03 -06:00
S.ScanDigits(2, 16);
2008-07-03 17:51:22 -06:00
return ""; // TODO fix this
case 'u':
2008-07-03 19:07:03 -06:00
S.ScanDigits(4, 16);
2008-07-03 17:51:22 -06:00
return ""; // TODO fix this
case 'U':
2008-07-03 19:07:03 -06:00
S.ScanDigits(8, 16);
2008-07-03 17:51:22 -06:00
return ""; // TODO fix this
default:
panic "illegal char escape";
}
}
2008-07-03 00:19:31 -06:00
func (S *Scanner) ScanChar () int {
2008-07-03 17:51:22 -06:00
// '\'' already consumed
ch := S.ch;
S.Next();
if ch == '\\' {
S.ScanEscape();
}
2008-07-03 19:07:03 -06:00
S.Expect('\'');
2008-07-03 00:19:31 -06:00
return NUMBER;
}
2008-07-03 00:19:31 -06:00
func (S *Scanner) ScanString () int {
2008-07-03 17:51:22 -06:00
// '"' already consumed
for S.ch != '"' {
ch := S.ch;
S.Next();
if ch == '\n' || ch < 0 {
panic "string not terminated";
}
2008-07-03 17:51:22 -06:00
if ch == '\\' {
S.ScanEscape();
}
}
2008-07-03 17:51:22 -06:00
S.Next();
2008-07-03 00:19:31 -06:00
return STRING;
}
2008-07-03 00:19:31 -06:00
func (S *Scanner) ScanRawString () int {
2008-07-03 17:51:22 -06:00
// '`' already consumed
for S.ch != '`' {
ch := S.ch;
S.Next();
if ch == '\n' || ch < 0 {
panic "string not terminated";
}
}
2008-07-03 17:51:22 -06:00
S.Next();
2008-07-03 00:19:31 -06:00
return STRING;
}
func (S *Scanner) Select2 (tok0, tok1 int) int {
if S.ch == '=' {
S.Next();
return tok1;
2008-07-03 00:19:31 -06:00
}
return tok0;
}
func (S *Scanner) Select3 (tok0, tok1, ch2, tok2 int) int {
if S.ch == '=' {
S.Next();
2008-07-03 00:19:31 -06:00
return tok1;
}
if S.ch == ch2 {
S.Next();
2008-07-03 00:19:31 -06:00
return tok2;
}
return tok0;
}
func (S *Scanner) Select4 (tok0, tok1, ch2, tok2, tok3 int) int {
if S.ch == '=' {
S.Next();
2008-07-03 00:19:31 -06:00
return tok1;
}
if S.ch == ch2 {
S.Next();
2008-07-03 00:19:31 -06:00
if S.ch == '=' {
S.Next();
2008-07-03 00:19:31 -06:00
return tok3;
}
return tok2;
}
return tok0;
}
func (S *Scanner) Scan () (tok, beg, end int) {
S.SkipWhitespace();
tok = ILLEGAL;
beg = S.pos - 1;
end = beg;
2008-07-03 19:07:03 -06:00
ch := S.ch;
switch {
2008-07-03 00:19:31 -06:00
case is_letter(ch): tok = S.ScanIdentifier();
2008-07-03 19:07:03 -06:00
case digit_val(ch) < 10: tok = S.ScanNumber(false);
2008-07-03 00:19:31 -06:00
default:
S.Next();
switch ch {
case -1: tok = EOF;
case '"': tok = S.ScanString();
case '\'': tok = S.ScanChar();
case '`': tok = S.ScanRawString();
case ':': tok = S.Select2(COLON, DEFINE);
case '.':
2008-07-03 19:07:03 -06:00
if digit_val(S.ch) < 10 {
2008-07-03 17:51:22 -06:00
tok = S.ScanNumber(true);
} else {
tok = PERIOD;
}
case ',': tok = COMMA;
case ';': tok = SEMICOLON;
case '(': tok = LPAREN;
case ')': tok = RPAREN;
case '[': tok = LBRACK;
case ']': tok = RBRACK;
case '{': tok = LBRACE;
case '}': tok = RBRACE;
case '+': tok = S.Select3(ADD, ADD_ASSIGN, '+', INC);
case '-': tok = S.Select3(SUB, SUB_ASSIGN, '-', DEC);
case '*': tok = S.Select2(MUL, MUL_ASSIGN);
case '/':
if S.ch == '/' || S.ch == '*' {
S.SkipComment();
// cannot simply return because of 6g bug
tok, beg, end = S.Scan();
return tok, beg, end;
}
tok = S.Select2(QUO, QUO_ASSIGN);
case '%': tok = S.Select2(REM, REM_ASSIGN);
case '^': tok = S.Select2(XOR, XOR_ASSIGN);
case '<': tok = S.Select4(LSS, LEQ, '<', SHL, SHL_ASSIGN);
case '>': tok = S.Select4(GTR, GEQ, '>', SHR, SHR_ASSIGN);
case '=': tok = S.Select2(ASSIGN, EQL);
case '!': tok = S.Select2(NOT, NEQ);
case '&': tok = S.Select3(AND, AND_ASSIGN, '&', CAND);
case '|': tok = S.Select3(OR, OR_ASSIGN, '|', COR);
default: tok = ILLEGAL;
}
}
end = S.pos - 1;
2008-07-03 19:07:03 -06:00
/*
2008-07-03 19:07:03 -06:00
t.val = tok;
t.beg = beg;
t.end = end;
t.txt = S.src[beg : end];
*/
2008-07-03 19:07:03 -06:00
return tok, beg, end;
}