1
0
mirror of https://github.com/golang/go synced 2024-11-19 05:54:44 -07:00
go/usr/gri/src/scanner.go

669 lines
11 KiB
Go
Raw Normal View History

// Copyright 2009 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package Scanner
export EOF;
const (
ILLEGAL = iota;
2008-07-03 17:51:22 -06:00
EOF;
IDENT;
STRING;
NUMBER;
COMMA;
COLON;
SEMICOLON;
PERIOD;
LPAREN;
RPAREN;
LBRACK;
RBRACK;
LBRACE;
RBRACE;
2008-07-03 17:51:22 -06:00
ASSIGN;
DEFINE;
2008-07-03 17:51:22 -06:00
INC;
DEC;
NOT;
2008-07-03 17:51:22 -06:00
AND;
OR;
XOR;
2008-07-03 17:51:22 -06:00
ADD;
SUB;
MUL;
QUO;
REM;
2008-07-03 17:51:22 -06:00
EQL;
NEQ;
LSS;
LEQ;
GTR;
GEQ;
SHL;
SHR;
ADD_ASSIGN;
SUB_ASSIGN;
MUL_ASSIGN;
QUO_ASSIGN;
REM_ASSIGN;
AND_ASSIGN;
OR_ASSIGN;
XOR_ASSIGN;
2008-07-03 00:19:31 -06:00
2008-07-03 17:51:22 -06:00
SHL_ASSIGN;
SHR_ASSIGN;
2008-07-03 00:19:31 -06:00
2008-07-03 17:51:22 -06:00
CAND;
COR;
// keywords
2008-07-03 17:51:22 -06:00
KEYWORDS_BEG;
BREAK;
CASE;
CONST;
CONTINUE;
DEFAULT;
ELSE;
EXPORT;
FALLTHROUGH;
FALSE;
FOR;
FUNC;
GO;
GOTO;
IF;
IMPORT;
INTERFACE;
MAP;
NEW;
NIL;
PACKAGE;
RANGE;
RETURN;
SELECT;
STRUCT;
SWITCH;
TRUE;
TYPE;
VAR;
KEYWORDS_END;
)
2008-07-03 17:51:22 -06:00
var Keywords *map [string] int;
export TokenName
func TokenName(tok int) string {
switch (tok) {
case ILLEGAL: return "illegal";
case EOF: return "eof";
case IDENT: return "ident";
case STRING: return "string";
case NUMBER: return "number";
case COMMA: return ",";
case COLON: return ":";
case SEMICOLON: return ";";
case PERIOD: return ".";
case LPAREN: return "(";
case RPAREN: return ")";
case LBRACK: return "[";
case RBRACK: return "]";
case LBRACE: return "{";
case RBRACE: return "}";
case ASSIGN: return "=";
case DEFINE: return ":=";
case INC: return "++";
case DEC: return "--";
case NOT: return "!";
2008-07-03 00:19:31 -06:00
case AND: return "&";
case OR: return "|";
case XOR: return "^";
case ADD: return "+";
case SUB: return "-";
case MUL: return "*";
case QUO: return "/";
case REM: return "%";
case EQL: return "==";
case NEQ: return "!=";
case LSS: return "<";
case LEQ: return "<=";
case GTR: return ">";
case GEQ: return ">=";
case SHL: return "<<";
case SHR: return ">>";
case ADD_ASSIGN: return "+=";
case SUB_ASSIGN: return "-=";
case MUL_ASSIGN: return "+=";
case QUO_ASSIGN: return "/=";
case REM_ASSIGN: return "%=";
case AND_ASSIGN: return "&=";
case OR_ASSIGN: return "|=";
case XOR_ASSIGN: return "^=";
case SHL_ASSIGN: return "<<=";
case SHR_ASSIGN: return ">>=";
case CAND: return "&&";
case COR: return "||";
case BREAK: return "break";
case CASE: return "case";
case CONST: return "const";
case CONTINUE: return "continue";
case DEFAULT: return "default";
case ELSE: return "else";
case EXPORT: return "export";
case FALLTHROUGH: return "fallthrough";
case FALSE: return "false";
case FOR: return "for";
case FUNC: return "func";
case GO: return "go";
case GOTO: return "goto";
case IF: return "if";
case IMPORT: return "import";
case INTERFACE: return "interface";
case MAP: return "map";
case NEW: return "new";
case NIL: return "nil";
case PACKAGE: return "package";
case RANGE: return "range";
case RETURN: return "return";
case SELECT: return "select";
case STRUCT: return "struct";
case SWITCH: return "switch";
case TRUE: return "true";
case TYPE: return "type";
case VAR: return "var";
}
return "???";
}
func is_whitespace (ch int) bool {
return ch == ' ' || ch == '\r' || ch == '\n' || ch == '\t';
}
func is_letter (ch int) bool {
return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch == '_' || ch >= 128 ;
}
2008-07-03 19:07:03 -06:00
func digit_val (ch int) int {
if '0' <= ch && ch <= '9' {
return ch - '0';
}
if 'a' <= ch && ch <= 'f' {
return ch - 'a' + 10;
}
if 'A' <= ch && ch <= 'F' {
return ch - 'A' + 10;
}
return 16; // larger than any legal digit val
}
2008-07-03 19:07:03 -06:00
export Scanner
type Scanner struct {
src string;
pos int;
ch int; // one char look-ahead
}
2008-07-03 19:07:03 -06:00
export Token
type Token struct {
val int;
beg, end int;
txt string;
}
2008-07-03 19:07:03 -06:00
func (T *Token) Print () {
print TokenName(T.val), " [", T.beg, ", ", T.end, "[ ", T.txt, "\n";
}
2008-07-03 17:51:22 -06:00
// Read the next Unicode char into S.ch.
// S.ch < 0 means end-of-file.
//
func (S *Scanner) Next () {
const (
Bit1 = 7;
Bitx = 6;
Bit2 = 5;
Bit3 = 4;
Bit4 = 3;
2008-07-03 17:51:22 -06:00
// TODO 6g constant evaluation incomplete
T1 = 0x00; // (1 << (Bit1 + 1) - 1) ^ 0xFF; // 0000 0000
Tx = 0x80; // (1 << (Bitx + 1) - 1) ^ 0xFF; // 1000 0000
T2 = 0xC0; // (1 << (Bit2 + 1) - 1) ^ 0xFF; // 1100 0000
T3 = 0xE0; // (1 << (Bit3 + 1) - 1) ^ 0xFF; // 1110 0000
T4 = 0xF0; // (1 << (Bit4 + 1) - 1) ^ 0xFF; // 1111 0000
Rune1 = 1 << (Bit1 + 0*Bitx) - 1; // 0000 0000 0111 1111
Rune2 = 1 << (Bit2 + 1*Bitx) - 1; // 0000 0111 1111 1111
Rune3 = 1 << (Bit3 + 2*Bitx) - 1; // 1111 1111 1111 1111
Maskx = 0x3F; // 1 << Bitx - 1; // 0011 1111
Testx = 0xC0; // Maskx ^ 0xFF; // 1100 0000
Bad = 0xFFFD; // Runeerror
);
src := S.src; // TODO only needed because of 6g bug
lim := len(src);
pos := S.pos;
// 1-byte sequence
// 0000-007F => T1
if pos >= lim {
2008-07-03 17:51:22 -06:00
S.ch = -1; // end of file
return;
}
2008-07-03 17:51:22 -06:00
c0 := int(src[pos]);
pos++;
if c0 < Tx {
S.ch = c0;
2008-07-03 17:51:22 -06:00
S.pos = pos;
return;
}
// 2-byte sequence
// 0080-07FF => T2 Tx
2008-07-03 17:51:22 -06:00
if pos >= lim {
goto bad;
}
2008-07-03 17:51:22 -06:00
c1 := int(src[pos]) ^ Tx;
pos++;
if c1 & Testx != 0 {
goto bad;
}
if c0 < T3 {
if c0 < T2 {
goto bad;
}
r := (c0 << Bitx | c1) & Rune2;
if r <= Rune1 {
goto bad;
}
S.ch = r;
2008-07-03 17:51:22 -06:00
S.pos = pos;
return;
}
2008-07-03 17:51:22 -06:00
// 3-byte sequence
// 0800-FFFF => T3 Tx Tx
2008-07-03 17:51:22 -06:00
if pos >= lim {
goto bad;
}
2008-07-03 17:51:22 -06:00
c2 := int(src[pos]) ^ Tx;
pos++;
if c2 & Testx != 0 {
goto bad;
}
if c0 < T4 {
r := (((c0 << Bitx | c1) << Bitx) | c2) & Rune3;
if r <= Rune2 {
goto bad;
}
S.ch = r;
2008-07-03 17:51:22 -06:00
S.pos = pos;
return;
}
// bad encoding
bad:
S.ch = Bad;
S.pos += 1;
return;
}
func Init () {
Keywords = new(map [string] int);
for i := KEYWORDS_BEG; i <= KEYWORDS_END; i++ {
Keywords[TokenName(i)] = i;
}
}
func (S *Scanner) Open (src string) {
if Keywords == nil {
Init();
}
S.src = src;
S.pos = 0;
S.Next();
}
2008-07-03 17:51:22 -06:00
func (S *Scanner) Expect (ch int) {
if S.ch != ch {
panic "expected ", string(ch), " found ", string(S.ch);
}
S.Next();
}
func (S *Scanner) SkipWhitespace () {
for is_whitespace(S.ch) {
S.Next();
}
}
func (S *Scanner) SkipComment () {
if S.ch == '/' {
// comment
2008-07-03 17:51:22 -06:00
S.Next();
for S.ch != '\n' && S.ch >= 0 {
S.Next();
}
} else {
/* comment */
2008-07-03 17:51:22 -06:00
S.Next();
for S.ch >= 0 {
ch := S.ch;
S.Next();
2008-07-03 17:51:22 -06:00
if ch == '*' && S.ch == '/' {
S.Next();
return;
}
}
panic "comment not terminated";
}
}
func (S *Scanner) ScanIdentifier () int {
beg := S.pos - 1;
2008-07-03 19:07:03 -06:00
for is_letter(S.ch) || digit_val(S.ch) < 10 {
S.Next();
}
end := S.pos - 1;
var tok int;
var present bool;
tok, present = Keywords[S.src[beg : end]];
if !present {
tok = IDENT;
}
return tok;
}
2008-07-03 17:51:22 -06:00
func (S *Scanner) ScanMantissa (base int) {
2008-07-03 19:07:03 -06:00
for digit_val(S.ch) < base {
S.Next();
2008-07-03 00:19:31 -06:00
}
}
2008-07-03 17:51:22 -06:00
func (S *Scanner) ScanNumber (seen_decimal_point bool) int {
if seen_decimal_point {
S.ScanMantissa(10);
goto exponent;
}
if S.ch == '0' {
2008-07-03 19:07:03 -06:00
// TODO bug: doesn't accept 09.0 !
2008-07-03 17:51:22 -06:00
// int
S.Next();
if S.ch == 'x' || S.ch == 'X' {
// hexadecimal int
S.Next();
S.ScanMantissa(16);
} else {
// octal int
S.ScanMantissa(8);
}
return NUMBER;
}
// decimal int or float
S.ScanMantissa(10);
2008-07-03 00:19:31 -06:00
if S.ch == '.' {
2008-07-03 17:51:22 -06:00
// float
S.Next();
2008-07-03 17:51:22 -06:00
S.ScanMantissa(10)
2008-07-03 00:19:31 -06:00
}
2008-07-03 17:51:22 -06:00
exponent:
2008-07-03 00:19:31 -06:00
if S.ch == 'e' || S.ch == 'E' {
2008-07-03 17:51:22 -06:00
// float
S.Next();
2008-07-03 00:19:31 -06:00
if S.ch == '-' || S.ch == '+' {
S.Next();
2008-07-03 00:19:31 -06:00
}
2008-07-03 17:51:22 -06:00
S.ScanMantissa(10);
}
2008-07-03 00:19:31 -06:00
return NUMBER;
}
2008-07-03 19:07:03 -06:00
func (S *Scanner) ScanDigits(n int, base int) {
for digit_val(S.ch) < base {
S.Next();
2008-07-03 19:07:03 -06:00
n--;
}
2008-07-03 19:07:03 -06:00
if n > 0 {
panic "illegal char escape";
}
}
2008-07-03 17:51:22 -06:00
func (S *Scanner) ScanEscape () string {
// TODO: fix this routine
2008-07-03 17:51:22 -06:00
ch := S.ch;
S.Next();
switch (ch) {
case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', '\'', '"':
return string(ch);
case '0', '1', '2', '3', '4', '5', '6', '7':
2008-07-03 19:07:03 -06:00
S.ScanDigits(3 - 1, 8); // 1 char already read
2008-07-03 17:51:22 -06:00
return ""; // TODO fix this
case 'x':
2008-07-03 19:07:03 -06:00
S.ScanDigits(2, 16);
2008-07-03 17:51:22 -06:00
return ""; // TODO fix this
case 'u':
2008-07-03 19:07:03 -06:00
S.ScanDigits(4, 16);
2008-07-03 17:51:22 -06:00
return ""; // TODO fix this
case 'U':
2008-07-03 19:07:03 -06:00
S.ScanDigits(8, 16);
2008-07-03 17:51:22 -06:00
return ""; // TODO fix this
default:
panic "illegal char escape";
}
}
2008-07-03 00:19:31 -06:00
func (S *Scanner) ScanChar () int {
2008-07-03 17:51:22 -06:00
// '\'' already consumed
ch := S.ch;
S.Next();
if ch == '\\' {
S.ScanEscape();
}
2008-07-03 19:07:03 -06:00
S.Expect('\'');
2008-07-03 00:19:31 -06:00
return NUMBER;
}
2008-07-03 00:19:31 -06:00
func (S *Scanner) ScanString () int {
2008-07-03 17:51:22 -06:00
// '"' already consumed
for S.ch != '"' {
ch := S.ch;
S.Next();
if ch == '\n' || ch < 0 {
panic "string not terminated";
}
2008-07-03 17:51:22 -06:00
if ch == '\\' {
S.ScanEscape();
}
}
2008-07-03 17:51:22 -06:00
S.Next();
2008-07-03 00:19:31 -06:00
return STRING;
}
2008-07-03 00:19:31 -06:00
func (S *Scanner) ScanRawString () int {
2008-07-03 17:51:22 -06:00
// '`' already consumed
for S.ch != '`' {
ch := S.ch;
S.Next();
if ch == '\n' || ch < 0 {
panic "string not terminated";
}
}
2008-07-03 17:51:22 -06:00
S.Next();
2008-07-03 00:19:31 -06:00
return STRING;
}
func (S *Scanner) Select2 (tok0, tok1 int) int {
if S.ch == '=' {
S.Next();
return tok1;
2008-07-03 00:19:31 -06:00
}
return tok0;
}
func (S *Scanner) Select3 (tok0, tok1, ch2, tok2 int) int {
if S.ch == '=' {
S.Next();
2008-07-03 00:19:31 -06:00
return tok1;
}
if S.ch == ch2 {
S.Next();
2008-07-03 00:19:31 -06:00
return tok2;
}
return tok0;
}
func (S *Scanner) Select4 (tok0, tok1, ch2, tok2, tok3 int) int {
if S.ch == '=' {
S.Next();
2008-07-03 00:19:31 -06:00
return tok1;
}
if S.ch == ch2 {
S.Next();
2008-07-03 00:19:31 -06:00
if S.ch == '=' {
S.Next();
2008-07-03 00:19:31 -06:00
return tok3;
}
return tok2;
}
return tok0;
}
2008-07-03 19:07:03 -06:00
func (S *Scanner) Scan (t *Token) (tok, beg, end int) {
S.SkipWhitespace();
var tok int = ILLEGAL;
var beg int = S.pos - 1;
var end int = beg;
2008-07-03 19:07:03 -06:00
ch := S.ch;
switch {
2008-07-03 00:19:31 -06:00
case is_letter(ch): tok = S.ScanIdentifier();
2008-07-03 19:07:03 -06:00
case digit_val(ch) < 10: tok = S.ScanNumber(false);
2008-07-03 00:19:31 -06:00
default:
S.Next();
switch ch {
case -1: tok = EOF;
case '"': tok = S.ScanString();
case '\'': tok = S.ScanChar();
case '`': tok = S.ScanRawString();
case ':': tok = S.Select2(COLON, DEFINE);
case '.':
2008-07-03 19:07:03 -06:00
if digit_val(S.ch) < 10 {
2008-07-03 17:51:22 -06:00
tok = S.ScanNumber(true);
} else {
tok = PERIOD;
}
case ',': tok = COMMA;
case ';': tok = SEMICOLON;
case '(': tok = LPAREN;
case ')': tok = RPAREN;
case '[': tok = LBRACK;
case ']': tok = RBRACK;
case '{': tok = LBRACE;
case '}': tok = RBRACE;
case '+': tok = S.Select3(ADD, ADD_ASSIGN, '+', INC);
case '-': tok = S.Select3(SUB, SUB_ASSIGN, '-', DEC);
case '*': tok = S.Select2(MUL, MUL_ASSIGN);
case '/':
if S.ch == '/' || S.ch == '*' {
S.SkipComment();
// cannot simply return because of 6g bug
2008-07-03 19:07:03 -06:00
tok, beg, end = S.Scan(t);
return tok, beg, end;
}
tok = S.Select2(QUO, QUO_ASSIGN);
case '%': tok = S.Select2(REM, REM_ASSIGN);
case '^': tok = S.Select2(XOR, XOR_ASSIGN);
case '<': tok = S.Select4(LSS, LEQ, '<', SHL, SHL_ASSIGN);
case '>': tok = S.Select4(GTR, GEQ, '>', SHR, SHR_ASSIGN);
case '=': tok = S.Select2(ASSIGN, EQL);
case '!': tok = S.Select2(NOT, NEQ);
case '&': tok = S.Select3(AND, AND_ASSIGN, '&', CAND);
case '|': tok = S.Select3(OR, OR_ASSIGN, '|', COR);
default: tok = ILLEGAL;
}
}
end = S.pos - 1;
2008-07-03 19:07:03 -06:00
t.val = tok;
t.beg = beg;
t.end = end;
t.txt = S.src[beg : end];
return tok, beg, end;
}