Basic EBNF package:

- parsing of EBNF grammars - basic consistency checks R=rsc DELTA=695 (695 added, 0 deleted, 0 changed) OCL=31479 CL=31516
2024-11-20 05:04:43 -07:00 · 2009-07-13 10:10:56 -07:00 · 2009-07-13 10:10:56 -07:00 · ef4347f19e
commit ef4347f19e
parent 092d6290d0
6 changed files with 702 additions and 0 deletions
--- a/src/pkg/Make.deps
+++ b/src/pkg/Make.deps
@ -13,6 +13,7 @@ crypto/hmac.install: crypto/md5.install crypto/sha1.install hash.install os.inst
 crypto/md5.install: hash.install os.install
 crypto/sha1.install: hash.install os.install
 datafmt.install: bytes.install container/vector.install fmt.install go/scanner.install go/token.install io.install os.install reflect.install runtime.install strconv.install strings.install
 ebnf.install: container/vector.install fmt.install go/scanner.install go/token.install os.install strconv.install strings.install unicode.install utf8.install
 exec.install: os.install strings.install
 exvar.install: bytes.install fmt.install http.install io.install log.install strconv.install sync.install
 flag.install: fmt.install os.install strconv.install
--- a/src/pkg/Makefile
+++ b/src/pkg/Makefile
@ -27,6 +27,7 @@ DIRS=\
 	crypto/md5\
 	crypto/sha1\
 	datafmt\
 	ebnf\
 	exec\
 	exvar\
 	flag\
@ -84,6 +85,7 @@ TEST=\
 	crypto/md5\
 	crypto/sha1\
 	datafmt\
 	ebnf\
 	exec\
 	exvar\
 	flag\
--- a/src/pkg/ebnf/Makefile
+++ b/src/pkg/ebnf/Makefile
@ -0,0 +1,69 @@
 # Copyright 2009 The Go Authors. All rights reserved.
 # Use of this source code is governed by a BSD-style
 # license that can be found in the LICENSE file.
 # DO NOT EDIT.  Automatically generated by gobuild.
 # gobuild -m ebnf.go parser.go >Makefile
 D=
 include $(GOROOT)/src/Make.$(GOARCH)
 AR=gopack
 default: packages
 clean:
 	rm -rf *.[$(OS)] *.a [$(OS)].out _obj
 test: packages
 	gotest
 coverage: packages
 	gotest
 	6cov -g $$(pwd) | grep -v '_test\.go:'
 %.$O: %.go
 	$(GC) -I_obj $*.go
 %.$O: %.c
 	$(CC) $*.c
 %.$O: %.s
 	$(AS) $*.s
 O1=\
 	ebnf.$O\
 O2=\
 	parser.$O\
 phases: a1 a2
 _obj$D/ebnf.a: phases
 a1: $(O1)
 	$(AR) grc _obj$D/ebnf.a ebnf.$O
 	rm -f $(O1)
 a2: $(O2)
 	$(AR) grc _obj$D/ebnf.a parser.$O
 	rm -f $(O2)
 newpkg: clean
 	mkdir -p _obj$D
 	$(AR) grc _obj$D/ebnf.a
 $(O1): newpkg
 $(O2): a1
 $(O3): a2
 nuke: clean
 	rm -f $(GOROOT)/pkg/$(GOOS)_$(GOARCH)$D/ebnf.a
 packages: _obj$D/ebnf.a
 install: packages
 	test -d $(GOROOT)/pkg && mkdir -p $(GOROOT)/pkg/$(GOOS)_$(GOARCH)$D
 	cp _obj$D/ebnf.a $(GOROOT)/pkg/$(GOOS)_$(GOARCH)$D/ebnf.a
--- a/src/pkg/ebnf/ebnf.go
+++ b/src/pkg/ebnf/ebnf.go
@ -0,0 +1,315 @@
 // Copyright 2009 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // A library for EBNF grammars. The input is text ([]byte) satisfying
 // the following grammar (represented itself in EBNF):
 //
 //	Production  = name "=" Expression "." .
 //	Expression  = Alternative { "|" Alternative } .
 //	Alternative = Term { Term } .
 //	Term        = name | token [ "..." token ] | Group | Option | Repetition .
 //	Group       = "(" Expression ")" .
 //	Option      = "[" Expression "]" .
 //	Repetition  = "{" Expression "}" .
 //
 // A name is a Go identifier, a token is a Go string, and comments
 // and white space follow the same rules as for the Go language.
 // Production names starting with an uppercase Unicode letter denote
 // non-terminal productions (i.e., productions which allow white-space
 // and comments between tokens); all other production names denote
 // lexical productions.
 //
 package ebnf
 import (
 	"container/vector";
 	"fmt";
 	"go/scanner";
 	"go/token";
 	"os";
 	"strconv";
 	"strings";
 	"unicode";
 	"utf8";
 )
 // ----------------------------------------------------------------------------
 // Internal representation
 type (
 	// An Expression node represents a production expression.
 	Expression interface {
 		// Pos is the position of the first character of the syntactic construct
 		Pos() token.Position;
 	};
 	// An Alternative node represents a non-empty list of alternative expressions.
 	Alternative []Expression;  // x | y | z
 	// A Sequence node represents a non-empty list of sequential expressions.
 	Sequence []Expression;  // x y z
 	// A Name node represents a production name.
 	Name struct {
 		token.Position;
 		String string;
 	};
 	// A Token node represents a literal.
 	Token struct {
 		token.Position;
 		String string;
 	};
 	// A List node represents a range of characters.
 	Range struct {
 		Begin, End *Token;  // begin ... end
 	};
 	// A Group node represents a grouped expression.
 	Group struct {
 		token.Position;
 		Body Expression;  // (body)
 	};
 	// An Option node represents an optional expression.
 	Option struct {
 		token.Position;
 		Body Expression;  // [body]
 	};
 	// A Repetition node represents a repeated expression.
 	Repetition struct {
 		token.Position;
 		Body Expression;  // {body}
 	};
 	// A Production node represents an EBNF production.
 	Production struct {
 		Name *Name;
 		Expr Expression;
 	};
 	// A Grammar is a set of EBNF productions. The map
 	// is indexed by production name.
 	//
 	Grammar map [string] *Production;
 )
 func (x Alternative) Pos() token.Position {
 	return x[0].Pos();  // the parser always generates non-empty Alternative
 }
 func (x Sequence) Pos() token.Position {
 	return x[0].Pos();  // the parser always generates non-empty Sequences
 }
 func (x Range) Pos() token.Position {
 	return x.Begin.Pos();
 }
 func (p *Production) Pos() token.Position {
 	return p.Name.Pos();
 }
 // ----------------------------------------------------------------------------
 // Error handling
 // TODO(gri) This is the same code as in datafmt and go/parser.
 //           Should factor this out as part of some parsing framework
 //           that could also deal with reading various input sources.
 // Error describes an individual error. The position Pos, if valid,
 // indicates the format source position the error relates to. The
 // error is specified with the Msg string.
 //
 type Error struct {
 	Pos token.Position;
 	Msg string;
 }
 // String returns the error message. If the error contains (line, column)
 // position information, it starts with "line:column: ", otherwise it
 // starts with a blank " ". 
 //
 func (e *Error) String() string {
 	pos := " ";
 	if e.Pos.IsValid() {
 		pos = fmt.Sprintf("%d:%d: ", e.Pos.Line, e.Pos.Column);
 	}
 	return pos + e.Msg;
 }
 // An ErrorList is a list of errors encountered during parsing.
 type ErrorList []*Error
 // ErrorList implements SortInterface and the os.Error interface.
 func (p ErrorList) Len() int  { return len(p); }
 func (p ErrorList) Swap(i, j int)  { p[i], p[j] = p[j], p[i]; }
 func (p ErrorList) Less(i, j int) bool  { return p[i].Pos.Offset < p[j].Pos.Offset; }
 func (p ErrorList) String() string {
 	switch len(p) {
 	case 0:
 		return "unspecified error";
 	case 1:
 		return p[0].String();
 	}
 	return fmt.Sprintf("%s (and %d more errors)", p[0].String(), len(p) - 1);
 }
 // ----------------------------------------------------------------------------
 // Grammar verification
 func isLexical(name string) bool {
 	ch, len := utf8.DecodeRuneInString(name);
 	return !unicode.IsUpper(ch);
 }
 type verifier struct {
 	errors vector.Vector;
 	worklist vector.Vector;
 	reached Grammar;  // set of productions reached from (and including) the root production
 	grammar Grammar;
 }
 func (v *verifier) error(pos token.Position, msg string) {
 	v.errors.Push(&Error{pos, msg});
 }
 func makeErrorList(v *vector.Vector) os.Error {
 	if v.Len() > 0 {
 		errors := make(ErrorList, v.Len());
 		for i := 0; i < v.Len(); i++ {
 			errors[i] = v.At(i).(*Error);
 		}
 		return errors;
 	}
 	return nil;
 }
 func (v *verifier) push(prod *Production) {
 	name := prod.Name.String;
 	if _, found := v.reached[name]; !found {
 		v.worklist.Push(prod);
 		v.reached[name] = prod;
 	}
 }
 func (v *verifier) verifyChar(x *Token) int {
 	s := x.String;
 	if utf8.RuneCountInString(s) != 1 {
 		v.error(x.Pos(), "single char expected, found " + s);
 		return 0;
 	}
 	ch, _ := utf8.DecodeRuneInString(s);
 	return ch;
 }
 func (v *verifier) verifyExpr(expr Expression, lexical bool) {
 	switch x := expr.(type) {
 	case nil:
 		// empty expression
 	case Alternative:
 		for _, e := range x {
 			v.verifyExpr(e, lexical);
 		}
 	case Sequence:
 		for _, e := range x {
 			v.verifyExpr(e, lexical);
 		}
 	case *Name:
 		// a production with this name must exist;
 		// add it to the worklist if not yet processed
 		if prod, found := v.grammar[x.String]; found {
 			v.push(prod);
 		} else {
 			v.error(x.Pos(), "missing production " + x.String);
 		}
 		// within a lexical production references
 		// to non-lexical productions are invalid
 		if lexical && !isLexical(x.String) {
 			v.error(x.Pos(), "reference to non-lexical production " + x.String);
 		}
 	case *Token:
 		// nothing to do for now
 	case *Range:
 		i := v.verifyChar(x.Begin);
 		j := v.verifyChar(x.End);
 		if i >= j {
 			v.error(x.Pos(), "decreasing character range");
 		}
 	case *Group:
 		v.verifyExpr(x.Body, lexical);
 	case *Option:
 		v.verifyExpr(x.Body, lexical);
 	case *Repetition:
 		v.verifyExpr(x.Body, lexical);
 	default:
 		panic("unreachable");
 	}
 }
 func (v *verifier) verify(grammar Grammar, start string) {
 	// find root production
 	root, found := grammar[start];
 	if !found {
 		var noPos token.Position;
 		v.error(noPos, "no start production " + start);
 		return;
 	}
 	// initialize verifier
 	v.errors.Init(0);
 	v.worklist.Init(0);
 	v.reached = make(Grammar);
 	v.grammar = grammar;
 	// work through the worklist
 	v.push(root);
 	for v.worklist.Len() > 0 {
 		prod := v.worklist.Pop().(*Production);
 		v.verifyExpr(prod.Expr, isLexical(prod.Name.String));
 	}
 	// check if all productions were reached
 	if len(v.reached) < len(v.grammar) {
 		for name, prod := range v.grammar {
 			if _, found := v.reached[name]; !found {
 				v.error(prod.Pos(), name + " is unreachable");
 			}
 		}
 	}
 }
 // Verify checks that:
 //	- all productions used are defined
 //	- all productions defined are used when beginning at start
 //	- lexical productions refer only to other lexical productions
 //
 func Verify(grammar Grammar, start string) os.Error {
 	var v verifier;
 	v.verify(grammar, start);
 	return makeErrorList(&v.errors);
 }
--- a/src/pkg/ebnf/ebnf_test.go
+++ b/src/pkg/ebnf/ebnf_test.go
@ -0,0 +1,75 @@
 // Copyright 2009 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 package ebnf
 import (
 	"ebnf";
 	"io";
 	"strings";
 	"testing";
 )
 var grammars = []string {
 	`Program = .
 	`,
 	`Program = foo .
 	foo = "foo" .
 	`,
 	`Program = "a" | "b" "c" .
 	`,
 	`Program = "a" ... "z" .
 	`,
 	`Program = Song .
 	 Song = { Note } .
 	 Note = Do | (Re | Mi | Fa | So | La) | Ti .
 	 Do = "c" .
 	 Re = "d" .
 	 Mi = "e" .
 	 Fa = "f" .
 	 So = "g" .
 	 La = "a" .
 	 Ti = ti .
 	 ti = "b" .
 	`,
 }
 func check(t *testing.T, src []byte) {
 	grammar, err := Parse(src);
 	if err != nil {
 		t.Errorf("Parse(%s) failed: %v", src, err);
 	}
 	if err = Verify(grammar, "Program"); err != nil {
 		t.Errorf("Verify(%s) failed: %v", src, err);
 	}
 }
 func TestGrammars(t *testing.T) {
 	for _, src := range grammars {
 		check(t, strings.Bytes(src));
 	}
 }
 var files = []string {
 	// TODO(gri) add some test files
 }
 func TestFiles(t *testing.T) {
 	for _, filename := range files {
 		src, err := io.ReadFile(filename);
 		if err != nil {
 			t.Fatal(err);
 		}
 		check(t, src);
 	}
 }
--- a/src/pkg/ebnf/parser.go
+++ b/src/pkg/ebnf/parser.go
@ -0,0 +1,240 @@
 // Copyright 2009 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 package ebnf
 import (
 	"container/vector";
 	"ebnf";
 	"fmt";
 	"go/scanner";
 	"go/token";
 	"os";
 	"strconv";
 	"strings";
 	"unicode";
 	"utf8";
 )
 type parser struct {
 	errors vector.Vector;
 	scanner scanner.Scanner;
 	pos token.Position;  // token position
 	tok token.Token;  // one token look-ahead
 	lit []byte;  // token literal
 }
 func (p *parser) next() {
 	p.pos, p.tok, p.lit = p.scanner.Scan();
 	if p.tok.IsKeyword() {
 		// TODO Should keyword mapping always happen outside scanner?
 		//      Or should there be a flag to scanner to enable keyword mapping?
 		p.tok = token.IDENT;
 	}
 }
 func (p *parser) init(src []byte) {
 	p.errors.Init(0);
 	p.scanner.Init(src, p, 0);
 	p.next();  // initializes pos, tok, lit
 }
 // The parser implements scanner.Error.
 func (p *parser) Error(pos token.Position, msg string) {
 	// Do not collect errors that are on the same line as the previous
 	// error to reduce the number of spurious errors due to incorrect
 	// parser synchronization.
 	if p.errors.Len() == 0 || p.errors.Last().(*Error).Pos.Line != pos.Line {
 		p.errors.Push(&Error{pos, msg});
 	}
 }
 func (p *parser) errorExpected(pos token.Position, msg string) {
 	msg = "expected " + msg;
 	if pos.Offset == p.pos.Offset {
 		// the error happened at the current position;
 		// make the error message more specific
 		msg += ", found '" + p.tok.String() + "'";
 		if p.tok.IsLiteral() {
 			msg += " " + string(p.lit);
 		}
 	}
 	p.Error(pos, msg);
 }
 func (p *parser) expect(tok token.Token) token.Position {
 	pos := p.pos;
 	if p.tok != tok {
 		p.errorExpected(pos, "'" + tok.String() + "'");
 	}
 	p.next();  // make progress in any case
 	return pos;
 }
 func (p *parser) parseIdentifier() *Name {
 	pos := p.pos;
 	name := string(p.lit);
 	p.expect(token.IDENT);
 	return &Name{pos, name};
 }
 func (p *parser) parseToken() *Token {
 	pos := p.pos;
 	value := "";
 	if p.tok == token.STRING {
 		var err os.Error;
 		value, err = strconv.Unquote(string(p.lit));
 		// Unquote may fail with an error, but only if the scanner found
 		// an illegal string in the first place. In this case the error
 		// has already been reported.
 		p.next();
 	} else {
 		p.expect(token.STRING);
 	}
 	return &Token{pos, value};
 }
 func (p *parser) parseExpression() Expression
 func (p *parser) parseTerm() (x Expression) {
 	pos := p.pos;
 	switch p.tok {
 	case token.IDENT:
 		x = p.parseIdentifier();
 	case token.STRING:
 		tok := p.parseToken();
 		x = tok;
 		if p.tok == token.ELLIPSIS {
 			p.next();
 			x = &Range{tok, p.parseToken()};
 		}
 	case token.LPAREN:
 		p.next();
 		x = &Group{pos, p.parseExpression()};
 		p.expect(token.RPAREN);
 	case token.LBRACK:
 		p.next();
 		x = &Option{pos, p.parseExpression()};
 		p.expect(token.RBRACK);
 	case token.LBRACE:
 		p.next();
 		x = &Repetition{pos, p.parseExpression()};
 		p.expect(token.RBRACE);
 	}
 	return x;
 }
 func (p *parser) parseSequence() Expression {
 	var list vector.Vector;
 	list.Init(0);
 	for x := p.parseTerm(); x != nil; x = p.parseTerm() {
 		list.Push(x);
 	}
 	// no need for a sequence if list.Len() < 2
 	switch list.Len() {
 	case 0:
 		return nil;
 	case 1:
 		return list.At(0).(Expression);
 	}
 	// convert list into a sequence
 	seq := make(Sequence, list.Len());
 	for i := 0; i < list.Len(); i++ {
 		seq[i] = list.At(i).(Expression);
 	}
 	return seq;
 }
 func (p *parser) parseExpression() Expression {
 	var list vector.Vector;
 	list.Init(0);
 	for {
 		x := p.parseSequence();
 		if x != nil {
 			list.Push(x);
 		}
 		if p.tok != token.OR {
 			break;
 		}
 		p.next();
 	}
 	// no need for an Alternative node if list.Len() < 2
 	switch list.Len() {
 	case 0:
 		return nil;
 	case 1:
 		return list.At(0).(Expression);
 	}
 	// convert list into an Alternative node
 	alt := make(Alternative, list.Len());
 	for i := 0; i < list.Len(); i++ {
 		alt[i] = list.At(i).(Expression);
 	}
 	return alt;
 }
 func (p *parser) parseProduction() *Production {
 	name := p.parseIdentifier();
 	p.expect(token.ASSIGN);
 	expr := p.parseExpression();
 	p.expect(token.PERIOD);
 	return &Production{name, expr};
 }
 func (p *parser) parse(src []byte) Grammar {
 	// initialize parser
 	p.errors.Init(0);
 	p.scanner.Init(src, p, 0);
 	p.next();  // initializes pos, tok, lit
 	grammar := make(Grammar);
 	for p.tok != token.EOF {
 		prod := p.parseProduction();
 		name := prod.Name.String;
 		if prev, found := grammar[name]; !found {
 			grammar[name] = prod;
 		} else {
 			p.Error(prod.Pos(), name + " declared already");
 		}
 	}
 	return grammar;
 }
 // Parse parses a set of EBNF productions from source src.
 // It returns a set of productions. Errors are reported
 // for incorrect syntax and if a production is declared
 // more than once.
 //
 func Parse(src []byte) (Grammar, os.Error) {
 	var p parser;
 	grammar := p.parse(src);
 	return grammar, makeErrorList(&p.errors);
 }