From ef4347f19ec70d18abba176e823e5d0d26694360 Mon Sep 17 00:00:00 2001
From: Robert Griesemer <gri@golang.org>
Date: Mon, 13 Jul 2009 10:10:56 -0700
Subject: [PATCH] Basic EBNF package: - parsing of EBNF grammars - basic
 consistency checks

R=rsc
DELTA=695  (695 added, 0 deleted, 0 changed)
OCL=31479
CL=31516
---
 src/pkg/Make.deps         |   1 +
 src/pkg/Makefile          |   2 +
 src/pkg/ebnf/Makefile     |  69 +++++++++
 src/pkg/ebnf/ebnf.go      | 315 ++++++++++++++++++++++++++++++++++++++
 src/pkg/ebnf/ebnf_test.go |  75 +++++++++
 src/pkg/ebnf/parser.go    | 240 +++++++++++++++++++++++++++++
 6 files changed, 702 insertions(+)
 create mode 100644 src/pkg/ebnf/Makefile
 create mode 100644 src/pkg/ebnf/ebnf.go
 create mode 100644 src/pkg/ebnf/ebnf_test.go
 create mode 100644 src/pkg/ebnf/parser.go

diff --git a/src/pkg/Make.deps b/src/pkg/Make.deps
index 9144ad40d9..12f0ef6e5f 100644
--- a/src/pkg/Make.deps
+++ b/src/pkg/Make.deps
@@ -13,6 +13,7 @@ crypto/hmac.install: crypto/md5.install crypto/sha1.install hash.install os.inst
 crypto/md5.install: hash.install os.install
 crypto/sha1.install: hash.install os.install
 datafmt.install: bytes.install container/vector.install fmt.install go/scanner.install go/token.install io.install os.install reflect.install runtime.install strconv.install strings.install
+ebnf.install: container/vector.install fmt.install go/scanner.install go/token.install os.install strconv.install strings.install unicode.install utf8.install
 exec.install: os.install strings.install
 exvar.install: bytes.install fmt.install http.install io.install log.install strconv.install sync.install
 flag.install: fmt.install os.install strconv.install
diff --git a/src/pkg/Makefile b/src/pkg/Makefile
index 3c16395f51..c007511e63 100644
--- a/src/pkg/Makefile
+++ b/src/pkg/Makefile
@@ -27,6 +27,7 @@ DIRS=\
 	crypto/md5\
 	crypto/sha1\
 	datafmt\
+	ebnf\
 	exec\
 	exvar\
 	flag\
@@ -84,6 +85,7 @@ TEST=\
 	crypto/md5\
 	crypto/sha1\
 	datafmt\
+	ebnf\
 	exec\
 	exvar\
 	flag\
diff --git a/src/pkg/ebnf/Makefile b/src/pkg/ebnf/Makefile
new file mode 100644
index 0000000000..06c0fc8322
--- /dev/null
+++ b/src/pkg/ebnf/Makefile
@@ -0,0 +1,69 @@
+# Copyright 2009 The Go Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+
+# DO NOT EDIT.  Automatically generated by gobuild.
+# gobuild -m ebnf.go parser.go >Makefile
+
+D=
+
+include $(GOROOT)/src/Make.$(GOARCH)
+AR=gopack
+
+default: packages
+
+clean:
+	rm -rf *.[$(OS)] *.a [$(OS)].out _obj
+
+test: packages
+	gotest
+
+coverage: packages
+	gotest
+	6cov -g $$(pwd) | grep -v '_test\.go:'
+
+%.$O: %.go
+	$(GC) -I_obj $*.go
+
+%.$O: %.c
+	$(CC) $*.c
+
+%.$O: %.s
+	$(AS) $*.s
+
+O1=\
+	ebnf.$O\
+
+O2=\
+	parser.$O\
+
+
+phases: a1 a2
+_obj$D/ebnf.a: phases
+
+a1: $(O1)
+	$(AR) grc _obj$D/ebnf.a ebnf.$O
+	rm -f $(O1)
+
+a2: $(O2)
+	$(AR) grc _obj$D/ebnf.a parser.$O
+	rm -f $(O2)
+
+
+newpkg: clean
+	mkdir -p _obj$D
+	$(AR) grc _obj$D/ebnf.a
+
+$(O1): newpkg
+$(O2): a1
+$(O3): a2
+
+nuke: clean
+	rm -f $(GOROOT)/pkg/$(GOOS)_$(GOARCH)$D/ebnf.a
+
+packages: _obj$D/ebnf.a
+
+install: packages
+	test -d $(GOROOT)/pkg && mkdir -p $(GOROOT)/pkg/$(GOOS)_$(GOARCH)$D
+	cp _obj$D/ebnf.a $(GOROOT)/pkg/$(GOOS)_$(GOARCH)$D/ebnf.a
diff --git a/src/pkg/ebnf/ebnf.go b/src/pkg/ebnf/ebnf.go
new file mode 100644
index 0000000000..c54f0f8dae
--- /dev/null
+++ b/src/pkg/ebnf/ebnf.go
@@ -0,0 +1,315 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// A library for EBNF grammars. The input is text ([]byte) satisfying
+// the following grammar (represented itself in EBNF):
+//
+//	Production  = name "=" Expression "." .
+//	Expression  = Alternative { "|" Alternative } .
+//	Alternative = Term { Term } .
+//	Term        = name | token [ "..." token ] | Group | Option | Repetition .
+//	Group       = "(" Expression ")" .
+//	Option      = "[" Expression "]" .
+//	Repetition  = "{" Expression "}" .
+//
+// A name is a Go identifier, a token is a Go string, and comments
+// and white space follow the same rules as for the Go language.
+// Production names starting with an uppercase Unicode letter denote
+// non-terminal productions (i.e., productions which allow white-space
+// and comments between tokens); all other production names denote
+// lexical productions.
+//
+package ebnf
+
+import (
+	"container/vector";
+	"fmt";
+	"go/scanner";
+	"go/token";
+	"os";
+	"strconv";
+	"strings";
+	"unicode";
+	"utf8";
+)
+
+
+// ----------------------------------------------------------------------------
+// Internal representation
+
+type (
+	// An Expression node represents a production expression.
+	Expression interface {
+		// Pos is the position of the first character of the syntactic construct
+		Pos() token.Position;
+	};
+
+	// An Alternative node represents a non-empty list of alternative expressions.
+	Alternative []Expression;  // x | y | z
+
+	// A Sequence node represents a non-empty list of sequential expressions.
+	Sequence []Expression;  // x y z
+
+	// A Name node represents a production name.
+	Name struct {
+		token.Position;
+		String string;
+	};
+
+	// A Token node represents a literal.
+	Token struct {
+		token.Position;
+		String string;
+	};
+
+	// A List node represents a range of characters.
+	Range struct {
+		Begin, End *Token;  // begin ... end
+	};
+
+	// A Group node represents a grouped expression.
+	Group struct {
+		token.Position;
+		Body Expression;  // (body)
+	};
+
+	// An Option node represents an optional expression.
+	Option struct {
+		token.Position;
+		Body Expression;  // [body]
+	};
+
+	// A Repetition node represents a repeated expression.
+	Repetition struct {
+		token.Position;
+		Body Expression;  // {body}
+	};
+
+	// A Production node represents an EBNF production.
+	Production struct {
+		Name *Name;
+		Expr Expression;
+	};
+
+	// A Grammar is a set of EBNF productions. The map
+	// is indexed by production name.
+	//
+	Grammar map [string] *Production;
+)
+
+
+func (x Alternative) Pos() token.Position {
+	return x[0].Pos();  // the parser always generates non-empty Alternative
+}
+
+
+func (x Sequence) Pos() token.Position {
+	return x[0].Pos();  // the parser always generates non-empty Sequences
+}
+
+
+func (x Range) Pos() token.Position {
+	return x.Begin.Pos();
+}
+
+
+func (p *Production) Pos() token.Position {
+	return p.Name.Pos();
+}
+
+
+// ----------------------------------------------------------------------------
+// Error handling
+
+// TODO(gri) This is the same code as in datafmt and go/parser.
+//           Should factor this out as part of some parsing framework
+//           that could also deal with reading various input sources.
+
+// Error describes an individual error. The position Pos, if valid,
+// indicates the format source position the error relates to. The
+// error is specified with the Msg string.
+//
+type Error struct {
+	Pos token.Position;
+	Msg string;
+}
+
+
+// String returns the error message. If the error contains (line, column)
+// position information, it starts with "line:column: ", otherwise it
+// starts with a blank " ". 
+//
+func (e *Error) String() string {
+	pos := " ";
+	if e.Pos.IsValid() {
+		pos = fmt.Sprintf("%d:%d: ", e.Pos.Line, e.Pos.Column);
+	}
+	return pos + e.Msg;
+}
+
+
+// An ErrorList is a list of errors encountered during parsing.
+type ErrorList []*Error
+
+
+// ErrorList implements SortInterface and the os.Error interface.
+
+func (p ErrorList) Len() int  { return len(p); }
+func (p ErrorList) Swap(i, j int)  { p[i], p[j] = p[j], p[i]; }
+func (p ErrorList) Less(i, j int) bool  { return p[i].Pos.Offset < p[j].Pos.Offset; }
+
+
+func (p ErrorList) String() string {
+	switch len(p) {
+	case 0:
+		return "unspecified error";
+	case 1:
+		return p[0].String();
+	}
+	return fmt.Sprintf("%s (and %d more errors)", p[0].String(), len(p) - 1);
+}
+
+
+// ----------------------------------------------------------------------------
+// Grammar verification
+
+func isLexical(name string) bool {
+	ch, len := utf8.DecodeRuneInString(name);
+	return !unicode.IsUpper(ch);
+}
+
+
+type verifier struct {
+	errors vector.Vector;
+	worklist vector.Vector;
+	reached Grammar;  // set of productions reached from (and including) the root production
+	grammar Grammar;
+}
+
+
+func (v *verifier) error(pos token.Position, msg string) {
+	v.errors.Push(&Error{pos, msg});
+}
+
+
+func makeErrorList(v *vector.Vector) os.Error {
+	if v.Len() > 0 {
+		errors := make(ErrorList, v.Len());
+		for i := 0; i < v.Len(); i++ {
+			errors[i] = v.At(i).(*Error);
+		}
+		return errors;
+	}
+	return nil;
+}
+
+
+func (v *verifier) push(prod *Production) {
+	name := prod.Name.String;
+	if _, found := v.reached[name]; !found {
+		v.worklist.Push(prod);
+		v.reached[name] = prod;
+	}
+}
+
+
+func (v *verifier) verifyChar(x *Token) int {
+	s := x.String;
+	if utf8.RuneCountInString(s) != 1 {
+		v.error(x.Pos(), "single char expected, found " + s);
+		return 0;
+	}
+	ch, _ := utf8.DecodeRuneInString(s);
+	return ch;
+}
+
+
+func (v *verifier) verifyExpr(expr Expression, lexical bool) {
+	switch x := expr.(type) {
+	case nil:
+		// empty expression
+	case Alternative:
+		for _, e := range x {
+			v.verifyExpr(e, lexical);
+		}
+	case Sequence:
+		for _, e := range x {
+			v.verifyExpr(e, lexical);
+		}
+	case *Name:
+		// a production with this name must exist;
+		// add it to the worklist if not yet processed
+		if prod, found := v.grammar[x.String]; found {
+			v.push(prod);
+		} else {
+			v.error(x.Pos(), "missing production " + x.String);
+		}
+		// within a lexical production references
+		// to non-lexical productions are invalid
+		if lexical && !isLexical(x.String) {
+			v.error(x.Pos(), "reference to non-lexical production " + x.String);
+		}
+	case *Token:
+		// nothing to do for now
+	case *Range:
+		i := v.verifyChar(x.Begin);
+		j := v.verifyChar(x.End);
+		if i >= j {
+			v.error(x.Pos(), "decreasing character range");
+		}
+	case *Group:
+		v.verifyExpr(x.Body, lexical);
+	case *Option:
+		v.verifyExpr(x.Body, lexical);
+	case *Repetition:
+		v.verifyExpr(x.Body, lexical);
+	default:
+		panic("unreachable");
+	}
+}
+
+
+func (v *verifier) verify(grammar Grammar, start string) {
+	// find root production
+	root, found := grammar[start];
+	if !found {
+		var noPos token.Position;
+		v.error(noPos, "no start production " + start);
+		return;
+	}
+
+	// initialize verifier
+	v.errors.Init(0);
+	v.worklist.Init(0);
+	v.reached = make(Grammar);
+	v.grammar = grammar;
+
+	// work through the worklist
+	v.push(root);
+	for v.worklist.Len() > 0 {
+		prod := v.worklist.Pop().(*Production);
+		v.verifyExpr(prod.Expr, isLexical(prod.Name.String));
+	}
+
+	// check if all productions were reached
+	if len(v.reached) < len(v.grammar) {
+		for name, prod := range v.grammar {
+			if _, found := v.reached[name]; !found {
+				v.error(prod.Pos(), name + " is unreachable");
+			}
+		}
+	}
+}
+
+
+// Verify checks that:
+//	- all productions used are defined
+//	- all productions defined are used when beginning at start
+//	- lexical productions refer only to other lexical productions
+//
+func Verify(grammar Grammar, start string) os.Error {
+	var v verifier;
+	v.verify(grammar, start);
+	return makeErrorList(&v.errors);
+}
diff --git a/src/pkg/ebnf/ebnf_test.go b/src/pkg/ebnf/ebnf_test.go
new file mode 100644
index 0000000000..ab4ea4c955
--- /dev/null
+++ b/src/pkg/ebnf/ebnf_test.go
@@ -0,0 +1,75 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package ebnf
+
+import (
+	"ebnf";
+	"io";
+	"strings";
+	"testing";
+)
+
+
+var grammars = []string {
+	`Program = .
+	`,
+
+	`Program = foo .
+	foo = "foo" .
+	`,
+
+	`Program = "a" | "b" "c" .
+	`,
+
+	`Program = "a" ... "z" .
+	`,
+
+	`Program = Song .
+	 Song = { Note } .
+	 Note = Do | (Re | Mi | Fa | So | La) | Ti .
+	 Do = "c" .
+	 Re = "d" .
+	 Mi = "e" .
+	 Fa = "f" .
+	 So = "g" .
+	 La = "a" .
+	 Ti = ti .
+	 ti = "b" .
+	`,
+}
+
+
+func check(t *testing.T, src []byte) {
+	grammar, err := Parse(src);
+	if err != nil {
+		t.Errorf("Parse(%s) failed: %v", src, err);
+	}
+	if err = Verify(grammar, "Program"); err != nil {
+		t.Errorf("Verify(%s) failed: %v", src, err);
+	}
+}
+
+
+func TestGrammars(t *testing.T) {
+	for _, src := range grammars {
+		check(t, strings.Bytes(src));
+	}
+}
+
+
+var files = []string {
+	// TODO(gri) add some test files
+}
+
+
+func TestFiles(t *testing.T) {
+	for _, filename := range files {
+		src, err := io.ReadFile(filename);
+		if err != nil {
+			t.Fatal(err);
+		}
+		check(t, src);
+	}
+}
diff --git a/src/pkg/ebnf/parser.go b/src/pkg/ebnf/parser.go
new file mode 100644
index 0000000000..84905d5fe5
--- /dev/null
+++ b/src/pkg/ebnf/parser.go
@@ -0,0 +1,240 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package ebnf
+
+import (
+	"container/vector";
+	"ebnf";
+	"fmt";
+	"go/scanner";
+	"go/token";
+	"os";
+	"strconv";
+	"strings";
+	"unicode";
+	"utf8";
+)
+
+
+type parser struct {
+	errors vector.Vector;
+	scanner scanner.Scanner;
+	pos token.Position;  // token position
+	tok token.Token;  // one token look-ahead
+	lit []byte;  // token literal
+}
+
+
+func (p *parser) next() {
+	p.pos, p.tok, p.lit = p.scanner.Scan();
+	if p.tok.IsKeyword() {
+		// TODO Should keyword mapping always happen outside scanner?
+		//      Or should there be a flag to scanner to enable keyword mapping?
+		p.tok = token.IDENT;
+	}
+}
+
+
+func (p *parser) init(src []byte) {
+	p.errors.Init(0);
+	p.scanner.Init(src, p, 0);
+	p.next();  // initializes pos, tok, lit
+}
+
+
+// The parser implements scanner.Error.
+func (p *parser) Error(pos token.Position, msg string) {
+	// Do not collect errors that are on the same line as the previous
+	// error to reduce the number of spurious errors due to incorrect
+	// parser synchronization.
+	if p.errors.Len() == 0 || p.errors.Last().(*Error).Pos.Line != pos.Line {
+		p.errors.Push(&Error{pos, msg});
+	}
+}
+
+
+func (p *parser) errorExpected(pos token.Position, msg string) {
+	msg = "expected " + msg;
+	if pos.Offset == p.pos.Offset {
+		// the error happened at the current position;
+		// make the error message more specific
+		msg += ", found '" + p.tok.String() + "'";
+		if p.tok.IsLiteral() {
+			msg += " " + string(p.lit);
+		}
+	}
+	p.Error(pos, msg);
+}
+
+
+func (p *parser) expect(tok token.Token) token.Position {
+	pos := p.pos;
+	if p.tok != tok {
+		p.errorExpected(pos, "'" + tok.String() + "'");
+	}
+	p.next();  // make progress in any case
+	return pos;
+}
+
+
+func (p *parser) parseIdentifier() *Name {
+	pos := p.pos;
+	name := string(p.lit);
+	p.expect(token.IDENT);
+	return &Name{pos, name};
+}
+
+
+func (p *parser) parseToken() *Token {
+	pos := p.pos;
+	value := "";
+	if p.tok == token.STRING {
+		var err os.Error;
+		value, err = strconv.Unquote(string(p.lit));
+		// Unquote may fail with an error, but only if the scanner found
+		// an illegal string in the first place. In this case the error
+		// has already been reported.
+		p.next();
+	} else {
+		p.expect(token.STRING);
+	}
+	return &Token{pos, value};
+}
+
+
+func (p *parser) parseExpression() Expression
+
+func (p *parser) parseTerm() (x Expression) {
+	pos := p.pos;
+
+	switch p.tok {
+	case token.IDENT:
+		x = p.parseIdentifier();
+
+	case token.STRING:
+		tok := p.parseToken();
+		x = tok;
+		if p.tok == token.ELLIPSIS {
+			p.next();
+			x = &Range{tok, p.parseToken()};
+		}
+
+	case token.LPAREN:
+		p.next();
+		x = &Group{pos, p.parseExpression()};
+		p.expect(token.RPAREN);
+
+	case token.LBRACK:
+		p.next();
+		x = &Option{pos, p.parseExpression()};
+		p.expect(token.RBRACK);
+
+	case token.LBRACE:
+		p.next();
+		x = &Repetition{pos, p.parseExpression()};
+		p.expect(token.RBRACE);
+	}
+
+	return x;
+}
+
+
+func (p *parser) parseSequence() Expression {
+	var list vector.Vector;
+	list.Init(0);
+
+	for x := p.parseTerm(); x != nil; x = p.parseTerm() {
+		list.Push(x);
+	}
+
+	// no need for a sequence if list.Len() < 2
+	switch list.Len() {
+	case 0:
+		return nil;
+	case 1:
+		return list.At(0).(Expression);
+	}
+
+	// convert list into a sequence
+	seq := make(Sequence, list.Len());
+	for i := 0; i < list.Len(); i++ {
+		seq[i] = list.At(i).(Expression);
+	}
+	return seq;
+}
+
+
+func (p *parser) parseExpression() Expression {
+	var list vector.Vector;
+	list.Init(0);
+
+	for {
+		x := p.parseSequence();
+		if x != nil {
+			list.Push(x);
+		}
+		if p.tok != token.OR {
+			break;
+		}
+		p.next();
+	}
+
+	// no need for an Alternative node if list.Len() < 2
+	switch list.Len() {
+	case 0:
+		return nil;
+	case 1:
+		return list.At(0).(Expression);
+	}
+
+	// convert list into an Alternative node
+	alt := make(Alternative, list.Len());
+	for i := 0; i < list.Len(); i++ {
+		alt[i] = list.At(i).(Expression);
+	}
+	return alt;
+}
+
+
+func (p *parser) parseProduction() *Production {
+	name := p.parseIdentifier();
+	p.expect(token.ASSIGN);
+	expr := p.parseExpression();
+	p.expect(token.PERIOD);
+	return &Production{name, expr};
+}
+
+
+func (p *parser) parse(src []byte) Grammar {
+	// initialize parser
+	p.errors.Init(0);
+	p.scanner.Init(src, p, 0);
+	p.next();  // initializes pos, tok, lit
+
+	grammar := make(Grammar);
+	for p.tok != token.EOF {
+		prod := p.parseProduction();
+		name := prod.Name.String;
+		if prev, found := grammar[name]; !found {
+			grammar[name] = prod;
+		} else {
+			p.Error(prod.Pos(), name + " declared already");
+		}
+	}
+
+	return grammar;
+}
+
+
+// Parse parses a set of EBNF productions from source src.
+// It returns a set of productions. Errors are reported
+// for incorrect syntax and if a production is declared
+// more than once.
+//
+func Parse(src []byte) (Grammar, os.Error) {
+	var p parser;
+	grammar := p.parse(src);
+	return grammar, makeErrorList(&p.errors);
+}