From ef4347f19ec70d18abba176e823e5d0d26694360 Mon Sep 17 00:00:00 2001 From: Robert Griesemer Date: Mon, 13 Jul 2009 10:10:56 -0700 Subject: [PATCH] Basic EBNF package: - parsing of EBNF grammars - basic consistency checks R=rsc DELTA=695 (695 added, 0 deleted, 0 changed) OCL=31479 CL=31516 --- src/pkg/Make.deps | 1 + src/pkg/Makefile | 2 + src/pkg/ebnf/Makefile | 69 +++++++++ src/pkg/ebnf/ebnf.go | 315 ++++++++++++++++++++++++++++++++++++++ src/pkg/ebnf/ebnf_test.go | 75 +++++++++ src/pkg/ebnf/parser.go | 240 +++++++++++++++++++++++++++++ 6 files changed, 702 insertions(+) create mode 100644 src/pkg/ebnf/Makefile create mode 100644 src/pkg/ebnf/ebnf.go create mode 100644 src/pkg/ebnf/ebnf_test.go create mode 100644 src/pkg/ebnf/parser.go diff --git a/src/pkg/Make.deps b/src/pkg/Make.deps index 9144ad40d9..12f0ef6e5f 100644 --- a/src/pkg/Make.deps +++ b/src/pkg/Make.deps @@ -13,6 +13,7 @@ crypto/hmac.install: crypto/md5.install crypto/sha1.install hash.install os.inst crypto/md5.install: hash.install os.install crypto/sha1.install: hash.install os.install datafmt.install: bytes.install container/vector.install fmt.install go/scanner.install go/token.install io.install os.install reflect.install runtime.install strconv.install strings.install +ebnf.install: container/vector.install fmt.install go/scanner.install go/token.install os.install strconv.install strings.install unicode.install utf8.install exec.install: os.install strings.install exvar.install: bytes.install fmt.install http.install io.install log.install strconv.install sync.install flag.install: fmt.install os.install strconv.install diff --git a/src/pkg/Makefile b/src/pkg/Makefile index 3c16395f51..c007511e63 100644 --- a/src/pkg/Makefile +++ b/src/pkg/Makefile @@ -27,6 +27,7 @@ DIRS=\ crypto/md5\ crypto/sha1\ datafmt\ + ebnf\ exec\ exvar\ flag\ @@ -84,6 +85,7 @@ TEST=\ crypto/md5\ crypto/sha1\ datafmt\ + ebnf\ exec\ exvar\ flag\ diff --git a/src/pkg/ebnf/Makefile b/src/pkg/ebnf/Makefile new file mode 100644 index 0000000000..06c0fc8322 --- /dev/null +++ b/src/pkg/ebnf/Makefile @@ -0,0 +1,69 @@ +# Copyright 2009 The Go Authors. All rights reserved. +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file. + + +# DO NOT EDIT. Automatically generated by gobuild. +# gobuild -m ebnf.go parser.go >Makefile + +D= + +include $(GOROOT)/src/Make.$(GOARCH) +AR=gopack + +default: packages + +clean: + rm -rf *.[$(OS)] *.a [$(OS)].out _obj + +test: packages + gotest + +coverage: packages + gotest + 6cov -g $$(pwd) | grep -v '_test\.go:' + +%.$O: %.go + $(GC) -I_obj $*.go + +%.$O: %.c + $(CC) $*.c + +%.$O: %.s + $(AS) $*.s + +O1=\ + ebnf.$O\ + +O2=\ + parser.$O\ + + +phases: a1 a2 +_obj$D/ebnf.a: phases + +a1: $(O1) + $(AR) grc _obj$D/ebnf.a ebnf.$O + rm -f $(O1) + +a2: $(O2) + $(AR) grc _obj$D/ebnf.a parser.$O + rm -f $(O2) + + +newpkg: clean + mkdir -p _obj$D + $(AR) grc _obj$D/ebnf.a + +$(O1): newpkg +$(O2): a1 +$(O3): a2 + +nuke: clean + rm -f $(GOROOT)/pkg/$(GOOS)_$(GOARCH)$D/ebnf.a + +packages: _obj$D/ebnf.a + +install: packages + test -d $(GOROOT)/pkg && mkdir -p $(GOROOT)/pkg/$(GOOS)_$(GOARCH)$D + cp _obj$D/ebnf.a $(GOROOT)/pkg/$(GOOS)_$(GOARCH)$D/ebnf.a diff --git a/src/pkg/ebnf/ebnf.go b/src/pkg/ebnf/ebnf.go new file mode 100644 index 0000000000..c54f0f8dae --- /dev/null +++ b/src/pkg/ebnf/ebnf.go @@ -0,0 +1,315 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// A library for EBNF grammars. The input is text ([]byte) satisfying +// the following grammar (represented itself in EBNF): +// +// Production = name "=" Expression "." . +// Expression = Alternative { "|" Alternative } . +// Alternative = Term { Term } . +// Term = name | token [ "..." token ] | Group | Option | Repetition . +// Group = "(" Expression ")" . +// Option = "[" Expression "]" . +// Repetition = "{" Expression "}" . +// +// A name is a Go identifier, a token is a Go string, and comments +// and white space follow the same rules as for the Go language. +// Production names starting with an uppercase Unicode letter denote +// non-terminal productions (i.e., productions which allow white-space +// and comments between tokens); all other production names denote +// lexical productions. +// +package ebnf + +import ( + "container/vector"; + "fmt"; + "go/scanner"; + "go/token"; + "os"; + "strconv"; + "strings"; + "unicode"; + "utf8"; +) + + +// ---------------------------------------------------------------------------- +// Internal representation + +type ( + // An Expression node represents a production expression. + Expression interface { + // Pos is the position of the first character of the syntactic construct + Pos() token.Position; + }; + + // An Alternative node represents a non-empty list of alternative expressions. + Alternative []Expression; // x | y | z + + // A Sequence node represents a non-empty list of sequential expressions. + Sequence []Expression; // x y z + + // A Name node represents a production name. + Name struct { + token.Position; + String string; + }; + + // A Token node represents a literal. + Token struct { + token.Position; + String string; + }; + + // A List node represents a range of characters. + Range struct { + Begin, End *Token; // begin ... end + }; + + // A Group node represents a grouped expression. + Group struct { + token.Position; + Body Expression; // (body) + }; + + // An Option node represents an optional expression. + Option struct { + token.Position; + Body Expression; // [body] + }; + + // A Repetition node represents a repeated expression. + Repetition struct { + token.Position; + Body Expression; // {body} + }; + + // A Production node represents an EBNF production. + Production struct { + Name *Name; + Expr Expression; + }; + + // A Grammar is a set of EBNF productions. The map + // is indexed by production name. + // + Grammar map [string] *Production; +) + + +func (x Alternative) Pos() token.Position { + return x[0].Pos(); // the parser always generates non-empty Alternative +} + + +func (x Sequence) Pos() token.Position { + return x[0].Pos(); // the parser always generates non-empty Sequences +} + + +func (x Range) Pos() token.Position { + return x.Begin.Pos(); +} + + +func (p *Production) Pos() token.Position { + return p.Name.Pos(); +} + + +// ---------------------------------------------------------------------------- +// Error handling + +// TODO(gri) This is the same code as in datafmt and go/parser. +// Should factor this out as part of some parsing framework +// that could also deal with reading various input sources. + +// Error describes an individual error. The position Pos, if valid, +// indicates the format source position the error relates to. The +// error is specified with the Msg string. +// +type Error struct { + Pos token.Position; + Msg string; +} + + +// String returns the error message. If the error contains (line, column) +// position information, it starts with "line:column: ", otherwise it +// starts with a blank " ". +// +func (e *Error) String() string { + pos := " "; + if e.Pos.IsValid() { + pos = fmt.Sprintf("%d:%d: ", e.Pos.Line, e.Pos.Column); + } + return pos + e.Msg; +} + + +// An ErrorList is a list of errors encountered during parsing. +type ErrorList []*Error + + +// ErrorList implements SortInterface and the os.Error interface. + +func (p ErrorList) Len() int { return len(p); } +func (p ErrorList) Swap(i, j int) { p[i], p[j] = p[j], p[i]; } +func (p ErrorList) Less(i, j int) bool { return p[i].Pos.Offset < p[j].Pos.Offset; } + + +func (p ErrorList) String() string { + switch len(p) { + case 0: + return "unspecified error"; + case 1: + return p[0].String(); + } + return fmt.Sprintf("%s (and %d more errors)", p[0].String(), len(p) - 1); +} + + +// ---------------------------------------------------------------------------- +// Grammar verification + +func isLexical(name string) bool { + ch, len := utf8.DecodeRuneInString(name); + return !unicode.IsUpper(ch); +} + + +type verifier struct { + errors vector.Vector; + worklist vector.Vector; + reached Grammar; // set of productions reached from (and including) the root production + grammar Grammar; +} + + +func (v *verifier) error(pos token.Position, msg string) { + v.errors.Push(&Error{pos, msg}); +} + + +func makeErrorList(v *vector.Vector) os.Error { + if v.Len() > 0 { + errors := make(ErrorList, v.Len()); + for i := 0; i < v.Len(); i++ { + errors[i] = v.At(i).(*Error); + } + return errors; + } + return nil; +} + + +func (v *verifier) push(prod *Production) { + name := prod.Name.String; + if _, found := v.reached[name]; !found { + v.worklist.Push(prod); + v.reached[name] = prod; + } +} + + +func (v *verifier) verifyChar(x *Token) int { + s := x.String; + if utf8.RuneCountInString(s) != 1 { + v.error(x.Pos(), "single char expected, found " + s); + return 0; + } + ch, _ := utf8.DecodeRuneInString(s); + return ch; +} + + +func (v *verifier) verifyExpr(expr Expression, lexical bool) { + switch x := expr.(type) { + case nil: + // empty expression + case Alternative: + for _, e := range x { + v.verifyExpr(e, lexical); + } + case Sequence: + for _, e := range x { + v.verifyExpr(e, lexical); + } + case *Name: + // a production with this name must exist; + // add it to the worklist if not yet processed + if prod, found := v.grammar[x.String]; found { + v.push(prod); + } else { + v.error(x.Pos(), "missing production " + x.String); + } + // within a lexical production references + // to non-lexical productions are invalid + if lexical && !isLexical(x.String) { + v.error(x.Pos(), "reference to non-lexical production " + x.String); + } + case *Token: + // nothing to do for now + case *Range: + i := v.verifyChar(x.Begin); + j := v.verifyChar(x.End); + if i >= j { + v.error(x.Pos(), "decreasing character range"); + } + case *Group: + v.verifyExpr(x.Body, lexical); + case *Option: + v.verifyExpr(x.Body, lexical); + case *Repetition: + v.verifyExpr(x.Body, lexical); + default: + panic("unreachable"); + } +} + + +func (v *verifier) verify(grammar Grammar, start string) { + // find root production + root, found := grammar[start]; + if !found { + var noPos token.Position; + v.error(noPos, "no start production " + start); + return; + } + + // initialize verifier + v.errors.Init(0); + v.worklist.Init(0); + v.reached = make(Grammar); + v.grammar = grammar; + + // work through the worklist + v.push(root); + for v.worklist.Len() > 0 { + prod := v.worklist.Pop().(*Production); + v.verifyExpr(prod.Expr, isLexical(prod.Name.String)); + } + + // check if all productions were reached + if len(v.reached) < len(v.grammar) { + for name, prod := range v.grammar { + if _, found := v.reached[name]; !found { + v.error(prod.Pos(), name + " is unreachable"); + } + } + } +} + + +// Verify checks that: +// - all productions used are defined +// - all productions defined are used when beginning at start +// - lexical productions refer only to other lexical productions +// +func Verify(grammar Grammar, start string) os.Error { + var v verifier; + v.verify(grammar, start); + return makeErrorList(&v.errors); +} diff --git a/src/pkg/ebnf/ebnf_test.go b/src/pkg/ebnf/ebnf_test.go new file mode 100644 index 0000000000..ab4ea4c955 --- /dev/null +++ b/src/pkg/ebnf/ebnf_test.go @@ -0,0 +1,75 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package ebnf + +import ( + "ebnf"; + "io"; + "strings"; + "testing"; +) + + +var grammars = []string { + `Program = . + `, + + `Program = foo . + foo = "foo" . + `, + + `Program = "a" | "b" "c" . + `, + + `Program = "a" ... "z" . + `, + + `Program = Song . + Song = { Note } . + Note = Do | (Re | Mi | Fa | So | La) | Ti . + Do = "c" . + Re = "d" . + Mi = "e" . + Fa = "f" . + So = "g" . + La = "a" . + Ti = ti . + ti = "b" . + `, +} + + +func check(t *testing.T, src []byte) { + grammar, err := Parse(src); + if err != nil { + t.Errorf("Parse(%s) failed: %v", src, err); + } + if err = Verify(grammar, "Program"); err != nil { + t.Errorf("Verify(%s) failed: %v", src, err); + } +} + + +func TestGrammars(t *testing.T) { + for _, src := range grammars { + check(t, strings.Bytes(src)); + } +} + + +var files = []string { + // TODO(gri) add some test files +} + + +func TestFiles(t *testing.T) { + for _, filename := range files { + src, err := io.ReadFile(filename); + if err != nil { + t.Fatal(err); + } + check(t, src); + } +} diff --git a/src/pkg/ebnf/parser.go b/src/pkg/ebnf/parser.go new file mode 100644 index 0000000000..84905d5fe5 --- /dev/null +++ b/src/pkg/ebnf/parser.go @@ -0,0 +1,240 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package ebnf + +import ( + "container/vector"; + "ebnf"; + "fmt"; + "go/scanner"; + "go/token"; + "os"; + "strconv"; + "strings"; + "unicode"; + "utf8"; +) + + +type parser struct { + errors vector.Vector; + scanner scanner.Scanner; + pos token.Position; // token position + tok token.Token; // one token look-ahead + lit []byte; // token literal +} + + +func (p *parser) next() { + p.pos, p.tok, p.lit = p.scanner.Scan(); + if p.tok.IsKeyword() { + // TODO Should keyword mapping always happen outside scanner? + // Or should there be a flag to scanner to enable keyword mapping? + p.tok = token.IDENT; + } +} + + +func (p *parser) init(src []byte) { + p.errors.Init(0); + p.scanner.Init(src, p, 0); + p.next(); // initializes pos, tok, lit +} + + +// The parser implements scanner.Error. +func (p *parser) Error(pos token.Position, msg string) { + // Do not collect errors that are on the same line as the previous + // error to reduce the number of spurious errors due to incorrect + // parser synchronization. + if p.errors.Len() == 0 || p.errors.Last().(*Error).Pos.Line != pos.Line { + p.errors.Push(&Error{pos, msg}); + } +} + + +func (p *parser) errorExpected(pos token.Position, msg string) { + msg = "expected " + msg; + if pos.Offset == p.pos.Offset { + // the error happened at the current position; + // make the error message more specific + msg += ", found '" + p.tok.String() + "'"; + if p.tok.IsLiteral() { + msg += " " + string(p.lit); + } + } + p.Error(pos, msg); +} + + +func (p *parser) expect(tok token.Token) token.Position { + pos := p.pos; + if p.tok != tok { + p.errorExpected(pos, "'" + tok.String() + "'"); + } + p.next(); // make progress in any case + return pos; +} + + +func (p *parser) parseIdentifier() *Name { + pos := p.pos; + name := string(p.lit); + p.expect(token.IDENT); + return &Name{pos, name}; +} + + +func (p *parser) parseToken() *Token { + pos := p.pos; + value := ""; + if p.tok == token.STRING { + var err os.Error; + value, err = strconv.Unquote(string(p.lit)); + // Unquote may fail with an error, but only if the scanner found + // an illegal string in the first place. In this case the error + // has already been reported. + p.next(); + } else { + p.expect(token.STRING); + } + return &Token{pos, value}; +} + + +func (p *parser) parseExpression() Expression + +func (p *parser) parseTerm() (x Expression) { + pos := p.pos; + + switch p.tok { + case token.IDENT: + x = p.parseIdentifier(); + + case token.STRING: + tok := p.parseToken(); + x = tok; + if p.tok == token.ELLIPSIS { + p.next(); + x = &Range{tok, p.parseToken()}; + } + + case token.LPAREN: + p.next(); + x = &Group{pos, p.parseExpression()}; + p.expect(token.RPAREN); + + case token.LBRACK: + p.next(); + x = &Option{pos, p.parseExpression()}; + p.expect(token.RBRACK); + + case token.LBRACE: + p.next(); + x = &Repetition{pos, p.parseExpression()}; + p.expect(token.RBRACE); + } + + return x; +} + + +func (p *parser) parseSequence() Expression { + var list vector.Vector; + list.Init(0); + + for x := p.parseTerm(); x != nil; x = p.parseTerm() { + list.Push(x); + } + + // no need for a sequence if list.Len() < 2 + switch list.Len() { + case 0: + return nil; + case 1: + return list.At(0).(Expression); + } + + // convert list into a sequence + seq := make(Sequence, list.Len()); + for i := 0; i < list.Len(); i++ { + seq[i] = list.At(i).(Expression); + } + return seq; +} + + +func (p *parser) parseExpression() Expression { + var list vector.Vector; + list.Init(0); + + for { + x := p.parseSequence(); + if x != nil { + list.Push(x); + } + if p.tok != token.OR { + break; + } + p.next(); + } + + // no need for an Alternative node if list.Len() < 2 + switch list.Len() { + case 0: + return nil; + case 1: + return list.At(0).(Expression); + } + + // convert list into an Alternative node + alt := make(Alternative, list.Len()); + for i := 0; i < list.Len(); i++ { + alt[i] = list.At(i).(Expression); + } + return alt; +} + + +func (p *parser) parseProduction() *Production { + name := p.parseIdentifier(); + p.expect(token.ASSIGN); + expr := p.parseExpression(); + p.expect(token.PERIOD); + return &Production{name, expr}; +} + + +func (p *parser) parse(src []byte) Grammar { + // initialize parser + p.errors.Init(0); + p.scanner.Init(src, p, 0); + p.next(); // initializes pos, tok, lit + + grammar := make(Grammar); + for p.tok != token.EOF { + prod := p.parseProduction(); + name := prod.Name.String; + if prev, found := grammar[name]; !found { + grammar[name] = prod; + } else { + p.Error(prod.Pos(), name + " declared already"); + } + } + + return grammar; +} + + +// Parse parses a set of EBNF productions from source src. +// It returns a set of productions. Errors are reported +// for incorrect syntax and if a production is declared +// more than once. +// +func Parse(src []byte) (Grammar, os.Error) { + var p parser; + grammar := p.parse(src); + return grammar, makeErrorList(&p.errors); +}