2009-07-13 11:10:56 -06:00
|
|
|
// Copyright 2009 The Go Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style
|
|
|
|
// license that can be found in the LICENSE file.
|
|
|
|
|
|
|
|
// A library for EBNF grammars. The input is text ([]byte) satisfying
|
|
|
|
// the following grammar (represented itself in EBNF):
|
|
|
|
//
|
|
|
|
// Production = name "=" Expression "." .
|
|
|
|
// Expression = Alternative { "|" Alternative } .
|
|
|
|
// Alternative = Term { Term } .
|
|
|
|
// Term = name | token [ "..." token ] | Group | Option | Repetition .
|
|
|
|
// Group = "(" Expression ")" .
|
|
|
|
// Option = "[" Expression "]" .
|
|
|
|
// Repetition = "{" Expression "}" .
|
|
|
|
//
|
|
|
|
// A name is a Go identifier, a token is a Go string, and comments
|
|
|
|
// and white space follow the same rules as for the Go language.
|
|
|
|
// Production names starting with an uppercase Unicode letter denote
|
|
|
|
// non-terminal productions (i.e., productions which allow white-space
|
|
|
|
// and comments between tokens); all other production names denote
|
|
|
|
// lexical productions.
|
|
|
|
//
|
|
|
|
package ebnf
|
|
|
|
|
|
|
|
import (
|
|
|
|
"container/vector";
|
|
|
|
"go/scanner";
|
|
|
|
"go/token";
|
|
|
|
"os";
|
|
|
|
"unicode";
|
|
|
|
"utf8";
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
// ----------------------------------------------------------------------------
|
|
|
|
// Internal representation
|
|
|
|
|
|
|
|
type (
|
|
|
|
// An Expression node represents a production expression.
|
2009-11-04 18:05:01 -07:00
|
|
|
Expression interface {
|
2009-11-05 18:02:55 -07:00
|
|
|
// Pos is the position of the first character of the syntactic construct
|
|
|
|
Pos() token.Position;
|
2009-07-13 11:10:56 -06:00
|
|
|
};
|
|
|
|
|
|
|
|
// An Alternative node represents a non-empty list of alternative expressions.
|
2009-11-04 18:05:01 -07:00
|
|
|
Alternative []Expression; // x | y | z
|
2009-07-13 11:10:56 -06:00
|
|
|
|
|
|
|
// A Sequence node represents a non-empty list of sequential expressions.
|
2009-11-04 18:05:01 -07:00
|
|
|
Sequence []Expression; // x y z
|
2009-07-13 11:10:56 -06:00
|
|
|
|
|
|
|
// A Name node represents a production name.
|
2009-11-04 18:05:01 -07:00
|
|
|
Name struct {
|
2009-07-13 11:10:56 -06:00
|
|
|
token.Position;
|
2009-11-04 18:05:01 -07:00
|
|
|
String string;
|
2009-07-13 11:10:56 -06:00
|
|
|
};
|
|
|
|
|
|
|
|
// A Token node represents a literal.
|
2009-11-04 18:05:01 -07:00
|
|
|
Token struct {
|
2009-07-13 11:10:56 -06:00
|
|
|
token.Position;
|
2009-11-04 18:05:01 -07:00
|
|
|
String string;
|
2009-07-13 11:10:56 -06:00
|
|
|
};
|
|
|
|
|
|
|
|
// A List node represents a range of characters.
|
2009-11-04 18:05:01 -07:00
|
|
|
Range struct {
|
|
|
|
Begin, End *Token; // begin ... end
|
2009-07-13 11:10:56 -06:00
|
|
|
};
|
|
|
|
|
|
|
|
// A Group node represents a grouped expression.
|
2009-11-04 18:05:01 -07:00
|
|
|
Group struct {
|
2009-07-13 11:10:56 -06:00
|
|
|
token.Position;
|
2009-11-04 18:05:01 -07:00
|
|
|
Body Expression; // (body)
|
2009-07-13 11:10:56 -06:00
|
|
|
};
|
|
|
|
|
|
|
|
// An Option node represents an optional expression.
|
2009-11-04 18:05:01 -07:00
|
|
|
Option struct {
|
2009-07-13 11:10:56 -06:00
|
|
|
token.Position;
|
2009-11-04 18:05:01 -07:00
|
|
|
Body Expression; // [body]
|
2009-07-13 11:10:56 -06:00
|
|
|
};
|
|
|
|
|
|
|
|
// A Repetition node represents a repeated expression.
|
2009-11-04 18:05:01 -07:00
|
|
|
Repetition struct {
|
2009-07-13 11:10:56 -06:00
|
|
|
token.Position;
|
2009-11-04 18:05:01 -07:00
|
|
|
Body Expression; // {body}
|
2009-07-13 11:10:56 -06:00
|
|
|
};
|
|
|
|
|
|
|
|
// A Production node represents an EBNF production.
|
2009-11-04 18:05:01 -07:00
|
|
|
Production struct {
|
|
|
|
Name *Name;
|
|
|
|
Expr Expression;
|
2009-07-13 11:10:56 -06:00
|
|
|
};
|
|
|
|
|
|
|
|
// A Grammar is a set of EBNF productions. The map
|
|
|
|
// is indexed by production name.
|
|
|
|
//
|
2009-11-04 18:05:01 -07:00
|
|
|
Grammar map[string]*Production;
|
2009-07-13 11:10:56 -06:00
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
func (x Alternative) Pos() token.Position {
|
2009-11-09 13:07:39 -07:00
|
|
|
return x[0].Pos() // the parser always generates non-empty Alternative
|
2009-07-13 11:10:56 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
func (x Sequence) Pos() token.Position {
|
2009-11-09 13:07:39 -07:00
|
|
|
return x[0].Pos() // the parser always generates non-empty Sequences
|
2009-07-13 11:10:56 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2009-11-06 15:24:38 -07:00
|
|
|
func (x Range) Pos() token.Position { return x.Begin.Pos() }
|
2009-07-13 11:10:56 -06:00
|
|
|
|
|
|
|
|
2009-11-06 15:24:38 -07:00
|
|
|
func (p *Production) Pos() token.Position { return p.Name.Pos() }
|
2009-07-13 11:10:56 -06:00
|
|
|
|
|
|
|
|
|
|
|
// ----------------------------------------------------------------------------
|
|
|
|
// Grammar verification
|
|
|
|
|
|
|
|
func isLexical(name string) bool {
|
2009-09-15 10:41:59 -06:00
|
|
|
ch, _ := utf8.DecodeRuneInString(name);
|
2009-07-13 11:10:56 -06:00
|
|
|
return !unicode.IsUpper(ch);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
type verifier struct {
|
2009-07-14 11:45:43 -06:00
|
|
|
scanner.ErrorVector;
|
2009-11-04 18:05:01 -07:00
|
|
|
worklist vector.Vector;
|
|
|
|
reached Grammar; // set of productions reached from (and including) the root production
|
|
|
|
grammar Grammar;
|
2009-07-13 11:10:56 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
func (v *verifier) push(prod *Production) {
|
|
|
|
name := prod.Name.String;
|
|
|
|
if _, found := v.reached[name]; !found {
|
|
|
|
v.worklist.Push(prod);
|
|
|
|
v.reached[name] = prod;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
func (v *verifier) verifyChar(x *Token) int {
|
|
|
|
s := x.String;
|
|
|
|
if utf8.RuneCountInString(s) != 1 {
|
2009-11-09 22:13:17 -07:00
|
|
|
v.Error(x.Pos(), "single char expected, found "+s);
|
2009-07-13 11:10:56 -06:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
ch, _ := utf8.DecodeRuneInString(s);
|
|
|
|
return ch;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
func (v *verifier) verifyExpr(expr Expression, lexical bool) {
|
|
|
|
switch x := expr.(type) {
|
|
|
|
case nil:
|
|
|
|
// empty expression
|
|
|
|
case Alternative:
|
|
|
|
for _, e := range x {
|
2009-11-09 13:07:39 -07:00
|
|
|
v.verifyExpr(e, lexical)
|
2009-07-13 11:10:56 -06:00
|
|
|
}
|
|
|
|
case Sequence:
|
|
|
|
for _, e := range x {
|
2009-11-09 13:07:39 -07:00
|
|
|
v.verifyExpr(e, lexical)
|
2009-07-13 11:10:56 -06:00
|
|
|
}
|
|
|
|
case *Name:
|
|
|
|
// a production with this name must exist;
|
|
|
|
// add it to the worklist if not yet processed
|
|
|
|
if prod, found := v.grammar[x.String]; found {
|
2009-11-09 13:07:39 -07:00
|
|
|
v.push(prod)
|
2009-07-13 11:10:56 -06:00
|
|
|
} else {
|
2009-11-09 22:13:17 -07:00
|
|
|
v.Error(x.Pos(), "missing production "+x.String)
|
2009-07-13 11:10:56 -06:00
|
|
|
}
|
|
|
|
// within a lexical production references
|
|
|
|
// to non-lexical productions are invalid
|
|
|
|
if lexical && !isLexical(x.String) {
|
2009-11-09 22:13:17 -07:00
|
|
|
v.Error(x.Pos(), "reference to non-lexical production "+x.String)
|
2009-07-13 11:10:56 -06:00
|
|
|
}
|
|
|
|
case *Token:
|
|
|
|
// nothing to do for now
|
|
|
|
case *Range:
|
|
|
|
i := v.verifyChar(x.Begin);
|
|
|
|
j := v.verifyChar(x.End);
|
|
|
|
if i >= j {
|
2009-11-09 13:07:39 -07:00
|
|
|
v.Error(x.Pos(), "decreasing character range")
|
2009-07-13 11:10:56 -06:00
|
|
|
}
|
|
|
|
case *Group:
|
2009-11-09 13:07:39 -07:00
|
|
|
v.verifyExpr(x.Body, lexical)
|
2009-07-13 11:10:56 -06:00
|
|
|
case *Option:
|
2009-11-09 13:07:39 -07:00
|
|
|
v.verifyExpr(x.Body, lexical)
|
2009-07-13 11:10:56 -06:00
|
|
|
case *Repetition:
|
2009-11-09 13:07:39 -07:00
|
|
|
v.verifyExpr(x.Body, lexical)
|
2009-07-13 11:10:56 -06:00
|
|
|
default:
|
2009-11-09 13:07:39 -07:00
|
|
|
panic("unreachable")
|
2009-07-13 11:10:56 -06:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
func (v *verifier) verify(grammar Grammar, start string) {
|
|
|
|
// find root production
|
|
|
|
root, found := grammar[start];
|
|
|
|
if !found {
|
|
|
|
var noPos token.Position;
|
2009-11-09 22:13:17 -07:00
|
|
|
v.Error(noPos, "no start production "+start);
|
2009-07-13 11:10:56 -06:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
// initialize verifier
|
2009-11-24 14:43:18 -07:00
|
|
|
v.ErrorVector.Reset();
|
|
|
|
v.worklist.Resize(0, 0);
|
2009-07-13 11:10:56 -06:00
|
|
|
v.reached = make(Grammar);
|
|
|
|
v.grammar = grammar;
|
|
|
|
|
|
|
|
// work through the worklist
|
|
|
|
v.push(root);
|
|
|
|
for v.worklist.Len() > 0 {
|
|
|
|
prod := v.worklist.Pop().(*Production);
|
|
|
|
v.verifyExpr(prod.Expr, isLexical(prod.Name.String));
|
|
|
|
}
|
|
|
|
|
|
|
|
// check if all productions were reached
|
|
|
|
if len(v.reached) < len(v.grammar) {
|
|
|
|
for name, prod := range v.grammar {
|
|
|
|
if _, found := v.reached[name]; !found {
|
2009-11-09 22:13:17 -07:00
|
|
|
v.Error(prod.Pos(), name+" is unreachable")
|
2009-07-13 11:10:56 -06:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Verify checks that:
|
|
|
|
// - all productions used are defined
|
|
|
|
// - all productions defined are used when beginning at start
|
|
|
|
// - lexical productions refer only to other lexical productions
|
|
|
|
//
|
|
|
|
func Verify(grammar Grammar, start string) os.Error {
|
|
|
|
var v verifier;
|
|
|
|
v.verify(grammar, start);
|
2009-07-14 11:45:43 -06:00
|
|
|
return v.GetError(scanner.Sorted);
|
2009-07-13 11:10:56 -06:00
|
|
|
}
|