2009-02-16 21:14:21 -07:00
|
|
|
// Copyright 2009 The Go Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style
|
|
|
|
// license that can be found in the LICENSE file.
|
|
|
|
|
2009-10-05 16:00:50 -06:00
|
|
|
// Package xml implements a simple XML 1.0 parser that
|
|
|
|
// understands XML name spaces.
|
2009-02-16 21:14:21 -07:00
|
|
|
package xml
|
|
|
|
|
2009-10-05 16:00:50 -06:00
|
|
|
// TODO(rsc):
|
|
|
|
// Test error handling.
|
|
|
|
// Expose parser line number in errors.
|
|
|
|
|
2009-02-16 21:14:21 -07:00
|
|
|
import (
|
2009-10-05 16:00:50 -06:00
|
|
|
"bufio";
|
|
|
|
"bytes";
|
2009-02-16 21:14:21 -07:00
|
|
|
"io";
|
|
|
|
"os";
|
2009-10-05 16:00:50 -06:00
|
|
|
"strconv";
|
|
|
|
"strings";
|
|
|
|
"unicode";
|
|
|
|
"utf8";
|
2009-02-16 21:14:21 -07:00
|
|
|
)
|
|
|
|
|
2009-10-05 16:00:50 -06:00
|
|
|
// A SyntaxError represents a syntax error in the XML input stream.
|
|
|
|
type SyntaxError string
|
|
|
|
func (e SyntaxError) String() string {
|
|
|
|
return "XML syntax error: " + string(e);
|
|
|
|
}
|
|
|
|
|
|
|
|
// A Name represents an XML name (Local) annotated
|
|
|
|
// with a name space identifier (Space).
|
|
|
|
// In tokens returned by Parser.Token, the Space identifier
|
|
|
|
// is given as a canonical URL, not the short prefix used
|
|
|
|
// in the document being parsed.
|
2009-02-16 21:14:21 -07:00
|
|
|
type Name struct {
|
2009-10-05 16:00:50 -06:00
|
|
|
Space, Local string;
|
2009-02-16 21:14:21 -07:00
|
|
|
}
|
|
|
|
|
2009-10-05 16:00:50 -06:00
|
|
|
// An Attr represents an attribute in an XML element (Name=Value).
|
2009-02-16 21:14:21 -07:00
|
|
|
type Attr struct {
|
2009-10-05 16:00:50 -06:00
|
|
|
Name Name;
|
|
|
|
Value string;
|
2009-02-16 21:14:21 -07:00
|
|
|
}
|
|
|
|
|
2009-10-05 16:00:50 -06:00
|
|
|
// A Token is an interface holding one of the token types:
|
|
|
|
// StartElement, EndElement, CharData, Comment, ProcInst, or Directive.
|
|
|
|
type Token interface{}
|
2009-02-16 21:14:21 -07:00
|
|
|
|
2009-10-05 16:00:50 -06:00
|
|
|
// A StartElement represents an XML start element.
|
|
|
|
type StartElement struct {
|
|
|
|
Name Name;
|
|
|
|
Attr []Attr;
|
|
|
|
}
|
2009-02-16 21:14:21 -07:00
|
|
|
|
2009-10-05 16:00:50 -06:00
|
|
|
// An EndElement represents an XML end element.
|
|
|
|
type EndElement struct {
|
|
|
|
Name Name;
|
|
|
|
}
|
2009-02-16 21:14:21 -07:00
|
|
|
|
2009-10-05 16:00:50 -06:00
|
|
|
// A CharData represents XML character data (raw text),
|
|
|
|
// in which XML escape sequences have been replaced by
|
|
|
|
// the characters they represent.
|
|
|
|
type CharData []byte
|
2009-02-16 21:14:21 -07:00
|
|
|
|
2009-10-05 16:00:50 -06:00
|
|
|
func copy(b []byte) []byte {
|
|
|
|
b1 := make([]byte, len(b));
|
|
|
|
bytes.Copy(b1, b);
|
|
|
|
return b1;
|
2009-02-16 21:14:21 -07:00
|
|
|
}
|
|
|
|
|
2009-10-05 16:00:50 -06:00
|
|
|
func (c CharData) Copy() CharData {
|
|
|
|
return CharData(copy(c));
|
2009-02-16 21:14:21 -07:00
|
|
|
}
|
|
|
|
|
2009-10-05 16:00:50 -06:00
|
|
|
// A Comment represents an XML comment of the form <!--comment-->.
|
|
|
|
// The bytes do not include the <!-- and --> comment markers.
|
|
|
|
type Comment []byte
|
|
|
|
|
|
|
|
func (c Comment) Copy() Comment {
|
|
|
|
return Comment(copy(c));
|
2009-02-16 21:14:21 -07:00
|
|
|
}
|
|
|
|
|
2009-10-05 16:00:50 -06:00
|
|
|
// A ProcInst represents an XML processing instruction of the form <?target inst?>
|
|
|
|
type ProcInst struct {
|
|
|
|
Target string;
|
|
|
|
Inst []byte;
|
2009-02-16 21:14:21 -07:00
|
|
|
}
|
|
|
|
|
2009-10-05 16:00:50 -06:00
|
|
|
func (p ProcInst) Copy() ProcInst {
|
|
|
|
p.Inst = copy(p.Inst);
|
|
|
|
return p;
|
2009-02-16 21:14:21 -07:00
|
|
|
}
|
|
|
|
|
2009-10-05 16:00:50 -06:00
|
|
|
// A Directive represents an XML directive of the form <!text>.
|
|
|
|
// The bytes do not include the <! and > markers.
|
|
|
|
type Directive []byte
|
|
|
|
|
|
|
|
func (d Directive) Copy() Directive {
|
|
|
|
return Directive(copy(d));
|
2009-02-16 21:14:21 -07:00
|
|
|
}
|
|
|
|
|
2009-10-05 16:00:50 -06:00
|
|
|
type readByter interface {
|
|
|
|
ReadByte() (b byte, err os.Error)
|
2009-02-16 21:14:21 -07:00
|
|
|
}
|
|
|
|
|
2009-10-05 16:00:50 -06:00
|
|
|
// A Parser represents an XML parser reading a particular input stream.
|
|
|
|
// The parser assumes that its input is encoded in UTF-8.
|
|
|
|
type Parser struct {
|
|
|
|
r readByter;
|
|
|
|
buf bytes.Buffer;
|
|
|
|
stk *stack;
|
|
|
|
free *stack;
|
|
|
|
needClose bool;
|
|
|
|
toClose Name;
|
|
|
|
nextByte int;
|
|
|
|
ns map[string]string;
|
|
|
|
err os.Error;
|
|
|
|
line int;
|
|
|
|
tmp [32]byte;
|
2009-02-16 21:14:21 -07:00
|
|
|
}
|
|
|
|
|
2009-10-05 16:00:50 -06:00
|
|
|
// NewParser creates a new XML parser reading from r.
|
|
|
|
func NewParser(r io.Reader) *Parser {
|
|
|
|
p := &Parser{
|
|
|
|
ns: make(map[string]string),
|
|
|
|
nextByte: -1,
|
|
|
|
line: 1,
|
|
|
|
};
|
|
|
|
|
|
|
|
// Get efficient byte at a time reader.
|
|
|
|
// Assume that if reader has its own
|
|
|
|
// ReadByte, it's efficient enough.
|
|
|
|
// Otherwise, use bufio.
|
|
|
|
if rb, ok := r.(readByter); ok {
|
|
|
|
p.r = rb;
|
|
|
|
} else {
|
|
|
|
p.r = bufio.NewReader(r);
|
|
|
|
}
|
|
|
|
|
|
|
|
return p;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Token returns the next XML token in the input stream.
|
|
|
|
// At the end of the input stream, Token returns nil, os.EOF.
|
|
|
|
//
|
|
|
|
// Slices of bytes in the returned token data refer to the
|
|
|
|
// parser's internal buffer and remain valid only until the next
|
|
|
|
// call to Token. To acquire a copy of the bytes, call the token's
|
|
|
|
// Copy method.
|
|
|
|
//
|
|
|
|
// Token expands self-closing elements such as <br/>
|
|
|
|
// into separate start and end elements returned by successive calls.
|
|
|
|
//
|
|
|
|
// Token guarantees that the StartElement and EndElement
|
|
|
|
// tokens it returns are properly nested and matched:
|
|
|
|
// if Token encounters an unexpected end element,
|
|
|
|
// it will return an error.
|
|
|
|
//
|
|
|
|
// Token implements XML name spaces as described by
|
|
|
|
// http://www.w3.org/TR/REC-xml-names/. Each of the
|
|
|
|
// Name structures contained in the Token has the Space
|
|
|
|
// set to the URL identifying its name space when known.
|
|
|
|
// If Token encounters an unrecognized name space prefix,
|
|
|
|
// it uses the prefix as the Space rather than report an error.
|
|
|
|
//
|
|
|
|
func (p *Parser) Token() (t Token, err os.Error) {
|
|
|
|
if t, err = p.RawToken(); err != nil {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
switch t1 := t.(type) {
|
|
|
|
case StartElement:
|
|
|
|
// In XML name spaces, the translations listed in the
|
|
|
|
// attributes apply to the element name and
|
|
|
|
// to the other attribute names, so process
|
|
|
|
// the translations first.
|
|
|
|
for _, a := range t1.Attr {
|
|
|
|
if a.Name.Space == "xmlns" {
|
|
|
|
v, ok := p.ns[a.Name.Local];
|
|
|
|
p.pushNs(a.Name.Local, v, ok);
|
|
|
|
p.ns[a.Name.Local] = a.Value;
|
|
|
|
}
|
|
|
|
if a.Name.Space == "" && a.Name.Local == "xmlns" {
|
|
|
|
// Default space for untagged names
|
|
|
|
v, ok := p.ns[""];
|
|
|
|
p.pushNs("", v, ok);
|
|
|
|
p.ns[""] = a.Value;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
p.translate(&t1.Name, true);
|
|
|
|
for i := range t1.Attr {
|
|
|
|
p.translate(&t1.Attr[i].Name, false);
|
|
|
|
}
|
|
|
|
p.pushElement(t1.Name);
|
|
|
|
t = t1;
|
|
|
|
|
|
|
|
case EndElement:
|
|
|
|
p.translate(&t1.Name, true);
|
|
|
|
if !p.popElement(t1.Name) {
|
|
|
|
return nil, p.err;
|
|
|
|
}
|
|
|
|
t = t1;
|
|
|
|
}
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Apply name space translation to name n.
|
|
|
|
// The default name space (for Space=="")
|
|
|
|
// applies only to element names, not to attribute names.
|
|
|
|
func (p *Parser) translate(n *Name, isElementName bool) {
|
|
|
|
switch {
|
|
|
|
case n.Space == "xmlns":
|
|
|
|
return;
|
|
|
|
case n.Space == "" && !isElementName:
|
|
|
|
return;
|
|
|
|
case n.Space == "" && n.Local == "xmlns":
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
if v, ok := p.ns[n.Space]; ok {
|
|
|
|
n.Space = v;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Parsing state - stack holds old name space translations
|
|
|
|
// and the current set of open elements. The translations to pop when
|
|
|
|
// ending a given tag are *below* it on the stack, which is
|
|
|
|
// more work but forced on us by XML.
|
|
|
|
type stack struct {
|
|
|
|
next *stack;
|
|
|
|
kind int;
|
|
|
|
name Name;
|
|
|
|
ok bool;
|
|
|
|
}
|
2009-02-16 21:14:21 -07:00
|
|
|
|
|
|
|
const (
|
2009-10-05 16:00:50 -06:00
|
|
|
stkStart = iota;
|
|
|
|
stkNs;
|
2009-02-16 21:14:21 -07:00
|
|
|
)
|
|
|
|
|
2009-10-05 16:00:50 -06:00
|
|
|
func (p *Parser) push(kind int) *stack {
|
|
|
|
s := p.free;
|
|
|
|
if s != nil {
|
|
|
|
p.free = s.next;
|
|
|
|
} else {
|
|
|
|
s = new(stack);
|
|
|
|
}
|
|
|
|
s.next = p.stk;
|
|
|
|
s.kind = kind;
|
|
|
|
p.stk = s;
|
|
|
|
return s;
|
2009-02-16 21:14:21 -07:00
|
|
|
}
|
|
|
|
|
2009-10-05 16:00:50 -06:00
|
|
|
func (p *Parser) pop() *stack {
|
|
|
|
s := p.stk;
|
|
|
|
if s != nil {
|
|
|
|
p.stk = s.next;
|
|
|
|
s.next = p.free;
|
|
|
|
p.free = s;
|
|
|
|
}
|
|
|
|
return s;
|
|
|
|
}
|
2009-02-16 21:14:21 -07:00
|
|
|
|
2009-10-05 16:00:50 -06:00
|
|
|
// Record that we are starting an element with the given name.
|
|
|
|
func (p *Parser) pushElement(name Name) {
|
|
|
|
s := p.push(stkStart);
|
|
|
|
s.name = name;
|
2009-02-16 21:14:21 -07:00
|
|
|
}
|
|
|
|
|
2009-10-05 16:00:50 -06:00
|
|
|
// Record that we are changing the value of ns[local].
|
|
|
|
// The old value is url, ok.
|
|
|
|
func (p *Parser) pushNs(local string, url string, ok bool) {
|
|
|
|
s := p.push(stkNs);
|
|
|
|
s.name.Local = local;
|
|
|
|
s.name.Space = url;
|
|
|
|
s.ok = ok;
|
2009-02-16 21:14:21 -07:00
|
|
|
}
|
|
|
|
|
2009-10-05 16:00:50 -06:00
|
|
|
// Record that we are ending an element with the given name.
|
|
|
|
// The name must match the record at the top of the stack,
|
|
|
|
// which must be a pushElement record.
|
|
|
|
// After popping the element, apply any undo records from
|
|
|
|
// the stack to restore the name translations that existed
|
|
|
|
// before we saw this element.
|
|
|
|
func (p *Parser) popElement(name Name) bool {
|
|
|
|
s := p.pop();
|
|
|
|
switch {
|
|
|
|
case s == nil || s.kind != stkStart:
|
|
|
|
p.err = SyntaxError("unexpected end element </" + name.Local + ">");
|
|
|
|
return false;
|
|
|
|
case s.name.Local != name.Local:
|
|
|
|
p.err = SyntaxError("element <" + s.name.Local + "> closed by </" + name.Local + ">");
|
|
|
|
return false;
|
|
|
|
case s.name.Space != name.Space:
|
|
|
|
p.err = SyntaxError("element <" + s.name.Local + "> in space " + s.name.Space +
|
|
|
|
"closed by </" + name.Local + "> in space " + name.Space);
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Pop stack until a Start is on the top, undoing the
|
|
|
|
// translations that were associated with the element we just closed.
|
|
|
|
for p.stk != nil && p.stk.kind != stkStart {
|
|
|
|
s := p.pop();
|
|
|
|
p.ns[s.name.Local] = s.name.Space, s.ok;
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
2009-02-16 21:14:21 -07:00
|
|
|
}
|
|
|
|
|
2009-10-05 16:00:50 -06:00
|
|
|
// RawToken is like Token but does not verify that
|
|
|
|
// start and end elements match and does not translate
|
|
|
|
// name space prefixes to their corresponding URLs.
|
|
|
|
func (p *Parser) RawToken() (Token, os.Error) {
|
|
|
|
if p.err != nil {
|
|
|
|
return nil, p.err;
|
|
|
|
}
|
|
|
|
if p.needClose {
|
|
|
|
// The last element we read was self-closing and
|
|
|
|
// we returned just the StartElement half.
|
|
|
|
// Return the EndElement half now.
|
|
|
|
p.needClose = false;
|
|
|
|
return EndElement{p.toClose}, nil;
|
|
|
|
}
|
|
|
|
|
|
|
|
b, ok := p.getc();
|
|
|
|
if !ok {
|
|
|
|
return nil, p.err;
|
|
|
|
}
|
|
|
|
|
|
|
|
if b != '<' {
|
|
|
|
// Text section.
|
|
|
|
p.ungetc(b);
|
|
|
|
data := p.text(-1, false);
|
|
|
|
if data == nil {
|
|
|
|
return nil, p.err;
|
|
|
|
}
|
|
|
|
return CharData(data), nil;
|
|
|
|
}
|
|
|
|
|
|
|
|
if b, ok = p.getc(); !ok {
|
|
|
|
return nil, p.err;
|
|
|
|
}
|
|
|
|
switch b {
|
|
|
|
case '/':
|
|
|
|
// </: End element
|
|
|
|
var name Name;
|
|
|
|
if name, ok = p.nsname(); !ok {
|
|
|
|
if p.err == nil {
|
|
|
|
p.err = SyntaxError("expected element name after </");
|
|
|
|
}
|
|
|
|
return nil, p.err;
|
|
|
|
}
|
|
|
|
p.space();
|
|
|
|
if b, ok = p.getc(); !ok {
|
|
|
|
return nil, p.err;
|
|
|
|
}
|
|
|
|
if b != '>' {
|
|
|
|
p.err = SyntaxError("invalid characters between </" + name.Local + " and >");
|
|
|
|
return nil, p.err;
|
|
|
|
}
|
|
|
|
return EndElement{name}, nil;
|
|
|
|
|
|
|
|
case '?':
|
|
|
|
// <?: Processing instruction.
|
|
|
|
// TODO(rsc): Should parse the <?xml declaration to make sure
|
|
|
|
// the version is 1.0 and the encoding is UTF-8.
|
|
|
|
var target string;
|
|
|
|
if target, ok = p.name(); !ok {
|
|
|
|
return nil, p.err;
|
|
|
|
}
|
|
|
|
p.space();
|
|
|
|
p.buf.Reset();
|
|
|
|
var b0 byte;
|
|
|
|
for {
|
|
|
|
if b, ok = p.getc(); !ok {
|
|
|
|
if p.err == os.EOF {
|
|
|
|
p.err = SyntaxError("unterminated <? directive");
|
|
|
|
}
|
|
|
|
return nil, p.err;
|
|
|
|
}
|
|
|
|
p.buf.WriteByte(b);
|
|
|
|
if b0 == '?' && b == '>' {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
b0 = b;
|
|
|
|
}
|
|
|
|
data := p.buf.Bytes();
|
|
|
|
data = data[0:len(data)-2]; // chop ?>
|
|
|
|
return ProcInst{target, data}, nil;
|
|
|
|
|
|
|
|
case '!':
|
|
|
|
// <!: Maybe comment, maybe CDATA.
|
|
|
|
if b, ok = p.getc(); !ok {
|
|
|
|
return nil, p.err;
|
|
|
|
}
|
|
|
|
switch b {
|
|
|
|
case '-': // <!-
|
|
|
|
// Probably <!-- for a comment.
|
|
|
|
if b, ok = p.getc(); !ok {
|
|
|
|
return nil, p.err;
|
|
|
|
}
|
|
|
|
if b != '-' {
|
|
|
|
p.err = SyntaxError("invalid sequence <!- not part of <!--");
|
|
|
|
return nil, p.err;
|
|
|
|
}
|
|
|
|
// Look for terminator.
|
|
|
|
p.buf.Reset();
|
|
|
|
var b0, b1 byte;
|
|
|
|
for {
|
|
|
|
if b, ok = p.getc(); !ok {
|
|
|
|
if p.err == os.EOF {
|
|
|
|
p.err = SyntaxError("unterminated <!-- comment");
|
|
|
|
}
|
|
|
|
return nil, p.err;
|
|
|
|
}
|
|
|
|
p.buf.WriteByte(b);
|
|
|
|
if b0 == '-' && b1 == '-' && b == '>' {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
b0, b1 = b1, b;
|
|
|
|
}
|
|
|
|
data := p.buf.Bytes();
|
|
|
|
data = data[0:len(data)-3]; // chop -->
|
|
|
|
return Comment(data), nil;
|
|
|
|
|
|
|
|
case '[': // <![
|
|
|
|
// Probably <![CDATA[.
|
|
|
|
for i := 0; i < 7; i++ {
|
|
|
|
if b, ok = p.getc(); !ok {
|
|
|
|
return nil, p.err;
|
|
|
|
}
|
|
|
|
if b != "[CDATA["[i] {
|
|
|
|
p.err = SyntaxError("invalid <![ sequence");
|
|
|
|
return nil, p.err;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// Have <![CDATA[. Read text until ]]>.
|
|
|
|
data := p.text(-1, true);
|
|
|
|
if data == nil {
|
|
|
|
return nil, p.err;
|
|
|
|
}
|
|
|
|
return CharData(data), nil;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Probably a directive: <!DOCTYPE ...>, <!ENTITY ...>, etc.
|
|
|
|
// We don't care, but accumulate for caller.
|
|
|
|
p.buf.Reset();
|
|
|
|
p.buf.WriteByte(b);
|
|
|
|
for {
|
|
|
|
if b, ok = p.getc(); !ok {
|
|
|
|
return nil, p.err;
|
|
|
|
}
|
|
|
|
if b == '>' {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
p.buf.WriteByte(b);
|
|
|
|
}
|
|
|
|
return Directive(p.buf.Bytes()), nil;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Must be an open element like <a href="foo">
|
|
|
|
p.ungetc(b);
|
|
|
|
|
|
|
|
var (
|
|
|
|
name Name;
|
|
|
|
empty bool;
|
|
|
|
attr []Attr;
|
|
|
|
)
|
|
|
|
if name, ok = p.nsname(); !ok {
|
|
|
|
if p.err == nil {
|
|
|
|
p.err = SyntaxError("expected element name after <");
|
|
|
|
}
|
|
|
|
return nil, p.err;
|
|
|
|
}
|
|
|
|
|
|
|
|
attr = make([]Attr, 0, 4);
|
|
|
|
for {
|
|
|
|
p.space();
|
|
|
|
if b, ok = p.getc(); !ok {
|
|
|
|
return nil, p.err;
|
|
|
|
}
|
|
|
|
if b == '/' {
|
|
|
|
empty = true;
|
|
|
|
if b, ok = p.getc(); !ok {
|
|
|
|
return nil, p.err;
|
|
|
|
}
|
|
|
|
if b != '>' {
|
|
|
|
p.err = SyntaxError("expected /> in element");
|
|
|
|
return nil, p.err;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if b == '>' {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
p.ungetc(b);
|
|
|
|
|
|
|
|
n := len(attr);
|
|
|
|
if n >= cap(attr) {
|
|
|
|
nattr := make([]Attr, n, 2*cap(attr));
|
|
|
|
for i, a := range attr {
|
|
|
|
nattr[i] = a;
|
|
|
|
}
|
|
|
|
attr = nattr;
|
|
|
|
}
|
|
|
|
attr = attr[0:n+1];
|
|
|
|
a := &attr[n];
|
|
|
|
if a.Name, ok = p.nsname(); !ok {
|
|
|
|
if p.err == nil {
|
|
|
|
p.err = SyntaxError("expected attribute name in element");
|
|
|
|
}
|
|
|
|
return nil, p.err;
|
|
|
|
}
|
|
|
|
p.space();
|
|
|
|
if b, ok = p.getc(); !ok {
|
|
|
|
return nil, p.err;
|
|
|
|
}
|
|
|
|
if b != '=' {
|
|
|
|
p.err = SyntaxError("attribute name without = in element");
|
|
|
|
return nil, p.err;
|
|
|
|
}
|
|
|
|
p.space();
|
|
|
|
if b, ok = p.getc(); !ok {
|
|
|
|
return nil, p.err;
|
|
|
|
}
|
|
|
|
if b != '"' && b != '\'' {
|
|
|
|
p.err = SyntaxError("unquoted or missing attribute value in element");
|
|
|
|
return nil, p.err;
|
|
|
|
}
|
|
|
|
data := p.text(int(b), false);
|
|
|
|
if data == nil {
|
|
|
|
return nil, p.err;
|
|
|
|
}
|
|
|
|
a.Value = string(data);
|
|
|
|
}
|
|
|
|
|
|
|
|
if empty {
|
|
|
|
p.needClose = true;
|
|
|
|
p.toClose = name;
|
|
|
|
}
|
|
|
|
return StartElement{name, attr}, nil;
|
2009-02-16 21:14:21 -07:00
|
|
|
}
|
|
|
|
|
2009-10-05 16:00:50 -06:00
|
|
|
// Skip spaces if any
|
|
|
|
func (p *Parser) space() {
|
|
|
|
for {
|
|
|
|
b, ok := p.getc();
|
|
|
|
if !ok {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
switch b {
|
|
|
|
case ' ', '\r', '\n', '\t':
|
|
|
|
default:
|
|
|
|
p.ungetc(b);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
2009-02-16 21:14:21 -07:00
|
|
|
}
|
|
|
|
|
2009-10-05 16:00:50 -06:00
|
|
|
// Read a single byte.
|
|
|
|
// If there is no byte to read, return ok==false
|
|
|
|
// and leave the error in p.err.
|
|
|
|
// Maintain line number.
|
|
|
|
func (p *Parser) getc() (b byte, ok bool) {
|
|
|
|
if p.err != nil {
|
|
|
|
return 0, false;
|
|
|
|
}
|
|
|
|
if p.nextByte >= 0 {
|
|
|
|
b = byte(p.nextByte);
|
|
|
|
p.nextByte = -1;
|
|
|
|
} else {
|
|
|
|
b, p.err = p.r.ReadByte();
|
|
|
|
if p.err != nil {
|
|
|
|
return 0, false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if b == '\n' {
|
|
|
|
p.line++;
|
|
|
|
}
|
|
|
|
return b, true;
|
2009-02-16 21:14:21 -07:00
|
|
|
}
|
|
|
|
|
2009-10-05 16:00:50 -06:00
|
|
|
// Unread a single byte.
|
|
|
|
func (p *Parser) ungetc(b byte) {
|
|
|
|
if b == '\n' {
|
|
|
|
p.line--;
|
|
|
|
}
|
|
|
|
p.nextByte = int(b);
|
|
|
|
}
|
2009-02-16 21:14:21 -07:00
|
|
|
|
2009-10-05 16:00:50 -06:00
|
|
|
var entity = map[string]int {
|
|
|
|
"lt": '<',
|
|
|
|
"gt": '>',
|
|
|
|
"amp": '&',
|
|
|
|
"apos": '\'',
|
|
|
|
"quot": '"',
|
|
|
|
}
|
2009-02-16 21:14:21 -07:00
|
|
|
|
2009-10-05 16:00:50 -06:00
|
|
|
// Read plain text section (XML calls it character data).
|
|
|
|
// If quote >= 0, we are in a quoted string and need to find the matching quote.
|
|
|
|
// If cdata == true, we are in a <![CDATA[ section and need to find ]]>.
|
|
|
|
// On failure return nil and leave the error in p.err.
|
|
|
|
func (p *Parser) text(quote int, cdata bool) []byte {
|
|
|
|
var b0, b1 byte;
|
|
|
|
var trunc int;
|
|
|
|
p.buf.Reset();
|
|
|
|
Input:
|
|
|
|
for {
|
|
|
|
b, ok := p.getc();
|
|
|
|
if !ok {
|
|
|
|
return nil;
|
|
|
|
}
|
|
|
|
|
|
|
|
// <![CDATA[ section ends with ]]>.
|
|
|
|
// It is an error for ]]> to appear in ordinary text.
|
|
|
|
if b0 == ']' && b1 == ']' && b == '>' {
|
|
|
|
if cdata {
|
|
|
|
trunc = 2;
|
|
|
|
break Input;
|
|
|
|
}
|
|
|
|
p.err = SyntaxError("unescaped ]]> not in CDATA section");
|
|
|
|
return nil;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Stop reading text if we see a <.
|
|
|
|
if b == '<' && !cdata {
|
|
|
|
if quote >= 0 {
|
|
|
|
p.err = SyntaxError("unescaped < inside quoted string");
|
|
|
|
return nil;
|
|
|
|
}
|
|
|
|
p.ungetc('<');
|
|
|
|
break Input;
|
|
|
|
}
|
|
|
|
if quote >= 0 && b == byte(quote) {
|
|
|
|
break Input;
|
|
|
|
}
|
|
|
|
if b == '&' {
|
|
|
|
// Read escaped character expression up to semicolon.
|
|
|
|
// XML in all its glory allows a document to define and use
|
|
|
|
// its own character names with <!ENTITY ...> directives.
|
|
|
|
// Parsers are required to recognize lt, gt, amp, apos, and quot
|
|
|
|
// even if they have not been declared. That's all we allow.
|
|
|
|
var i int;
|
|
|
|
for i = 0; i < len(p.tmp); i++ {
|
|
|
|
p.tmp[i], p.err = p.r.ReadByte();
|
|
|
|
if p.err != nil {
|
|
|
|
return nil;
|
|
|
|
}
|
|
|
|
if p.tmp[i] == ';' {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
s := string(p.tmp[0:i]);
|
|
|
|
if i >= len(p.tmp) {
|
|
|
|
p.err = SyntaxError("character entity expression &" + s + "... too long");
|
|
|
|
return nil;
|
|
|
|
}
|
|
|
|
rune := -1;
|
|
|
|
if i >= 2 && s[0] == '#' {
|
|
|
|
var n uint64;
|
|
|
|
var err os.Error;
|
|
|
|
if i >= 3 && s[1] == 'x' {
|
|
|
|
n, err = strconv.Btoui64(s[2:len(s)], 16);
|
|
|
|
} else {
|
|
|
|
n, err = strconv.Btoui64(s[1:len(s)], 10);
|
|
|
|
}
|
|
|
|
if err == nil && n <= unicode.MaxRune {
|
|
|
|
rune = int(n);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
if r, ok := entity[s]; ok {
|
|
|
|
rune = r;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if rune < 0 {
|
|
|
|
p.err = SyntaxError("invalid character entity &" + s + ";");
|
|
|
|
return nil;
|
|
|
|
}
|
|
|
|
i = utf8.EncodeRune(rune, &p.tmp);
|
|
|
|
p.buf.Write(p.tmp[0:i]);
|
|
|
|
b0, b1 = 0, 0;
|
|
|
|
continue Input;
|
|
|
|
}
|
|
|
|
p.buf.WriteByte(b);
|
|
|
|
b0, b1 = b1, b;
|
|
|
|
}
|
|
|
|
data := p.buf.Bytes();
|
|
|
|
data = data[0:len(data)-trunc];
|
|
|
|
|
|
|
|
// Must rewrite \r and \r\n into \n.
|
|
|
|
w := 0;
|
|
|
|
for r := 0; r < len(data); r++ {
|
|
|
|
b := data[r];
|
|
|
|
if b == '\r' {
|
|
|
|
if r+1 < len(data) && data[r+1] == '\n' {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
b = '\n';
|
|
|
|
}
|
|
|
|
data[w] = b;
|
|
|
|
w++;
|
|
|
|
}
|
|
|
|
return data[0:w];
|
|
|
|
}
|
|
|
|
|
|
|
|
// Get name space name: name with a : stuck in the middle.
|
|
|
|
// The part before the : is the name space identifier.
|
|
|
|
func (p *Parser) nsname() (name Name, ok bool) {
|
|
|
|
s, ok := p.name();
|
|
|
|
if !ok {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
i := strings.Index(s, ":");
|
|
|
|
if i < 0 {
|
|
|
|
name.Local = s;
|
|
|
|
} else {
|
|
|
|
name.Space = s[0:i];
|
|
|
|
name.Local = s[i+1:len(s)];
|
|
|
|
}
|
|
|
|
return name, true;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Get name: /first(first|second)*/
|
|
|
|
// Unlike most routines, do not set p.err if the name is
|
|
|
|
// merely malformed. Let the caller provide better context.
|
|
|
|
func (p *Parser) name() (s string, ok bool) {
|
|
|
|
var b byte;
|
|
|
|
if b, ok = p.getc(); !ok {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
if b < utf8.RuneSelf && !isFirst(b) {
|
|
|
|
p.ungetc(b);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
p.buf.Reset();
|
|
|
|
p.buf.WriteByte(b);
|
|
|
|
for {
|
|
|
|
if b, ok = p.getc(); !ok {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
if b < utf8.RuneSelf && !isFirst(b) && !isSecond(b) {
|
|
|
|
p.ungetc(b);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
p.buf.WriteByte(b);
|
|
|
|
}
|
|
|
|
return p.buf.String(), true;
|
|
|
|
}
|
|
|
|
|
|
|
|
// We allow any Unicode char >= 0x80, but the XML spec is pickier:
|
|
|
|
// the exact character sets are listed in the comment at the end of the file.
|
|
|
|
func isFirst(c byte) bool {
|
|
|
|
return 'A' <= c && c <= 'Z' ||
|
|
|
|
'a' <= c && c <= 'z' ||
|
|
|
|
c == '_' ||
|
|
|
|
c == ':';
|
|
|
|
}
|
|
|
|
|
|
|
|
func isSecond(c byte) bool {
|
|
|
|
return c == '.' || c == '-';
|
|
|
|
}
|
|
|
|
|
|
|
|
// The precise form of an XML name is /first(first|second)*/, where
|
|
|
|
// first is one of these characters:
|
2009-02-16 21:14:21 -07:00
|
|
|
//
|
|
|
|
// 003A 04D0-04EB 0A59-0A5C 0C35-0C39 0F49-0F69 1E00-1E9B
|
|
|
|
// 0041-005A 04EE-04F5 0A5E 0C60-0C61 10A0-10C5 1EA0-1EF9
|
|
|
|
// 005F 04F8-04F9 0A72-0A74 0C85-0C8C 10D0-10F6 1F00-1F15
|
|
|
|
// 0061-007A 0531-0556 0A85-0A8B 0C8E-0C90 1100 1F18-1F1D
|
|
|
|
// 00C0-00D6 0559 0A8D 0C92-0CA8 1102-1103 1F20-1F45
|
|
|
|
// 00D8-00F6 0561-0586 0A8F-0A91 0CAA-0CB3 1105-1107 1F48-1F4D
|
|
|
|
// 00F8-00FF 05D0-05EA 0A93-0AA8 0CB5-0CB9 1109 1F50-1F57
|
|
|
|
// 0100-0131 05F0-05F2 0AAA-0AB0 0CDE 110B-110C 1F59
|
|
|
|
// 0134-013E 0621-063A 0AB2-0AB3 0CE0-0CE1 110E-1112 1F5B
|
|
|
|
// 0141-0148 0641-064A 0AB5-0AB9 0D05-0D0C 113C 1F5D
|
|
|
|
// 014A-017E 0671-06B7 0ABD 0D0E-0D10 113E 1F5F-1F7D
|
|
|
|
// 0180-01C3 06BA-06BE 0AE0 0D12-0D28 1140 1F80-1FB4
|
|
|
|
// 01CD-01F0 06C0-06CE 0B05-0B0C 0D2A-0D39 114C 1FB6-1FBC
|
|
|
|
// 01F4-01F5 06D0-06D3 0B0F-0B10 0D60-0D61 114E 1FBE
|
|
|
|
// 01FA-0217 06D5 0B13-0B28 0E01-0E2E 1150 1FC2-1FC4
|
|
|
|
// 0250-02A8 06E5-06E6 0B2A-0B30 0E30 1154-1155 1FC6-1FCC
|
|
|
|
// 02BB-02C1 0905-0939 0B32-0B33 0E32-0E33 1159 1FD0-1FD3
|
|
|
|
// 0386 093D 0B36-0B39 0E40-0E45 115F-1161 1FD6-1FDB
|
|
|
|
// 0388-038A 0958-0961 0B3D 0E81-0E82 1163 1FE0-1FEC
|
|
|
|
// 038C 0985-098C 0B5C-0B5D 0E84 1165 1FF2-1FF4
|
|
|
|
// 038E-03A1 098F-0990 0B5F-0B61 0E87-0E88 1167 1FF6-1FFC
|
|
|
|
// 03A3-03CE 0993-09A8 0B85-0B8A 0E8A 1169 2126
|
|
|
|
// 03D0-03D6 09AA-09B0 0B8E-0B90 0E8D 116D-116E 212A-212B
|
|
|
|
// 03DA 09B2 0B92-0B95 0E94-0E97 1172-1173 212E
|
|
|
|
// 03DC 09B6-09B9 0B99-0B9A 0E99-0E9F 1175 2180-2182
|
|
|
|
// 03DE 09DC-09DD 0B9C 0EA1-0EA3 119E 3007
|
|
|
|
// 03E0 09DF-09E1 0B9E-0B9F 0EA5 11A8 3021-3029
|
|
|
|
// 03E2-03F3 09F0-09F1 0BA3-0BA4 0EA7 11AB 3041-3094
|
|
|
|
// 0401-040C 0A05-0A0A 0BA8-0BAA 0EAA-0EAB 11AE-11AF 30A1-30FA
|
|
|
|
// 040E-044F 0A0F-0A10 0BAE-0BB5 0EAD-0EAE 11B7-11B8 3105-312C
|
|
|
|
// 0451-045C 0A13-0A28 0BB7-0BB9 0EB0 11BA 4E00-9FA5
|
|
|
|
// 045E-0481 0A2A-0A30 0C05-0C0C 0EB2-0EB3 11BC-11C2 AC00-D7A3
|
|
|
|
// 0490-04C4 0A32-0A33 0C0E-0C10 0EBD 11EB
|
|
|
|
// 04C7-04C8 0A35-0A36 0C12-0C28 0EC0-0EC4 11F0
|
|
|
|
// 04CB-04CC 0A38-0A39 0C2A-0C33 0F40-0F47 11F9
|
|
|
|
//
|
2009-10-05 16:00:50 -06:00
|
|
|
// and a second is one of these:
|
2009-02-16 21:14:21 -07:00
|
|
|
//
|
|
|
|
// 002D 06DD-06DF 09E6-09EF 0B56-0B57 0D3E-0D43 0F3E
|
|
|
|
// 002E 06E0-06E4 0A02 0B66-0B6F 0D46-0D48 0F3F
|
|
|
|
// 0030-0039 06E7-06E8 0A3C 0B82-0B83 0D4A-0D4D 0F71-0F84
|
|
|
|
// 00B7 06EA-06ED 0A3E 0BBE-0BC2 0D57 0F86-0F8B
|
|
|
|
// 02D0 06F0-06F9 0A3F 0BC6-0BC8 0D66-0D6F 0F90-0F95
|
|
|
|
// 02D1 0901-0903 0A40-0A42 0BCA-0BCD 0E31 0F97
|
|
|
|
// 0300-0345 093C 0A47-0A48 0BD7 0E34-0E3A 0F99-0FAD
|
|
|
|
// 0360-0361 093E-094C 0A4B-0A4D 0BE7-0BEF 0E46 0FB1-0FB7
|
|
|
|
// 0387 094D 0A66-0A6F 0C01-0C03 0E47-0E4E 0FB9
|
|
|
|
// 0483-0486 0951-0954 0A70-0A71 0C3E-0C44 0E50-0E59 20D0-20DC
|
|
|
|
// 0591-05A1 0962-0963 0A81-0A83 0C46-0C48 0EB1 20E1
|
|
|
|
// 05A3-05B9 0966-096F 0ABC 0C4A-0C4D 0EB4-0EB9 3005
|
|
|
|
// 05BB-05BD 0981-0983 0ABE-0AC5 0C55-0C56 0EBB-0EBC 302A-302F
|
|
|
|
// 05BF 09BC 0AC7-0AC9 0C66-0C6F 0EC6 3031-3035
|
|
|
|
// 05C1-05C2 09BE 0ACB-0ACD 0C82-0C83 0EC8-0ECD 3099
|
|
|
|
// 05C4 09BF 0AE6-0AEF 0CBE-0CC4 0ED0-0ED9 309A
|
|
|
|
// 0640 09C0-09C4 0B01-0B03 0CC6-0CC8 0F18-0F19 309D-309E
|
|
|
|
// 064B-0652 09C7-09C8 0B3C 0CCA-0CCD 0F20-0F29 30FC-30FE
|
|
|
|
// 0660-0669 09CB-09CD 0B3E-0B43 0CD5-0CD6 0F35
|
|
|
|
// 0670 09D7 0B47-0B48 0CE6-0CEF 0F37
|
|
|
|
// 06D6-06DC 09E2-09E3 0B4B-0B4D 0D02-0D03 0F39
|
|
|
|
|