1
0
mirror of https://github.com/golang/go synced 2024-10-05 00:11:21 -06:00
go/src/pkg/xml/xml.go

825 lines
21 KiB
Go
Raw Normal View History

// Copyright 2009 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package xml implements a simple XML 1.0 parser that
// understands XML name spaces.
package xml
// TODO(rsc):
// Test error handling.
// Expose parser line number in errors.
import (
"bufio";
"bytes";
"io";
"os";
"strconv";
"strings";
"unicode";
"utf8";
)
// A SyntaxError represents a syntax error in the XML input stream.
type SyntaxError string
func (e SyntaxError) String() string {
return "XML syntax error: " + string(e);
}
// A Name represents an XML name (Local) annotated
// with a name space identifier (Space).
// In tokens returned by Parser.Token, the Space identifier
// is given as a canonical URL, not the short prefix used
// in the document being parsed.
type Name struct {
Space, Local string;
}
// An Attr represents an attribute in an XML element (Name=Value).
type Attr struct {
Name Name;
Value string;
}
// A Token is an interface holding one of the token types:
// StartElement, EndElement, CharData, Comment, ProcInst, or Directive.
type Token interface{}
// A StartElement represents an XML start element.
type StartElement struct {
Name Name;
Attr []Attr;
}
// An EndElement represents an XML end element.
type EndElement struct {
Name Name;
}
// A CharData represents XML character data (raw text),
// in which XML escape sequences have been replaced by
// the characters they represent.
type CharData []byte
func copy(b []byte) []byte {
b1 := make([]byte, len(b));
bytes.Copy(b1, b);
return b1;
}
func (c CharData) Copy() CharData {
return CharData(copy(c));
}
// A Comment represents an XML comment of the form <!--comment-->.
// The bytes do not include the <!-- and --> comment markers.
type Comment []byte
func (c Comment) Copy() Comment {
return Comment(copy(c));
}
// A ProcInst represents an XML processing instruction of the form <?target inst?>
type ProcInst struct {
Target string;
Inst []byte;
}
func (p ProcInst) Copy() ProcInst {
p.Inst = copy(p.Inst);
return p;
}
// A Directive represents an XML directive of the form <!text>.
// The bytes do not include the <! and > markers.
type Directive []byte
func (d Directive) Copy() Directive {
return Directive(copy(d));
}
type readByter interface {
ReadByte() (b byte, err os.Error)
}
// A Parser represents an XML parser reading a particular input stream.
// The parser assumes that its input is encoded in UTF-8.
type Parser struct {
r readByter;
buf bytes.Buffer;
stk *stack;
free *stack;
needClose bool;
toClose Name;
nextByte int;
ns map[string]string;
err os.Error;
line int;
tmp [32]byte;
}
// NewParser creates a new XML parser reading from r.
func NewParser(r io.Reader) *Parser {
p := &Parser{
ns: make(map[string]string),
nextByte: -1,
line: 1,
};
// Get efficient byte at a time reader.
// Assume that if reader has its own
// ReadByte, it's efficient enough.
// Otherwise, use bufio.
if rb, ok := r.(readByter); ok {
p.r = rb;
} else {
p.r = bufio.NewReader(r);
}
return p;
}
// Token returns the next XML token in the input stream.
// At the end of the input stream, Token returns nil, os.EOF.
//
// Slices of bytes in the returned token data refer to the
// parser's internal buffer and remain valid only until the next
// call to Token. To acquire a copy of the bytes, call the token's
// Copy method.
//
// Token expands self-closing elements such as <br/>
// into separate start and end elements returned by successive calls.
//
// Token guarantees that the StartElement and EndElement
// tokens it returns are properly nested and matched:
// if Token encounters an unexpected end element,
// it will return an error.
//
// Token implements XML name spaces as described by
// http://www.w3.org/TR/REC-xml-names/. Each of the
// Name structures contained in the Token has the Space
// set to the URL identifying its name space when known.
// If Token encounters an unrecognized name space prefix,
// it uses the prefix as the Space rather than report an error.
//
func (p *Parser) Token() (t Token, err os.Error) {
if t, err = p.RawToken(); err != nil {
return;
}
switch t1 := t.(type) {
case StartElement:
// In XML name spaces, the translations listed in the
// attributes apply to the element name and
// to the other attribute names, so process
// the translations first.
for _, a := range t1.Attr {
if a.Name.Space == "xmlns" {
v, ok := p.ns[a.Name.Local];
p.pushNs(a.Name.Local, v, ok);
p.ns[a.Name.Local] = a.Value;
}
if a.Name.Space == "" && a.Name.Local == "xmlns" {
// Default space for untagged names
v, ok := p.ns[""];
p.pushNs("", v, ok);
p.ns[""] = a.Value;
}
}
p.translate(&t1.Name, true);
for i := range t1.Attr {
p.translate(&t1.Attr[i].Name, false);
}
p.pushElement(t1.Name);
t = t1;
case EndElement:
p.translate(&t1.Name, true);
if !p.popElement(t1.Name) {
return nil, p.err;
}
t = t1;
}
return;
}
// Apply name space translation to name n.
// The default name space (for Space=="")
// applies only to element names, not to attribute names.
func (p *Parser) translate(n *Name, isElementName bool) {
switch {
case n.Space == "xmlns":
return;
case n.Space == "" && !isElementName:
return;
case n.Space == "" && n.Local == "xmlns":
return;
}
if v, ok := p.ns[n.Space]; ok {
n.Space = v;
}
}
// Parsing state - stack holds old name space translations
// and the current set of open elements. The translations to pop when
// ending a given tag are *below* it on the stack, which is
// more work but forced on us by XML.
type stack struct {
next *stack;
kind int;
name Name;
ok bool;
}
const (
stkStart = iota;
stkNs;
)
func (p *Parser) push(kind int) *stack {
s := p.free;
if s != nil {
p.free = s.next;
} else {
s = new(stack);
}
s.next = p.stk;
s.kind = kind;
p.stk = s;
return s;
}
func (p *Parser) pop() *stack {
s := p.stk;
if s != nil {
p.stk = s.next;
s.next = p.free;
p.free = s;
}
return s;
}
// Record that we are starting an element with the given name.
func (p *Parser) pushElement(name Name) {
s := p.push(stkStart);
s.name = name;
}
// Record that we are changing the value of ns[local].
// The old value is url, ok.
func (p *Parser) pushNs(local string, url string, ok bool) {
s := p.push(stkNs);
s.name.Local = local;
s.name.Space = url;
s.ok = ok;
}
// Record that we are ending an element with the given name.
// The name must match the record at the top of the stack,
// which must be a pushElement record.
// After popping the element, apply any undo records from
// the stack to restore the name translations that existed
// before we saw this element.
func (p *Parser) popElement(name Name) bool {
s := p.pop();
switch {
case s == nil || s.kind != stkStart:
p.err = SyntaxError("unexpected end element </" + name.Local + ">");
return false;
case s.name.Local != name.Local:
p.err = SyntaxError("element <" + s.name.Local + "> closed by </" + name.Local + ">");
return false;
case s.name.Space != name.Space:
p.err = SyntaxError("element <" + s.name.Local + "> in space " + s.name.Space +
"closed by </" + name.Local + "> in space " + name.Space);
return false;
}
// Pop stack until a Start is on the top, undoing the
// translations that were associated with the element we just closed.
for p.stk != nil && p.stk.kind != stkStart {
s := p.pop();
p.ns[s.name.Local] = s.name.Space, s.ok;
}
return true;
}
// RawToken is like Token but does not verify that
// start and end elements match and does not translate
// name space prefixes to their corresponding URLs.
func (p *Parser) RawToken() (Token, os.Error) {
if p.err != nil {
return nil, p.err;
}
if p.needClose {
// The last element we read was self-closing and
// we returned just the StartElement half.
// Return the EndElement half now.
p.needClose = false;
return EndElement{p.toClose}, nil;
}
b, ok := p.getc();
if !ok {
return nil, p.err;
}
if b != '<' {
// Text section.
p.ungetc(b);
data := p.text(-1, false);
if data == nil {
return nil, p.err;
}
return CharData(data), nil;
}
if b, ok = p.getc(); !ok {
return nil, p.err;
}
switch b {
case '/':
// </: End element
var name Name;
if name, ok = p.nsname(); !ok {
if p.err == nil {
p.err = SyntaxError("expected element name after </");
}
return nil, p.err;
}
p.space();
if b, ok = p.getc(); !ok {
return nil, p.err;
}
if b != '>' {
p.err = SyntaxError("invalid characters between </" + name.Local + " and >");
return nil, p.err;
}
return EndElement{name}, nil;
case '?':
// <?: Processing instruction.
// TODO(rsc): Should parse the <?xml declaration to make sure
// the version is 1.0 and the encoding is UTF-8.
var target string;
if target, ok = p.name(); !ok {
return nil, p.err;
}
p.space();
p.buf.Reset();
var b0 byte;
for {
if b, ok = p.getc(); !ok {
if p.err == os.EOF {
p.err = SyntaxError("unterminated <? directive");
}
return nil, p.err;
}
p.buf.WriteByte(b);
if b0 == '?' && b == '>' {
break;
}
b0 = b;
}
data := p.buf.Bytes();
data = data[0:len(data)-2]; // chop ?>
return ProcInst{target, data}, nil;
case '!':
// <!: Maybe comment, maybe CDATA.
if b, ok = p.getc(); !ok {
return nil, p.err;
}
switch b {
case '-': // <!-
// Probably <!-- for a comment.
if b, ok = p.getc(); !ok {
return nil, p.err;
}
if b != '-' {
p.err = SyntaxError("invalid sequence <!- not part of <!--");
return nil, p.err;
}
// Look for terminator.
p.buf.Reset();
var b0, b1 byte;
for {
if b, ok = p.getc(); !ok {
if p.err == os.EOF {
p.err = SyntaxError("unterminated <!-- comment");
}
return nil, p.err;
}
p.buf.WriteByte(b);
if b0 == '-' && b1 == '-' && b == '>' {
break;
}
b0, b1 = b1, b;
}
data := p.buf.Bytes();
data = data[0:len(data)-3]; // chop -->
return Comment(data), nil;
case '[': // <![
// Probably <![CDATA[.
for i := 0; i < 7; i++ {
if b, ok = p.getc(); !ok {
return nil, p.err;
}
if b != "[CDATA["[i] {
p.err = SyntaxError("invalid <![ sequence");
return nil, p.err;
}
}
// Have <![CDATA[. Read text until ]]>.
data := p.text(-1, true);
if data == nil {
return nil, p.err;
}
return CharData(data), nil;
}
// Probably a directive: <!DOCTYPE ...>, <!ENTITY ...>, etc.
// We don't care, but accumulate for caller.
p.buf.Reset();
p.buf.WriteByte(b);
for {
if b, ok = p.getc(); !ok {
return nil, p.err;
}
if b == '>' {
break;
}
p.buf.WriteByte(b);
}
return Directive(p.buf.Bytes()), nil;
}
// Must be an open element like <a href="foo">
p.ungetc(b);
var (
name Name;
empty bool;
attr []Attr;
)
if name, ok = p.nsname(); !ok {
if p.err == nil {
p.err = SyntaxError("expected element name after <");
}
return nil, p.err;
}
attr = make([]Attr, 0, 4);
for {
p.space();
if b, ok = p.getc(); !ok {
return nil, p.err;
}
if b == '/' {
empty = true;
if b, ok = p.getc(); !ok {
return nil, p.err;
}
if b != '>' {
p.err = SyntaxError("expected /> in element");
return nil, p.err;
}
break;
}
if b == '>' {
break;
}
p.ungetc(b);
n := len(attr);
if n >= cap(attr) {
nattr := make([]Attr, n, 2*cap(attr));
for i, a := range attr {
nattr[i] = a;
}
attr = nattr;
}
attr = attr[0:n+1];
a := &attr[n];
if a.Name, ok = p.nsname(); !ok {
if p.err == nil {
p.err = SyntaxError("expected attribute name in element");
}
return nil, p.err;
}
p.space();
if b, ok = p.getc(); !ok {
return nil, p.err;
}
if b != '=' {
p.err = SyntaxError("attribute name without = in element");
return nil, p.err;
}
p.space();
if b, ok = p.getc(); !ok {
return nil, p.err;
}
if b != '"' && b != '\'' {
p.err = SyntaxError("unquoted or missing attribute value in element");
return nil, p.err;
}
data := p.text(int(b), false);
if data == nil {
return nil, p.err;
}
a.Value = string(data);
}
if empty {
p.needClose = true;
p.toClose = name;
}
return StartElement{name, attr}, nil;
}
// Skip spaces if any
func (p *Parser) space() {
for {
b, ok := p.getc();
if !ok {
return;
}
switch b {
case ' ', '\r', '\n', '\t':
default:
p.ungetc(b);
return;
}
}
}
// Read a single byte.
// If there is no byte to read, return ok==false
// and leave the error in p.err.
// Maintain line number.
func (p *Parser) getc() (b byte, ok bool) {
if p.err != nil {
return 0, false;
}
if p.nextByte >= 0 {
b = byte(p.nextByte);
p.nextByte = -1;
} else {
b, p.err = p.r.ReadByte();
if p.err != nil {
return 0, false;
}
}
if b == '\n' {
p.line++;
}
return b, true;
}
// Unread a single byte.
func (p *Parser) ungetc(b byte) {
if b == '\n' {
p.line--;
}
p.nextByte = int(b);
}
var entity = map[string]int {
"lt": '<',
"gt": '>',
"amp": '&',
"apos": '\'',
"quot": '"',
}
// Read plain text section (XML calls it character data).
// If quote >= 0, we are in a quoted string and need to find the matching quote.
// If cdata == true, we are in a <![CDATA[ section and need to find ]]>.
// On failure return nil and leave the error in p.err.
func (p *Parser) text(quote int, cdata bool) []byte {
var b0, b1 byte;
var trunc int;
p.buf.Reset();
Input:
for {
b, ok := p.getc();
if !ok {
return nil;
}
// <![CDATA[ section ends with ]]>.
// It is an error for ]]> to appear in ordinary text.
if b0 == ']' && b1 == ']' && b == '>' {
if cdata {
trunc = 2;
break Input;
}
p.err = SyntaxError("unescaped ]]> not in CDATA section");
return nil;
}
// Stop reading text if we see a <.
if b == '<' && !cdata {
if quote >= 0 {
p.err = SyntaxError("unescaped < inside quoted string");
return nil;
}
p.ungetc('<');
break Input;
}
if quote >= 0 && b == byte(quote) {
break Input;
}
if b == '&' {
// Read escaped character expression up to semicolon.
// XML in all its glory allows a document to define and use
// its own character names with <!ENTITY ...> directives.
// Parsers are required to recognize lt, gt, amp, apos, and quot
// even if they have not been declared. That's all we allow.
var i int;
for i = 0; i < len(p.tmp); i++ {
p.tmp[i], p.err = p.r.ReadByte();
if p.err != nil {
return nil;
}
if p.tmp[i] == ';' {
break;
}
}
s := string(p.tmp[0:i]);
if i >= len(p.tmp) {
p.err = SyntaxError("character entity expression &" + s + "... too long");
return nil;
}
rune := -1;
if i >= 2 && s[0] == '#' {
var n uint64;
var err os.Error;
if i >= 3 && s[1] == 'x' {
n, err = strconv.Btoui64(s[2:len(s)], 16);
} else {
n, err = strconv.Btoui64(s[1:len(s)], 10);
}
if err == nil && n <= unicode.MaxRune {
rune = int(n);
}
} else {
if r, ok := entity[s]; ok {
rune = r;
}
}
if rune < 0 {
p.err = SyntaxError("invalid character entity &" + s + ";");
return nil;
}
i = utf8.EncodeRune(rune, &p.tmp);
p.buf.Write(p.tmp[0:i]);
b0, b1 = 0, 0;
continue Input;
}
p.buf.WriteByte(b);
b0, b1 = b1, b;
}
data := p.buf.Bytes();
data = data[0:len(data)-trunc];
// Must rewrite \r and \r\n into \n.
w := 0;
for r := 0; r < len(data); r++ {
b := data[r];
if b == '\r' {
if r+1 < len(data) && data[r+1] == '\n' {
continue;
}
b = '\n';
}
data[w] = b;
w++;
}
return data[0:w];
}
// Get name space name: name with a : stuck in the middle.
// The part before the : is the name space identifier.
func (p *Parser) nsname() (name Name, ok bool) {
s, ok := p.name();
if !ok {
return;
}
i := strings.Index(s, ":");
if i < 0 {
name.Local = s;
} else {
name.Space = s[0:i];
name.Local = s[i+1:len(s)];
}
return name, true;
}
// Get name: /first(first|second)*/
// Unlike most routines, do not set p.err if the name is
// merely malformed. Let the caller provide better context.
func (p *Parser) name() (s string, ok bool) {
var b byte;
if b, ok = p.getc(); !ok {
return;
}
if b < utf8.RuneSelf && !isFirst(b) {
p.ungetc(b);
return;
}
p.buf.Reset();
p.buf.WriteByte(b);
for {
if b, ok = p.getc(); !ok {
return;
}
if b < utf8.RuneSelf && !isFirst(b) && !isSecond(b) {
p.ungetc(b);
break;
}
p.buf.WriteByte(b);
}
return p.buf.String(), true;
}
// We allow any Unicode char >= 0x80, but the XML spec is pickier:
// the exact character sets are listed in the comment at the end of the file.
func isFirst(c byte) bool {
return 'A' <= c && c <= 'Z' ||
'a' <= c && c <= 'z' ||
c == '_' ||
c == ':';
}
func isSecond(c byte) bool {
return c == '.' || c == '-';
}
// The precise form of an XML name is /first(first|second)*/, where
// first is one of these characters:
//
// 003A 04D0-04EB 0A59-0A5C 0C35-0C39 0F49-0F69 1E00-1E9B
// 0041-005A 04EE-04F5 0A5E 0C60-0C61 10A0-10C5 1EA0-1EF9
// 005F 04F8-04F9 0A72-0A74 0C85-0C8C 10D0-10F6 1F00-1F15
// 0061-007A 0531-0556 0A85-0A8B 0C8E-0C90 1100 1F18-1F1D
// 00C0-00D6 0559 0A8D 0C92-0CA8 1102-1103 1F20-1F45
// 00D8-00F6 0561-0586 0A8F-0A91 0CAA-0CB3 1105-1107 1F48-1F4D
// 00F8-00FF 05D0-05EA 0A93-0AA8 0CB5-0CB9 1109 1F50-1F57
// 0100-0131 05F0-05F2 0AAA-0AB0 0CDE 110B-110C 1F59
// 0134-013E 0621-063A 0AB2-0AB3 0CE0-0CE1 110E-1112 1F5B
// 0141-0148 0641-064A 0AB5-0AB9 0D05-0D0C 113C 1F5D
// 014A-017E 0671-06B7 0ABD 0D0E-0D10 113E 1F5F-1F7D
// 0180-01C3 06BA-06BE 0AE0 0D12-0D28 1140 1F80-1FB4
// 01CD-01F0 06C0-06CE 0B05-0B0C 0D2A-0D39 114C 1FB6-1FBC
// 01F4-01F5 06D0-06D3 0B0F-0B10 0D60-0D61 114E 1FBE
// 01FA-0217 06D5 0B13-0B28 0E01-0E2E 1150 1FC2-1FC4
// 0250-02A8 06E5-06E6 0B2A-0B30 0E30 1154-1155 1FC6-1FCC
// 02BB-02C1 0905-0939 0B32-0B33 0E32-0E33 1159 1FD0-1FD3
// 0386 093D 0B36-0B39 0E40-0E45 115F-1161 1FD6-1FDB
// 0388-038A 0958-0961 0B3D 0E81-0E82 1163 1FE0-1FEC
// 038C 0985-098C 0B5C-0B5D 0E84 1165 1FF2-1FF4
// 038E-03A1 098F-0990 0B5F-0B61 0E87-0E88 1167 1FF6-1FFC
// 03A3-03CE 0993-09A8 0B85-0B8A 0E8A 1169 2126
// 03D0-03D6 09AA-09B0 0B8E-0B90 0E8D 116D-116E 212A-212B
// 03DA 09B2 0B92-0B95 0E94-0E97 1172-1173 212E
// 03DC 09B6-09B9 0B99-0B9A 0E99-0E9F 1175 2180-2182
// 03DE 09DC-09DD 0B9C 0EA1-0EA3 119E 3007
// 03E0 09DF-09E1 0B9E-0B9F 0EA5 11A8 3021-3029
// 03E2-03F3 09F0-09F1 0BA3-0BA4 0EA7 11AB 3041-3094
// 0401-040C 0A05-0A0A 0BA8-0BAA 0EAA-0EAB 11AE-11AF 30A1-30FA
// 040E-044F 0A0F-0A10 0BAE-0BB5 0EAD-0EAE 11B7-11B8 3105-312C
// 0451-045C 0A13-0A28 0BB7-0BB9 0EB0 11BA 4E00-9FA5
// 045E-0481 0A2A-0A30 0C05-0C0C 0EB2-0EB3 11BC-11C2 AC00-D7A3
// 0490-04C4 0A32-0A33 0C0E-0C10 0EBD 11EB
// 04C7-04C8 0A35-0A36 0C12-0C28 0EC0-0EC4 11F0
// 04CB-04CC 0A38-0A39 0C2A-0C33 0F40-0F47 11F9
//
// and a second is one of these:
//
// 002D 06DD-06DF 09E6-09EF 0B56-0B57 0D3E-0D43 0F3E
// 002E 06E0-06E4 0A02 0B66-0B6F 0D46-0D48 0F3F
// 0030-0039 06E7-06E8 0A3C 0B82-0B83 0D4A-0D4D 0F71-0F84
// 00B7 06EA-06ED 0A3E 0BBE-0BC2 0D57 0F86-0F8B
// 02D0 06F0-06F9 0A3F 0BC6-0BC8 0D66-0D6F 0F90-0F95
// 02D1 0901-0903 0A40-0A42 0BCA-0BCD 0E31 0F97
// 0300-0345 093C 0A47-0A48 0BD7 0E34-0E3A 0F99-0FAD
// 0360-0361 093E-094C 0A4B-0A4D 0BE7-0BEF 0E46 0FB1-0FB7
// 0387 094D 0A66-0A6F 0C01-0C03 0E47-0E4E 0FB9
// 0483-0486 0951-0954 0A70-0A71 0C3E-0C44 0E50-0E59 20D0-20DC
// 0591-05A1 0962-0963 0A81-0A83 0C46-0C48 0EB1 20E1
// 05A3-05B9 0966-096F 0ABC 0C4A-0C4D 0EB4-0EB9 3005
// 05BB-05BD 0981-0983 0ABE-0AC5 0C55-0C56 0EBB-0EBC 302A-302F
// 05BF 09BC 0AC7-0AC9 0C66-0C6F 0EC6 3031-3035
// 05C1-05C2 09BE 0ACB-0ACD 0C82-0C83 0EC8-0ECD 3099
// 05C4 09BF 0AE6-0AEF 0CBE-0CC4 0ED0-0ED9 309A
// 0640 09C0-09C4 0B01-0B03 0CC6-0CC8 0F18-0F19 309D-309E
// 064B-0652 09C7-09C8 0B3C 0CCA-0CCD 0F20-0F29 30FC-30FE
// 0660-0669 09CB-09CD 0B3E-0B43 0CD5-0CD6 0F35
// 0670 09D7 0B47-0B48 0CE6-0CEF 0F37
// 06D6-06DC 09E2-09E3 0B4B-0B4D 0D02-0D03 0F39