// Copyright 2009 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. // Package xml implements a simple XML 1.0 parser that // understands XML name spaces. package xml // TODO(rsc): // Test error handling. // Expose parser line number in errors. import ( "bufio"; "bytes"; "io"; "os"; "strconv"; "strings"; "unicode"; "utf8"; ) // A SyntaxError represents a syntax error in the XML input stream. type SyntaxError string func (e SyntaxError) String() string { return "XML syntax error: " + string(e); } // A Name represents an XML name (Local) annotated // with a name space identifier (Space). // In tokens returned by Parser.Token, the Space identifier // is given as a canonical URL, not the short prefix used // in the document being parsed. type Name struct { Space, Local string; } // An Attr represents an attribute in an XML element (Name=Value). type Attr struct { Name Name; Value string; } // A Token is an interface holding one of the token types: // StartElement, EndElement, CharData, Comment, ProcInst, or Directive. type Token interface{} // A StartElement represents an XML start element. type StartElement struct { Name Name; Attr []Attr; } // An EndElement represents an XML end element. type EndElement struct { Name Name; } // A CharData represents XML character data (raw text), // in which XML escape sequences have been replaced by // the characters they represent. type CharData []byte func copy(b []byte) []byte { b1 := make([]byte, len(b)); bytes.Copy(b1, b); return b1; } func (c CharData) Copy() CharData { return CharData(copy(c)); } // A Comment represents an XML comment of the form . // The bytes do not include the comment markers. type Comment []byte func (c Comment) Copy() Comment { return Comment(copy(c)); } // A ProcInst represents an XML processing instruction of the form type ProcInst struct { Target string; Inst []byte; } func (p ProcInst) Copy() ProcInst { p.Inst = copy(p.Inst); return p; } // A Directive represents an XML directive of the form . // The bytes do not include the markers. type Directive []byte func (d Directive) Copy() Directive { return Directive(copy(d)); } type readByter interface { ReadByte() (b byte, err os.Error) } // A Parser represents an XML parser reading a particular input stream. // The parser assumes that its input is encoded in UTF-8. type Parser struct { r readByter; buf bytes.Buffer; stk *stack; free *stack; needClose bool; toClose Name; nextByte int; ns map[string]string; err os.Error; line int; tmp [32]byte; } // NewParser creates a new XML parser reading from r. func NewParser(r io.Reader) *Parser { p := &Parser{ ns: make(map[string]string), nextByte: -1, line: 1, }; // Get efficient byte at a time reader. // Assume that if reader has its own // ReadByte, it's efficient enough. // Otherwise, use bufio. if rb, ok := r.(readByter); ok { p.r = rb; } else { p.r = bufio.NewReader(r); } return p; } // Token returns the next XML token in the input stream. // At the end of the input stream, Token returns nil, os.EOF. // // Slices of bytes in the returned token data refer to the // parser's internal buffer and remain valid only until the next // call to Token. To acquire a copy of the bytes, call the token's // Copy method. // // Token expands self-closing elements such as
// into separate start and end elements returned by successive calls. // // Token guarantees that the StartElement and EndElement // tokens it returns are properly nested and matched: // if Token encounters an unexpected end element, // it will return an error. // // Token implements XML name spaces as described by // http://www.w3.org/TR/REC-xml-names/. Each of the // Name structures contained in the Token has the Space // set to the URL identifying its name space when known. // If Token encounters an unrecognized name space prefix, // it uses the prefix as the Space rather than report an error. // func (p *Parser) Token() (t Token, err os.Error) { if t, err = p.RawToken(); err != nil { return; } switch t1 := t.(type) { case StartElement: // In XML name spaces, the translations listed in the // attributes apply to the element name and // to the other attribute names, so process // the translations first. for _, a := range t1.Attr { if a.Name.Space == "xmlns" { v, ok := p.ns[a.Name.Local]; p.pushNs(a.Name.Local, v, ok); p.ns[a.Name.Local] = a.Value; } if a.Name.Space == "" && a.Name.Local == "xmlns" { // Default space for untagged names v, ok := p.ns[""]; p.pushNs("", v, ok); p.ns[""] = a.Value; } } p.translate(&t1.Name, true); for i := range t1.Attr { p.translate(&t1.Attr[i].Name, false); } p.pushElement(t1.Name); t = t1; case EndElement: p.translate(&t1.Name, true); if !p.popElement(t1.Name) { return nil, p.err; } t = t1; } return; } // Apply name space translation to name n. // The default name space (for Space=="") // applies only to element names, not to attribute names. func (p *Parser) translate(n *Name, isElementName bool) { switch { case n.Space == "xmlns": return; case n.Space == "" && !isElementName: return; case n.Space == "" && n.Local == "xmlns": return; } if v, ok := p.ns[n.Space]; ok { n.Space = v; } } // Parsing state - stack holds old name space translations // and the current set of open elements. The translations to pop when // ending a given tag are *below* it on the stack, which is // more work but forced on us by XML. type stack struct { next *stack; kind int; name Name; ok bool; } const ( stkStart = iota; stkNs; ) func (p *Parser) push(kind int) *stack { s := p.free; if s != nil { p.free = s.next; } else { s = new(stack); } s.next = p.stk; s.kind = kind; p.stk = s; return s; } func (p *Parser) pop() *stack { s := p.stk; if s != nil { p.stk = s.next; s.next = p.free; p.free = s; } return s; } // Record that we are starting an element with the given name. func (p *Parser) pushElement(name Name) { s := p.push(stkStart); s.name = name; } // Record that we are changing the value of ns[local]. // The old value is url, ok. func (p *Parser) pushNs(local string, url string, ok bool) { s := p.push(stkNs); s.name.Local = local; s.name.Space = url; s.ok = ok; } // Record that we are ending an element with the given name. // The name must match the record at the top of the stack, // which must be a pushElement record. // After popping the element, apply any undo records from // the stack to restore the name translations that existed // before we saw this element. func (p *Parser) popElement(name Name) bool { s := p.pop(); switch { case s == nil || s.kind != stkStart: p.err = SyntaxError("unexpected end element "); return false; case s.name.Local != name.Local: p.err = SyntaxError("element <" + s.name.Local + "> closed by "); return false; case s.name.Space != name.Space: p.err = SyntaxError("element <" + s.name.Local + "> in space " + s.name.Space + "closed by in space " + name.Space); return false; } // Pop stack until a Start is on the top, undoing the // translations that were associated with the element we just closed. for p.stk != nil && p.stk.kind != stkStart { s := p.pop(); p.ns[s.name.Local] = s.name.Space, s.ok; } return true; } // RawToken is like Token but does not verify that // start and end elements match and does not translate // name space prefixes to their corresponding URLs. func (p *Parser) RawToken() (Token, os.Error) { if p.err != nil { return nil, p.err; } if p.needClose { // The last element we read was self-closing and // we returned just the StartElement half. // Return the EndElement half now. p.needClose = false; return EndElement{p.toClose}, nil; } b, ok := p.getc(); if !ok { return nil, p.err; } if b != '<' { // Text section. p.ungetc(b); data := p.text(-1, false); if data == nil { return nil, p.err; } return CharData(data), nil; } if b, ok = p.getc(); !ok { return nil, p.err; } switch b { case '/': // ' { p.err = SyntaxError("invalid characters between "); return nil, p.err; } return EndElement{name}, nil; case '?': // ' { break; } b0 = b; } data := p.buf.Bytes(); data = data[0:len(data)-2]; // chop ?> return ProcInst{target, data}, nil; case '!': // ' { break; } b0, b1 = b1, b; } data := p.buf.Bytes(); data = data[0:len(data)-3]; // chop --> return Comment(data), nil; case '[': // . data := p.text(-1, true); if data == nil { return nil, p.err; } return CharData(data), nil; } // Probably a directive: , , etc. // We don't care, but accumulate for caller. p.buf.Reset(); p.buf.WriteByte(b); for { if b, ok = p.getc(); !ok { return nil, p.err; } if b == '>' { break; } p.buf.WriteByte(b); } return Directive(p.buf.Bytes()), nil; } // Must be an open element like p.ungetc(b); var ( name Name; empty bool; attr []Attr; ) if name, ok = p.nsname(); !ok { if p.err == nil { p.err = SyntaxError("expected element name after <"); } return nil, p.err; } attr = make([]Attr, 0, 4); for { p.space(); if b, ok = p.getc(); !ok { return nil, p.err; } if b == '/' { empty = true; if b, ok = p.getc(); !ok { return nil, p.err; } if b != '>' { p.err = SyntaxError("expected /> in element"); return nil, p.err; } break; } if b == '>' { break; } p.ungetc(b); n := len(attr); if n >= cap(attr) { nattr := make([]Attr, n, 2*cap(attr)); for i, a := range attr { nattr[i] = a; } attr = nattr; } attr = attr[0:n+1]; a := &attr[n]; if a.Name, ok = p.nsname(); !ok { if p.err == nil { p.err = SyntaxError("expected attribute name in element"); } return nil, p.err; } p.space(); if b, ok = p.getc(); !ok { return nil, p.err; } if b != '=' { p.err = SyntaxError("attribute name without = in element"); return nil, p.err; } p.space(); if b, ok = p.getc(); !ok { return nil, p.err; } if b != '"' && b != '\'' { p.err = SyntaxError("unquoted or missing attribute value in element"); return nil, p.err; } data := p.text(int(b), false); if data == nil { return nil, p.err; } a.Value = string(data); } if empty { p.needClose = true; p.toClose = name; } return StartElement{name, attr}, nil; } // Skip spaces if any func (p *Parser) space() { for { b, ok := p.getc(); if !ok { return; } switch b { case ' ', '\r', '\n', '\t': default: p.ungetc(b); return; } } } // Read a single byte. // If there is no byte to read, return ok==false // and leave the error in p.err. // Maintain line number. func (p *Parser) getc() (b byte, ok bool) { if p.err != nil { return 0, false; } if p.nextByte >= 0 { b = byte(p.nextByte); p.nextByte = -1; } else { b, p.err = p.r.ReadByte(); if p.err != nil { return 0, false; } } if b == '\n' { p.line++; } return b, true; } // Unread a single byte. func (p *Parser) ungetc(b byte) { if b == '\n' { p.line--; } p.nextByte = int(b); } var entity = map[string]int { "lt": '<', "gt": '>', "amp": '&', "apos": '\'', "quot": '"', } // Read plain text section (XML calls it character data). // If quote >= 0, we are in a quoted string and need to find the matching quote. // If cdata == true, we are in a . // On failure return nil and leave the error in p.err. func (p *Parser) text(quote int, cdata bool) []byte { var b0, b1 byte; var trunc int; p.buf.Reset(); Input: for { b, ok := p.getc(); if !ok { return nil; } // . // It is an error for ]]> to appear in ordinary text. if b0 == ']' && b1 == ']' && b == '>' { if cdata { trunc = 2; break Input; } p.err = SyntaxError("unescaped ]]> not in CDATA section"); return nil; } // Stop reading text if we see a <. if b == '<' && !cdata { if quote >= 0 { p.err = SyntaxError("unescaped < inside quoted string"); return nil; } p.ungetc('<'); break Input; } if quote >= 0 && b == byte(quote) { break Input; } if b == '&' { // Read escaped character expression up to semicolon. // XML in all its glory allows a document to define and use // its own character names with directives. // Parsers are required to recognize lt, gt, amp, apos, and quot // even if they have not been declared. That's all we allow. var i int; for i = 0; i < len(p.tmp); i++ { p.tmp[i], p.err = p.r.ReadByte(); if p.err != nil { return nil; } if p.tmp[i] == ';' { break; } } s := string(p.tmp[0:i]); if i >= len(p.tmp) { p.err = SyntaxError("character entity expression &" + s + "... too long"); return nil; } rune := -1; if i >= 2 && s[0] == '#' { var n uint64; var err os.Error; if i >= 3 && s[1] == 'x' { n, err = strconv.Btoui64(s[2:len(s)], 16); } else { n, err = strconv.Btoui64(s[1:len(s)], 10); } if err == nil && n <= unicode.MaxRune { rune = int(n); } } else { if r, ok := entity[s]; ok { rune = r; } } if rune < 0 { p.err = SyntaxError("invalid character entity &" + s + ";"); return nil; } i = utf8.EncodeRune(rune, &p.tmp); p.buf.Write(p.tmp[0:i]); b0, b1 = 0, 0; continue Input; } p.buf.WriteByte(b); b0, b1 = b1, b; } data := p.buf.Bytes(); data = data[0:len(data)-trunc]; // Must rewrite \r and \r\n into \n. w := 0; for r := 0; r < len(data); r++ { b := data[r]; if b == '\r' { if r+1 < len(data) && data[r+1] == '\n' { continue; } b = '\n'; } data[w] = b; w++; } return data[0:w]; } // Get name space name: name with a : stuck in the middle. // The part before the : is the name space identifier. func (p *Parser) nsname() (name Name, ok bool) { s, ok := p.name(); if !ok { return; } i := strings.Index(s, ":"); if i < 0 { name.Local = s; } else { name.Space = s[0:i]; name.Local = s[i+1:len(s)]; } return name, true; } // Get name: /first(first|second)*/ // Unlike most routines, do not set p.err if the name is // merely malformed. Let the caller provide better context. func (p *Parser) name() (s string, ok bool) { var b byte; if b, ok = p.getc(); !ok { return; } if b < utf8.RuneSelf && !isFirst(b) { p.ungetc(b); return; } p.buf.Reset(); p.buf.WriteByte(b); for { if b, ok = p.getc(); !ok { return; } if b < utf8.RuneSelf && !isFirst(b) && !isSecond(b) { p.ungetc(b); break; } p.buf.WriteByte(b); } return p.buf.String(), true; } // We allow any Unicode char >= 0x80, but the XML spec is pickier: // the exact character sets are listed in the comment at the end of the file. func isFirst(c byte) bool { return 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z' || c == '_' || c == ':'; } func isSecond(c byte) bool { return c == '.' || c == '-'; } // The precise form of an XML name is /first(first|second)*/, where // first is one of these characters: // // 003A 04D0-04EB 0A59-0A5C 0C35-0C39 0F49-0F69 1E00-1E9B // 0041-005A 04EE-04F5 0A5E 0C60-0C61 10A0-10C5 1EA0-1EF9 // 005F 04F8-04F9 0A72-0A74 0C85-0C8C 10D0-10F6 1F00-1F15 // 0061-007A 0531-0556 0A85-0A8B 0C8E-0C90 1100 1F18-1F1D // 00C0-00D6 0559 0A8D 0C92-0CA8 1102-1103 1F20-1F45 // 00D8-00F6 0561-0586 0A8F-0A91 0CAA-0CB3 1105-1107 1F48-1F4D // 00F8-00FF 05D0-05EA 0A93-0AA8 0CB5-0CB9 1109 1F50-1F57 // 0100-0131 05F0-05F2 0AAA-0AB0 0CDE 110B-110C 1F59 // 0134-013E 0621-063A 0AB2-0AB3 0CE0-0CE1 110E-1112 1F5B // 0141-0148 0641-064A 0AB5-0AB9 0D05-0D0C 113C 1F5D // 014A-017E 0671-06B7 0ABD 0D0E-0D10 113E 1F5F-1F7D // 0180-01C3 06BA-06BE 0AE0 0D12-0D28 1140 1F80-1FB4 // 01CD-01F0 06C0-06CE 0B05-0B0C 0D2A-0D39 114C 1FB6-1FBC // 01F4-01F5 06D0-06D3 0B0F-0B10 0D60-0D61 114E 1FBE // 01FA-0217 06D5 0B13-0B28 0E01-0E2E 1150 1FC2-1FC4 // 0250-02A8 06E5-06E6 0B2A-0B30 0E30 1154-1155 1FC6-1FCC // 02BB-02C1 0905-0939 0B32-0B33 0E32-0E33 1159 1FD0-1FD3 // 0386 093D 0B36-0B39 0E40-0E45 115F-1161 1FD6-1FDB // 0388-038A 0958-0961 0B3D 0E81-0E82 1163 1FE0-1FEC // 038C 0985-098C 0B5C-0B5D 0E84 1165 1FF2-1FF4 // 038E-03A1 098F-0990 0B5F-0B61 0E87-0E88 1167 1FF6-1FFC // 03A3-03CE 0993-09A8 0B85-0B8A 0E8A 1169 2126 // 03D0-03D6 09AA-09B0 0B8E-0B90 0E8D 116D-116E 212A-212B // 03DA 09B2 0B92-0B95 0E94-0E97 1172-1173 212E // 03DC 09B6-09B9 0B99-0B9A 0E99-0E9F 1175 2180-2182 // 03DE 09DC-09DD 0B9C 0EA1-0EA3 119E 3007 // 03E0 09DF-09E1 0B9E-0B9F 0EA5 11A8 3021-3029 // 03E2-03F3 09F0-09F1 0BA3-0BA4 0EA7 11AB 3041-3094 // 0401-040C 0A05-0A0A 0BA8-0BAA 0EAA-0EAB 11AE-11AF 30A1-30FA // 040E-044F 0A0F-0A10 0BAE-0BB5 0EAD-0EAE 11B7-11B8 3105-312C // 0451-045C 0A13-0A28 0BB7-0BB9 0EB0 11BA 4E00-9FA5 // 045E-0481 0A2A-0A30 0C05-0C0C 0EB2-0EB3 11BC-11C2 AC00-D7A3 // 0490-04C4 0A32-0A33 0C0E-0C10 0EBD 11EB // 04C7-04C8 0A35-0A36 0C12-0C28 0EC0-0EC4 11F0 // 04CB-04CC 0A38-0A39 0C2A-0C33 0F40-0F47 11F9 // // and a second is one of these: // // 002D 06DD-06DF 09E6-09EF 0B56-0B57 0D3E-0D43 0F3E // 002E 06E0-06E4 0A02 0B66-0B6F 0D46-0D48 0F3F // 0030-0039 06E7-06E8 0A3C 0B82-0B83 0D4A-0D4D 0F71-0F84 // 00B7 06EA-06ED 0A3E 0BBE-0BC2 0D57 0F86-0F8B // 02D0 06F0-06F9 0A3F 0BC6-0BC8 0D66-0D6F 0F90-0F95 // 02D1 0901-0903 0A40-0A42 0BCA-0BCD 0E31 0F97 // 0300-0345 093C 0A47-0A48 0BD7 0E34-0E3A 0F99-0FAD // 0360-0361 093E-094C 0A4B-0A4D 0BE7-0BEF 0E46 0FB1-0FB7 // 0387 094D 0A66-0A6F 0C01-0C03 0E47-0E4E 0FB9 // 0483-0486 0951-0954 0A70-0A71 0C3E-0C44 0E50-0E59 20D0-20DC // 0591-05A1 0962-0963 0A81-0A83 0C46-0C48 0EB1 20E1 // 05A3-05B9 0966-096F 0ABC 0C4A-0C4D 0EB4-0EB9 3005 // 05BB-05BD 0981-0983 0ABE-0AC5 0C55-0C56 0EBB-0EBC 302A-302F // 05BF 09BC 0AC7-0AC9 0C66-0C6F 0EC6 3031-3035 // 05C1-05C2 09BE 0ACB-0ACD 0C82-0C83 0EC8-0ECD 3099 // 05C4 09BF 0AE6-0AEF 0CBE-0CC4 0ED0-0ED9 309A // 0640 09C0-09C4 0B01-0B03 0CC6-0CC8 0F18-0F19 309D-309E // 064B-0652 09C7-09C8 0B3C 0CCA-0CCD 0F20-0F29 30FC-30FE // 0660-0669 09CB-09CD 0B3E-0B43 0CD5-0CD6 0F35 // 0670 09D7 0B47-0B48 0CE6-0CEF 0F37 // 06D6-06DC 09E2-09E3 0B4B-0B4D 0D02-0D03 0F39