mirror of
https://github.com/golang/go
synced 2024-11-25 01:57:56 -07:00
XML parser
R=r DELTA=546 (545 added, 0 deleted, 1 changed) OCL=35318 CL=35341
This commit is contained in:
parent
a91b6b74e3
commit
0ddfe70c27
@ -30,7 +30,7 @@ expvar.install: bytes.install fmt.install http.install log.install strconv.insta
|
||||
flag.install: fmt.install os.install strconv.install
|
||||
fmt.install: io.install os.install reflect.install strconv.install utf8.install
|
||||
go/ast.install: go/token.install unicode.install utf8.install
|
||||
go/doc.install: container/vector.install go/ast.install go/token.install io.install once.install regexp.install sort.install strings.install template.install
|
||||
go/doc.install: container/vector.install go/ast.install go/token.install io.install regexp.install sort.install strings.install template.install
|
||||
go/parser.install: bytes.install container/vector.install fmt.install go/ast.install go/scanner.install go/token.install io.install os.install path.install strings.install
|
||||
go/printer.install: bytes.install container/vector.install fmt.install go/ast.install go/token.install io.install os.install reflect.install runtime.install strings.install tabwriter.install
|
||||
go/scanner.install: bytes.install container/vector.install fmt.install go/token.install io.install os.install sort.install strconv.install unicode.install utf8.install
|
||||
@ -68,3 +68,4 @@ testing/iotest.install: bytes.install io.install log.install os.install
|
||||
time.install: io.install once.install os.install syscall.install
|
||||
unicode.install:
|
||||
utf8.install: unicode.install
|
||||
xml.install: bufio.install bytes.install io.install os.install reflect.install strconv.install strings.install unicode.install utf8.install
|
||||
|
@ -82,6 +82,7 @@ DIRS=\
|
||||
time\
|
||||
unicode\
|
||||
utf8\
|
||||
xml\
|
||||
|
||||
NOTEST=\
|
||||
debug/proc\
|
||||
|
13
src/pkg/xml/Makefile
Normal file
13
src/pkg/xml/Makefile
Normal file
@ -0,0 +1,13 @@
|
||||
# Copyright 2009 The Go Authors. All rights reserved.
|
||||
# Use of this source code is governed by a BSD-style
|
||||
# license that can be found in the LICENSE file.
|
||||
|
||||
include $(GOROOT)/src/Make.$(GOARCH)
|
||||
|
||||
TARG=xml
|
||||
|
||||
GOFILES=\
|
||||
read.go\
|
||||
xml.go\
|
||||
|
||||
include $(GOROOT)/src/Make.pkg
|
320
src/pkg/xml/read.go
Normal file
320
src/pkg/xml/read.go
Normal file
@ -0,0 +1,320 @@
|
||||
// Copyright 2009 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package xml
|
||||
|
||||
import (
|
||||
"bytes";
|
||||
"io";
|
||||
"os";
|
||||
"reflect";
|
||||
"strings";
|
||||
)
|
||||
|
||||
// BUG(rsc): Mapping between XML elements and data structures is inherently flawed:
|
||||
// an XML element is an order-dependent collection of anonymous
|
||||
// values, while a data structure is an order-independent collection
|
||||
// of named values.
|
||||
// See package json for a textual representation more suitable
|
||||
// to data structures.
|
||||
|
||||
// Unmarshal parses an XML element from r and uses the
|
||||
// reflect library to fill in an arbitrary struct, slice, or string
|
||||
// pointed at by val. Well-formed data that does not fit
|
||||
// into val is discarded.
|
||||
//
|
||||
// For example, given these definitions:
|
||||
//
|
||||
// type Email struct {
|
||||
// Where string "attr";
|
||||
// Addr string;
|
||||
// }
|
||||
//
|
||||
// type Result struct {
|
||||
// XMLName xml.Name "result";
|
||||
// Name string;
|
||||
// Phone string;
|
||||
// Email []Email;
|
||||
// }
|
||||
//
|
||||
// var result = Result{ "name", "phone", nil }
|
||||
//
|
||||
// unmarshalling the XML input
|
||||
//
|
||||
// <result>
|
||||
// <email where="home">
|
||||
// <addr>gre@example.com</addr>
|
||||
// </email>
|
||||
// <email where='work'>
|
||||
// <addr>gre@work.com</addr>
|
||||
// </email>
|
||||
// <name>Grace R. Emlin</name>
|
||||
// <address>123 Main Street</address>
|
||||
// </result>
|
||||
//
|
||||
// via Unmarshal(r, &result) is equivalent to assigning
|
||||
//
|
||||
// r = Result{
|
||||
// xml.Name{"", "result"},
|
||||
// "Grace R. Emlin", // name
|
||||
// "phone", // no phone given
|
||||
// []Email{
|
||||
// Email{ "home", "gre@example.com" },
|
||||
// Email{ "work", "gre@work.com" }
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// Note that the field r.Phone has not been modified and
|
||||
// that the XML <address> element was discarded.
|
||||
//
|
||||
// Because Unmarshal uses the reflect package, it can only
|
||||
// assign to upper case fields. Unmarshal uses a case-insensitive
|
||||
// comparison to match XML element names to struct field names.
|
||||
//
|
||||
// Unmarshal maps an XML element to a struct using the following rules:
|
||||
//
|
||||
// * If the struct has a field named XMLName of type xml.Name,
|
||||
// Unmarshal records the element name in that field.
|
||||
//
|
||||
// * If the XMLName field has an associated tag string of the form
|
||||
// "tag" or "namespace-URL tag", the XML element must have
|
||||
// the given tag (and, optionally, name space) or else Unmarshal
|
||||
// returns an error.
|
||||
//
|
||||
// * If the XML element has an attribute whose name matches a
|
||||
// struct field of type string with tag "attr", Unmarshal records
|
||||
// the attribute value in that field.
|
||||
//
|
||||
// * If the XML element contains character data, that data is
|
||||
// accumulated in the first struct field that has tag "chardata".
|
||||
// The struct field may have type []byte or string.
|
||||
// If there is no such field, the character data is discarded.
|
||||
//
|
||||
// * If the XML element contains a sub-element whose name
|
||||
// matches a struct field whose tag is neither "attr" nor "chardata",
|
||||
// Unmarshal maps the sub-element to that struct field.
|
||||
//
|
||||
// Unmarshal maps an XML element to a string or []byte by saving the
|
||||
// concatenation of that elements character data in the string or []byte.
|
||||
//
|
||||
// Unmarshal maps an XML element to a slice by extending the length
|
||||
// of the slice and mapping the element to the newly created value.
|
||||
//
|
||||
func Unmarshal(r io.Reader, val interface{}) os.Error {
|
||||
v, ok := reflect.NewValue(val).(*reflect.PtrValue);
|
||||
if !ok {
|
||||
return os.NewError("non-pointer passed to Unmarshal");
|
||||
}
|
||||
p := NewParser(r);
|
||||
elem := v.Elem();
|
||||
for {
|
||||
err := p.unmarshal(elem, nil);
|
||||
if err != nil {
|
||||
if err == os.EOF {
|
||||
break;
|
||||
}
|
||||
return err;
|
||||
}
|
||||
}
|
||||
return nil;
|
||||
}
|
||||
|
||||
// An UnmarshalError represents an error in the unmarshalling process.
|
||||
type UnmarshalError string
|
||||
func (e UnmarshalError) String() string {
|
||||
return string(e);
|
||||
}
|
||||
|
||||
// Unmarshal a single XML element into val.
|
||||
func (p *Parser) unmarshal(val reflect.Value, start *StartElement) os.Error {
|
||||
// Find start element if we need it.
|
||||
if start == nil {
|
||||
for {
|
||||
tok, err := p.Token();
|
||||
if err != nil {
|
||||
return err;
|
||||
}
|
||||
if t, ok := tok.(StartElement); ok {
|
||||
start = &t;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var (
|
||||
data []byte;
|
||||
saveData reflect.Value;
|
||||
sv *reflect.StructValue;
|
||||
styp *reflect.StructType;
|
||||
)
|
||||
switch v := val.(type) {
|
||||
case *reflect.SliceValue:
|
||||
typ := v.Type().(*reflect.SliceType);
|
||||
if _, ok := typ.Elem().(*reflect.Uint8Type); ok {
|
||||
// []byte
|
||||
saveData = v;
|
||||
break;
|
||||
}
|
||||
|
||||
// Slice of element values.
|
||||
// Grow slice.
|
||||
n := v.Len();
|
||||
if n >= v.Cap() {
|
||||
ncap := 2*n;
|
||||
if ncap < 4 {
|
||||
ncap = 4;
|
||||
}
|
||||
new := reflect.MakeSlice(typ, n, ncap);
|
||||
reflect.ArrayCopy(new, v);
|
||||
v.Set(new);
|
||||
}
|
||||
v.SetLen(n+1);
|
||||
|
||||
// Recur to read element into slice.
|
||||
if err := p.unmarshal(v.Elem(n), start); err != nil {
|
||||
v.SetLen(n);
|
||||
return err;
|
||||
}
|
||||
return nil;
|
||||
|
||||
case *reflect.StringValue:
|
||||
saveData = v;
|
||||
|
||||
case *reflect.StructValue:
|
||||
sv = v;
|
||||
typ := sv.Type().(*reflect.StructType);
|
||||
styp = typ;
|
||||
// Assign name.
|
||||
if f, ok := typ.FieldByName("XMLName"); ok {
|
||||
// Validate element name.
|
||||
if f.Tag != "" {
|
||||
tag := f.Tag;
|
||||
ns := "";
|
||||
i := strings.LastIndex(tag, " ");
|
||||
if i >= 0 {
|
||||
ns, tag = tag[0:i], tag[i+1:len(tag)];
|
||||
}
|
||||
if tag != start.Name.Local {
|
||||
return UnmarshalError("expected element type <" + tag + "> but have <" + start.Name.Local + ">");
|
||||
}
|
||||
if ns != "" && ns != start.Name.Space {
|
||||
e := "expected element <" + tag + "> in name space " + ns + " but have ";
|
||||
if start.Name.Space == "" {
|
||||
e += "no name space";
|
||||
} else {
|
||||
e += start.Name.Space;
|
||||
}
|
||||
return UnmarshalError(e);
|
||||
}
|
||||
}
|
||||
|
||||
// Save
|
||||
v := sv.FieldByIndex(f.Index);
|
||||
if _, ok := v.Interface().(Name); !ok {
|
||||
return UnmarshalError(sv.Type().String() + " field XMLName does not have type xml.Name");
|
||||
}
|
||||
v.(*reflect.StructValue).Set(reflect.NewValue(start.Name).(*reflect.StructValue));
|
||||
}
|
||||
|
||||
// Assign attributes.
|
||||
// Also, do we need to save character data?
|
||||
for i, n := 0, typ.NumField(); i < n; i++ {
|
||||
f := typ.Field(i);
|
||||
switch f.Tag {
|
||||
case "attr":
|
||||
strv, ok := sv.FieldByIndex(f.Index).(*reflect.StringValue);
|
||||
if !ok {
|
||||
return UnmarshalError(sv.Type().String() + " field " + f.Name + " has attr tag but is not type string");
|
||||
}
|
||||
// Look for attribute.
|
||||
val := "";
|
||||
k := strings.ToLower(f.Name);
|
||||
for _, a := range start.Attr {
|
||||
if strings.ToLower(a.Name.Local) == k {
|
||||
val = a.Value;
|
||||
break;
|
||||
}
|
||||
}
|
||||
strv.Set(val);
|
||||
|
||||
case "chardata":
|
||||
if saveData == nil {
|
||||
saveData = sv.FieldByIndex(f.Index);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Find end element.
|
||||
// Process sub-elements along the way.
|
||||
Loop:
|
||||
for {
|
||||
tok, err := p.Token();
|
||||
if err != nil {
|
||||
return err;
|
||||
}
|
||||
switch t := tok.(type) {
|
||||
case StartElement:
|
||||
// Sub-element.
|
||||
if sv != nil {
|
||||
k := strings.ToLower(t.Name.Local);
|
||||
for i, n := 0, styp.NumField(); i < n; i++ {
|
||||
f := styp.Field(i);
|
||||
if strings.ToLower(f.Name) == k {
|
||||
if err := p.unmarshal(sv.FieldByIndex(f.Index), &t); err != nil {
|
||||
return err;
|
||||
}
|
||||
continue Loop;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Not saving sub-element but still have to skip over it.
|
||||
if err := p.skip(); err != nil {
|
||||
return err;
|
||||
}
|
||||
|
||||
case EndElement:
|
||||
break Loop;
|
||||
|
||||
case CharData:
|
||||
if saveData != nil {
|
||||
data = bytes.Add(data, t);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Save accumulated character data
|
||||
if saveData != nil {
|
||||
switch t := saveData.(type) {
|
||||
case *reflect.StringValue:
|
||||
t.Set(string(data));
|
||||
case *reflect.SliceValue:
|
||||
t.Set(reflect.NewValue(data).(*reflect.SliceValue));
|
||||
}
|
||||
}
|
||||
|
||||
return nil;
|
||||
}
|
||||
|
||||
// Have already read a start element.
|
||||
// Read tokens until we find the end element.
|
||||
// Token is taking care of making sure the
|
||||
// end element matches the start element we saw.
|
||||
func (p *Parser) skip() os.Error {
|
||||
for {
|
||||
tok, err := p.Token();
|
||||
if err != nil {
|
||||
return err;
|
||||
}
|
||||
switch t := tok.(type) {
|
||||
case StartElement:
|
||||
if err := p.skip(); err != nil {
|
||||
return err;
|
||||
}
|
||||
case EndElement:
|
||||
return nil;
|
||||
}
|
||||
}
|
||||
panic("unreachable");
|
||||
}
|
210
src/pkg/xml/read_test.go
Normal file
210
src/pkg/xml/read_test.go
Normal file
@ -0,0 +1,210 @@
|
||||
// Copyright 2009 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package xml
|
||||
|
||||
import (
|
||||
"reflect";
|
||||
"testing";
|
||||
)
|
||||
|
||||
// Stripped down Atom feed data structures.
|
||||
|
||||
func TestUnmarshalFeed(t *testing.T) {
|
||||
var f Feed;
|
||||
if err := Unmarshal(StringReader(rssFeedString), &f); err != nil {
|
||||
t.Fatalf("Unmarshal: %s", err);
|
||||
}
|
||||
if !reflect.DeepEqual(f, rssFeed) {
|
||||
t.Fatalf("have %#v\nwant %#v\n\n%#v", f);
|
||||
}
|
||||
}
|
||||
|
||||
// hget http://codereview.appspot.com/rss/mine/rsc
|
||||
const rssFeedString = `
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<feed xmlns="http://www.w3.org/2005/Atom" xml:lang="en-us"><title>Code Review - My issues</title><link href="http://codereview.appspot.com/" rel="alternate"></link><link href="http://codereview.appspot.com/rss/mine/rsc" rel="self"></link><id>http://codereview.appspot.com/</id><updated>2009-10-04T01:35:58+00:00</updated><author><name>rietveld</name></author><entry><title>rietveld: an attempt at pubsubhubbub
|
||||
</title><link href="http://codereview.appspot.com/126085" rel="alternate"></link><updated>2009-10-04T01:35:58+00:00</updated><author><name>email-address-removed</name></author><id>urn:md5:134d9179c41f806be79b3a5f7877d19a</id><summary type="html">
|
||||
An attempt at adding pubsubhubbub support to Rietveld.
|
||||
http://code.google.com/p/pubsubhubbub
|
||||
http://code.google.com/p/rietveld/issues/detail?id=155
|
||||
|
||||
The server side of the protocol is trivial:
|
||||
1. add a &lt;link rel=&quot;hub&quot; href=&quot;hub-server&quot;&gt; tag to all
|
||||
feeds that will be pubsubhubbubbed.
|
||||
2. every time one of those feeds changes, tell the hub
|
||||
with a simple POST request.
|
||||
|
||||
I have tested this by adding debug prints to a local hub
|
||||
server and checking that the server got the right publish
|
||||
requests.
|
||||
|
||||
I can&#39;t quite get the server to work, but I think the bug
|
||||
is not in my code. I think that the server expects to be
|
||||
able to grab the feed and see the feed&#39;s actual URL in
|
||||
the link rel=&quot;self&quot;, but the default value for that drops
|
||||
the :port from the URL, and I cannot for the life of me
|
||||
figure out how to get the Atom generator deep inside
|
||||
django not to do that, or even where it is doing that,
|
||||
or even what code is running to generate the Atom feed.
|
||||
(I thought I knew but I added some assert False statements
|
||||
and it kept running!)
|
||||
|
||||
Ignoring that particular problem, I would appreciate
|
||||
feedback on the right way to get the two values at
|
||||
the top of feeds.py marked NOTE(rsc).
|
||||
|
||||
|
||||
</summary></entry><entry><title>rietveld: correct tab handling
|
||||
</title><link href="http://codereview.appspot.com/124106" rel="alternate"></link><updated>2009-10-03T23:02:17+00:00</updated><author><name>email-address-removed</name></author><id>urn:md5:0a2a4f19bb815101f0ba2904aed7c35a</id><summary type="html">
|
||||
This fixes the buggy tab rendering that can be seen at
|
||||
http://codereview.appspot.com/116075/diff/1/2
|
||||
|
||||
The fundamental problem was that the tab code was
|
||||
not being told what column the text began in, so it
|
||||
didn&#39;t know where to put the tab stops. Another problem
|
||||
was that some of the code assumed that string byte
|
||||
offsets were the same as column offsets, which is only
|
||||
true if there are no tabs.
|
||||
|
||||
In the process of fixing this, I cleaned up the arguments
|
||||
to Fold and ExpandTabs and renamed them Break and
|
||||
_ExpandTabs so that I could be sure that I found all the
|
||||
call sites. I also wanted to verify that ExpandTabs was
|
||||
not being used from outside intra_region_diff.py.
|
||||
|
||||
|
||||
</summary></entry></feed>`
|
||||
|
||||
type Feed struct {
|
||||
XMLName Name "http://www.w3.org/2005/Atom feed";
|
||||
Title string;
|
||||
Id string;
|
||||
Link []Link;
|
||||
Updated Time;
|
||||
Author Person;
|
||||
Entry []Entry;
|
||||
}
|
||||
|
||||
type Entry struct {
|
||||
Title string;
|
||||
Id string;
|
||||
Link []Link;
|
||||
Updated Time;
|
||||
Author Person;
|
||||
Summary Text;
|
||||
}
|
||||
|
||||
type Link struct {
|
||||
Rel string "attr";
|
||||
Href string "attr";
|
||||
}
|
||||
|
||||
type Person struct {
|
||||
Name string;
|
||||
URI string;
|
||||
Email string;
|
||||
}
|
||||
|
||||
type Text struct {
|
||||
Type string "attr";
|
||||
Body string "chardata";
|
||||
}
|
||||
|
||||
type Time string
|
||||
|
||||
var rssFeed = Feed{
|
||||
XMLName: Name{"http://www.w3.org/2005/Atom", "feed"},
|
||||
Title: "Code Review - My issues",
|
||||
Link: []Link{
|
||||
Link{Rel: "alternate", Href: "http://codereview.appspot.com/"},
|
||||
Link{Rel: "self", Href: "http://codereview.appspot.com/rss/mine/rsc"},
|
||||
},
|
||||
Id: "http://codereview.appspot.com/",
|
||||
Updated: "2009-10-04T01:35:58+00:00",
|
||||
Author: Person{
|
||||
Name: "rietveld"
|
||||
},
|
||||
Entry: []Entry{
|
||||
Entry{
|
||||
Title: "rietveld: an attempt at pubsubhubbub\n",
|
||||
Link: []Link{
|
||||
Link{Rel: "alternate", Href: "http://codereview.appspot.com/126085"},
|
||||
},
|
||||
Updated: "2009-10-04T01:35:58+00:00",
|
||||
Author: Person{
|
||||
Name: "email-address-removed"
|
||||
},
|
||||
Id: "urn:md5:134d9179c41f806be79b3a5f7877d19a",
|
||||
Summary: Text{
|
||||
Type: "html",
|
||||
Body: `
|
||||
An attempt at adding pubsubhubbub support to Rietveld.
|
||||
http://code.google.com/p/pubsubhubbub
|
||||
http://code.google.com/p/rietveld/issues/detail?id=155
|
||||
|
||||
The server side of the protocol is trivial:
|
||||
1. add a <link rel="hub" href="hub-server"> tag to all
|
||||
feeds that will be pubsubhubbubbed.
|
||||
2. every time one of those feeds changes, tell the hub
|
||||
with a simple POST request.
|
||||
|
||||
I have tested this by adding debug prints to a local hub
|
||||
server and checking that the server got the right publish
|
||||
requests.
|
||||
|
||||
I can't quite get the server to work, but I think the bug
|
||||
is not in my code. I think that the server expects to be
|
||||
able to grab the feed and see the feed's actual URL in
|
||||
the link rel="self", but the default value for that drops
|
||||
the :port from the URL, and I cannot for the life of me
|
||||
figure out how to get the Atom generator deep inside
|
||||
django not to do that, or even where it is doing that,
|
||||
or even what code is running to generate the Atom feed.
|
||||
(I thought I knew but I added some assert False statements
|
||||
and it kept running!)
|
||||
|
||||
Ignoring that particular problem, I would appreciate
|
||||
feedback on the right way to get the two values at
|
||||
the top of feeds.py marked NOTE(rsc).
|
||||
|
||||
|
||||
`
|
||||
},
|
||||
},
|
||||
Entry{
|
||||
Title: "rietveld: correct tab handling\n",
|
||||
Link: []Link{
|
||||
Link{Rel: "alternate", Href: "http://codereview.appspot.com/124106"},
|
||||
},
|
||||
Updated: "2009-10-03T23:02:17+00:00",
|
||||
Author: Person{
|
||||
Name: "email-address-removed"
|
||||
},
|
||||
Id: "urn:md5:0a2a4f19bb815101f0ba2904aed7c35a",
|
||||
Summary: Text{
|
||||
Type: "html",
|
||||
Body: `
|
||||
This fixes the buggy tab rendering that can be seen at
|
||||
http://codereview.appspot.com/116075/diff/1/2
|
||||
|
||||
The fundamental problem was that the tab code was
|
||||
not being told what column the text began in, so it
|
||||
didn't know where to put the tab stops. Another problem
|
||||
was that some of the code assumed that string byte
|
||||
offsets were the same as column offsets, which is only
|
||||
true if there are no tabs.
|
||||
|
||||
In the process of fixing this, I cleaned up the arguments
|
||||
to Fold and ExpandTabs and renamed them Break and
|
||||
_ExpandTabs so that I could be sure that I found all the
|
||||
call sites. I also wanted to verify that ExpandTabs was
|
||||
not being used from outside intra_region_diff.py.
|
||||
|
||||
|
||||
`
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user