From 08a47d6f6087ea2baabca741267a82643d289e92 Mon Sep 17 00:00:00 2001 From: Nigel Tao Date: Tue, 7 Dec 2010 12:02:36 +1100 Subject: [PATCH] html: first cut at a parser. R=gri CC=golang-dev https://golang.org/cl/3355041 --- src/pkg/html/Makefile | 1 + src/pkg/html/doc.go | 31 ++- src/pkg/html/parse.go | 414 +++++++++++++++++++++++++++++++++++++ src/pkg/html/parse_test.go | 153 ++++++++++++++ src/pkg/html/token.go | 82 ++++---- src/pkg/html/token_test.go | 10 +- 6 files changed, 639 insertions(+), 52 deletions(-) create mode 100644 src/pkg/html/parse.go create mode 100644 src/pkg/html/parse_test.go diff --git a/src/pkg/html/Makefile b/src/pkg/html/Makefile index 4bbd98a9368..00e1c05508d 100644 --- a/src/pkg/html/Makefile +++ b/src/pkg/html/Makefile @@ -9,6 +9,7 @@ GOFILES=\ doc.go\ entity.go\ escape.go\ + parse.go\ token.go\ include ../../Make.pkg diff --git a/src/pkg/html/doc.go b/src/pkg/html/doc.go index 9f5d478b42c..c5338d0781d 100644 --- a/src/pkg/html/doc.go +++ b/src/pkg/html/doc.go @@ -15,7 +15,7 @@ which parses the next token and returns its type, or an error: for { tt := z.Next() - if tt == html.Error { + if tt == html.ErrorToken { // ... return ... } @@ -34,7 +34,7 @@ Entities (such as "<") are unescaped, tag names and attribute keys are lower-cased, and attributes are collected into a []Attribute. For example: for { - if z.Next() == html.Error { + if z.Next() == html.ErrorToken { // Returning os.EOF indicates success. return z.Error() } @@ -49,15 +49,15 @@ call to Next. For example, to extract an HTML page's anchor text: for { tt := z.Next() switch tt { - case Error: + case ErrorToken: return z.Error() - case Text: + case TextToken: if depth > 0 { // emitBytes should copy the []byte it receives, // if it doesn't process it immediately. emitBytes(z.Text()) } - case StartTag, EndTag: + case StartTagToken, EndTagToken: tn, _ := z.TagName() if len(tn) == 1 && tn[0] == 'a' { if tt == StartTag { @@ -69,6 +69,26 @@ call to Next. For example, to extract an HTML page's anchor text: } } +Parsing is done by calling Parse with an io.Reader, which returns the root of +the parse tree (the document element) as a *Node. It is the caller's +responsibility to ensure that the Reader provides UTF-8 encoded HTML. For +example, to process each anchor node in depth-first order: + + doc, err := html.Parse(r) + if err != nil { + // ... + } + var f func(*html.Node) + f = func(n *html.Node) { + if n.Type == html.ElementNode && n.Data == "a" { + // Do something with n... + } + for _, c := range n.Child { + f(c) + } + } + f(doc) + The relevant specifications include: http://www.whatwg.org/specs/web-apps/current-work/multipage/syntax.html and http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html @@ -82,6 +102,5 @@ package html // node. Specification compliance is verified by checking expected and actual // outputs over a test suite rather than aiming for algorithmic fidelity. -// TODO(nigeltao): Implement a parser, not just a tokenizer. // TODO(nigeltao): Does a DOM API belong in this package or a separate one? // TODO(nigeltao): How does parsing interact with a JavaScript engine? diff --git a/src/pkg/html/parse.go b/src/pkg/html/parse.go new file mode 100644 index 00000000000..d3c1f12135f --- /dev/null +++ b/src/pkg/html/parse.go @@ -0,0 +1,414 @@ +// Copyright 2010 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package html + +import ( + "io" + "os" +) + +// A NodeType is the type of a Node. +type NodeType int + +const ( + ErrorNode NodeType = iota + TextNode + DocumentNode + ElementNode + CommentNode +) + +// A Node consists of a NodeType and some Data (tag name for element nodes, +// content for text) and are part of a tree of Nodes. Element nodes may also +// contain a slice of Attributes. Data is unescaped, so that it looks like +// "a are re-interpreted as a two-token sequence: + //
followed by . hasSelfClosingToken is true if we have just read + // the synthetic start tag and the next one due is the matching end tag. + hasSelfClosingToken bool + // doc is the document root element. + doc *Node + // The stack of open elements (section 10.2.3.2). + stack []*Node + // Element pointers (section 10.2.3.4). + head, form *Node + // Other parsing state flags (section 10.2.3.5). + scripting, framesetOK bool +} + +// pop pops the top of the stack of open elements. +// It will panic if the stack is empty. +func (p *parser) pop() *Node { + n := len(p.stack) + ret := p.stack[n-1] + p.stack = p.stack[:n-1] + return ret +} + +// push pushes onto the stack of open elements. +func (p *parser) push(n *Node) { + p.stack = append(p.stack, n) +} + +// top returns the top of the stack of open elements. +// This is also known as the current node. +func (p *parser) top() *Node { + if n := len(p.stack); n > 0 { + return p.stack[n-1] + } + return p.doc +} + +// addChild adds a child node n to the top element, and pushes n +// if it is an element node (text nodes do not have children). +func (p *parser) addChild(n *Node) { + m := p.top() + m.Child = append(m.Child, n) + if n.Type == ElementNode { + p.push(n) + } +} + +// addText adds text to the current node. +func (p *parser) addText(s string) { + // TODO(nigeltao): merge s with previous text, if the preceding node is a text node. + // TODO(nigeltao): distinguish whitespace text from others. + p.addChild(&Node{ + Type: TextNode, + Data: s, + }) +} + +// Section 10.2.3.3. +func (p *parser) addFormattingElement(n *Node) { + p.addChild(n) + // TODO. +} + +// Section 10.2.3.3. +func (p *parser) reconstructActiveFormattingElements() { + // TODO. +} + +// read reads the next token. This is usually from the tokenizer, but it may +// be the synthesized end tag implied by a self-closing tag. +func (p *parser) read() os.Error { + if p.hasSelfClosingToken { + p.hasSelfClosingToken = false + p.tok.Type = EndTagToken + p.tok.Attr = nil + return nil + } + if tokenType := p.tokenizer.Next(); tokenType == ErrorToken { + return p.tokenizer.Error() + } + p.tok = p.tokenizer.Token() + if p.tok.Type == SelfClosingTagToken { + p.hasSelfClosingToken = true + p.tok.Type = StartTagToken + } + return nil +} + +// Section 10.2.4. +func (p *parser) acknowledgeSelfClosingTag() { + p.hasSelfClosingToken = false +} + +// Section 10.2.5.4. +func initialInsertionMode(p *parser) (insertionMode, bool) { + // TODO(nigeltao): check p.tok for DOCTYPE. + return beforeHTMLInsertionMode, false +} + +// Section 10.2.5.5. +func beforeHTMLInsertionMode(p *parser) (insertionMode, bool) { + var ( + add bool + attr []Attribute + implied bool + ) + switch p.tok.Type { + case TextToken: + // TODO(nigeltao): distinguish whitespace text from others. + implied = true + case StartTagToken: + if p.tok.Data == "html" { + add = true + attr = p.tok.Attr + } else { + implied = true + } + case EndTagToken: + // TODO. + } + if add || implied { + p.addChild(&Node{ + Type: ElementNode, + Data: "html", + Attr: attr, + }) + } + return beforeHeadInsertionMode, !implied +} + +// Section 10.2.5.6. +func beforeHeadInsertionMode(p *parser) (insertionMode, bool) { + var ( + add bool + attr []Attribute + implied bool + ) + switch p.tok.Type { + case TextToken: + // TODO(nigeltao): distinguish whitespace text from others. + implied = true + case StartTagToken: + switch p.tok.Data { + case "head": + add = true + attr = p.tok.Attr + case "html": + // TODO. + default: + implied = true + } + case EndTagToken: + // TODO. + } + if add || implied { + p.addChild(&Node{ + Type: ElementNode, + Data: "head", + Attr: attr, + }) + } + return inHeadInsertionMode, !implied +} + +// Section 10.2.5.7. +func inHeadInsertionMode(p *parser) (insertionMode, bool) { + var ( + pop bool + implied bool + ) + switch p.tok.Type { + case TextToken: + implied = true + case StartTagToken: + switch p.tok.Data { + case "meta": + // TODO. + case "script": + // TODO. + default: + implied = true + } + case EndTagToken: + if p.tok.Data == "head" { + pop = true + } + // TODO. + } + if pop || implied { + n := p.pop() + if n.Data != "head" { + panic("html: bad parser state") + } + return afterHeadInsertionMode, !implied + } + return inHeadInsertionMode, !implied +} + +// Section 10.2.5.9. +func afterHeadInsertionMode(p *parser) (insertionMode, bool) { + var ( + add bool + attr []Attribute + framesetOK bool + implied bool + ) + switch p.tok.Type { + case TextToken: + implied = true + framesetOK = true + case StartTagToken: + switch p.tok.Data { + case "html": + // TODO. + case "body": + add = true + attr = p.tok.Attr + framesetOK = false + case "frameset": + // TODO. + case "base", "basefont", "bgsound", "link", "meta", "noframes", "script", "style", "title": + // TODO. + case "head": + // TODO. + default: + implied = true + framesetOK = true + } + case EndTagToken: + // TODO. + } + if add || implied { + p.addChild(&Node{ + Type: ElementNode, + Data: "body", + Attr: attr, + }) + p.framesetOK = framesetOK + } + return inBodyInsertionMode, !implied +} + +// Section 10.2.5.10. +func inBodyInsertionMode(p *parser) (insertionMode, bool) { + var endP bool + switch p.tok.Type { + case TextToken: + p.addText(p.tok.Data) + p.framesetOK = false + case StartTagToken: + switch p.tok.Data { + case "address", "article", "aside", "blockquote", "center", "details", "dir", "div", "dl", "fieldset", "figcaption", "figure", "footer", "header", "hgroup", "menu", "nav", "ol", "p", "section", "summary", "ul": + // TODO(nigeltao): Do the proper "does the stack of open elements has a p element in button scope" algorithm in section 10.2.3.2. + n := p.top() + if n.Type == ElementNode && n.Data == "p" { + endP = true + } else { + p.addChild(&Node{ + Type: ElementNode, + Data: p.tok.Data, + Attr: p.tok.Attr, + }) + } + case "b", "big", "code", "em", "font", "i", "s", "small", "strike", "strong", "tt", "u": + p.reconstructActiveFormattingElements() + p.addFormattingElement(&Node{ + Type: ElementNode, + Data: p.tok.Data, + Attr: p.tok.Attr, + }) + case "area", "br", "embed", "img", "input", "keygen", "wbr": + p.reconstructActiveFormattingElements() + p.addChild(&Node{ + Type: ElementNode, + Data: p.tok.Data, + Attr: p.tok.Attr, + }) + p.pop() + p.acknowledgeSelfClosingTag() + p.framesetOK = false + case "hr": + // TODO(nigeltao): auto-insert

if necessary. + p.addChild(&Node{ + Type: ElementNode, + Data: p.tok.Data, + Attr: p.tok.Attr, + }) + p.pop() + p.acknowledgeSelfClosingTag() + p.framesetOK = false + default: + // TODO. + } + case EndTagToken: + switch p.tok.Data { + case "body": + // TODO(nigeltao): autoclose the stack of open elements. + return afterBodyInsertionMode, true + case "a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small", "strike", "strong", "tt", "u": + // TODO(nigeltao): implement the "adoption agency" algorithm: + // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#adoptionAgency + p.pop() + default: + // TODO. + } + } + if endP { + // TODO(nigeltao): do the proper algorithm. + n := p.pop() + if n.Type != ElementNode || n.Data != "p" { + panic("unreachable") + } + } + return inBodyInsertionMode, !endP +} + +// Section 10.2.5.22. +func afterBodyInsertionMode(p *parser) (insertionMode, bool) { + switch p.tok.Type { + case TextToken: + // TODO. + case StartTagToken: + // TODO. + case EndTagToken: + switch p.tok.Data { + case "html": + // TODO(nigeltao): autoclose the stack of open elements. + return afterAfterBodyInsertionMode, true + default: + // TODO. + } + } + return afterBodyInsertionMode, true +} + +// Section 10.2.5.25. +func afterAfterBodyInsertionMode(p *parser) (insertionMode, bool) { + return inBodyInsertionMode, false +} + +// Parse returns the parse tree for the HTML from the given Reader. +// The input is assumed to be UTF-8 encoded. +func Parse(r io.Reader) (*Node, os.Error) { + p := &parser{ + tokenizer: NewTokenizer(r), + doc: &Node{ + Type: DocumentNode, + }, + scripting: true, + framesetOK: true, + } + im, consumed := initialInsertionMode, true + for { + if consumed { + if err := p.read(); err != nil { + if err == os.EOF { + break + } + return nil, err + } + } + im, consumed = im(p) + } + // TODO(nigeltao): clean up, depending on the value of im. + // The specification's algorithm does clean up on reading an EOF 'token', + // but in go we represent EOF by an os.Error instead. + return p.doc, nil +} diff --git a/src/pkg/html/parse_test.go b/src/pkg/html/parse_test.go new file mode 100644 index 00000000000..7fa4f427671 --- /dev/null +++ b/src/pkg/html/parse_test.go @@ -0,0 +1,153 @@ +// Copyright 2010 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package html + +import ( + "bufio" + "bytes" + "fmt" + "io" + "io/ioutil" + "os" + "testing" +) + +type devNull struct{} + +func (devNull) Write(p []byte) (int, os.Error) { + return len(p), nil +} + +func pipeErr(err os.Error) io.Reader { + pr, pw := io.Pipe() + pw.CloseWithError(err) + return pr +} + +func readDat(filename string, c chan io.Reader) { + f, err := os.Open("testdata/webkit/"+filename, os.O_RDONLY, 0600) + if err != nil { + c <- pipeErr(err) + return + } + defer f.Close() + + // Loop through the lines of the file. Each line beginning with "#" denotes + // a new section, which is returned as a separate io.Reader. + r := bufio.NewReader(f) + var pw *io.PipeWriter + for { + line, err := r.ReadSlice('\n') + if err != nil { + if pw != nil { + pw.CloseWithError(err) + pw = nil + } else { + c <- pipeErr(err) + } + return + } + if len(line) == 0 { + continue + } + if line[0] == '#' { + if pw != nil { + pw.Close() + } + var pr *io.PipeReader + pr, pw = io.Pipe() + c <- pr + continue + } + if line[0] != '|' { + // Strip the trailing '\n'. + line = line[:len(line)-1] + } + if pw != nil { + if _, err := pw.Write(line); err != nil { + pw.CloseWithError(err) + pw = nil + } + } + } +} + +func dumpLevel(w io.Writer, n *Node, level int) os.Error { + io.WriteString(w, "| ") + for i := 0; i < level; i++ { + io.WriteString(w, " ") + } + switch n.Type { + case ErrorNode: + return os.NewError("unexpected ErrorNode") + case DocumentNode: + return os.NewError("unexpected DocumentNode") + case ElementNode: + fmt.Fprintf(w, "<%s>", EscapeString(n.Data)) + case TextNode: + fmt.Fprintf(w, "%q", EscapeString(n.Data)) + case CommentNode: + return os.NewError("COMMENT") + default: + return os.NewError("unknown node type") + } + io.WriteString(w, "\n") + for _, c := range n.Child { + if err := dumpLevel(w, c, level+1); err != nil { + return err + } + } + return nil +} + +func dump(n *Node) (string, os.Error) { + if n == nil || len(n.Child) == 0 { + return "", nil + } + if len(n.Child) > 1 { + return "too many children", nil + } + b := bytes.NewBuffer(nil) + if err := dumpLevel(b, n.Child[0], 0); err != nil { + return "", err + } + return b.String(), nil +} + +func TestParser(t *testing.T) { + // TODO(nigeltao): Process all the .dat files, not just the first one. + filenames := []string{ + "tests1.dat", + } + for _, filename := range filenames { + rc := make(chan io.Reader) + go readDat(filename, rc) + // TODO(nigeltao): Process all test cases, not just the first three. + for i := 0; i < 3; i++ { + // Parse the #data section. + doc, err := Parse(<-rc) + if err != nil { + t.Fatal(err) + } + actual, err := dump(doc) + if err != nil { + t.Fatal(err) + } + // Skip the #error section. + if _, err := io.Copy(devNull{}, <-rc); err != nil { + t.Fatal(err) + } + // Compare the parsed tree to the #document section. + b, err := ioutil.ReadAll(<-rc) + if err != nil { + t.Fatal(err) + } + expected := string(b) + if actual != expected { + t.Errorf("%s test #%d, actual vs expected:\n----\n%s----\n%s----", filename, i, actual, expected) + } + } + } +} diff --git a/src/pkg/html/token.go b/src/pkg/html/token.go index 0d4de254308..dc2a6ec5c31 100644 --- a/src/pkg/html/token.go +++ b/src/pkg/html/token.go @@ -15,30 +15,30 @@ import ( type TokenType int const ( - // Error means that an error occurred during tokenization. - Error TokenType = iota - // Text means a text node. - Text - // A StartTag looks like . - StartTag - // An EndTag looks like . - EndTag - // A SelfClosingTag tag looks like
. - SelfClosingTag + // ErrorToken means that an error occurred during tokenization. + ErrorToken TokenType = iota + // TextToken means a text node. + TextToken + // A StartTagToken looks like . + StartTagToken + // An EndTagToken looks like . + EndTagToken + // A SelfClosingTagToken tag looks like
. + SelfClosingTagToken ) // String returns a string representation of the TokenType. func (t TokenType) String() string { switch t { - case Error: + case ErrorToken: return "Error" - case Text: + case TextToken: return "Text" - case StartTag: + case StartTagToken: return "StartTag" - case EndTag: + case EndTagToken: return "EndTag" - case SelfClosingTag: + case SelfClosingTagToken: return "SelfClosingTag" } return "Invalid(" + strconv.Itoa(int(t)) + ")" @@ -81,15 +81,15 @@ func (t Token) tagString() string { // String returns a string representation of the Token. func (t Token) String() string { switch t.Type { - case Error: + case ErrorToken: return "" - case Text: + case TextToken: return EscapeString(t.Data) - case StartTag: + case StartTagToken: return "<" + t.tagString() + ">" - case EndTag: + case EndTagToken: return "" - case SelfClosingTag: + case SelfClosingTagToken: return "<" + t.tagString() + "/>" } return "Invalid(" + strconv.Itoa(int(t.Type)) + ")" @@ -109,10 +109,10 @@ type Tokenizer struct { buf []byte } -// Error returns the error associated with the most recent Error token. This is -// typically os.EOF, meaning the end of tokenization. +// Error returns the error associated with the most recent ErrorToken token. +// This is typically os.EOF, meaning the end of tokenization. func (z *Tokenizer) Error() os.Error { - if z.tt != Error { + if z.tt != ErrorToken { return nil } return z.err @@ -180,40 +180,40 @@ func (z *Tokenizer) readTo(x uint8) os.Error { func (z *Tokenizer) nextTag() (tt TokenType, err os.Error) { c, err := z.readByte() if err != nil { - return Error, err + return ErrorToken, err } switch { case c == '/': - tt = EndTag + tt = EndTagToken // Lower-cased characters are more common in tag names, so we check for them first. case 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z': - tt = StartTag + tt = StartTagToken case c == '!': - return Error, os.NewError("html: TODO(nigeltao): implement comments") + return ErrorToken, os.NewError("html: TODO(nigeltao): implement comments") case c == '?': - return Error, os.NewError("html: TODO(nigeltao): implement XML processing instructions") + return ErrorToken, os.NewError("html: TODO(nigeltao): implement XML processing instructions") default: - return Error, os.NewError("html: TODO(nigeltao): handle malformed tags") + return ErrorToken, os.NewError("html: TODO(nigeltao): handle malformed tags") } for { c, err := z.readByte() if err != nil { - return Text, err + return TextToken, err } switch c { case '"': err = z.readTo('"') if err != nil { - return Text, err + return TextToken, err } case '\'': err = z.readTo('\'') if err != nil { - return Text, err + return TextToken, err } case '>': - if z.buf[z.p1-2] == '/' && tt == StartTag { - return SelfClosingTag, nil + if z.buf[z.p1-2] == '/' && tt == StartTagToken { + return SelfClosingTagToken, nil } return tt, nil } @@ -224,13 +224,13 @@ func (z *Tokenizer) nextTag() (tt TokenType, err os.Error) { // Next scans the next token and returns its type. func (z *Tokenizer) Next() TokenType { if z.err != nil { - z.tt = Error + z.tt = ErrorToken return z.tt } z.p0 = z.p1 c, err := z.readByte() if err != nil { - z.tt, z.err = Error, err + z.tt, z.err = ErrorToken, err return z.tt } if c == '<' { @@ -240,15 +240,15 @@ func (z *Tokenizer) Next() TokenType { for { c, err := z.readByte() if err != nil { - z.tt, z.err = Error, err + z.tt, z.err = ErrorToken, err if err == os.EOF { - z.tt = Text + z.tt = TextToken } return z.tt } if c == '<' { z.p1-- - z.tt = Text + z.tt = TextToken return z.tt } } @@ -371,9 +371,9 @@ loop: func (z *Tokenizer) Token() Token { t := Token{Type: z.tt} switch z.tt { - case Text: + case TextToken: t.Data = string(z.Text()) - case StartTag, EndTag, SelfClosingTag: + case StartTagToken, EndTagToken, SelfClosingTagToken: var attr []Attribute name, remaining := z.TagName() for remaining { diff --git a/src/pkg/html/token_test.go b/src/pkg/html/token_test.go index 5759476eab4..7dbe13ddfe6 100644 --- a/src/pkg/html/token_test.go +++ b/src/pkg/html/token_test.go @@ -88,7 +88,7 @@ loop: for _, tt := range tokenTests { z := NewTokenizer(bytes.NewBuffer([]byte(tt.html))) for i, s := range tt.tokens { - if z.Next() == Error { + if z.Next() == ErrorToken { t.Errorf("%s token %d: want %q got error %v", tt.desc, i, s, z.Error()) continue loop } @@ -134,19 +134,19 @@ loop: for { tt := z.Next() switch tt { - case Error: + case ErrorToken: if z.Error() != os.EOF { t.Error(z.Error()) } break loop - case Text: + case TextToken: if depth > 0 { result.Write(z.Text()) } - case StartTag, EndTag: + case StartTagToken, EndTagToken: tn, _ := z.TagName() if len(tn) == 1 && tn[0] == 'a' { - if tt == StartTag { + if tt == StartTagToken { depth++ } else { depth--