From 49014c5b12bd00817203c5ae5dbd4ec1c8f6d157 Mon Sep 17 00:00:00 2001 From: Nigel Tao Date: Wed, 8 Dec 2010 08:59:20 +1100 Subject: [PATCH] html: handle unexpected EOF during parsing. This lets us parse HTML like "foo". R=gri CC=golang-dev https://golang.org/cl/3460043 --- src/pkg/html/parse.go | 59 +++++++++++++++++++++++++++----------- src/pkg/html/parse_test.go | 13 ++++----- 2 files changed, 48 insertions(+), 24 deletions(-) diff --git a/src/pkg/html/parse.go b/src/pkg/html/parse.go index d3c1f12135f..acc3eccbcc7 100644 --- a/src/pkg/html/parse.go +++ b/src/pkg/html/parse.go @@ -32,11 +32,6 @@ type Node struct { Attr []Attribute } -// An insertion mode (section 10.2.3.1) is the state transition function from -// a particular state in the HTML5 parser's state machine. In addition to -// returning the next state, it also returns whether the token was consumed. -type insertionMode func(*parser) (insertionMode, bool) - // A parser implements the HTML5 parsing algorithm: // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#tree-construction type parser struct { @@ -121,11 +116,12 @@ func (p *parser) read() os.Error { p.tok.Attr = nil return nil } - if tokenType := p.tokenizer.Next(); tokenType == ErrorToken { - return p.tokenizer.Error() - } + p.tokenizer.Next() p.tok = p.tokenizer.Token() - if p.tok.Type == SelfClosingTagToken { + switch p.tok.Type { + case ErrorToken: + return p.tokenizer.Error() + case SelfClosingTagToken: p.hasSelfClosingToken = true p.tok.Type = StartTagToken } @@ -137,6 +133,13 @@ func (p *parser) acknowledgeSelfClosingTag() { p.hasSelfClosingToken = false } +// An insertion mode (section 10.2.3.1) is the state transition function from +// a particular state in the HTML5 parser's state machine. It updates the +// parser's fields depending on parser.token (where ErrorToken means EOF). In +// addition to returning the next insertionMode state, it also returns whether +// the token was consumed. +type insertionMode func(*parser) (insertionMode, bool) + // Section 10.2.5.4. func initialInsertionMode(p *parser) (insertionMode, bool) { // TODO(nigeltao): check p.tok for DOCTYPE. @@ -151,6 +154,8 @@ func beforeHTMLInsertionMode(p *parser) (insertionMode, bool) { implied bool ) switch p.tok.Type { + case ErrorToken: + implied = true case TextToken: // TODO(nigeltao): distinguish whitespace text from others. implied = true @@ -162,7 +167,12 @@ func beforeHTMLInsertionMode(p *parser) (insertionMode, bool) { implied = true } case EndTagToken: - // TODO. + switch p.tok.Data { + case "head", "body", "html", "br": + implied = true + default: + // Ignore the token. + } } if add || implied { p.addChild(&Node{ @@ -182,6 +192,8 @@ func beforeHeadInsertionMode(p *parser) (insertionMode, bool) { implied bool ) switch p.tok.Type { + case ErrorToken: + implied = true case TextToken: // TODO(nigeltao): distinguish whitespace text from others. implied = true @@ -191,12 +203,17 @@ func beforeHeadInsertionMode(p *parser) (insertionMode, bool) { add = true attr = p.tok.Attr case "html": - // TODO. + return inBodyInsertionMode, false default: implied = true } case EndTagToken: - // TODO. + switch p.tok.Data { + case "head", "body", "html", "br": + implied = true + default: + // Ignore the token. + } } if add || implied { p.addChild(&Node{ @@ -215,7 +232,7 @@ func inHeadInsertionMode(p *parser) (insertionMode, bool) { implied bool ) switch p.tok.Type { - case TextToken: + case ErrorToken, TextToken: implied = true case StartTagToken: switch p.tok.Data { @@ -251,7 +268,7 @@ func afterHeadInsertionMode(p *parser) (insertionMode, bool) { implied bool ) switch p.tok.Type { - case TextToken: + case ErrorToken, TextToken: implied = true framesetOK = true case StartTagToken: @@ -290,6 +307,8 @@ func afterHeadInsertionMode(p *parser) (insertionMode, bool) { func inBodyInsertionMode(p *parser) (insertionMode, bool) { var endP bool switch p.tok.Type { + case ErrorToken: + // No-op. case TextToken: p.addText(p.tok.Data) p.framesetOK = false @@ -363,6 +382,8 @@ func inBodyInsertionMode(p *parser) (insertionMode, bool) { // Section 10.2.5.22. func afterBodyInsertionMode(p *parser) (insertionMode, bool) { switch p.tok.Type { + case ErrorToken: + // TODO. case TextToken: // TODO. case StartTagToken: @@ -395,6 +416,7 @@ func Parse(r io.Reader) (*Node, os.Error) { scripting: true, framesetOK: true, } + // Iterate until EOF. Any other error will cause an early return. im, consumed := initialInsertionMode, true for { if consumed { @@ -407,8 +429,11 @@ func Parse(r io.Reader) (*Node, os.Error) { } im, consumed = im(p) } - // TODO(nigeltao): clean up, depending on the value of im. - // The specification's algorithm does clean up on reading an EOF 'token', - // but in go we represent EOF by an os.Error instead. + // Loop until the final token (the ErrorToken signifying EOF) is consumed. + for { + if im, consumed = im(p); consumed { + break + } + } return p.doc, nil } diff --git a/src/pkg/html/parse_test.go b/src/pkg/html/parse_test.go index 7fa4f427671..839a034b7d5 100644 --- a/src/pkg/html/parse_test.go +++ b/src/pkg/html/parse_test.go @@ -106,12 +106,11 @@ func dump(n *Node) (string, os.Error) { if n == nil || len(n.Child) == 0 { return "", nil } - if len(n.Child) > 1 { - return "too many children", nil - } b := bytes.NewBuffer(nil) - if err := dumpLevel(b, n.Child[0], 0); err != nil { - return "", err + for _, child := range n.Child { + if err := dumpLevel(b, child, 0); err != nil { + return "", err + } } return b.String(), nil } @@ -124,8 +123,8 @@ func TestParser(t *testing.T) { for _, filename := range filenames { rc := make(chan io.Reader) go readDat(filename, rc) - // TODO(nigeltao): Process all test cases, not just the first three. - for i := 0; i < 3; i++ { + // TODO(nigeltao): Process all test cases, not just a subset. + for i := 0; i < 19; i++ { // Parse the #data section. doc, err := Parse(<-rc) if err != nil {