1
0
mirror of https://github.com/golang/go synced 2024-11-20 11:04:56 -07:00

html: handle unexpected EOF during parsing.

This lets us parse HTML like "<html>foo".

R=gri
CC=golang-dev
https://golang.org/cl/3460043
This commit is contained in:
Nigel Tao 2010-12-08 08:59:20 +11:00
parent 8d50557979
commit 49014c5b12
2 changed files with 48 additions and 24 deletions

View File

@ -32,11 +32,6 @@ type Node struct {
Attr []Attribute Attr []Attribute
} }
// An insertion mode (section 10.2.3.1) is the state transition function from
// a particular state in the HTML5 parser's state machine. In addition to
// returning the next state, it also returns whether the token was consumed.
type insertionMode func(*parser) (insertionMode, bool)
// A parser implements the HTML5 parsing algorithm: // A parser implements the HTML5 parsing algorithm:
// http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#tree-construction // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#tree-construction
type parser struct { type parser struct {
@ -121,11 +116,12 @@ func (p *parser) read() os.Error {
p.tok.Attr = nil p.tok.Attr = nil
return nil return nil
} }
if tokenType := p.tokenizer.Next(); tokenType == ErrorToken { p.tokenizer.Next()
return p.tokenizer.Error()
}
p.tok = p.tokenizer.Token() p.tok = p.tokenizer.Token()
if p.tok.Type == SelfClosingTagToken { switch p.tok.Type {
case ErrorToken:
return p.tokenizer.Error()
case SelfClosingTagToken:
p.hasSelfClosingToken = true p.hasSelfClosingToken = true
p.tok.Type = StartTagToken p.tok.Type = StartTagToken
} }
@ -137,6 +133,13 @@ func (p *parser) acknowledgeSelfClosingTag() {
p.hasSelfClosingToken = false p.hasSelfClosingToken = false
} }
// An insertion mode (section 10.2.3.1) is the state transition function from
// a particular state in the HTML5 parser's state machine. It updates the
// parser's fields depending on parser.token (where ErrorToken means EOF). In
// addition to returning the next insertionMode state, it also returns whether
// the token was consumed.
type insertionMode func(*parser) (insertionMode, bool)
// Section 10.2.5.4. // Section 10.2.5.4.
func initialInsertionMode(p *parser) (insertionMode, bool) { func initialInsertionMode(p *parser) (insertionMode, bool) {
// TODO(nigeltao): check p.tok for DOCTYPE. // TODO(nigeltao): check p.tok for DOCTYPE.
@ -151,6 +154,8 @@ func beforeHTMLInsertionMode(p *parser) (insertionMode, bool) {
implied bool implied bool
) )
switch p.tok.Type { switch p.tok.Type {
case ErrorToken:
implied = true
case TextToken: case TextToken:
// TODO(nigeltao): distinguish whitespace text from others. // TODO(nigeltao): distinguish whitespace text from others.
implied = true implied = true
@ -162,7 +167,12 @@ func beforeHTMLInsertionMode(p *parser) (insertionMode, bool) {
implied = true implied = true
} }
case EndTagToken: case EndTagToken:
// TODO. switch p.tok.Data {
case "head", "body", "html", "br":
implied = true
default:
// Ignore the token.
}
} }
if add || implied { if add || implied {
p.addChild(&Node{ p.addChild(&Node{
@ -182,6 +192,8 @@ func beforeHeadInsertionMode(p *parser) (insertionMode, bool) {
implied bool implied bool
) )
switch p.tok.Type { switch p.tok.Type {
case ErrorToken:
implied = true
case TextToken: case TextToken:
// TODO(nigeltao): distinguish whitespace text from others. // TODO(nigeltao): distinguish whitespace text from others.
implied = true implied = true
@ -191,12 +203,17 @@ func beforeHeadInsertionMode(p *parser) (insertionMode, bool) {
add = true add = true
attr = p.tok.Attr attr = p.tok.Attr
case "html": case "html":
// TODO. return inBodyInsertionMode, false
default: default:
implied = true implied = true
} }
case EndTagToken: case EndTagToken:
// TODO. switch p.tok.Data {
case "head", "body", "html", "br":
implied = true
default:
// Ignore the token.
}
} }
if add || implied { if add || implied {
p.addChild(&Node{ p.addChild(&Node{
@ -215,7 +232,7 @@ func inHeadInsertionMode(p *parser) (insertionMode, bool) {
implied bool implied bool
) )
switch p.tok.Type { switch p.tok.Type {
case TextToken: case ErrorToken, TextToken:
implied = true implied = true
case StartTagToken: case StartTagToken:
switch p.tok.Data { switch p.tok.Data {
@ -251,7 +268,7 @@ func afterHeadInsertionMode(p *parser) (insertionMode, bool) {
implied bool implied bool
) )
switch p.tok.Type { switch p.tok.Type {
case TextToken: case ErrorToken, TextToken:
implied = true implied = true
framesetOK = true framesetOK = true
case StartTagToken: case StartTagToken:
@ -290,6 +307,8 @@ func afterHeadInsertionMode(p *parser) (insertionMode, bool) {
func inBodyInsertionMode(p *parser) (insertionMode, bool) { func inBodyInsertionMode(p *parser) (insertionMode, bool) {
var endP bool var endP bool
switch p.tok.Type { switch p.tok.Type {
case ErrorToken:
// No-op.
case TextToken: case TextToken:
p.addText(p.tok.Data) p.addText(p.tok.Data)
p.framesetOK = false p.framesetOK = false
@ -363,6 +382,8 @@ func inBodyInsertionMode(p *parser) (insertionMode, bool) {
// Section 10.2.5.22. // Section 10.2.5.22.
func afterBodyInsertionMode(p *parser) (insertionMode, bool) { func afterBodyInsertionMode(p *parser) (insertionMode, bool) {
switch p.tok.Type { switch p.tok.Type {
case ErrorToken:
// TODO.
case TextToken: case TextToken:
// TODO. // TODO.
case StartTagToken: case StartTagToken:
@ -395,6 +416,7 @@ func Parse(r io.Reader) (*Node, os.Error) {
scripting: true, scripting: true,
framesetOK: true, framesetOK: true,
} }
// Iterate until EOF. Any other error will cause an early return.
im, consumed := initialInsertionMode, true im, consumed := initialInsertionMode, true
for { for {
if consumed { if consumed {
@ -407,8 +429,11 @@ func Parse(r io.Reader) (*Node, os.Error) {
} }
im, consumed = im(p) im, consumed = im(p)
} }
// TODO(nigeltao): clean up, depending on the value of im. // Loop until the final token (the ErrorToken signifying EOF) is consumed.
// The specification's algorithm does clean up on reading an EOF 'token', for {
// but in go we represent EOF by an os.Error instead. if im, consumed = im(p); consumed {
break
}
}
return p.doc, nil return p.doc, nil
} }

View File

@ -106,12 +106,11 @@ func dump(n *Node) (string, os.Error) {
if n == nil || len(n.Child) == 0 { if n == nil || len(n.Child) == 0 {
return "", nil return "", nil
} }
if len(n.Child) > 1 {
return "too many children", nil
}
b := bytes.NewBuffer(nil) b := bytes.NewBuffer(nil)
if err := dumpLevel(b, n.Child[0], 0); err != nil { for _, child := range n.Child {
return "", err if err := dumpLevel(b, child, 0); err != nil {
return "", err
}
} }
return b.String(), nil return b.String(), nil
} }
@ -124,8 +123,8 @@ func TestParser(t *testing.T) {
for _, filename := range filenames { for _, filename := range filenames {
rc := make(chan io.Reader) rc := make(chan io.Reader)
go readDat(filename, rc) go readDat(filename, rc)
// TODO(nigeltao): Process all test cases, not just the first three. // TODO(nigeltao): Process all test cases, not just a subset.
for i := 0; i < 3; i++ { for i := 0; i < 19; i++ {
// Parse the #data section. // Parse the #data section.
doc, err := Parse(<-rc) doc, err := Parse(<-rc)
if err != nil { if err != nil {