mirror of
https://github.com/golang/go
synced 2024-11-20 11:04:56 -07:00
html: handle unexpected EOF during parsing.
This lets us parse HTML like "<html>foo". R=gri CC=golang-dev https://golang.org/cl/3460043
This commit is contained in:
parent
8d50557979
commit
49014c5b12
@ -32,11 +32,6 @@ type Node struct {
|
|||||||
Attr []Attribute
|
Attr []Attribute
|
||||||
}
|
}
|
||||||
|
|
||||||
// An insertion mode (section 10.2.3.1) is the state transition function from
|
|
||||||
// a particular state in the HTML5 parser's state machine. In addition to
|
|
||||||
// returning the next state, it also returns whether the token was consumed.
|
|
||||||
type insertionMode func(*parser) (insertionMode, bool)
|
|
||||||
|
|
||||||
// A parser implements the HTML5 parsing algorithm:
|
// A parser implements the HTML5 parsing algorithm:
|
||||||
// http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#tree-construction
|
// http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#tree-construction
|
||||||
type parser struct {
|
type parser struct {
|
||||||
@ -121,11 +116,12 @@ func (p *parser) read() os.Error {
|
|||||||
p.tok.Attr = nil
|
p.tok.Attr = nil
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
if tokenType := p.tokenizer.Next(); tokenType == ErrorToken {
|
p.tokenizer.Next()
|
||||||
return p.tokenizer.Error()
|
|
||||||
}
|
|
||||||
p.tok = p.tokenizer.Token()
|
p.tok = p.tokenizer.Token()
|
||||||
if p.tok.Type == SelfClosingTagToken {
|
switch p.tok.Type {
|
||||||
|
case ErrorToken:
|
||||||
|
return p.tokenizer.Error()
|
||||||
|
case SelfClosingTagToken:
|
||||||
p.hasSelfClosingToken = true
|
p.hasSelfClosingToken = true
|
||||||
p.tok.Type = StartTagToken
|
p.tok.Type = StartTagToken
|
||||||
}
|
}
|
||||||
@ -137,6 +133,13 @@ func (p *parser) acknowledgeSelfClosingTag() {
|
|||||||
p.hasSelfClosingToken = false
|
p.hasSelfClosingToken = false
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// An insertion mode (section 10.2.3.1) is the state transition function from
|
||||||
|
// a particular state in the HTML5 parser's state machine. It updates the
|
||||||
|
// parser's fields depending on parser.token (where ErrorToken means EOF). In
|
||||||
|
// addition to returning the next insertionMode state, it also returns whether
|
||||||
|
// the token was consumed.
|
||||||
|
type insertionMode func(*parser) (insertionMode, bool)
|
||||||
|
|
||||||
// Section 10.2.5.4.
|
// Section 10.2.5.4.
|
||||||
func initialInsertionMode(p *parser) (insertionMode, bool) {
|
func initialInsertionMode(p *parser) (insertionMode, bool) {
|
||||||
// TODO(nigeltao): check p.tok for DOCTYPE.
|
// TODO(nigeltao): check p.tok for DOCTYPE.
|
||||||
@ -151,6 +154,8 @@ func beforeHTMLInsertionMode(p *parser) (insertionMode, bool) {
|
|||||||
implied bool
|
implied bool
|
||||||
)
|
)
|
||||||
switch p.tok.Type {
|
switch p.tok.Type {
|
||||||
|
case ErrorToken:
|
||||||
|
implied = true
|
||||||
case TextToken:
|
case TextToken:
|
||||||
// TODO(nigeltao): distinguish whitespace text from others.
|
// TODO(nigeltao): distinguish whitespace text from others.
|
||||||
implied = true
|
implied = true
|
||||||
@ -162,7 +167,12 @@ func beforeHTMLInsertionMode(p *parser) (insertionMode, bool) {
|
|||||||
implied = true
|
implied = true
|
||||||
}
|
}
|
||||||
case EndTagToken:
|
case EndTagToken:
|
||||||
// TODO.
|
switch p.tok.Data {
|
||||||
|
case "head", "body", "html", "br":
|
||||||
|
implied = true
|
||||||
|
default:
|
||||||
|
// Ignore the token.
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if add || implied {
|
if add || implied {
|
||||||
p.addChild(&Node{
|
p.addChild(&Node{
|
||||||
@ -182,6 +192,8 @@ func beforeHeadInsertionMode(p *parser) (insertionMode, bool) {
|
|||||||
implied bool
|
implied bool
|
||||||
)
|
)
|
||||||
switch p.tok.Type {
|
switch p.tok.Type {
|
||||||
|
case ErrorToken:
|
||||||
|
implied = true
|
||||||
case TextToken:
|
case TextToken:
|
||||||
// TODO(nigeltao): distinguish whitespace text from others.
|
// TODO(nigeltao): distinguish whitespace text from others.
|
||||||
implied = true
|
implied = true
|
||||||
@ -191,12 +203,17 @@ func beforeHeadInsertionMode(p *parser) (insertionMode, bool) {
|
|||||||
add = true
|
add = true
|
||||||
attr = p.tok.Attr
|
attr = p.tok.Attr
|
||||||
case "html":
|
case "html":
|
||||||
// TODO.
|
return inBodyInsertionMode, false
|
||||||
default:
|
default:
|
||||||
implied = true
|
implied = true
|
||||||
}
|
}
|
||||||
case EndTagToken:
|
case EndTagToken:
|
||||||
// TODO.
|
switch p.tok.Data {
|
||||||
|
case "head", "body", "html", "br":
|
||||||
|
implied = true
|
||||||
|
default:
|
||||||
|
// Ignore the token.
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if add || implied {
|
if add || implied {
|
||||||
p.addChild(&Node{
|
p.addChild(&Node{
|
||||||
@ -215,7 +232,7 @@ func inHeadInsertionMode(p *parser) (insertionMode, bool) {
|
|||||||
implied bool
|
implied bool
|
||||||
)
|
)
|
||||||
switch p.tok.Type {
|
switch p.tok.Type {
|
||||||
case TextToken:
|
case ErrorToken, TextToken:
|
||||||
implied = true
|
implied = true
|
||||||
case StartTagToken:
|
case StartTagToken:
|
||||||
switch p.tok.Data {
|
switch p.tok.Data {
|
||||||
@ -251,7 +268,7 @@ func afterHeadInsertionMode(p *parser) (insertionMode, bool) {
|
|||||||
implied bool
|
implied bool
|
||||||
)
|
)
|
||||||
switch p.tok.Type {
|
switch p.tok.Type {
|
||||||
case TextToken:
|
case ErrorToken, TextToken:
|
||||||
implied = true
|
implied = true
|
||||||
framesetOK = true
|
framesetOK = true
|
||||||
case StartTagToken:
|
case StartTagToken:
|
||||||
@ -290,6 +307,8 @@ func afterHeadInsertionMode(p *parser) (insertionMode, bool) {
|
|||||||
func inBodyInsertionMode(p *parser) (insertionMode, bool) {
|
func inBodyInsertionMode(p *parser) (insertionMode, bool) {
|
||||||
var endP bool
|
var endP bool
|
||||||
switch p.tok.Type {
|
switch p.tok.Type {
|
||||||
|
case ErrorToken:
|
||||||
|
// No-op.
|
||||||
case TextToken:
|
case TextToken:
|
||||||
p.addText(p.tok.Data)
|
p.addText(p.tok.Data)
|
||||||
p.framesetOK = false
|
p.framesetOK = false
|
||||||
@ -363,6 +382,8 @@ func inBodyInsertionMode(p *parser) (insertionMode, bool) {
|
|||||||
// Section 10.2.5.22.
|
// Section 10.2.5.22.
|
||||||
func afterBodyInsertionMode(p *parser) (insertionMode, bool) {
|
func afterBodyInsertionMode(p *parser) (insertionMode, bool) {
|
||||||
switch p.tok.Type {
|
switch p.tok.Type {
|
||||||
|
case ErrorToken:
|
||||||
|
// TODO.
|
||||||
case TextToken:
|
case TextToken:
|
||||||
// TODO.
|
// TODO.
|
||||||
case StartTagToken:
|
case StartTagToken:
|
||||||
@ -395,6 +416,7 @@ func Parse(r io.Reader) (*Node, os.Error) {
|
|||||||
scripting: true,
|
scripting: true,
|
||||||
framesetOK: true,
|
framesetOK: true,
|
||||||
}
|
}
|
||||||
|
// Iterate until EOF. Any other error will cause an early return.
|
||||||
im, consumed := initialInsertionMode, true
|
im, consumed := initialInsertionMode, true
|
||||||
for {
|
for {
|
||||||
if consumed {
|
if consumed {
|
||||||
@ -407,8 +429,11 @@ func Parse(r io.Reader) (*Node, os.Error) {
|
|||||||
}
|
}
|
||||||
im, consumed = im(p)
|
im, consumed = im(p)
|
||||||
}
|
}
|
||||||
// TODO(nigeltao): clean up, depending on the value of im.
|
// Loop until the final token (the ErrorToken signifying EOF) is consumed.
|
||||||
// The specification's algorithm does clean up on reading an EOF 'token',
|
for {
|
||||||
// but in go we represent EOF by an os.Error instead.
|
if im, consumed = im(p); consumed {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
return p.doc, nil
|
return p.doc, nil
|
||||||
}
|
}
|
||||||
|
@ -106,12 +106,11 @@ func dump(n *Node) (string, os.Error) {
|
|||||||
if n == nil || len(n.Child) == 0 {
|
if n == nil || len(n.Child) == 0 {
|
||||||
return "", nil
|
return "", nil
|
||||||
}
|
}
|
||||||
if len(n.Child) > 1 {
|
|
||||||
return "too many children", nil
|
|
||||||
}
|
|
||||||
b := bytes.NewBuffer(nil)
|
b := bytes.NewBuffer(nil)
|
||||||
if err := dumpLevel(b, n.Child[0], 0); err != nil {
|
for _, child := range n.Child {
|
||||||
return "", err
|
if err := dumpLevel(b, child, 0); err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return b.String(), nil
|
return b.String(), nil
|
||||||
}
|
}
|
||||||
@ -124,8 +123,8 @@ func TestParser(t *testing.T) {
|
|||||||
for _, filename := range filenames {
|
for _, filename := range filenames {
|
||||||
rc := make(chan io.Reader)
|
rc := make(chan io.Reader)
|
||||||
go readDat(filename, rc)
|
go readDat(filename, rc)
|
||||||
// TODO(nigeltao): Process all test cases, not just the first three.
|
// TODO(nigeltao): Process all test cases, not just a subset.
|
||||||
for i := 0; i < 3; i++ {
|
for i := 0; i < 19; i++ {
|
||||||
// Parse the #data section.
|
// Parse the #data section.
|
||||||
doc, err := Parse(<-rc)
|
doc, err := Parse(<-rc)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
Loading…
Reference in New Issue
Block a user