1
0
mirror of https://github.com/golang/go synced 2024-11-24 22:47:58 -07:00

html: implement fragment parsing algorithm

Pass the tests in tests4.dat.

R=nigeltao
CC=golang-dev
https://golang.org/cl/5447055
This commit is contained in:
Andrew Balholm 2011-12-01 12:47:57 +11:00 committed by Nigel Tao
parent 595efd0d20
commit ce27b00f48
2 changed files with 132 additions and 35 deletions

View File

@ -39,6 +39,9 @@ type parser struct {
fosterParenting bool fosterParenting bool
// quirks is whether the parser is operating in "quirks mode." // quirks is whether the parser is operating in "quirks mode."
quirks bool quirks bool
// context is the context element when parsing an HTML fragment
// (section 11.4).
context *Node
} }
func (p *parser) top() *Node { func (p *parser) top() *Node {
@ -287,9 +290,10 @@ func (p *parser) setOriginalIM() {
func (p *parser) resetInsertionMode() { func (p *parser) resetInsertionMode() {
for i := len(p.oe) - 1; i >= 0; i-- { for i := len(p.oe) - 1; i >= 0; i-- {
n := p.oe[i] n := p.oe[i]
if i == 0 { if i == 0 && p.context != nil {
// TODO: set n to the context element, for HTML fragment parsing. n = p.context
} }
switch n.Data { switch n.Data {
case "select": case "select":
p.im = inSelectIM p.im = inSelectIM
@ -1516,6 +1520,29 @@ func afterAfterFramesetIM(p *parser) bool {
return true return true
} }
func (p *parser) parse() error {
// Iterate until EOF. Any other error will cause an early return.
consumed := true
for {
if consumed {
if err := p.read(); err != nil {
if err == io.EOF {
break
}
return err
}
}
consumed = p.im(p)
}
// Loop until the final token (the ErrorToken signifying EOF) is consumed.
for {
if consumed = p.im(p); consumed {
break
}
}
return nil
}
// Parse returns the parse tree for the HTML from the given Reader. // Parse returns the parse tree for the HTML from the given Reader.
// The input is assumed to be UTF-8 encoded. // The input is assumed to be UTF-8 encoded.
func Parse(r io.Reader) (*Node, error) { func Parse(r io.Reader) (*Node, error) {
@ -1528,24 +1555,62 @@ func Parse(r io.Reader) (*Node, error) {
framesetOK: true, framesetOK: true,
im: initialIM, im: initialIM,
} }
// Iterate until EOF. Any other error will cause an early return. err := p.parse()
consumed := true if err != nil {
for { return nil, err
if consumed {
if err := p.read(); err != nil {
if err == io.EOF {
break
}
return nil, err
}
}
consumed = p.im(p)
}
// Loop until the final token (the ErrorToken signifying EOF) is consumed.
for {
if consumed = p.im(p); consumed {
break
}
} }
return p.doc, nil return p.doc, nil
} }
// ParseFragment parses a fragment of HTML and returns the nodes that were
// found. If the fragment is the InnerHTML for an existing element, pass that
// element in context.
func ParseFragment(r io.Reader, context *Node) ([]*Node, error) {
p := &parser{
tokenizer: NewTokenizer(r),
doc: &Node{
Type: DocumentNode,
},
scripting: true,
context: context,
}
if context != nil {
switch context.Data {
case "iframe", "noembed", "noframes", "noscript", "plaintext", "script", "style", "title", "textarea", "xmp":
p.tokenizer.rawTag = context.Data
}
}
root := &Node{
Type: ElementNode,
Data: "html",
}
p.doc.Add(root)
p.oe = nodeStack{root}
p.resetInsertionMode()
for n := context; n != nil; n = n.Parent {
if n.Type == ElementNode && n.Data == "form" {
p.form = n
break
}
}
err := p.parse()
if err != nil {
return nil, err
}
parent := p.doc
if context != nil {
parent = root
}
result := parent.Child
parent.Child = nil
for _, n := range result {
n.Parent = nil
}
return result, nil
}

View File

@ -16,21 +16,21 @@ import (
) )
// readParseTest reads a single test case from r. // readParseTest reads a single test case from r.
func readParseTest(r *bufio.Reader) (text, want string, err error) { func readParseTest(r *bufio.Reader) (text, want, context string, err error) {
line, err := r.ReadSlice('\n') line, err := r.ReadSlice('\n')
if err != nil { if err != nil {
return "", "", err return "", "", "", err
} }
var b []byte var b []byte
// Read the HTML. // Read the HTML.
if string(line) != "#data\n" { if string(line) != "#data\n" {
return "", "", fmt.Errorf(`got %q want "#data\n"`, line) return "", "", "", fmt.Errorf(`got %q want "#data\n"`, line)
} }
for { for {
line, err = r.ReadSlice('\n') line, err = r.ReadSlice('\n')
if err != nil { if err != nil {
return "", "", err return "", "", "", err
} }
if line[0] == '#' { if line[0] == '#' {
break break
@ -42,33 +42,45 @@ func readParseTest(r *bufio.Reader) (text, want string, err error) {
// Skip the error list. // Skip the error list.
if string(line) != "#errors\n" { if string(line) != "#errors\n" {
return "", "", fmt.Errorf(`got %q want "#errors\n"`, line) return "", "", "", fmt.Errorf(`got %q want "#errors\n"`, line)
} }
for { for {
line, err = r.ReadSlice('\n') line, err = r.ReadSlice('\n')
if err != nil { if err != nil {
return "", "", err return "", "", "", err
} }
if line[0] == '#' { if line[0] == '#' {
break break
} }
} }
if string(line) == "#document-fragment\n" {
line, err = r.ReadSlice('\n')
if err != nil {
return "", "", "", err
}
context = strings.TrimSpace(string(line))
line, err = r.ReadSlice('\n')
if err != nil {
return "", "", "", err
}
}
// Read the dump of what the parse tree should be. // Read the dump of what the parse tree should be.
if string(line) != "#document\n" { if string(line) != "#document\n" {
return "", "", fmt.Errorf(`got %q want "#document\n"`, line) return "", "", "", fmt.Errorf(`got %q want "#document\n"`, line)
} }
for { for {
line, err = r.ReadSlice('\n') line, err = r.ReadSlice('\n')
if err != nil && err != io.EOF { if err != nil && err != io.EOF {
return "", "", err return "", "", "", err
} }
if len(line) == 0 || len(line) == 1 && line[0] == '\n' { if len(line) == 0 || len(line) == 1 && line[0] == '\n' {
break break
} }
b = append(b, line...) b = append(b, line...)
} }
return text, string(b), nil return text, string(b), context, nil
} }
func dumpIndent(w io.Writer, level int) { func dumpIndent(w io.Writer, level int) {
@ -153,7 +165,7 @@ func TestParser(t *testing.T) {
{"tests1.dat", -1}, {"tests1.dat", -1},
{"tests2.dat", -1}, {"tests2.dat", -1},
{"tests3.dat", -1}, {"tests3.dat", -1},
// tests4.dat is fragment cases. {"tests4.dat", -1},
{"tests5.dat", -1}, {"tests5.dat", -1},
} }
for _, tf := range testFiles { for _, tf := range testFiles {
@ -164,17 +176,37 @@ func TestParser(t *testing.T) {
defer f.Close() defer f.Close()
r := bufio.NewReader(f) r := bufio.NewReader(f)
for i := 0; i != tf.n; i++ { for i := 0; i != tf.n; i++ {
text, want, err := readParseTest(r) text, want, context, err := readParseTest(r)
if err == io.EOF && tf.n == -1 { if err == io.EOF && tf.n == -1 {
break break
} }
if err != nil { if err != nil {
t.Fatal(err) t.Fatal(err)
} }
doc, err := Parse(strings.NewReader(text))
if err != nil { var doc *Node
t.Fatal(err) if context == "" {
doc, err = Parse(strings.NewReader(text))
if err != nil {
t.Fatal(err)
}
} else {
contextNode := &Node{
Type: ElementNode,
Data: context,
}
nodes, err := ParseFragment(strings.NewReader(text), contextNode)
if err != nil {
t.Fatal(err)
}
doc = &Node{
Type: DocumentNode,
}
for _, n := range nodes {
doc.Add(n)
}
} }
got, err := dump(doc) got, err := dump(doc)
if err != nil { if err != nil {
t.Fatal(err) t.Fatal(err)
@ -184,7 +216,7 @@ func TestParser(t *testing.T) {
t.Errorf("%s test #%d %q, got vs want:\n----\n%s----\n%s----", tf.filename, i, text, got, want) t.Errorf("%s test #%d %q, got vs want:\n----\n%s----\n%s----", tf.filename, i, text, got, want)
continue continue
} }
if renderTestBlacklist[text] { if renderTestBlacklist[text] || context != "" {
continue continue
} }
// Check that rendering and re-parsing results in an identical tree. // Check that rendering and re-parsing results in an identical tree.