1
0
mirror of https://github.com/golang/go synced 2024-10-03 23:21:22 -06:00

html: implement fragment parsing algorithm

Pass the tests in tests4.dat.

R=nigeltao
CC=golang-dev
https://golang.org/cl/5447055
This commit is contained in:
Andrew Balholm 2011-12-01 12:47:57 +11:00 committed by Nigel Tao
parent 595efd0d20
commit ce27b00f48
2 changed files with 132 additions and 35 deletions

View File

@ -39,6 +39,9 @@ type parser struct {
fosterParenting bool
// quirks is whether the parser is operating in "quirks mode."
quirks bool
// context is the context element when parsing an HTML fragment
// (section 11.4).
context *Node
}
func (p *parser) top() *Node {
@ -287,9 +290,10 @@ func (p *parser) setOriginalIM() {
func (p *parser) resetInsertionMode() {
for i := len(p.oe) - 1; i >= 0; i-- {
n := p.oe[i]
if i == 0 {
// TODO: set n to the context element, for HTML fragment parsing.
if i == 0 && p.context != nil {
n = p.context
}
switch n.Data {
case "select":
p.im = inSelectIM
@ -1516,6 +1520,29 @@ func afterAfterFramesetIM(p *parser) bool {
return true
}
func (p *parser) parse() error {
// Iterate until EOF. Any other error will cause an early return.
consumed := true
for {
if consumed {
if err := p.read(); err != nil {
if err == io.EOF {
break
}
return err
}
}
consumed = p.im(p)
}
// Loop until the final token (the ErrorToken signifying EOF) is consumed.
for {
if consumed = p.im(p); consumed {
break
}
}
return nil
}
// Parse returns the parse tree for the HTML from the given Reader.
// The input is assumed to be UTF-8 encoded.
func Parse(r io.Reader) (*Node, error) {
@ -1528,24 +1555,62 @@ func Parse(r io.Reader) (*Node, error) {
framesetOK: true,
im: initialIM,
}
// Iterate until EOF. Any other error will cause an early return.
consumed := true
for {
if consumed {
if err := p.read(); err != nil {
if err == io.EOF {
break
}
return nil, err
}
}
consumed = p.im(p)
}
// Loop until the final token (the ErrorToken signifying EOF) is consumed.
for {
if consumed = p.im(p); consumed {
break
}
err := p.parse()
if err != nil {
return nil, err
}
return p.doc, nil
}
// ParseFragment parses a fragment of HTML and returns the nodes that were
// found. If the fragment is the InnerHTML for an existing element, pass that
// element in context.
func ParseFragment(r io.Reader, context *Node) ([]*Node, error) {
p := &parser{
tokenizer: NewTokenizer(r),
doc: &Node{
Type: DocumentNode,
},
scripting: true,
context: context,
}
if context != nil {
switch context.Data {
case "iframe", "noembed", "noframes", "noscript", "plaintext", "script", "style", "title", "textarea", "xmp":
p.tokenizer.rawTag = context.Data
}
}
root := &Node{
Type: ElementNode,
Data: "html",
}
p.doc.Add(root)
p.oe = nodeStack{root}
p.resetInsertionMode()
for n := context; n != nil; n = n.Parent {
if n.Type == ElementNode && n.Data == "form" {
p.form = n
break
}
}
err := p.parse()
if err != nil {
return nil, err
}
parent := p.doc
if context != nil {
parent = root
}
result := parent.Child
parent.Child = nil
for _, n := range result {
n.Parent = nil
}
return result, nil
}

View File

@ -16,21 +16,21 @@ import (
)
// readParseTest reads a single test case from r.
func readParseTest(r *bufio.Reader) (text, want string, err error) {
func readParseTest(r *bufio.Reader) (text, want, context string, err error) {
line, err := r.ReadSlice('\n')
if err != nil {
return "", "", err
return "", "", "", err
}
var b []byte
// Read the HTML.
if string(line) != "#data\n" {
return "", "", fmt.Errorf(`got %q want "#data\n"`, line)
return "", "", "", fmt.Errorf(`got %q want "#data\n"`, line)
}
for {
line, err = r.ReadSlice('\n')
if err != nil {
return "", "", err
return "", "", "", err
}
if line[0] == '#' {
break
@ -42,33 +42,45 @@ func readParseTest(r *bufio.Reader) (text, want string, err error) {
// Skip the error list.
if string(line) != "#errors\n" {
return "", "", fmt.Errorf(`got %q want "#errors\n"`, line)
return "", "", "", fmt.Errorf(`got %q want "#errors\n"`, line)
}
for {
line, err = r.ReadSlice('\n')
if err != nil {
return "", "", err
return "", "", "", err
}
if line[0] == '#' {
break
}
}
if string(line) == "#document-fragment\n" {
line, err = r.ReadSlice('\n')
if err != nil {
return "", "", "", err
}
context = strings.TrimSpace(string(line))
line, err = r.ReadSlice('\n')
if err != nil {
return "", "", "", err
}
}
// Read the dump of what the parse tree should be.
if string(line) != "#document\n" {
return "", "", fmt.Errorf(`got %q want "#document\n"`, line)
return "", "", "", fmt.Errorf(`got %q want "#document\n"`, line)
}
for {
line, err = r.ReadSlice('\n')
if err != nil && err != io.EOF {
return "", "", err
return "", "", "", err
}
if len(line) == 0 || len(line) == 1 && line[0] == '\n' {
break
}
b = append(b, line...)
}
return text, string(b), nil
return text, string(b), context, nil
}
func dumpIndent(w io.Writer, level int) {
@ -153,7 +165,7 @@ func TestParser(t *testing.T) {
{"tests1.dat", -1},
{"tests2.dat", -1},
{"tests3.dat", -1},
// tests4.dat is fragment cases.
{"tests4.dat", -1},
{"tests5.dat", -1},
}
for _, tf := range testFiles {
@ -164,17 +176,37 @@ func TestParser(t *testing.T) {
defer f.Close()
r := bufio.NewReader(f)
for i := 0; i != tf.n; i++ {
text, want, err := readParseTest(r)
text, want, context, err := readParseTest(r)
if err == io.EOF && tf.n == -1 {
break
}
if err != nil {
t.Fatal(err)
}
doc, err := Parse(strings.NewReader(text))
if err != nil {
t.Fatal(err)
var doc *Node
if context == "" {
doc, err = Parse(strings.NewReader(text))
if err != nil {
t.Fatal(err)
}
} else {
contextNode := &Node{
Type: ElementNode,
Data: context,
}
nodes, err := ParseFragment(strings.NewReader(text), contextNode)
if err != nil {
t.Fatal(err)
}
doc = &Node{
Type: DocumentNode,
}
for _, n := range nodes {
doc.Add(n)
}
}
got, err := dump(doc)
if err != nil {
t.Fatal(err)
@ -184,7 +216,7 @@ func TestParser(t *testing.T) {
t.Errorf("%s test #%d %q, got vs want:\n----\n%s----\n%s----", tf.filename, i, text, got, want)
continue
}
if renderTestBlacklist[text] {
if renderTestBlacklist[text] || context != "" {
continue
}
// Check that rendering and re-parsing results in an identical tree.