From 6e318bda6c4236caf5a7f02d5ce545f5365094e0 Mon Sep 17 00:00:00 2001 From: Andrew Balholm Date: Wed, 26 Oct 2011 11:36:46 +1100 Subject: [PATCH] html: improve parsing of tables When foster parenting, merge adjacent text nodes. Properly close table row at tag. Pass tests1.dat, test 32:
helloexcite!me! | | | | | |
| "helloexcite!" | | "me!" |
please!
| | |
| | "please!" | R=nigeltao CC=golang-dev https://golang.org/cl/5323048 --- src/pkg/html/parse.go | 33 ++++++++++++++++++++++++++------- src/pkg/html/parse_test.go | 2 +- 2 files changed, 27 insertions(+), 8 deletions(-) diff --git a/src/pkg/html/parse.go b/src/pkg/html/parse.go index d1d4e483c53..292fbaf6be4 100644 --- a/src/pkg/html/parse.go +++ b/src/pkg/html/parse.go @@ -52,6 +52,11 @@ var ( tableScopeStopTags = []string{"html", "table"} ) +// stopTags for use in clearStackToContext. +var ( + tableRowContextStopTags = []string{"tr", "html"} +) + // popUntil pops the stack of open elements at the highest element whose tag // is in matchTags, provided there is no higher element in stopTags. It returns // whether or not there was such an element. If there was not, popUntil leaves @@ -146,6 +151,11 @@ func (p *parser) fosterParent(n *Node) { } } + if i > 0 && parent.Child[i-1].Type == TextNode && n.Type == TextNode { + parent.Child[i-1].Data += n.Data + return + } + if i == len(parent.Child) { parent.Add(n) } else { @@ -749,11 +759,11 @@ func inTableIM(p *parser) (insertionMode, bool) { case StartTagToken: switch p.tok.Data { case "tbody", "tfoot", "thead": - p.clearStackToTableContext() + p.clearStackToContext(tableScopeStopTags) p.addElement(p.tok.Data, p.tok.Attr) return inTableBodyIM, true case "td", "th", "tr": - p.clearStackToTableContext() + p.clearStackToContext(tableScopeStopTags) p.addElement("tbody", nil) return inTableBodyIM, false case "table": @@ -794,11 +804,15 @@ func inTableIM(p *parser) (insertionMode, bool) { return useTheRulesFor(p, inTableIM, inBodyIM) } -func (p *parser) clearStackToTableContext() { +// clearStackToContext pops elements off the stack of open elements +// until an element listed in stopTags is found. +func (p *parser) clearStackToContext(stopTags []string) { for i := len(p.oe) - 1; i >= 0; i-- { - if x := p.oe[i].Data; x == "table" || x == "html" { - p.oe = p.oe[:i+1] - return + for _, tag := range stopTags { + if p.oe[i].Data == tag { + p.oe = p.oe[:i+1] + return + } } } } @@ -877,7 +891,12 @@ func inRowIM(p *parser) (insertionMode, bool) { case EndTagToken: switch p.tok.Data { case "tr": - // TODO. + if !p.elementInScope(tableScopeStopTags, "tr") { + return inRowIM, true + } + p.clearStackToContext(tableRowContextStopTags) + p.oe.pop() + return inTableBodyIM, true case "table": if p.popUntil(tableScopeStopTags, "tr") { return inTableBodyIM, false diff --git a/src/pkg/html/parse_test.go b/src/pkg/html/parse_test.go index beba98d3ad2..865a47dea1d 100644 --- a/src/pkg/html/parse_test.go +++ b/src/pkg/html/parse_test.go @@ -132,7 +132,7 @@ func TestParser(t *testing.T) { rc := make(chan io.Reader) go readDat(filename, rc) // TODO(nigeltao): Process all test cases, not just a subset. - for i := 0; i < 32; i++ { + for i := 0; i < 33; i++ { // Parse the #data section. b, err := ioutil.ReadAll(<-rc) if err != nil {