diff --git a/src/pkg/html/parse.go b/src/pkg/html/parse.go index acc3eccbcc7..feef431eb1c 100644 --- a/src/pkg/html/parse.go +++ b/src/pkg/html/parse.go @@ -53,15 +53,6 @@ type parser struct { scripting, framesetOK bool } -// pop pops the top of the stack of open elements. -// It will panic if the stack is empty. -func (p *parser) pop() *Node { - n := len(p.stack) - ret := p.stack[n-1] - p.stack = p.stack[:n-1] - return ret -} - // push pushes onto the stack of open elements. func (p *parser) push(n *Node) { p.stack = append(p.stack, n) @@ -76,8 +67,60 @@ func (p *parser) top() *Node { return p.doc } -// addChild adds a child node n to the top element, and pushes n -// if it is an element node (text nodes do not have children). +// pop pops the top of the stack of open elements. +// It will panic if the stack is empty. +func (p *parser) pop() *Node { + n := len(p.stack) + ret := p.stack[n-1] + p.stack = p.stack[:n-1] + return ret +} + +// stopTags for use in popUntil. These come from section 10.2.3.2. +var ( + defaultScopeStopTags = []string{"applet", "caption", "html", "table", "td", "th", "marquee", "object"} + listItemScopeStopTags = []string{"applet", "caption", "html", "table", "td", "th", "marquee", "object", "ol", "ul"} + buttonScopeStopTags = []string{"applet", "caption", "html", "table", "td", "th", "marquee", "object", "button"} + tableScopeStopTags = []string{"html", "table"} +) + +// popUntil pops the stack of open elements at the highest element whose tag +// is in matchTags, provided there is no higher element in stopTags. It returns +// whether or not there was such an element. If there was not, popUntil leaves +// the stack unchanged. +// +// For example, if the stack was: +// ["html", "body", "font", "table", "b", "i", "u"] +// then popUntil([]string{"html, "table"}, "font") would return false, but +// popUntil([]string{"html, "table"}, "i") would return true and the resultant +// stack would be: +// ["html", "body", "font", "table", "b"] +// +// If an element's tag is in both stopTags and matchTags, then the stack will +// be popped and the function returns true (provided, of course, there was no +// higher element in the stack that was also in stopTags). For example, +// popUntil([]string{"html, "table"}, "table") would return true and leave: +// ["html", "body", "font"] +func (p *parser) popUntil(stopTags []string, matchTags ...string) bool { + for i := len(p.stack) - 1; i >= 0; i-- { + tag := p.stack[i].Data + for _, t := range matchTags { + if t == tag { + p.stack = p.stack[:i] + return true + } + } + for _, t := range stopTags { + if t == tag { + return false + } + } + } + return false +} + +// addChild adds a child node n to the top element, and pushes n if it is an +// element node (text nodes are not part of the stack of open elements). func (p *parser) addChild(n *Node) { m := p.top() m.Child = append(m.Child, n) @@ -86,19 +129,28 @@ func (p *parser) addChild(n *Node) { } } -// addText adds text to the current node. -func (p *parser) addText(s string) { - // TODO(nigeltao): merge s with previous text, if the preceding node is a text node. - // TODO(nigeltao): distinguish whitespace text from others. +// addText calls addChild with a text node. +func (p *parser) addText(text string) { + // TODO: merge s with previous text, if the preceding node is a text node. + // TODO: distinguish whitespace text from others. p.addChild(&Node{ Type: TextNode, - Data: s, + Data: text, + }) +} + +// addElement calls addChild with an element node. +func (p *parser) addElement(tag string, attr []Attribute) { + p.addChild(&Node{ + Type: ElementNode, + Data: tag, + Attr: attr, }) } // Section 10.2.3.3. -func (p *parser) addFormattingElement(n *Node) { - p.addChild(n) +func (p *parser) addFormattingElement(tag string, attr []Attribute) { + p.addElement(tag, attr) // TODO. } @@ -140,14 +192,25 @@ func (p *parser) acknowledgeSelfClosingTag() { // the token was consumed. type insertionMode func(*parser) (insertionMode, bool) +// useTheRulesFor runs the delegate insertionMode over p, returning the actual +// insertionMode unless the delegate caused a state transition. +// Section 10.2.3.1, "using the rules for". +func useTheRulesFor(p *parser, actual, delegate insertionMode) (insertionMode, bool) { + im, consumed := delegate(p) + if im != delegate { + return im, consumed + } + return actual, consumed +} + // Section 10.2.5.4. -func initialInsertionMode(p *parser) (insertionMode, bool) { - // TODO(nigeltao): check p.tok for DOCTYPE. - return beforeHTMLInsertionMode, false +func initialIM(p *parser) (insertionMode, bool) { + // TODO: check p.tok for DOCTYPE. + return beforeHTMLIM, false } // Section 10.2.5.5. -func beforeHTMLInsertionMode(p *parser) (insertionMode, bool) { +func beforeHTMLIM(p *parser) (insertionMode, bool) { var ( add bool attr []Attribute @@ -157,7 +220,7 @@ func beforeHTMLInsertionMode(p *parser) (insertionMode, bool) { case ErrorToken: implied = true case TextToken: - // TODO(nigeltao): distinguish whitespace text from others. + // TODO: distinguish whitespace text from others. implied = true case StartTagToken: if p.tok.Data == "html" { @@ -175,17 +238,13 @@ func beforeHTMLInsertionMode(p *parser) (insertionMode, bool) { } } if add || implied { - p.addChild(&Node{ - Type: ElementNode, - Data: "html", - Attr: attr, - }) + p.addElement("html", attr) } - return beforeHeadInsertionMode, !implied + return beforeHeadIM, !implied } // Section 10.2.5.6. -func beforeHeadInsertionMode(p *parser) (insertionMode, bool) { +func beforeHeadIM(p *parser) (insertionMode, bool) { var ( add bool attr []Attribute @@ -195,7 +254,7 @@ func beforeHeadInsertionMode(p *parser) (insertionMode, bool) { case ErrorToken: implied = true case TextToken: - // TODO(nigeltao): distinguish whitespace text from others. + // TODO: distinguish whitespace text from others. implied = true case StartTagToken: switch p.tok.Data { @@ -203,7 +262,7 @@ func beforeHeadInsertionMode(p *parser) (insertionMode, bool) { add = true attr = p.tok.Attr case "html": - return inBodyInsertionMode, false + return useTheRulesFor(p, beforeHeadIM, inBodyIM) default: implied = true } @@ -216,17 +275,13 @@ func beforeHeadInsertionMode(p *parser) (insertionMode, bool) { } } if add || implied { - p.addChild(&Node{ - Type: ElementNode, - Data: "head", - Attr: attr, - }) + p.addElement("head", attr) } - return inHeadInsertionMode, !implied + return inHeadIM, !implied } // Section 10.2.5.7. -func inHeadInsertionMode(p *parser) (insertionMode, bool) { +func inHeadIM(p *parser) (insertionMode, bool) { var ( pop bool implied bool @@ -254,13 +309,13 @@ func inHeadInsertionMode(p *parser) (insertionMode, bool) { if n.Data != "head" { panic("html: bad parser state") } - return afterHeadInsertionMode, !implied + return afterHeadIM, !implied } - return inHeadInsertionMode, !implied + return inHeadIM, !implied } // Section 10.2.5.9. -func afterHeadInsertionMode(p *parser) (insertionMode, bool) { +func afterHeadIM(p *parser) (insertionMode, bool) { var ( add bool attr []Attribute @@ -293,63 +348,46 @@ func afterHeadInsertionMode(p *parser) (insertionMode, bool) { // TODO. } if add || implied { - p.addChild(&Node{ - Type: ElementNode, - Data: "body", - Attr: attr, - }) + p.addElement("body", attr) p.framesetOK = framesetOK } - return inBodyInsertionMode, !implied + return inBodyIM, !implied } // Section 10.2.5.10. -func inBodyInsertionMode(p *parser) (insertionMode, bool) { +func inBodyIM(p *parser) (insertionMode, bool) { var endP bool switch p.tok.Type { - case ErrorToken: - // No-op. case TextToken: p.addText(p.tok.Data) p.framesetOK = false case StartTagToken: switch p.tok.Data { case "address", "article", "aside", "blockquote", "center", "details", "dir", "div", "dl", "fieldset", "figcaption", "figure", "footer", "header", "hgroup", "menu", "nav", "ol", "p", "section", "summary", "ul": - // TODO(nigeltao): Do the proper "does the stack of open elements has a p element in button scope" algorithm in section 10.2.3.2. + // TODO: Do the proper "does the stack of open elements has a p element in button scope" algorithm in section 10.2.3.2. n := p.top() if n.Type == ElementNode && n.Data == "p" { endP = true } else { - p.addChild(&Node{ - Type: ElementNode, - Data: p.tok.Data, - Attr: p.tok.Attr, - }) + p.addElement(p.tok.Data, p.tok.Attr) } case "b", "big", "code", "em", "font", "i", "s", "small", "strike", "strong", "tt", "u": p.reconstructActiveFormattingElements() - p.addFormattingElement(&Node{ - Type: ElementNode, - Data: p.tok.Data, - Attr: p.tok.Attr, - }) + p.addFormattingElement(p.tok.Data, p.tok.Attr) case "area", "br", "embed", "img", "input", "keygen", "wbr": p.reconstructActiveFormattingElements() - p.addChild(&Node{ - Type: ElementNode, - Data: p.tok.Data, - Attr: p.tok.Attr, - }) + p.addElement(p.tok.Data, p.tok.Attr) p.pop() p.acknowledgeSelfClosingTag() p.framesetOK = false + case "table": + // TODO: auto-insert
if necessary, depending on quirks mode. + p.addElement(p.tok.Data, p.tok.Attr) + p.framesetOK = false + return inTableIM, true case "hr": - // TODO(nigeltao): auto-insert if necessary. - p.addChild(&Node{ - Type: ElementNode, - Data: p.tok.Data, - Attr: p.tok.Attr, - }) + // TODO: auto-insert if necessary. + p.addElement(p.tok.Data, p.tok.Attr) p.pop() p.acknowledgeSelfClosingTag() p.framesetOK = false @@ -359,28 +397,199 @@ func inBodyInsertionMode(p *parser) (insertionMode, bool) { case EndTagToken: switch p.tok.Data { case "body": - // TODO(nigeltao): autoclose the stack of open elements. - return afterBodyInsertionMode, true + // TODO: autoclose the stack of open elements. + return afterBodyIM, true case "a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small", "strike", "strong", "tt", "u": - // TODO(nigeltao): implement the "adoption agency" algorithm: + // TODO: implement the "adoption agency" algorithm: // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#adoptionAgency - p.pop() + if p.tok.Data == p.top().Data { + p.pop() + } default: // TODO. } } if endP { - // TODO(nigeltao): do the proper algorithm. + // TODO: do the proper algorithm. n := p.pop() if n.Type != ElementNode || n.Data != "p" { panic("unreachable") } } - return inBodyInsertionMode, !endP + return inBodyIM, !endP +} + +// Section 10.2.5.12. +func inTableIM(p *parser) (insertionMode, bool) { + var ( + add bool + data string + attr []Attribute + consumed bool + ) + switch p.tok.Type { + case ErrorToken: + // Stop parsing. + return nil, true + case TextToken: + // TODO. + case StartTagToken: + switch p.tok.Data { + case "tbody", "tfoot", "thead": + add = true + data = p.tok.Data + attr = p.tok.Attr + consumed = true + case "td", "th", "tr": + add = true + data = "tbody" + default: + // TODO. + } + case EndTagToken: + switch p.tok.Data { + case "table": + if p.popUntil(tableScopeStopTags, "table") { + // TODO: "reset the insertion mode appropriately" as per 10.2.3.1. + return inBodyIM, false + } + // Ignore the token. + return inTableIM, true + case "body", "caption", "col", "colgroup", "html", "tbody", "td", "tfoot", "th", "thead", "tr": + // Ignore the token. + return inTableIM, true + } + } + if add { + // TODO: clear the stack back to a table context. + p.addElement(data, attr) + return inTableBodyIM, consumed + } + // TODO: return useTheRulesFor(inTableIM, inBodyIM, p) unless etc. etc. foster parenting. + return inTableIM, true +} + +// Section 10.2.5.16. +func inTableBodyIM(p *parser) (insertionMode, bool) { + var ( + add bool + data string + attr []Attribute + consumed bool + ) + switch p.tok.Type { + case ErrorToken: + // TODO. + case TextToken: + // TODO. + case StartTagToken: + switch p.tok.Data { + case "tr": + add = true + data = p.tok.Data + attr = p.tok.Attr + consumed = true + case "td", "th": + add = true + data = "tr" + consumed = false + default: + // TODO. + } + case EndTagToken: + switch p.tok.Data { + case "table": + if p.popUntil(tableScopeStopTags, "tbody", "thead", "tfoot") { + return inTableIM, false + } + // Ignore the token. + return inTableBodyIM, true + case "body", "caption", "col", "colgroup", "html", "td", "th", "tr": + // Ignore the token. + return inTableBodyIM, true + } + } + if add { + // TODO: clear the stack back to a table body context. + p.addElement(data, attr) + return inRowIM, consumed + } + return useTheRulesFor(p, inTableBodyIM, inTableIM) +} + +// Section 10.2.5.17. +func inRowIM(p *parser) (insertionMode, bool) { + switch p.tok.Type { + case ErrorToken: + // TODO. + case TextToken: + // TODO. + case StartTagToken: + switch p.tok.Data { + case "td", "th": + // TODO: clear the stack back to a table row context. + p.addElement(p.tok.Data, p.tok.Attr) + // TODO: insert a marker at the end of the list of active formatting elements. + return inCellIM, true + default: + // TODO. + } + case EndTagToken: + switch p.tok.Data { + case "tr": + // TODO. + case "table": + if p.popUntil(tableScopeStopTags, "tr") { + return inTableBodyIM, false + } + // Ignore the token. + return inRowIM, true + case "tbody", "tfoot", "thead": + // TODO. + case "body", "caption", "col", "colgroup", "html", "td", "th": + // Ignore the token. + return inRowIM, true + default: + // TODO. + } + } + return useTheRulesFor(p, inRowIM, inTableIM) +} + +// Section 10.2.5.18. +func inCellIM(p *parser) (insertionMode, bool) { + var ( + closeTheCellAndReprocess bool + ) + switch p.tok.Type { + case StartTagToken: + switch p.tok.Data { + case "caption", "col", "colgroup", "tbody", "td", "tfoot", "th", "thead", "tr": + // TODO: check for "td" or "th" in table scope. + closeTheCellAndReprocess = true + } + case EndTagToken: + switch p.tok.Data { + case "td", "th": + // TODO. + case "body", "caption", "col", "colgroup", "html": + // TODO. + case "table", "tbody", "tfoot", "thead", "tr": + // TODO: check for matching element in table scope. + closeTheCellAndReprocess = true + } + } + if closeTheCellAndReprocess { + if p.popUntil(tableScopeStopTags, "td") || p.popUntil(tableScopeStopTags, "th") { + // TODO: clear the list of active formatting elements up to the last marker. + return inRowIM, false + } + } + return useTheRulesFor(p, inCellIM, inBodyIM) } // Section 10.2.5.22. -func afterBodyInsertionMode(p *parser) (insertionMode, bool) { +func afterBodyIM(p *parser) (insertionMode, bool) { switch p.tok.Type { case ErrorToken: // TODO. @@ -391,18 +600,29 @@ func afterBodyInsertionMode(p *parser) (insertionMode, bool) { case EndTagToken: switch p.tok.Data { case "html": - // TODO(nigeltao): autoclose the stack of open elements. - return afterAfterBodyInsertionMode, true + // TODO: autoclose the stack of open elements. + return afterAfterBodyIM, true default: // TODO. } } - return afterBodyInsertionMode, true + return afterBodyIM, true } // Section 10.2.5.25. -func afterAfterBodyInsertionMode(p *parser) (insertionMode, bool) { - return inBodyInsertionMode, false +func afterAfterBodyIM(p *parser) (insertionMode, bool) { + switch p.tok.Type { + case ErrorToken: + // Stop parsing. + return nil, true + case TextToken: + // TODO. + case StartTagToken: + if p.tok.Data == "html" { + return useTheRulesFor(p, afterAfterBodyIM, inBodyIM) + } + } + return inBodyIM, false } // Parse returns the parse tree for the HTML from the given Reader. @@ -417,7 +637,7 @@ func Parse(r io.Reader) (*Node, os.Error) { framesetOK: true, } // Iterate until EOF. Any other error will cause an early return. - im, consumed := initialInsertionMode, true + im, consumed := initialIM, true for { if consumed { if err := p.read(); err != nil { diff --git a/src/pkg/html/parse_test.go b/src/pkg/html/parse_test.go index 839a034b7d5..dbfc57f666f 100644 --- a/src/pkg/html/parse_test.go +++ b/src/pkg/html/parse_test.go @@ -124,7 +124,7 @@ func TestParser(t *testing.T) { rc := make(chan io.Reader) go readDat(filename, rc) // TODO(nigeltao): Process all test cases, not just a subset. - for i := 0; i < 19; i++ { + for i := 0; i < 21; i++ { // Parse the #data section. doc, err := Parse(<-rc) if err != nil {