mirror of
https://github.com/golang/go
synced 2024-11-25 01:08:02 -07:00
html: parse raw text and RCDATA elements, such as <script> and <title>.
Pass tests1.dat, test 26: #data <script><div></script></div><title><p></title><p><p> #document | <html> | <head> | <script> | "<div>" | <title> | "<p>" | <body> | <p> | <p> Thanks to Andy Balholm for driving this change. R=andybalholm CC=golang-dev https://golang.org/cl/5301042
This commit is contained in:
parent
78ad19f214
commit
b1fd528db5
@ -29,6 +29,9 @@ type parser struct {
|
|||||||
head, form *Node
|
head, form *Node
|
||||||
// Other parsing state flags (section 11.2.3.5).
|
// Other parsing state flags (section 11.2.3.5).
|
||||||
scripting, framesetOK bool
|
scripting, framesetOK bool
|
||||||
|
// originalIM is the insertion mode to go back to after completing a text
|
||||||
|
// or inTableText insertion mode.
|
||||||
|
originalIM insertionMode
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *parser) top() *Node {
|
func (p *parser) top() *Node {
|
||||||
@ -214,12 +217,23 @@ type insertionMode func(*parser) (insertionMode, bool)
|
|||||||
// Section 11.2.3.1, "using the rules for".
|
// Section 11.2.3.1, "using the rules for".
|
||||||
func useTheRulesFor(p *parser, actual, delegate insertionMode) (insertionMode, bool) {
|
func useTheRulesFor(p *parser, actual, delegate insertionMode) (insertionMode, bool) {
|
||||||
im, consumed := delegate(p)
|
im, consumed := delegate(p)
|
||||||
|
// TODO: do we need to update p.originalMode if it equals delegate?
|
||||||
if im != delegate {
|
if im != delegate {
|
||||||
return im, consumed
|
return im, consumed
|
||||||
}
|
}
|
||||||
return actual, consumed
|
return actual, consumed
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// setOriginalIM sets the insertion mode to return to after completing a text or
|
||||||
|
// inTableText insertion mode.
|
||||||
|
// Section 11.2.3.1, "using the rules for".
|
||||||
|
func (p *parser) setOriginalIM(im insertionMode) {
|
||||||
|
if p.originalIM != nil {
|
||||||
|
panic("html: bad parser state: originalIM was set twice")
|
||||||
|
}
|
||||||
|
p.originalIM = im
|
||||||
|
}
|
||||||
|
|
||||||
// Section 11.2.5.4.1.
|
// Section 11.2.5.4.1.
|
||||||
func initialIM(p *parser) (insertionMode, bool) {
|
func initialIM(p *parser) (insertionMode, bool) {
|
||||||
if p.tok.Type == DoctypeToken {
|
if p.tok.Type == DoctypeToken {
|
||||||
@ -318,8 +332,10 @@ func inHeadIM(p *parser) (insertionMode, bool) {
|
|||||||
switch p.tok.Data {
|
switch p.tok.Data {
|
||||||
case "meta":
|
case "meta":
|
||||||
// TODO.
|
// TODO.
|
||||||
case "script":
|
case "script", "title":
|
||||||
// TODO.
|
p.addElement(p.tok.Data, p.tok.Attr)
|
||||||
|
p.setOriginalIM(inHeadIM)
|
||||||
|
return textIM, true
|
||||||
default:
|
default:
|
||||||
implied = true
|
implied = true
|
||||||
}
|
}
|
||||||
@ -574,6 +590,20 @@ func (p *parser) inBodyEndTagFormatting(tag string) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Section 11.2.5.4.8.
|
||||||
|
func textIM(p *parser) (insertionMode, bool) {
|
||||||
|
switch p.tok.Type {
|
||||||
|
case TextToken:
|
||||||
|
p.addText(p.tok.Data)
|
||||||
|
return textIM, true
|
||||||
|
case EndTagToken:
|
||||||
|
p.oe.pop()
|
||||||
|
}
|
||||||
|
o := p.originalIM
|
||||||
|
p.originalIM = nil
|
||||||
|
return o, p.tok.Type == EndTagToken
|
||||||
|
}
|
||||||
|
|
||||||
// Section 11.2.5.4.9.
|
// Section 11.2.5.4.9.
|
||||||
func inTableIM(p *parser) (insertionMode, bool) {
|
func inTableIM(p *parser) (insertionMode, bool) {
|
||||||
var (
|
var (
|
||||||
|
@ -80,13 +80,13 @@ func dumpLevel(w io.Writer, n *Node, level int) os.Error {
|
|||||||
case DocumentNode:
|
case DocumentNode:
|
||||||
return os.NewError("unexpected DocumentNode")
|
return os.NewError("unexpected DocumentNode")
|
||||||
case ElementNode:
|
case ElementNode:
|
||||||
fmt.Fprintf(w, "<%s>", EscapeString(n.Data))
|
fmt.Fprintf(w, "<%s>", n.Data)
|
||||||
case TextNode:
|
case TextNode:
|
||||||
fmt.Fprintf(w, "%q", EscapeString(n.Data))
|
fmt.Fprintf(w, "%q", n.Data)
|
||||||
case CommentNode:
|
case CommentNode:
|
||||||
return os.NewError("COMMENT")
|
return os.NewError("COMMENT")
|
||||||
case DoctypeNode:
|
case DoctypeNode:
|
||||||
fmt.Fprintf(w, "<!DOCTYPE %s>", EscapeString(n.Data))
|
fmt.Fprintf(w, "<!DOCTYPE %s>", n.Data)
|
||||||
case scopeMarkerNode:
|
case scopeMarkerNode:
|
||||||
return os.NewError("unexpected scopeMarkerNode")
|
return os.NewError("unexpected scopeMarkerNode")
|
||||||
default:
|
default:
|
||||||
@ -123,7 +123,7 @@ func TestParser(t *testing.T) {
|
|||||||
rc := make(chan io.Reader)
|
rc := make(chan io.Reader)
|
||||||
go readDat(filename, rc)
|
go readDat(filename, rc)
|
||||||
// TODO(nigeltao): Process all test cases, not just a subset.
|
// TODO(nigeltao): Process all test cases, not just a subset.
|
||||||
for i := 0; i < 26; i++ {
|
for i := 0; i < 27; i++ {
|
||||||
// Parse the #data section.
|
// Parse the #data section.
|
||||||
b, err := ioutil.ReadAll(<-rc)
|
b, err := ioutil.ReadAll(<-rc)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
@ -74,17 +74,6 @@ func render(w writer, n *Node) os.Error {
|
|||||||
return os.NewError("html: unknown node type")
|
return os.NewError("html: unknown node type")
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: figure out what to do with <script>, <style>, <noembed>,
|
|
||||||
// <noframes> and <noscript> elements. A tentative plan:
|
|
||||||
// 1. render the <xxx> opening tag as normal.
|
|
||||||
// 2. maybe error out if any child is not a text node.
|
|
||||||
// 3. render the text nodes (without escaping??).
|
|
||||||
// 4. maybe error out if `</xxx` is a case-insensitive substring of the
|
|
||||||
// concatenation of the children's data.
|
|
||||||
// 5. maybe error out if the concatenation of the children's data contains an
|
|
||||||
// unbalanced escaping text span start ("<!--") not followed by an end ("-->").
|
|
||||||
// 6. render the closing tag as normal.
|
|
||||||
|
|
||||||
// Render the <xxx> opening tag.
|
// Render the <xxx> opening tag.
|
||||||
if err := w.WriteByte('<'); err != nil {
|
if err := w.WriteByte('<'); err != nil {
|
||||||
return err
|
return err
|
||||||
@ -121,9 +110,30 @@ func render(w writer, n *Node) os.Error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Render any child nodes.
|
// Render any child nodes.
|
||||||
for _, c := range n.Child {
|
switch n.Data {
|
||||||
if err := render(w, c); err != nil {
|
case "noembed", "noframes", "noscript", "script", "style":
|
||||||
return err
|
for _, c := range n.Child {
|
||||||
|
if c.Type != TextNode {
|
||||||
|
return fmt.Errorf("html: raw text element <%s> has non-text child node", n.Data)
|
||||||
|
}
|
||||||
|
if _, err := w.WriteString(c.Data); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
case "textarea", "title":
|
||||||
|
for _, c := range n.Child {
|
||||||
|
if c.Type != TextNode {
|
||||||
|
return fmt.Errorf("html: RCDATA element <%s> has non-text child node", n.Data)
|
||||||
|
}
|
||||||
|
if err := render(w, c); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
for _, c := range n.Child {
|
||||||
|
if err := render(w, c); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -9,6 +9,7 @@ import (
|
|||||||
"io"
|
"io"
|
||||||
"os"
|
"os"
|
||||||
"strconv"
|
"strconv"
|
||||||
|
"strings"
|
||||||
)
|
)
|
||||||
|
|
||||||
// A TokenType is the type of a Token.
|
// A TokenType is the type of a Token.
|
||||||
@ -144,6 +145,13 @@ type Tokenizer struct {
|
|||||||
pendingAttr [2]span
|
pendingAttr [2]span
|
||||||
attr [][2]span
|
attr [][2]span
|
||||||
nAttrReturned int
|
nAttrReturned int
|
||||||
|
// rawTag is the "script" in "</script>" that closes the next token. If
|
||||||
|
// non-empty, the subsequent call to Next will return a raw or RCDATA text
|
||||||
|
// token: one that treats "<p>" as text instead of an element.
|
||||||
|
// rawTag's contents are lower-cased.
|
||||||
|
rawTag string
|
||||||
|
// textIsRaw is whether the current text token's data is not escaped.
|
||||||
|
textIsRaw bool
|
||||||
}
|
}
|
||||||
|
|
||||||
// Error returns the error associated with the most recent ErrorToken token.
|
// Error returns the error associated with the most recent ErrorToken token.
|
||||||
@ -225,6 +233,54 @@ func (z *Tokenizer) skipWhiteSpace() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// readRawOrRCDATA reads until the next "</foo>", where "foo" is z.rawTag and
|
||||||
|
// is typically something like "script" or "textarea".
|
||||||
|
func (z *Tokenizer) readRawOrRCDATA() {
|
||||||
|
loop:
|
||||||
|
for {
|
||||||
|
c := z.readByte()
|
||||||
|
if z.err != nil {
|
||||||
|
break loop
|
||||||
|
}
|
||||||
|
if c != '<' {
|
||||||
|
continue loop
|
||||||
|
}
|
||||||
|
c = z.readByte()
|
||||||
|
if z.err != nil {
|
||||||
|
break loop
|
||||||
|
}
|
||||||
|
if c != '/' {
|
||||||
|
continue loop
|
||||||
|
}
|
||||||
|
for i := 0; i < len(z.rawTag); i++ {
|
||||||
|
c = z.readByte()
|
||||||
|
if z.err != nil {
|
||||||
|
break loop
|
||||||
|
}
|
||||||
|
if c != z.rawTag[i] && c != z.rawTag[i]-('a'-'A') {
|
||||||
|
continue loop
|
||||||
|
}
|
||||||
|
}
|
||||||
|
c = z.readByte()
|
||||||
|
if z.err != nil {
|
||||||
|
break loop
|
||||||
|
}
|
||||||
|
switch c {
|
||||||
|
case ' ', '\n', '\r', '\t', '\f', '/', '>':
|
||||||
|
// The 3 is 2 for the leading "</" plus 1 for the trailing character c.
|
||||||
|
z.raw.end -= 3 + len(z.rawTag)
|
||||||
|
break loop
|
||||||
|
case '<':
|
||||||
|
// Step back one, to catch "</foo</foo>".
|
||||||
|
z.raw.end--
|
||||||
|
}
|
||||||
|
}
|
||||||
|
z.data.end = z.raw.end
|
||||||
|
// A textarea's or title's RCDATA can contain escaped entities.
|
||||||
|
z.textIsRaw = z.rawTag != "textarea" && z.rawTag != "title"
|
||||||
|
z.rawTag = ""
|
||||||
|
}
|
||||||
|
|
||||||
// readComment reads the next comment token starting with "<!--". The opening
|
// readComment reads the next comment token starting with "<!--". The opening
|
||||||
// "<!--" has already been consumed.
|
// "<!--" has already been consumed.
|
||||||
func (z *Tokenizer) readComment() {
|
func (z *Tokenizer) readComment() {
|
||||||
@ -350,6 +406,19 @@ func (z *Tokenizer) readStartTag() TokenType {
|
|||||||
break
|
break
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// Any "<noembed>", "<noframes>", "<noscript>", "<script>", "<style>",
|
||||||
|
// "<textarea>" or "<title>" tag flags the tokenizer's next token as raw.
|
||||||
|
// The tag name lengths of these special cases ranges in [5, 8].
|
||||||
|
if x := z.data.end - z.data.start; 5 <= x && x <= 8 {
|
||||||
|
switch z.buf[z.data.start] {
|
||||||
|
case 'n', 's', 't', 'N', 'S', 'T':
|
||||||
|
switch s := strings.ToLower(string(z.buf[z.data.start:z.data.end])); s {
|
||||||
|
case "noembed", "noframes", "noscript", "script", "style", "textarea", "title":
|
||||||
|
z.rawTag = s
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Look for a self-closing token like "<br/>".
|
||||||
if z.err == nil && z.buf[z.raw.end-2] == '/' {
|
if z.err == nil && z.buf[z.raw.end-2] == '/' {
|
||||||
return SelfClosingTagToken
|
return SelfClosingTagToken
|
||||||
}
|
}
|
||||||
@ -485,6 +554,11 @@ func (z *Tokenizer) next() TokenType {
|
|||||||
z.raw.start = z.raw.end
|
z.raw.start = z.raw.end
|
||||||
z.data.start = z.raw.end
|
z.data.start = z.raw.end
|
||||||
z.data.end = z.raw.end
|
z.data.end = z.raw.end
|
||||||
|
if z.rawTag != "" {
|
||||||
|
z.readRawOrRCDATA()
|
||||||
|
return TextToken
|
||||||
|
}
|
||||||
|
z.textIsRaw = false
|
||||||
|
|
||||||
loop:
|
loop:
|
||||||
for {
|
for {
|
||||||
@ -591,7 +665,10 @@ func (z *Tokenizer) Text() []byte {
|
|||||||
s := z.buf[z.data.start:z.data.end]
|
s := z.buf[z.data.start:z.data.end]
|
||||||
z.data.start = z.raw.end
|
z.data.start = z.raw.end
|
||||||
z.data.end = z.raw.end
|
z.data.end = z.raw.end
|
||||||
return unescape(s)
|
if !z.textIsRaw {
|
||||||
|
s = unescape(s)
|
||||||
|
}
|
||||||
|
return s
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
@ -174,6 +174,77 @@ var tokenTests = []tokenTest{
|
|||||||
`<p id="0"</p>`,
|
`<p id="0"</p>`,
|
||||||
`<p id="0" <="" p="">`,
|
`<p id="0" <="" p="">`,
|
||||||
},
|
},
|
||||||
|
// Raw text and RCDATA.
|
||||||
|
{
|
||||||
|
"basic raw text",
|
||||||
|
"<script><a></b></script>",
|
||||||
|
"<script>$<a></b>$</script>",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"unfinished script end tag",
|
||||||
|
"<SCRIPT>a</SCR",
|
||||||
|
"<script>$a</SCR",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"broken script end tag",
|
||||||
|
"<SCRIPT>a</SCR ipt>",
|
||||||
|
"<script>$a</SCR ipt>",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"EOF in script end tag",
|
||||||
|
"<SCRIPT>a</SCRipt",
|
||||||
|
"<script>$a</SCRipt",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"scriptx end tag",
|
||||||
|
"<SCRIPT>a</SCRiptx",
|
||||||
|
"<script>$a</SCRiptx",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"' ' completes script end tag",
|
||||||
|
"<SCRIPT>a</SCRipt ",
|
||||||
|
"<script>$a$</script>",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"'>' completes script end tag",
|
||||||
|
"<SCRIPT>a</SCRipt>",
|
||||||
|
"<script>$a$</script>",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self-closing script end tag",
|
||||||
|
"<SCRIPT>a</SCRipt/>",
|
||||||
|
"<script>$a$</script>",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"nested script tag",
|
||||||
|
"<SCRIPT>a</SCRipt<script>",
|
||||||
|
"<script>$a</SCRipt<script>",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"script end tag after unfinished",
|
||||||
|
"<SCRIPT>a</SCRipt</script>",
|
||||||
|
"<script>$a</SCRipt$</script>",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"script/style mismatched tags",
|
||||||
|
"<script>a</style>",
|
||||||
|
"<script>$a</style>",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"style element with entity",
|
||||||
|
"<style>'",
|
||||||
|
"<style>$&apos;",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"textarea with tag",
|
||||||
|
"<textarea><div></textarea>",
|
||||||
|
"<textarea>$<div>$</textarea>",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title with tag and entity",
|
||||||
|
"<title><b>K&R C</b></title>",
|
||||||
|
"<title>$<b>K&R C</b>$</title>",
|
||||||
|
},
|
||||||
// DOCTYPE tests.
|
// DOCTYPE tests.
|
||||||
{
|
{
|
||||||
"Proper DOCTYPE",
|
"Proper DOCTYPE",
|
||||||
|
Loading…
Reference in New Issue
Block a user