mirror of
https://github.com/golang/go
synced 2024-11-21 17:24:42 -07:00
html: parse raw text and RCDATA elements, such as <script> and <title>.
Pass tests1.dat, test 26: #data <script><div></script></div><title><p></title><p><p> #document | <html> | <head> | <script> | "<div>" | <title> | "<p>" | <body> | <p> | <p> Thanks to Andy Balholm for driving this change. R=andybalholm CC=golang-dev https://golang.org/cl/5301042
This commit is contained in:
parent
78ad19f214
commit
b1fd528db5
@ -29,6 +29,9 @@ type parser struct {
|
||||
head, form *Node
|
||||
// Other parsing state flags (section 11.2.3.5).
|
||||
scripting, framesetOK bool
|
||||
// originalIM is the insertion mode to go back to after completing a text
|
||||
// or inTableText insertion mode.
|
||||
originalIM insertionMode
|
||||
}
|
||||
|
||||
func (p *parser) top() *Node {
|
||||
@ -214,12 +217,23 @@ type insertionMode func(*parser) (insertionMode, bool)
|
||||
// Section 11.2.3.1, "using the rules for".
|
||||
func useTheRulesFor(p *parser, actual, delegate insertionMode) (insertionMode, bool) {
|
||||
im, consumed := delegate(p)
|
||||
// TODO: do we need to update p.originalMode if it equals delegate?
|
||||
if im != delegate {
|
||||
return im, consumed
|
||||
}
|
||||
return actual, consumed
|
||||
}
|
||||
|
||||
// setOriginalIM sets the insertion mode to return to after completing a text or
|
||||
// inTableText insertion mode.
|
||||
// Section 11.2.3.1, "using the rules for".
|
||||
func (p *parser) setOriginalIM(im insertionMode) {
|
||||
if p.originalIM != nil {
|
||||
panic("html: bad parser state: originalIM was set twice")
|
||||
}
|
||||
p.originalIM = im
|
||||
}
|
||||
|
||||
// Section 11.2.5.4.1.
|
||||
func initialIM(p *parser) (insertionMode, bool) {
|
||||
if p.tok.Type == DoctypeToken {
|
||||
@ -318,8 +332,10 @@ func inHeadIM(p *parser) (insertionMode, bool) {
|
||||
switch p.tok.Data {
|
||||
case "meta":
|
||||
// TODO.
|
||||
case "script":
|
||||
// TODO.
|
||||
case "script", "title":
|
||||
p.addElement(p.tok.Data, p.tok.Attr)
|
||||
p.setOriginalIM(inHeadIM)
|
||||
return textIM, true
|
||||
default:
|
||||
implied = true
|
||||
}
|
||||
@ -574,6 +590,20 @@ func (p *parser) inBodyEndTagFormatting(tag string) {
|
||||
}
|
||||
}
|
||||
|
||||
// Section 11.2.5.4.8.
|
||||
func textIM(p *parser) (insertionMode, bool) {
|
||||
switch p.tok.Type {
|
||||
case TextToken:
|
||||
p.addText(p.tok.Data)
|
||||
return textIM, true
|
||||
case EndTagToken:
|
||||
p.oe.pop()
|
||||
}
|
||||
o := p.originalIM
|
||||
p.originalIM = nil
|
||||
return o, p.tok.Type == EndTagToken
|
||||
}
|
||||
|
||||
// Section 11.2.5.4.9.
|
||||
func inTableIM(p *parser) (insertionMode, bool) {
|
||||
var (
|
||||
|
@ -80,13 +80,13 @@ func dumpLevel(w io.Writer, n *Node, level int) os.Error {
|
||||
case DocumentNode:
|
||||
return os.NewError("unexpected DocumentNode")
|
||||
case ElementNode:
|
||||
fmt.Fprintf(w, "<%s>", EscapeString(n.Data))
|
||||
fmt.Fprintf(w, "<%s>", n.Data)
|
||||
case TextNode:
|
||||
fmt.Fprintf(w, "%q", EscapeString(n.Data))
|
||||
fmt.Fprintf(w, "%q", n.Data)
|
||||
case CommentNode:
|
||||
return os.NewError("COMMENT")
|
||||
case DoctypeNode:
|
||||
fmt.Fprintf(w, "<!DOCTYPE %s>", EscapeString(n.Data))
|
||||
fmt.Fprintf(w, "<!DOCTYPE %s>", n.Data)
|
||||
case scopeMarkerNode:
|
||||
return os.NewError("unexpected scopeMarkerNode")
|
||||
default:
|
||||
@ -123,7 +123,7 @@ func TestParser(t *testing.T) {
|
||||
rc := make(chan io.Reader)
|
||||
go readDat(filename, rc)
|
||||
// TODO(nigeltao): Process all test cases, not just a subset.
|
||||
for i := 0; i < 26; i++ {
|
||||
for i := 0; i < 27; i++ {
|
||||
// Parse the #data section.
|
||||
b, err := ioutil.ReadAll(<-rc)
|
||||
if err != nil {
|
||||
|
@ -74,17 +74,6 @@ func render(w writer, n *Node) os.Error {
|
||||
return os.NewError("html: unknown node type")
|
||||
}
|
||||
|
||||
// TODO: figure out what to do with <script>, <style>, <noembed>,
|
||||
// <noframes> and <noscript> elements. A tentative plan:
|
||||
// 1. render the <xxx> opening tag as normal.
|
||||
// 2. maybe error out if any child is not a text node.
|
||||
// 3. render the text nodes (without escaping??).
|
||||
// 4. maybe error out if `</xxx` is a case-insensitive substring of the
|
||||
// concatenation of the children's data.
|
||||
// 5. maybe error out if the concatenation of the children's data contains an
|
||||
// unbalanced escaping text span start ("<!--") not followed by an end ("-->").
|
||||
// 6. render the closing tag as normal.
|
||||
|
||||
// Render the <xxx> opening tag.
|
||||
if err := w.WriteByte('<'); err != nil {
|
||||
return err
|
||||
@ -121,9 +110,30 @@ func render(w writer, n *Node) os.Error {
|
||||
}
|
||||
|
||||
// Render any child nodes.
|
||||
for _, c := range n.Child {
|
||||
if err := render(w, c); err != nil {
|
||||
return err
|
||||
switch n.Data {
|
||||
case "noembed", "noframes", "noscript", "script", "style":
|
||||
for _, c := range n.Child {
|
||||
if c.Type != TextNode {
|
||||
return fmt.Errorf("html: raw text element <%s> has non-text child node", n.Data)
|
||||
}
|
||||
if _, err := w.WriteString(c.Data); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
case "textarea", "title":
|
||||
for _, c := range n.Child {
|
||||
if c.Type != TextNode {
|
||||
return fmt.Errorf("html: RCDATA element <%s> has non-text child node", n.Data)
|
||||
}
|
||||
if err := render(w, c); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
default:
|
||||
for _, c := range n.Child {
|
||||
if err := render(w, c); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -9,6 +9,7 @@ import (
|
||||
"io"
|
||||
"os"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// A TokenType is the type of a Token.
|
||||
@ -144,6 +145,13 @@ type Tokenizer struct {
|
||||
pendingAttr [2]span
|
||||
attr [][2]span
|
||||
nAttrReturned int
|
||||
// rawTag is the "script" in "</script>" that closes the next token. If
|
||||
// non-empty, the subsequent call to Next will return a raw or RCDATA text
|
||||
// token: one that treats "<p>" as text instead of an element.
|
||||
// rawTag's contents are lower-cased.
|
||||
rawTag string
|
||||
// textIsRaw is whether the current text token's data is not escaped.
|
||||
textIsRaw bool
|
||||
}
|
||||
|
||||
// Error returns the error associated with the most recent ErrorToken token.
|
||||
@ -225,6 +233,54 @@ func (z *Tokenizer) skipWhiteSpace() {
|
||||
}
|
||||
}
|
||||
|
||||
// readRawOrRCDATA reads until the next "</foo>", where "foo" is z.rawTag and
|
||||
// is typically something like "script" or "textarea".
|
||||
func (z *Tokenizer) readRawOrRCDATA() {
|
||||
loop:
|
||||
for {
|
||||
c := z.readByte()
|
||||
if z.err != nil {
|
||||
break loop
|
||||
}
|
||||
if c != '<' {
|
||||
continue loop
|
||||
}
|
||||
c = z.readByte()
|
||||
if z.err != nil {
|
||||
break loop
|
||||
}
|
||||
if c != '/' {
|
||||
continue loop
|
||||
}
|
||||
for i := 0; i < len(z.rawTag); i++ {
|
||||
c = z.readByte()
|
||||
if z.err != nil {
|
||||
break loop
|
||||
}
|
||||
if c != z.rawTag[i] && c != z.rawTag[i]-('a'-'A') {
|
||||
continue loop
|
||||
}
|
||||
}
|
||||
c = z.readByte()
|
||||
if z.err != nil {
|
||||
break loop
|
||||
}
|
||||
switch c {
|
||||
case ' ', '\n', '\r', '\t', '\f', '/', '>':
|
||||
// The 3 is 2 for the leading "</" plus 1 for the trailing character c.
|
||||
z.raw.end -= 3 + len(z.rawTag)
|
||||
break loop
|
||||
case '<':
|
||||
// Step back one, to catch "</foo</foo>".
|
||||
z.raw.end--
|
||||
}
|
||||
}
|
||||
z.data.end = z.raw.end
|
||||
// A textarea's or title's RCDATA can contain escaped entities.
|
||||
z.textIsRaw = z.rawTag != "textarea" && z.rawTag != "title"
|
||||
z.rawTag = ""
|
||||
}
|
||||
|
||||
// readComment reads the next comment token starting with "<!--". The opening
|
||||
// "<!--" has already been consumed.
|
||||
func (z *Tokenizer) readComment() {
|
||||
@ -350,6 +406,19 @@ func (z *Tokenizer) readStartTag() TokenType {
|
||||
break
|
||||
}
|
||||
}
|
||||
// Any "<noembed>", "<noframes>", "<noscript>", "<script>", "<style>",
|
||||
// "<textarea>" or "<title>" tag flags the tokenizer's next token as raw.
|
||||
// The tag name lengths of these special cases ranges in [5, 8].
|
||||
if x := z.data.end - z.data.start; 5 <= x && x <= 8 {
|
||||
switch z.buf[z.data.start] {
|
||||
case 'n', 's', 't', 'N', 'S', 'T':
|
||||
switch s := strings.ToLower(string(z.buf[z.data.start:z.data.end])); s {
|
||||
case "noembed", "noframes", "noscript", "script", "style", "textarea", "title":
|
||||
z.rawTag = s
|
||||
}
|
||||
}
|
||||
}
|
||||
// Look for a self-closing token like "<br/>".
|
||||
if z.err == nil && z.buf[z.raw.end-2] == '/' {
|
||||
return SelfClosingTagToken
|
||||
}
|
||||
@ -485,6 +554,11 @@ func (z *Tokenizer) next() TokenType {
|
||||
z.raw.start = z.raw.end
|
||||
z.data.start = z.raw.end
|
||||
z.data.end = z.raw.end
|
||||
if z.rawTag != "" {
|
||||
z.readRawOrRCDATA()
|
||||
return TextToken
|
||||
}
|
||||
z.textIsRaw = false
|
||||
|
||||
loop:
|
||||
for {
|
||||
@ -591,7 +665,10 @@ func (z *Tokenizer) Text() []byte {
|
||||
s := z.buf[z.data.start:z.data.end]
|
||||
z.data.start = z.raw.end
|
||||
z.data.end = z.raw.end
|
||||
return unescape(s)
|
||||
if !z.textIsRaw {
|
||||
s = unescape(s)
|
||||
}
|
||||
return s
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
@ -174,6 +174,77 @@ var tokenTests = []tokenTest{
|
||||
`<p id="0"</p>`,
|
||||
`<p id="0" <="" p="">`,
|
||||
},
|
||||
// Raw text and RCDATA.
|
||||
{
|
||||
"basic raw text",
|
||||
"<script><a></b></script>",
|
||||
"<script>$<a></b>$</script>",
|
||||
},
|
||||
{
|
||||
"unfinished script end tag",
|
||||
"<SCRIPT>a</SCR",
|
||||
"<script>$a</SCR",
|
||||
},
|
||||
{
|
||||
"broken script end tag",
|
||||
"<SCRIPT>a</SCR ipt>",
|
||||
"<script>$a</SCR ipt>",
|
||||
},
|
||||
{
|
||||
"EOF in script end tag",
|
||||
"<SCRIPT>a</SCRipt",
|
||||
"<script>$a</SCRipt",
|
||||
},
|
||||
{
|
||||
"scriptx end tag",
|
||||
"<SCRIPT>a</SCRiptx",
|
||||
"<script>$a</SCRiptx",
|
||||
},
|
||||
{
|
||||
"' ' completes script end tag",
|
||||
"<SCRIPT>a</SCRipt ",
|
||||
"<script>$a$</script>",
|
||||
},
|
||||
{
|
||||
"'>' completes script end tag",
|
||||
"<SCRIPT>a</SCRipt>",
|
||||
"<script>$a$</script>",
|
||||
},
|
||||
{
|
||||
"self-closing script end tag",
|
||||
"<SCRIPT>a</SCRipt/>",
|
||||
"<script>$a$</script>",
|
||||
},
|
||||
{
|
||||
"nested script tag",
|
||||
"<SCRIPT>a</SCRipt<script>",
|
||||
"<script>$a</SCRipt<script>",
|
||||
},
|
||||
{
|
||||
"script end tag after unfinished",
|
||||
"<SCRIPT>a</SCRipt</script>",
|
||||
"<script>$a</SCRipt$</script>",
|
||||
},
|
||||
{
|
||||
"script/style mismatched tags",
|
||||
"<script>a</style>",
|
||||
"<script>$a</style>",
|
||||
},
|
||||
{
|
||||
"style element with entity",
|
||||
"<style>'",
|
||||
"<style>$&apos;",
|
||||
},
|
||||
{
|
||||
"textarea with tag",
|
||||
"<textarea><div></textarea>",
|
||||
"<textarea>$<div>$</textarea>",
|
||||
},
|
||||
{
|
||||
"title with tag and entity",
|
||||
"<title><b>K&R C</b></title>",
|
||||
"<title>$<b>K&R C</b>$</title>",
|
||||
},
|
||||
// DOCTYPE tests.
|
||||
{
|
||||
"Proper DOCTYPE",
|
||||
|
Loading…
Reference in New Issue
Block a user