1
0
mirror of https://github.com/golang/go synced 2024-11-25 07:07:57 -07:00

html: improve parsing of comments and "bogus comments"

R=nigeltao
CC=golang-dev
https://golang.org/cl/5279044
This commit is contained in:
Andrew Balholm 2011-10-15 12:22:08 +11:00 committed by Nigel Tao
parent 5079129deb
commit b770c9e9a2
2 changed files with 129 additions and 38 deletions

View File

@ -100,9 +100,9 @@ func (t Token) String() string {
case SelfClosingTagToken:
return "<" + t.tagString() + "/>"
case CommentToken:
return "<!--" + EscapeString(t.Data) + "-->"
return "<!--" + t.Data + "-->"
case DoctypeToken:
return "<!DOCTYPE " + EscapeString(t.Data) + ">"
return "<!DOCTYPE " + t.Data + ">"
}
return "Invalid(" + strconv.Itoa(int(t.Type)) + ")"
}
@ -227,30 +227,62 @@ func (z *Tokenizer) skipWhiteSpace() {
// nextComment reads the next token starting with "<!--".
// The opening "<!--" has already been consumed.
// Pre-condition: z.tt == TextToken && z.err == nil &&
// Pre-condition: z.tt == CommentToken && z.err == nil &&
// z.raw.start + 4 <= z.raw.end.
func (z *Tokenizer) nextComment() {
// <!--> is a valid comment.
z.data.start = z.raw.end
defer func() {
if z.data.end < z.data.start {
// It's a comment with no data, like <!-->.
z.data.end = z.data.start
}
}()
for dashCount := 2; ; {
c := z.readByte()
if z.err != nil {
z.data = z.raw
z.data.end = z.raw.end
return
}
switch c {
case '-':
dashCount++
continue
case '>':
if dashCount >= 2 {
z.tt = CommentToken
// TODO: adjust z.data to be only the "x" in "<!--x-->".
// Note that "<!>" is also a valid HTML5 comment.
z.data = z.raw
z.data.end = z.raw.end - len("-->")
return
}
dashCount = 0
default:
dashCount = 0
case '!':
if dashCount >= 2 {
c = z.readByte()
if z.err != nil {
z.data.end = z.raw.end
return
}
if c == '>' {
z.data.end = z.raw.end - len("--!>")
return
}
}
}
dashCount = 0
}
}
// nextBogusComment reads text until the next ">" and treats it as a comment.
// Pre-condition: z.err == nil && z.raw.end is before the first comment byte.
func (z *Tokenizer) nextBogusComment() {
z.tt = CommentToken
z.data.start = z.raw.end
for {
c := z.readByte()
if z.err != nil {
z.data.end = z.raw.end
return
}
if c == '>' {
z.data.end = z.raw.end - len(">")
return
}
}
}
@ -258,13 +290,15 @@ func (z *Tokenizer) nextComment() {
// nextMarkupDeclaration reads the next token starting with "<!".
// It might be a "<!--comment-->", a "<!DOCTYPE foo>", or "<!malformed text".
// The opening "<!" has already been consumed.
// Pre-condition: z.tt == TextToken && z.err == nil &&
// z.raw.start + 2 <= z.raw.end.
// Pre-condition: z.err == nil && z.raw.start + 2 <= z.raw.end.
func (z *Tokenizer) nextMarkupDeclaration() {
z.tt = CommentToken
z.data.start = z.raw.end
var c [2]byte
for i := 0; i < 2; i++ {
c[i] = z.readByte()
if z.err != nil {
z.data.end = z.raw.end
return
}
}
@ -273,27 +307,35 @@ func (z *Tokenizer) nextMarkupDeclaration() {
return
}
z.raw.end -= 2
const s = "DOCTYPE "
for i := 0; ; i++ {
const s = "DOCTYPE"
for i := 0; i < len(s); i++ {
c := z.readByte()
if z.err != nil {
z.data = z.raw
z.data.end = z.raw.end
return
}
// Capitalize c.
if 'a' <= c && c <= 'z' {
c = 'A' + (c - 'a')
if c != s[i] && c != s[i]+('a'-'A') {
// Back up to read the fragment of "DOCTYPE" again.
z.raw.end = z.data.start
z.nextBogusComment()
return
}
if i < len(s) && c != s[i] {
z.nextText()
}
z.tt = DoctypeToken
if z.skipWhiteSpace(); z.err != nil {
z.data.start = z.raw.end
z.data.end = z.raw.end
return
}
z.data.start = z.raw.end
for {
c := z.readByte()
if z.err != nil {
z.data.end = z.raw.end
return
}
if c == '>' {
if i >= len(s) {
z.tt = DoctypeToken
z.data.start = z.raw.start + len("<!DOCTYPE ")
z.data.end = z.raw.end - len(">")
}
z.data.end = z.raw.end - len(">")
return
}
}
@ -311,8 +353,18 @@ func (z *Tokenizer) nextTag() {
return
}
switch {
// TODO: check that the "</" is followed by something in A-Za-z.
case c == '/':
// Check that the "</" is followed by something in A-Za-z.
c = z.readByte()
if z.err != nil {
z.data = z.raw
return
}
z.raw.end--
if !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z') {
z.nextBogusComment()
return
}
z.tt = EndTagToken
z.data.start += len("</")
// Lower-cased characters are more common in tag names, so we check for them first.
@ -323,7 +375,8 @@ func (z *Tokenizer) nextTag() {
z.nextMarkupDeclaration()
return
case c == '?':
z.tt, z.err = ErrorToken, os.NewError("html: TODO: implement XML processing instructions")
z.raw.end--
z.nextBogusComment()
return
default:
z.tt, z.err = ErrorToken, os.NewError("html: TODO: handle malformed tags")

View File

@ -87,51 +87,88 @@ var tokenTests = []tokenTest{
`<p id="0"</p>`,
`<p id="0" <="" p="">`,
},
// DOCTYPE tests.
{
"Proper DOCTYPE",
"<!DOCTYPE html>",
"<!DOCTYPE html>",
},
{
"DOCTYPE with no space",
"<!doctypehtml>",
"<!DOCTYPE html>",
},
{
"DOCTYPE with two spaces",
"<!doctype html>",
"<!DOCTYPE html>",
},
{
"looks like DOCTYPE but isn't",
"<!DOCUMENT html>",
"<!--DOCUMENT html-->",
},
{
"DOCTYPE at EOF",
"<!DOCtype",
"<!DOCTYPE >",
},
// XML processing instructions.
{
"XML processing instruction",
"<?xml?>",
"<!--?xml?-->",
},
// Comments.
{
"comment0",
"abc<b><!-- skipme --></b>def",
"abc$<b>$</b>$def",
"abc$<b>$<!-- skipme -->$</b>$def",
},
{
"comment1",
"a<!-->z",
"a$z",
"a$<!---->$z",
},
{
"comment2",
"a<!--->z",
"a$z",
"a$<!---->$z",
},
{
"comment3",
"a<!--x>-->z",
"a$z",
"a$<!--x>-->$z",
},
{
"comment4",
"a<!--x->-->z",
"a$z",
"a$<!--x->-->$z",
},
{
"comment5",
"a<!>z",
"a$&lt;!&gt;z",
"a$<!---->$z",
},
{
"comment6",
"a<!->z",
"a$&lt;!-&gt;z",
"a$<!----->$z",
},
{
"comment7",
"a<!---<>z",
"a$&lt;!---&lt;&gt;z",
"a$<!---<>z-->",
},
{
"comment8",
"a<!--z",
"a$&lt;!--z",
"a$<!--z-->",
},
{
"comment9",
"a<!--x--!>z",
"a$<!--x-->$z",
},
// An attribute with a backslash.
{
@ -229,6 +266,7 @@ func TestTokenizer(t *testing.T) {
loop:
for _, tt := range tokenTests {
z := NewTokenizer(bytes.NewBuffer([]byte(tt.html)))
z.ReturnComments = true
for i, s := range strings.Split(tt.golden, "$") {
if z.Next() == ErrorToken {
t.Errorf("%s token %d: want %q got error %v", tt.desc, i, s, z.Error())