html: parse malformed tags missing a '>', such as <p id=0</p>.

The additional token_test.go cases matches html5lib behavior. Fixes #2124. R=gri CC=golang-dev https://golang.org/cl/4844055
2024-11-21 15:54:43 -07:00 · 2011-08-10 13:39:07 +10:00 · 2011-08-10 13:39:07 +10:00 · 37afff2978
commit 37afff2978
parent 1ac7a69701
2 changed files with 52 additions and 23 deletions
--- a/src/pkg/html/token.go
+++ b/src/pkg/html/token.go
@ -276,13 +276,12 @@ func (z *Tokenizer) nextTag() {
 	if z.err != nil {
 		return
 	}
-	var tt TokenType
 	switch {
 	case c == '/':
-		tt = EndTagToken
+		z.tt = EndTagToken
 	// Lower-cased characters are more common in tag names, so we check for them first.
 	case 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z':
-		tt = StartTagToken
+		z.tt = StartTagToken
 	case c == '!':
 		z.nextMarkupDeclaration()
 		return
@ -305,8 +304,7 @@ func (z *Tokenizer) nextTag() {
 				return
 			}
 		case '>':
-			z.tt = tt
-			if z.buf[z.p1-2] == '/' && tt == StartTagToken {
+			if z.buf[z.p1-2] == '/' && z.tt == StartTagToken {
 				z.tt = SelfClosingTagToken
 			}
 			return
@ -379,25 +377,35 @@ func (z *Tokenizer) trim(i int) int {
 	return k
 }

-// word finds the largest alphabetic [0-9A-Za-z]* word at the start
-// of z.buf[i:] and returns that word (optionally lower-cased), as
-// well as the trimmed cursor location after that word.
-func (z *Tokenizer) word(i int, lower bool) ([]byte, int) {
+// tagName finds the tag name at the start of z.buf[i:] and returns that name
+// lower-cased, as well as the trimmed cursor location afterwards.
+func (z *Tokenizer) tagName(i int) ([]byte, int) {
 	i0 := i
 loop:
 	for ; i < z.p1; i++ {
 		c := z.buf[i]
-		switch {
-		case '0' <= c && c <= '9':
-			// No-op.
-		case 'A' <= c && c <= 'Z':
-			if lower {
+		switch c {
+		case ' ', '\n', '\t', '\f', '/', '>':
+			break loop
+		}
+		if 'A' <= c && c <= 'Z' {
 			z.buf[i] = c + 'a' - 'A'
 		}
-		case 'a' <= c && c <= 'z':
-			// No-op.
-		default:
+	}
+	return z.buf[i0:i], z.trim(i)
+}
+
+// unquotedAttrVal finds the unquoted attribute value at the start of z.buf[i:]
+// and returns that value, as well as the trimmed cursor location afterwards.
+func (z *Tokenizer) unquotedAttrVal(i int) ([]byte, int) {
+	i0 := i
+loop:
+	for ; i < z.p1; i++ {
+		switch z.buf[i] {
+		case ' ', '\n', '\t', '\f', '>':
 			break loop
+		case '&':
+			// TODO: unescape the entity.
 		}
 	}
 	return z.buf[i0:i], z.trim(i)
@ -405,11 +413,17 @@ loop:

 // attrName finds the largest attribute name at the start
 // of z.buf[i:] and returns it lower-cased, as well
-// as the trimmed cursor location after that word.
+// as the trimmed cursor location after that name.
 //
 // http://dev.w3.org/html5/spec/Overview.html#syntax-attribute-name
 // TODO: unicode characters
 func (z *Tokenizer) attrName(i int) ([]byte, int) {
+	for z.buf[i] == '/' {
+		i++
+		if z.buf[i] == '>' {
+			return nil, z.trim(i)
+		}
+	}
 	i0 := i
 loop:
 	for ; i < z.p1; i++ {
@ -469,7 +483,7 @@ func (z *Tokenizer) TagName() (name []byte, hasAttr bool) {
 	if z.buf[i] == '/' {
 		i++
 	}
-	name, z.p0 = z.word(i, true)
+	name, z.p0 = z.tagName(i)
 	hasAttr = z.p0 != z.p1
 	return
 }
@ -496,7 +510,7 @@ func (z *Tokenizer) TagAttr() (key, val []byte, moreAttr bool) {
 	}
 	closeQuote := z.buf[i]
 	if closeQuote != '\'' && closeQuote != '"' {
-		val, z.p0 = z.word(i, false)
+		val, z.p0 = z.unquotedAttrVal(i)
 		moreAttr = z.p0 != z.p1
 		return
 	}
--- a/src/pkg/html/token_test.go
+++ b/src/pkg/html/token_test.go
@ -41,6 +41,22 @@ var tokenTests = []tokenTest{
 		"<a>b<c/>d</e>",
 		"<a>$b$<c/>$d$</e>",
 	},
+	// Some malformed tags that are missing a '>'.
+	{
+		"malformed tag #0",
+		`<p</p>`,
+		`<p< p="">`,
+	},
+	{
+		"malformed tag #1",
+		`<p id=0</p>`,
+		`<p id="0&lt;/p">`,
+	},
+	{
+		"malformed tag #2",
+		`<p id="0</p>`,
+		`<p id="0&lt;/p&gt;">`,
+	},
 	// Comments.
 	{
 		"comment0",
@ -117,7 +133,6 @@ var tokenTests = []tokenTest{
 		"&frac12;",
 		"½",
 	},
-
 	// Attribute tests:
 	// http://dev.w3.org/html5/spec/Overview.html#attributes-0
 	{