html: improve attribute parsing, note package status

Fixes #1890 R=nigeltao CC=golang-dev https://golang.org/cl/4528102
2024-11-19 15:44:44 -07:00 · 2011-06-06 15:56:15 -07:00 · 2011-06-06 15:56:15 -07:00 · 5e03143c1a
commit 5e03143c1a
parent 9e857dbdcc
3 changed files with 39 additions and 1 deletions
--- a/src/pkg/html/doc.go
+++ b/src/pkg/html/doc.go
@ -4,6 +4,7 @@

 /*
 Package html implements an HTML5-compliant tokenizer and parser.
+INCOMPLETE.

 Tokenization is done by creating a Tokenizer for an io.Reader r. It is the
 caller's responsibility to ensure that r provides UTF-8 encoded HTML.
--- a/src/pkg/html/token.go
+++ b/src/pkg/html/token.go
@ -355,6 +355,33 @@ loop:
 	return z.buf[i0:i], z.trim(i)
 }

+// attrName finds the largest attribute name at the start
+// of z.buf[i:] and returns it lower-cased, as well
+// as the trimmed cursor location after that word.
+//
+// http://dev.w3.org/html5/spec/Overview.html#syntax-attribute-name
+// TODO: unicode characters
+func (z *Tokenizer) attrName(i int) ([]byte, int) {
+	i0 := i
+loop:
+	for ; i < z.p1; i++ {
+		c := z.buf[i]
+		switch c {
+		case '<', '>', '"', '\'', '/', '=':
+			break loop
+		}
+		switch {
+		case 'A' <= c && c <= 'Z':
+			z.buf[i] = c + 'a' - 'A'
+		case c > ' ' && c < 0x7f:
+			// No-op.
+		default:
+			break loop
+		}
+	}
+	return z.buf[i0:i], z.trim(i)
+}
+
 // Text returns the unescaped text of a TextToken or a CommentToken.
 // The contents of the returned slice may change on the next call to Next.
 func (z *Tokenizer) Text() []byte {
@ -399,7 +426,7 @@ func (z *Tokenizer) TagName() (name []byte, hasAttr bool) {
 // attribute for the current tag token and whether there are more attributes.
 // The contents of the returned slices may change on the next call to Next.
 func (z *Tokenizer) TagAttr() (key, val []byte, moreAttr bool) {
-	key, i := z.word(z.p0, true)
+	key, i := z.attrName(z.p0)
 	// Check for an empty attribute value.
 	if i == z.p1 {
 		z.p0 = i
--- a/src/pkg/html/token_test.go
+++ b/src/pkg/html/token_test.go
@ -125,6 +125,11 @@ var tokenTests = []tokenTest{
 		`<input value=yes FOO=BAR>`,
 		`<input value="yes" foo="BAR">`,
 	},
+	{
+		"Unquoted attribute value, spaces",
+		`<input value = yes FOO = BAR>`,
+		`<input value="yes" foo="BAR">`,
+	},
 	{
 		"Unquoted attribute value, trailing space",
 		`<input value=yes FOO=BAR >`,
@ -145,6 +150,11 @@ var tokenTests = []tokenTest{
 		`<input value="I'm an attribute" FOO="BAR">`,
 		`<input value="I&apos;m an attribute" foo="BAR">`,
 	},
+	{
+		"Attribute name characters",
+		`<meta http-equiv="content-type">`,
+		`<meta http-equiv="content-type">`,
+	},
 }

 func TestTokenizer(t *testing.T) {