1
0
mirror of https://github.com/golang/go synced 2024-11-24 17:20:12 -07:00

html: improve attribute parsing, note package status

Fixes #1890

R=nigeltao
CC=golang-dev
https://golang.org/cl/4528102
This commit is contained in:
Brad Fitzpatrick 2011-06-06 15:56:15 -07:00
parent 9e857dbdcc
commit 5e03143c1a
3 changed files with 39 additions and 1 deletions

View File

@ -4,6 +4,7 @@
/* /*
Package html implements an HTML5-compliant tokenizer and parser. Package html implements an HTML5-compliant tokenizer and parser.
INCOMPLETE.
Tokenization is done by creating a Tokenizer for an io.Reader r. It is the Tokenization is done by creating a Tokenizer for an io.Reader r. It is the
caller's responsibility to ensure that r provides UTF-8 encoded HTML. caller's responsibility to ensure that r provides UTF-8 encoded HTML.

View File

@ -355,6 +355,33 @@ loop:
return z.buf[i0:i], z.trim(i) return z.buf[i0:i], z.trim(i)
} }
// attrName finds the largest attribute name at the start
// of z.buf[i:] and returns it lower-cased, as well
// as the trimmed cursor location after that word.
//
// http://dev.w3.org/html5/spec/Overview.html#syntax-attribute-name
// TODO: unicode characters
func (z *Tokenizer) attrName(i int) ([]byte, int) {
i0 := i
loop:
for ; i < z.p1; i++ {
c := z.buf[i]
switch c {
case '<', '>', '"', '\'', '/', '=':
break loop
}
switch {
case 'A' <= c && c <= 'Z':
z.buf[i] = c + 'a' - 'A'
case c > ' ' && c < 0x7f:
// No-op.
default:
break loop
}
}
return z.buf[i0:i], z.trim(i)
}
// Text returns the unescaped text of a TextToken or a CommentToken. // Text returns the unescaped text of a TextToken or a CommentToken.
// The contents of the returned slice may change on the next call to Next. // The contents of the returned slice may change on the next call to Next.
func (z *Tokenizer) Text() []byte { func (z *Tokenizer) Text() []byte {
@ -399,7 +426,7 @@ func (z *Tokenizer) TagName() (name []byte, hasAttr bool) {
// attribute for the current tag token and whether there are more attributes. // attribute for the current tag token and whether there are more attributes.
// The contents of the returned slices may change on the next call to Next. // The contents of the returned slices may change on the next call to Next.
func (z *Tokenizer) TagAttr() (key, val []byte, moreAttr bool) { func (z *Tokenizer) TagAttr() (key, val []byte, moreAttr bool) {
key, i := z.word(z.p0, true) key, i := z.attrName(z.p0)
// Check for an empty attribute value. // Check for an empty attribute value.
if i == z.p1 { if i == z.p1 {
z.p0 = i z.p0 = i

View File

@ -125,6 +125,11 @@ var tokenTests = []tokenTest{
`<input value=yes FOO=BAR>`, `<input value=yes FOO=BAR>`,
`<input value="yes" foo="BAR">`, `<input value="yes" foo="BAR">`,
}, },
{
"Unquoted attribute value, spaces",
`<input value = yes FOO = BAR>`,
`<input value="yes" foo="BAR">`,
},
{ {
"Unquoted attribute value, trailing space", "Unquoted attribute value, trailing space",
`<input value=yes FOO=BAR >`, `<input value=yes FOO=BAR >`,
@ -145,6 +150,11 @@ var tokenTests = []tokenTest{
`<input value="I'm an attribute" FOO="BAR">`, `<input value="I'm an attribute" FOO="BAR">`,
`<input value="I&apos;m an attribute" foo="BAR">`, `<input value="I&apos;m an attribute" foo="BAR">`,
}, },
{
"Attribute name characters",
`<meta http-equiv="content-type">`,
`<meta http-equiv="content-type">`,
},
} }
func TestTokenizer(t *testing.T) { func TestTokenizer(t *testing.T) {