mirror of
https://github.com/golang/go
synced 2024-11-22 02:24:41 -07:00
html: handle character entities without semicolons
Fix the TODO: unescape("¬it;") should be "¬it;" Also accept digits in entity names. R=nigeltao CC=golang-dev, rsc https://golang.org/cl/4781042
This commit is contained in:
parent
78c89d21bc
commit
816c972ff0
@ -4,6 +4,9 @@
|
|||||||
|
|
||||||
package html
|
package html
|
||||||
|
|
||||||
|
// All entities that do not end with ';' are 6 or fewer bytes long.
|
||||||
|
const longestEntityWithoutSemicolon = 6
|
||||||
|
|
||||||
// entity is a map from HTML entity names to their values. The semicolon matters:
|
// entity is a map from HTML entity names to their values. The semicolon matters:
|
||||||
// http://www.whatwg.org/specs/web-apps/current-work/multipage/named-character-references.html
|
// http://www.whatwg.org/specs/web-apps/current-work/multipage/named-character-references.html
|
||||||
// lists both "amp" and "amp;" as two separate entries.
|
// lists both "amp" and "amp;" as two separate entries.
|
||||||
|
@ -17,6 +17,9 @@ func TestEntityLength(t *testing.T) {
|
|||||||
if 1+len(k) < utf8.RuneLen(v) {
|
if 1+len(k) < utf8.RuneLen(v) {
|
||||||
t.Error("escaped entity &" + k + " is shorter than its UTF-8 encoding " + string(v))
|
t.Error("escaped entity &" + k + " is shorter than its UTF-8 encoding " + string(v))
|
||||||
}
|
}
|
||||||
|
if len(k) > longestEntityWithoutSemicolon && k[len(k)-1] != ';' {
|
||||||
|
t.Errorf("entity name %s is %d characters, but longestEntityWithoutSemicolon=%d", k, len(k), longestEntityWithoutSemicolon)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
for k, v := range entity2 {
|
for k, v := range entity2 {
|
||||||
if 1+len(k) < utf8.RuneLen(v[0])+utf8.RuneLen(v[1]) {
|
if 1+len(k) < utf8.RuneLen(v[0])+utf8.RuneLen(v[1]) {
|
||||||
|
@ -53,7 +53,8 @@ var replacementTable = [...]int{
|
|||||||
// unescapeEntity reads an entity like "<" from b[src:] and writes the
|
// unescapeEntity reads an entity like "<" from b[src:] and writes the
|
||||||
// corresponding "<" to b[dst:], returning the incremented dst and src cursors.
|
// corresponding "<" to b[dst:], returning the incremented dst and src cursors.
|
||||||
// Precondition: b[src] == '&' && dst <= src.
|
// Precondition: b[src] == '&' && dst <= src.
|
||||||
func unescapeEntity(b []byte, dst, src int) (dst1, src1 int) {
|
// attribute should be true if parsing an attribute value.
|
||||||
|
func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) {
|
||||||
// http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
|
// http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
|
||||||
|
|
||||||
// i starts at 1 because we already know that s[0] == '&'.
|
// i starts at 1 because we already know that s[0] == '&'.
|
||||||
@ -121,12 +122,11 @@ func unescapeEntity(b []byte, dst, src int) (dst1, src1 int) {
|
|||||||
// Consume the maximum number of characters possible, with the
|
// Consume the maximum number of characters possible, with the
|
||||||
// consumed characters matching one of the named references.
|
// consumed characters matching one of the named references.
|
||||||
|
|
||||||
// TODO(nigeltao): unescape("¬it;") should be "¬it;"
|
|
||||||
for i < len(s) {
|
for i < len(s) {
|
||||||
c := s[i]
|
c := s[i]
|
||||||
i++
|
i++
|
||||||
// Lower-cased characters are more common in entities, so we check for them first.
|
// Lower-cased characters are more common in entities, so we check for them first.
|
||||||
if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' {
|
if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9' {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if c != ';' {
|
if c != ';' {
|
||||||
@ -136,11 +136,25 @@ func unescapeEntity(b []byte, dst, src int) (dst1, src1 int) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
entityName := string(s[1:i])
|
entityName := string(s[1:i])
|
||||||
if x := entity[entityName]; x != 0 {
|
if entityName == "" {
|
||||||
|
// No-op.
|
||||||
|
} else if attribute && entityName[len(entityName)-1] != ';' && len(s) > i && s[i] == '=' {
|
||||||
|
// No-op.
|
||||||
|
} else if x := entity[entityName]; x != 0 {
|
||||||
return dst + utf8.EncodeRune(b[dst:], x), src + i
|
return dst + utf8.EncodeRune(b[dst:], x), src + i
|
||||||
} else if x := entity2[entityName]; x[0] != 0 { // Check if it's a two-character entity.
|
} else if x := entity2[entityName]; x[0] != 0 {
|
||||||
dst1 := dst + utf8.EncodeRune(b[dst:], x[0])
|
dst1 := dst + utf8.EncodeRune(b[dst:], x[0])
|
||||||
return dst1 + utf8.EncodeRune(b[dst1:], x[1]), src + i
|
return dst1 + utf8.EncodeRune(b[dst1:], x[1]), src + i
|
||||||
|
} else if !attribute {
|
||||||
|
maxLen := len(entityName) - 1
|
||||||
|
if maxLen > longestEntityWithoutSemicolon {
|
||||||
|
maxLen = longestEntityWithoutSemicolon
|
||||||
|
}
|
||||||
|
for j := maxLen; j > 1; j-- {
|
||||||
|
if x := entity[entityName[:j]]; x != 0 {
|
||||||
|
return dst + utf8.EncodeRune(b[dst:], x), src + j + 1
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
dst1, src1 = dst+i, src+i
|
dst1, src1 = dst+i, src+i
|
||||||
@ -152,11 +166,11 @@ func unescapeEntity(b []byte, dst, src int) (dst1, src1 int) {
|
|||||||
func unescape(b []byte) []byte {
|
func unescape(b []byte) []byte {
|
||||||
for i, c := range b {
|
for i, c := range b {
|
||||||
if c == '&' {
|
if c == '&' {
|
||||||
dst, src := unescapeEntity(b, i, i)
|
dst, src := unescapeEntity(b, i, i, false)
|
||||||
for src < len(b) {
|
for src < len(b) {
|
||||||
c := b[src]
|
c := b[src]
|
||||||
if c == '&' {
|
if c == '&' {
|
||||||
dst, src = unescapeEntity(b, dst, src)
|
dst, src = unescapeEntity(b, dst, src, false)
|
||||||
} else {
|
} else {
|
||||||
b[dst] = c
|
b[dst] = c
|
||||||
dst, src = dst+1, src+1
|
dst, src = dst+1, src+1
|
||||||
|
@ -459,7 +459,7 @@ loop:
|
|||||||
src++
|
src++
|
||||||
break loop
|
break loop
|
||||||
case '&':
|
case '&':
|
||||||
dst, src = unescapeEntity(z.buf, dst, src)
|
dst, src = unescapeEntity(z.buf, dst, src, true)
|
||||||
case '\\':
|
case '\\':
|
||||||
if src == z.p1 {
|
if src == z.p1 {
|
||||||
z.buf[dst] = '\\'
|
z.buf[dst] = '\\'
|
||||||
|
@ -107,6 +107,16 @@ var tokenTests = []tokenTest{
|
|||||||
`<a b="c&noSuchEntity;d"><&alsoDoesntExist;&`,
|
`<a b="c&noSuchEntity;d"><&alsoDoesntExist;&`,
|
||||||
`<a b="c&noSuchEntity;d">$<&alsoDoesntExist;&`,
|
`<a b="c&noSuchEntity;d">$<&alsoDoesntExist;&`,
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"entity without semicolon",
|
||||||
|
`¬it;∉<a b="q=z&=5¬ice=hello¬=world">`,
|
||||||
|
`¬it;∉$<a b="q=z&amp=5&notice=hello¬=world">`,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"entity with digits",
|
||||||
|
"½",
|
||||||
|
"½",
|
||||||
|
},
|
||||||
|
|
||||||
// Attribute tests:
|
// Attribute tests:
|
||||||
// http://dev.w3.org/html5/spec/Overview.html#attributes-0
|
// http://dev.w3.org/html5/spec/Overview.html#attributes-0
|
||||||
|
Loading…
Reference in New Issue
Block a user