mirror of
https://github.com/golang/go
synced 2024-11-22 02:34:40 -07:00
html: handle character entities without semicolons
Fix the TODO: unescape("¬it;") should be "¬it;" Also accept digits in entity names. R=nigeltao CC=golang-dev, rsc https://golang.org/cl/4781042
This commit is contained in:
parent
78c89d21bc
commit
816c972ff0
@ -4,6 +4,9 @@
|
||||
|
||||
package html
|
||||
|
||||
// All entities that do not end with ';' are 6 or fewer bytes long.
|
||||
const longestEntityWithoutSemicolon = 6
|
||||
|
||||
// entity is a map from HTML entity names to their values. The semicolon matters:
|
||||
// http://www.whatwg.org/specs/web-apps/current-work/multipage/named-character-references.html
|
||||
// lists both "amp" and "amp;" as two separate entries.
|
||||
|
@ -17,6 +17,9 @@ func TestEntityLength(t *testing.T) {
|
||||
if 1+len(k) < utf8.RuneLen(v) {
|
||||
t.Error("escaped entity &" + k + " is shorter than its UTF-8 encoding " + string(v))
|
||||
}
|
||||
if len(k) > longestEntityWithoutSemicolon && k[len(k)-1] != ';' {
|
||||
t.Errorf("entity name %s is %d characters, but longestEntityWithoutSemicolon=%d", k, len(k), longestEntityWithoutSemicolon)
|
||||
}
|
||||
}
|
||||
for k, v := range entity2 {
|
||||
if 1+len(k) < utf8.RuneLen(v[0])+utf8.RuneLen(v[1]) {
|
||||
|
@ -53,7 +53,8 @@ var replacementTable = [...]int{
|
||||
// unescapeEntity reads an entity like "<" from b[src:] and writes the
|
||||
// corresponding "<" to b[dst:], returning the incremented dst and src cursors.
|
||||
// Precondition: b[src] == '&' && dst <= src.
|
||||
func unescapeEntity(b []byte, dst, src int) (dst1, src1 int) {
|
||||
// attribute should be true if parsing an attribute value.
|
||||
func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) {
|
||||
// http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
|
||||
|
||||
// i starts at 1 because we already know that s[0] == '&'.
|
||||
@ -121,12 +122,11 @@ func unescapeEntity(b []byte, dst, src int) (dst1, src1 int) {
|
||||
// Consume the maximum number of characters possible, with the
|
||||
// consumed characters matching one of the named references.
|
||||
|
||||
// TODO(nigeltao): unescape("¬it;") should be "¬it;"
|
||||
for i < len(s) {
|
||||
c := s[i]
|
||||
i++
|
||||
// Lower-cased characters are more common in entities, so we check for them first.
|
||||
if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' {
|
||||
if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9' {
|
||||
continue
|
||||
}
|
||||
if c != ';' {
|
||||
@ -136,11 +136,25 @@ func unescapeEntity(b []byte, dst, src int) (dst1, src1 int) {
|
||||
}
|
||||
|
||||
entityName := string(s[1:i])
|
||||
if x := entity[entityName]; x != 0 {
|
||||
if entityName == "" {
|
||||
// No-op.
|
||||
} else if attribute && entityName[len(entityName)-1] != ';' && len(s) > i && s[i] == '=' {
|
||||
// No-op.
|
||||
} else if x := entity[entityName]; x != 0 {
|
||||
return dst + utf8.EncodeRune(b[dst:], x), src + i
|
||||
} else if x := entity2[entityName]; x[0] != 0 { // Check if it's a two-character entity.
|
||||
} else if x := entity2[entityName]; x[0] != 0 {
|
||||
dst1 := dst + utf8.EncodeRune(b[dst:], x[0])
|
||||
return dst1 + utf8.EncodeRune(b[dst1:], x[1]), src + i
|
||||
} else if !attribute {
|
||||
maxLen := len(entityName) - 1
|
||||
if maxLen > longestEntityWithoutSemicolon {
|
||||
maxLen = longestEntityWithoutSemicolon
|
||||
}
|
||||
for j := maxLen; j > 1; j-- {
|
||||
if x := entity[entityName[:j]]; x != 0 {
|
||||
return dst + utf8.EncodeRune(b[dst:], x), src + j + 1
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
dst1, src1 = dst+i, src+i
|
||||
@ -152,11 +166,11 @@ func unescapeEntity(b []byte, dst, src int) (dst1, src1 int) {
|
||||
func unescape(b []byte) []byte {
|
||||
for i, c := range b {
|
||||
if c == '&' {
|
||||
dst, src := unescapeEntity(b, i, i)
|
||||
dst, src := unescapeEntity(b, i, i, false)
|
||||
for src < len(b) {
|
||||
c := b[src]
|
||||
if c == '&' {
|
||||
dst, src = unescapeEntity(b, dst, src)
|
||||
dst, src = unescapeEntity(b, dst, src, false)
|
||||
} else {
|
||||
b[dst] = c
|
||||
dst, src = dst+1, src+1
|
||||
|
@ -459,7 +459,7 @@ loop:
|
||||
src++
|
||||
break loop
|
||||
case '&':
|
||||
dst, src = unescapeEntity(z.buf, dst, src)
|
||||
dst, src = unescapeEntity(z.buf, dst, src, true)
|
||||
case '\\':
|
||||
if src == z.p1 {
|
||||
z.buf[dst] = '\\'
|
||||
|
@ -107,6 +107,16 @@ var tokenTests = []tokenTest{
|
||||
`<a b="c&noSuchEntity;d"><&alsoDoesntExist;&`,
|
||||
`<a b="c&noSuchEntity;d">$<&alsoDoesntExist;&`,
|
||||
},
|
||||
{
|
||||
"entity without semicolon",
|
||||
`¬it;∉<a b="q=z&=5¬ice=hello¬=world">`,
|
||||
`¬it;∉$<a b="q=z&amp=5&notice=hello¬=world">`,
|
||||
},
|
||||
{
|
||||
"entity with digits",
|
||||
"½",
|
||||
"½",
|
||||
},
|
||||
|
||||
// Attribute tests:
|
||||
// http://dev.w3.org/html5/spec/Overview.html#attributes-0
|
||||
|
Loading…
Reference in New Issue
Block a user