From 816c972ff04c3975b29605a6cb9b16382460e47c Mon Sep 17 00:00:00 2001 From: Andrew Balholm Date: Thu, 21 Jul 2011 09:10:49 +1000 Subject: [PATCH] html: handle character entities without semicolons MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix the TODO: unescape("¬it;") should be "¬it;" Also accept digits in entity names. R=nigeltao CC=golang-dev, rsc https://golang.org/cl/4781042 --- src/pkg/html/entity.go | 3 +++ src/pkg/html/entity_test.go | 3 +++ src/pkg/html/escape.go | 28 +++++++++++++++++++++------- src/pkg/html/token.go | 2 +- src/pkg/html/token_test.go | 10 ++++++++++ 5 files changed, 38 insertions(+), 8 deletions(-) diff --git a/src/pkg/html/entity.go b/src/pkg/html/entity.go index 1530290cb38..21263e22d8c 100644 --- a/src/pkg/html/entity.go +++ b/src/pkg/html/entity.go @@ -4,6 +4,9 @@ package html +// All entities that do not end with ';' are 6 or fewer bytes long. +const longestEntityWithoutSemicolon = 6 + // entity is a map from HTML entity names to their values. The semicolon matters: // http://www.whatwg.org/specs/web-apps/current-work/multipage/named-character-references.html // lists both "amp" and "amp;" as two separate entries. diff --git a/src/pkg/html/entity_test.go b/src/pkg/html/entity_test.go index a1eb4d4f013..2cf49d61d2d 100644 --- a/src/pkg/html/entity_test.go +++ b/src/pkg/html/entity_test.go @@ -17,6 +17,9 @@ func TestEntityLength(t *testing.T) { if 1+len(k) < utf8.RuneLen(v) { t.Error("escaped entity &" + k + " is shorter than its UTF-8 encoding " + string(v)) } + if len(k) > longestEntityWithoutSemicolon && k[len(k)-1] != ';' { + t.Errorf("entity name %s is %d characters, but longestEntityWithoutSemicolon=%d", k, len(k), longestEntityWithoutSemicolon) + } } for k, v := range entity2 { if 1+len(k) < utf8.RuneLen(v[0])+utf8.RuneLen(v[1]) { diff --git a/src/pkg/html/escape.go b/src/pkg/html/escape.go index 2799f690876..0de97c5ac1b 100644 --- a/src/pkg/html/escape.go +++ b/src/pkg/html/escape.go @@ -53,7 +53,8 @@ var replacementTable = [...]int{ // unescapeEntity reads an entity like "<" from b[src:] and writes the // corresponding "<" to b[dst:], returning the incremented dst and src cursors. // Precondition: b[src] == '&' && dst <= src. -func unescapeEntity(b []byte, dst, src int) (dst1, src1 int) { +// attribute should be true if parsing an attribute value. +func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) { // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference // i starts at 1 because we already know that s[0] == '&'. @@ -121,12 +122,11 @@ func unescapeEntity(b []byte, dst, src int) (dst1, src1 int) { // Consume the maximum number of characters possible, with the // consumed characters matching one of the named references. - // TODO(nigeltao): unescape("¬it;") should be "¬it;" for i < len(s) { c := s[i] i++ // Lower-cased characters are more common in entities, so we check for them first. - if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' { + if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9' { continue } if c != ';' { @@ -136,11 +136,25 @@ func unescapeEntity(b []byte, dst, src int) (dst1, src1 int) { } entityName := string(s[1:i]) - if x := entity[entityName]; x != 0 { + if entityName == "" { + // No-op. + } else if attribute && entityName[len(entityName)-1] != ';' && len(s) > i && s[i] == '=' { + // No-op. + } else if x := entity[entityName]; x != 0 { return dst + utf8.EncodeRune(b[dst:], x), src + i - } else if x := entity2[entityName]; x[0] != 0 { // Check if it's a two-character entity. + } else if x := entity2[entityName]; x[0] != 0 { dst1 := dst + utf8.EncodeRune(b[dst:], x[0]) return dst1 + utf8.EncodeRune(b[dst1:], x[1]), src + i + } else if !attribute { + maxLen := len(entityName) - 1 + if maxLen > longestEntityWithoutSemicolon { + maxLen = longestEntityWithoutSemicolon + } + for j := maxLen; j > 1; j-- { + if x := entity[entityName[:j]]; x != 0 { + return dst + utf8.EncodeRune(b[dst:], x), src + j + 1 + } + } } dst1, src1 = dst+i, src+i @@ -152,11 +166,11 @@ func unescapeEntity(b []byte, dst, src int) (dst1, src1 int) { func unescape(b []byte) []byte { for i, c := range b { if c == '&' { - dst, src := unescapeEntity(b, i, i) + dst, src := unescapeEntity(b, i, i, false) for src < len(b) { c := b[src] if c == '&' { - dst, src = unescapeEntity(b, dst, src) + dst, src = unescapeEntity(b, dst, src, false) } else { b[dst] = c dst, src = dst+1, src+1 diff --git a/src/pkg/html/token.go b/src/pkg/html/token.go index 23c95ece6f6..5c6ed166620 100644 --- a/src/pkg/html/token.go +++ b/src/pkg/html/token.go @@ -459,7 +459,7 @@ loop: src++ break loop case '&': - dst, src = unescapeEntity(z.buf, dst, src) + dst, src = unescapeEntity(z.buf, dst, src, true) case '\\': if src == z.p1 { z.buf[dst] = '\\' diff --git a/src/pkg/html/token_test.go b/src/pkg/html/token_test.go index c794612abcf..c8dcc88648d 100644 --- a/src/pkg/html/token_test.go +++ b/src/pkg/html/token_test.go @@ -107,6 +107,16 @@ var tokenTests = []tokenTest{ `<&alsoDoesntExist;&`, `$<&alsoDoesntExist;&`, }, + { + "entity without semicolon", + `¬it;∉`, + `¬it;∉$`, + }, + { + "entity with digits", + "½", + "½", + }, // Attribute tests: // http://dev.w3.org/html5/spec/Overview.html#attributes-0