html: unescape numeric entities, and complete the named entities table, including two-character entities.

Fixes #1233. R=nigeltao CC=golang-dev https://golang.org/cl/3445041
2024-11-20 09:34:52 -07:00 · 2010-12-07 12:13:47 +11:00 · 2010-12-07 12:13:47 +11:00 · f503e26379
commit f503e26379
parent 08a47d6f60
3 changed files with 2424 additions and 16 deletions
--- a/src/pkg/html/entity.go
+++ b/src/pkg/html/entity.go
--- a/src/pkg/html/escape.go
+++ b/src/pkg/html/escape.go
@ -10,16 +10,118 @@ import (
 	"utf8"
 )

+// These replacements permit compatibility with old numeric entities that 
+// assumed Windows-1252 encoding.
+// http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
+var replacementTable = [...]int{
+	'\u20AC', // First entry is what 0x80 should be replaced with.
+	'\u0081',
+	'\u201A',
+	'\u0192',
+	'\u201E',
+	'\u2026',
+	'\u2020',
+	'\u2021',
+	'\u02C6',
+	'\u2030',
+	'\u0160',
+	'\u2039',
+	'\u0152',
+	'\u008D',
+	'\u017D',
+	'\u008F',
+	'\u0090',
+	'\u2018',
+	'\u2019',
+	'\u201C',
+	'\u201D',
+	'\u2022',
+	'\u2013',
+	'\u2014',
+	'\u02DC',
+	'\u2122',
+	'\u0161',
+	'\u203A',
+	'\u0153',
+	'\u009D',
+	'\u017E',
+	'\u0178', // Last entry is 0x9F.
+	// 0x00->'\uFFFD' is handled programmatically. 
+	// 0x0D->'\u000D' is a no-op.
+}
+
 // unescapeEntity reads an entity like "&lt;" from b[src:] and writes the
 // corresponding "<" to b[dst:], returning the incremented dst and src cursors.
-// Precondition: src[0] == '&' && dst <= src.
+// Precondition: b[src] == '&' && dst <= src.
 func unescapeEntity(b []byte, dst, src int) (dst1, src1 int) {
-	// TODO(nigeltao): Check that this entity substitution algorithm matches the spec:
 	// http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
-	// TODO(nigeltao): Handle things like "&#20013;" or "&#x4e2d;".

 	// i starts at 1 because we already know that s[0] == '&'.
 	i, s := 1, b[src:]
+
+	if len(s) <= 1 {
+		b[dst] = b[src]
+		return dst + 1, src + 1
+	}
+
+	if s[i] == '#' {
+		if len(s) <= 3 { // We need to have at least "&#.".
+			b[dst] = b[src]
+			return dst + 1, src + 1
+		}
+		i++
+		c := s[i]
+		hex := false
+		if c == 'x' || c == 'X' {
+			hex = true
+			i++
+		}
+
+		x := 0
+		for i < len(s) {
+			c = s[i]
+			i++
+			if hex {
+				if '0' <= c && c <= '9' {
+					x = 16*x + int(c) - '0'
+					continue
+				} else if 'a' <= c && c <= 'f' {
+					x = 16*x + int(c) - 'a' + 10
+					continue
+				} else if 'A' <= c && c <= 'F' {
+					x = 16*x + int(c) - 'A' + 10
+					continue
+				}
+			} else if '0' <= c && c <= '9' {
+				x = 10*x + int(c) - '0'
+				continue
+			}
+			if c != ';' {
+				i--
+			}
+			break
+		}
+
+		if i <= 3 { // No characters matched.
+			b[dst] = b[src]
+			return dst + 1, src + 1
+		}
+
+		if 0x80 <= x && x <= 0x9F {
+			// Replace characters from Windows-1252 with UTF-8 equivalents.
+			x = replacementTable[x-0x80]
+		} else if x == 0 || (0xD800 <= x && x <= 0xDFFF) || x > 0x10FFFF {
+			// Replace invalid characters with the replacement character.
+			x = '\uFFFD'
+		}
+
+		return dst + utf8.EncodeRune(b[dst:], x), src + i
+	}
+
+	// Consume the maximum number of characters possible, with the
+	// consumed characters matching one of the named references.
+
+	// TODO(nigeltao): unescape("&notit;") should be "¬it;"
 	for i < len(s) {
 		c := s[i]
 		i++
@ -30,12 +132,17 @@ func unescapeEntity(b []byte, dst, src int) (dst1, src1 int) {
 		if c != ';' {
 			i--
 		}
-		x := entity[string(s[1:i])]
-		if x != 0 {
-			return dst + utf8.EncodeRune(b[dst:], x), src + i
-		}
 		break
 	}
+
+	entityName := string(s[1:i])
+	if x := entity[entityName]; x != 0 {
+		return dst + utf8.EncodeRune(b[dst:], x), src + i
+	} else if x := entity2[entityName]; x[0] != 0 { // Check if it's a two-character entity.
+		dst1 := dst + utf8.EncodeRune(b[dst:], x[0])
+		return dst1 + utf8.EncodeRune(b[dst1:], x[1]), src + i
+	}
+
 	dst1, src1 = dst+i, src+i
 	copy(b[dst:dst1], b[src:src1])
 	return dst1, src1
--- a/src/pkg/html/token_test.go
+++ b/src/pkg/html/token_test.go
@ -105,6 +105,75 @@ loop:
 	}
 }

+type unescapeTest struct {
+	// A short description of the test case.
+	desc string
+	// The HTML text.
+	html string
+	// The unescaped text.
+	unescaped string
+}
+
+var unescapeTests = []unescapeTest{
+	// Handle no entities.
+	{
+		"copy",
+		"A\ttext\nstring",
+		"A\ttext\nstring",
+	},
+	// Handle simple named entities.
+	{
+		"simple",
+		"&amp; &gt; &lt;",
+		"& > <",
+	},
+	// Handle hitting the end of the string.
+	{
+		"stringEnd",
+		"&amp &amp",
+		"& &",
+	},
+	// Handle entities with two codepoints.
+	{
+		"multiCodepoint",
+		"text &gesl; blah",
+		"text \u22db\ufe00 blah",
+	},
+	// Handle decimal numeric entities.
+	{
+		"decimalEntity",
+		"Delta = &#916; ",
+		"Delta = Δ ",
+	},
+	// Handle hexadecimal numeric entities.
+	{
+		"hexadecimalEntity",
+		"Lambda = &#x3bb; = &#X3Bb ",
+		"Lambda = λ = λ ",
+	},
+	// Handle numeric early termination.
+	{
+		"numericEnds",
+		"&# &#x &#128;43 &copy = &#169f = &#xa9",
+		"&# &#x €43 © = ©f = ©",
+	},
+	// Handle numeric ISO-8859-1 entity replacements.
+	{
+		"numericReplacements",
+		"Footnote&#x87;",
+		"Footnote‡",
+	},
+}
+
+func TestUnescape(t *testing.T) {
+	for _, tt := range unescapeTests {
+		unescaped := UnescapeString(tt.html)
+		if unescaped != tt.unescaped {
+			t.Errorf("TestUnescape %s: want %q, got %q", tt.desc, tt.unescaped, unescaped)
+		}
+	}
+}
+
 func TestUnescapeEscape(t *testing.T) {
 	ss := []string{
 		``,