1
0
mirror of https://github.com/golang/go synced 2024-11-20 09:34:52 -07:00

html: unescape numeric entities, and complete the named entities table, including two-character entities.

Fixes #1233.

R=nigeltao
CC=golang-dev
https://golang.org/cl/3445041
This commit is contained in:
Ryan Hitchman 2010-12-07 12:13:47 +11:00 committed by Nigel Tao
parent 08a47d6f60
commit f503e26379
3 changed files with 2424 additions and 16 deletions

File diff suppressed because it is too large Load Diff

View File

@ -10,16 +10,118 @@ import (
"utf8"
)
// These replacements permit compatibility with old numeric entities that
// assumed Windows-1252 encoding.
// http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
var replacementTable = [...]int{
'\u20AC', // First entry is what 0x80 should be replaced with.
'\u0081',
'\u201A',
'\u0192',
'\u201E',
'\u2026',
'\u2020',
'\u2021',
'\u02C6',
'\u2030',
'\u0160',
'\u2039',
'\u0152',
'\u008D',
'\u017D',
'\u008F',
'\u0090',
'\u2018',
'\u2019',
'\u201C',
'\u201D',
'\u2022',
'\u2013',
'\u2014',
'\u02DC',
'\u2122',
'\u0161',
'\u203A',
'\u0153',
'\u009D',
'\u017E',
'\u0178', // Last entry is 0x9F.
// 0x00->'\uFFFD' is handled programmatically.
// 0x0D->'\u000D' is a no-op.
}
// unescapeEntity reads an entity like "<" from b[src:] and writes the
// corresponding "<" to b[dst:], returning the incremented dst and src cursors.
// Precondition: src[0] == '&' && dst <= src.
// Precondition: b[src] == '&' && dst <= src.
func unescapeEntity(b []byte, dst, src int) (dst1, src1 int) {
// TODO(nigeltao): Check that this entity substitution algorithm matches the spec:
// http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
// TODO(nigeltao): Handle things like "&#20013;" or "&#x4e2d;".
// i starts at 1 because we already know that s[0] == '&'.
i, s := 1, b[src:]
if len(s) <= 1 {
b[dst] = b[src]
return dst + 1, src + 1
}
if s[i] == '#' {
if len(s) <= 3 { // We need to have at least "&#.".
b[dst] = b[src]
return dst + 1, src + 1
}
i++
c := s[i]
hex := false
if c == 'x' || c == 'X' {
hex = true
i++
}
x := 0
for i < len(s) {
c = s[i]
i++
if hex {
if '0' <= c && c <= '9' {
x = 16*x + int(c) - '0'
continue
} else if 'a' <= c && c <= 'f' {
x = 16*x + int(c) - 'a' + 10
continue
} else if 'A' <= c && c <= 'F' {
x = 16*x + int(c) - 'A' + 10
continue
}
} else if '0' <= c && c <= '9' {
x = 10*x + int(c) - '0'
continue
}
if c != ';' {
i--
}
break
}
if i <= 3 { // No characters matched.
b[dst] = b[src]
return dst + 1, src + 1
}
if 0x80 <= x && x <= 0x9F {
// Replace characters from Windows-1252 with UTF-8 equivalents.
x = replacementTable[x-0x80]
} else if x == 0 || (0xD800 <= x && x <= 0xDFFF) || x > 0x10FFFF {
// Replace invalid characters with the replacement character.
x = '\uFFFD'
}
return dst + utf8.EncodeRune(b[dst:], x), src + i
}
// Consume the maximum number of characters possible, with the
// consumed characters matching one of the named references.
// TODO(nigeltao): unescape("&notit;") should be "¬it;"
for i < len(s) {
c := s[i]
i++
@ -30,12 +132,17 @@ func unescapeEntity(b []byte, dst, src int) (dst1, src1 int) {
if c != ';' {
i--
}
x := entity[string(s[1:i])]
if x != 0 {
return dst + utf8.EncodeRune(b[dst:], x), src + i
}
break
}
entityName := string(s[1:i])
if x := entity[entityName]; x != 0 {
return dst + utf8.EncodeRune(b[dst:], x), src + i
} else if x := entity2[entityName]; x[0] != 0 { // Check if it's a two-character entity.
dst1 := dst + utf8.EncodeRune(b[dst:], x[0])
return dst1 + utf8.EncodeRune(b[dst1:], x[1]), src + i
}
dst1, src1 = dst+i, src+i
copy(b[dst:dst1], b[src:src1])
return dst1, src1

View File

@ -105,6 +105,75 @@ loop:
}
}
type unescapeTest struct {
// A short description of the test case.
desc string
// The HTML text.
html string
// The unescaped text.
unescaped string
}
var unescapeTests = []unescapeTest{
// Handle no entities.
{
"copy",
"A\ttext\nstring",
"A\ttext\nstring",
},
// Handle simple named entities.
{
"simple",
"&amp; &gt; &lt;",
"& > <",
},
// Handle hitting the end of the string.
{
"stringEnd",
"&amp &amp",
"& &",
},
// Handle entities with two codepoints.
{
"multiCodepoint",
"text &gesl; blah",
"text \u22db\ufe00 blah",
},
// Handle decimal numeric entities.
{
"decimalEntity",
"Delta = &#916; ",
"Delta = Δ ",
},
// Handle hexadecimal numeric entities.
{
"hexadecimalEntity",
"Lambda = &#x3bb; = &#X3Bb ",
"Lambda = λ = λ ",
},
// Handle numeric early termination.
{
"numericEnds",
"&# &#x &#128;43 &copy = &#169f = &#xa9",
"&# &#x €43 © = ©f = ©",
},
// Handle numeric ISO-8859-1 entity replacements.
{
"numericReplacements",
"Footnote&#x87;",
"Footnote‡",
},
}
func TestUnescape(t *testing.T) {
for _, tt := range unescapeTests {
unescaped := UnescapeString(tt.html)
if unescaped != tt.unescaped {
t.Errorf("TestUnescape %s: want %q, got %q", tt.desc, tt.unescaped, unescaped)
}
}
}
func TestUnescapeEscape(t *testing.T) {
ss := []string{
``,