mirror of
https://github.com/golang/go
synced 2024-11-20 09:34:52 -07:00
html: unescape numeric entities, and complete the named entities table, including two-character entities.
Fixes #1233. R=nigeltao CC=golang-dev https://golang.org/cl/3445041
This commit is contained in:
parent
08a47d6f60
commit
f503e26379
File diff suppressed because it is too large
Load Diff
@ -10,16 +10,118 @@ import (
|
||||
"utf8"
|
||||
)
|
||||
|
||||
// These replacements permit compatibility with old numeric entities that
|
||||
// assumed Windows-1252 encoding.
|
||||
// http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
|
||||
var replacementTable = [...]int{
|
||||
'\u20AC', // First entry is what 0x80 should be replaced with.
|
||||
'\u0081',
|
||||
'\u201A',
|
||||
'\u0192',
|
||||
'\u201E',
|
||||
'\u2026',
|
||||
'\u2020',
|
||||
'\u2021',
|
||||
'\u02C6',
|
||||
'\u2030',
|
||||
'\u0160',
|
||||
'\u2039',
|
||||
'\u0152',
|
||||
'\u008D',
|
||||
'\u017D',
|
||||
'\u008F',
|
||||
'\u0090',
|
||||
'\u2018',
|
||||
'\u2019',
|
||||
'\u201C',
|
||||
'\u201D',
|
||||
'\u2022',
|
||||
'\u2013',
|
||||
'\u2014',
|
||||
'\u02DC',
|
||||
'\u2122',
|
||||
'\u0161',
|
||||
'\u203A',
|
||||
'\u0153',
|
||||
'\u009D',
|
||||
'\u017E',
|
||||
'\u0178', // Last entry is 0x9F.
|
||||
// 0x00->'\uFFFD' is handled programmatically.
|
||||
// 0x0D->'\u000D' is a no-op.
|
||||
}
|
||||
|
||||
// unescapeEntity reads an entity like "<" from b[src:] and writes the
|
||||
// corresponding "<" to b[dst:], returning the incremented dst and src cursors.
|
||||
// Precondition: src[0] == '&' && dst <= src.
|
||||
// Precondition: b[src] == '&' && dst <= src.
|
||||
func unescapeEntity(b []byte, dst, src int) (dst1, src1 int) {
|
||||
// TODO(nigeltao): Check that this entity substitution algorithm matches the spec:
|
||||
// http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
|
||||
// TODO(nigeltao): Handle things like "中" or "中".
|
||||
|
||||
// i starts at 1 because we already know that s[0] == '&'.
|
||||
i, s := 1, b[src:]
|
||||
|
||||
if len(s) <= 1 {
|
||||
b[dst] = b[src]
|
||||
return dst + 1, src + 1
|
||||
}
|
||||
|
||||
if s[i] == '#' {
|
||||
if len(s) <= 3 { // We need to have at least "&#.".
|
||||
b[dst] = b[src]
|
||||
return dst + 1, src + 1
|
||||
}
|
||||
i++
|
||||
c := s[i]
|
||||
hex := false
|
||||
if c == 'x' || c == 'X' {
|
||||
hex = true
|
||||
i++
|
||||
}
|
||||
|
||||
x := 0
|
||||
for i < len(s) {
|
||||
c = s[i]
|
||||
i++
|
||||
if hex {
|
||||
if '0' <= c && c <= '9' {
|
||||
x = 16*x + int(c) - '0'
|
||||
continue
|
||||
} else if 'a' <= c && c <= 'f' {
|
||||
x = 16*x + int(c) - 'a' + 10
|
||||
continue
|
||||
} else if 'A' <= c && c <= 'F' {
|
||||
x = 16*x + int(c) - 'A' + 10
|
||||
continue
|
||||
}
|
||||
} else if '0' <= c && c <= '9' {
|
||||
x = 10*x + int(c) - '0'
|
||||
continue
|
||||
}
|
||||
if c != ';' {
|
||||
i--
|
||||
}
|
||||
break
|
||||
}
|
||||
|
||||
if i <= 3 { // No characters matched.
|
||||
b[dst] = b[src]
|
||||
return dst + 1, src + 1
|
||||
}
|
||||
|
||||
if 0x80 <= x && x <= 0x9F {
|
||||
// Replace characters from Windows-1252 with UTF-8 equivalents.
|
||||
x = replacementTable[x-0x80]
|
||||
} else if x == 0 || (0xD800 <= x && x <= 0xDFFF) || x > 0x10FFFF {
|
||||
// Replace invalid characters with the replacement character.
|
||||
x = '\uFFFD'
|
||||
}
|
||||
|
||||
return dst + utf8.EncodeRune(b[dst:], x), src + i
|
||||
}
|
||||
|
||||
// Consume the maximum number of characters possible, with the
|
||||
// consumed characters matching one of the named references.
|
||||
|
||||
// TODO(nigeltao): unescape("¬it;") should be "¬it;"
|
||||
for i < len(s) {
|
||||
c := s[i]
|
||||
i++
|
||||
@ -30,12 +132,17 @@ func unescapeEntity(b []byte, dst, src int) (dst1, src1 int) {
|
||||
if c != ';' {
|
||||
i--
|
||||
}
|
||||
x := entity[string(s[1:i])]
|
||||
if x != 0 {
|
||||
return dst + utf8.EncodeRune(b[dst:], x), src + i
|
||||
}
|
||||
break
|
||||
}
|
||||
|
||||
entityName := string(s[1:i])
|
||||
if x := entity[entityName]; x != 0 {
|
||||
return dst + utf8.EncodeRune(b[dst:], x), src + i
|
||||
} else if x := entity2[entityName]; x[0] != 0 { // Check if it's a two-character entity.
|
||||
dst1 := dst + utf8.EncodeRune(b[dst:], x[0])
|
||||
return dst1 + utf8.EncodeRune(b[dst1:], x[1]), src + i
|
||||
}
|
||||
|
||||
dst1, src1 = dst+i, src+i
|
||||
copy(b[dst:dst1], b[src:src1])
|
||||
return dst1, src1
|
||||
|
@ -105,6 +105,75 @@ loop:
|
||||
}
|
||||
}
|
||||
|
||||
type unescapeTest struct {
|
||||
// A short description of the test case.
|
||||
desc string
|
||||
// The HTML text.
|
||||
html string
|
||||
// The unescaped text.
|
||||
unescaped string
|
||||
}
|
||||
|
||||
var unescapeTests = []unescapeTest{
|
||||
// Handle no entities.
|
||||
{
|
||||
"copy",
|
||||
"A\ttext\nstring",
|
||||
"A\ttext\nstring",
|
||||
},
|
||||
// Handle simple named entities.
|
||||
{
|
||||
"simple",
|
||||
"& > <",
|
||||
"& > <",
|
||||
},
|
||||
// Handle hitting the end of the string.
|
||||
{
|
||||
"stringEnd",
|
||||
"& &",
|
||||
"& &",
|
||||
},
|
||||
// Handle entities with two codepoints.
|
||||
{
|
||||
"multiCodepoint",
|
||||
"text ⋛︀ blah",
|
||||
"text \u22db\ufe00 blah",
|
||||
},
|
||||
// Handle decimal numeric entities.
|
||||
{
|
||||
"decimalEntity",
|
||||
"Delta = Δ ",
|
||||
"Delta = Δ ",
|
||||
},
|
||||
// Handle hexadecimal numeric entities.
|
||||
{
|
||||
"hexadecimalEntity",
|
||||
"Lambda = λ = λ ",
|
||||
"Lambda = λ = λ ",
|
||||
},
|
||||
// Handle numeric early termination.
|
||||
{
|
||||
"numericEnds",
|
||||
"&# &#x €43 © = ©f = ©",
|
||||
"&# &#x €43 © = ©f = ©",
|
||||
},
|
||||
// Handle numeric ISO-8859-1 entity replacements.
|
||||
{
|
||||
"numericReplacements",
|
||||
"Footnote‡",
|
||||
"Footnote‡",
|
||||
},
|
||||
}
|
||||
|
||||
func TestUnescape(t *testing.T) {
|
||||
for _, tt := range unescapeTests {
|
||||
unescaped := UnescapeString(tt.html)
|
||||
if unescaped != tt.unescaped {
|
||||
t.Errorf("TestUnescape %s: want %q, got %q", tt.desc, tt.unescaped, unescaped)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestUnescapeEscape(t *testing.T) {
|
||||
ss := []string{
|
||||
``,
|
||||
|
Loading…
Reference in New Issue
Block a user