mirror of
https://github.com/golang/go
synced 2024-11-22 04:04:40 -07:00
html: move the HTML parser to an exp/html package. The parser is a
work in progress, and we are not ready to freeze its API for Go 1. Package html still exists, containing just two functions: EscapeString and UnescapeString. Both the packages at exp/html and html are "package html". The former is a superset of the latter. At some point in the future, the exp/html code will move back into html, once we have finalized the parser API. R=rsc, dsymonds CC=golang-dev https://golang.org/cl/5571059
This commit is contained in:
parent
66599c4070
commit
324513bc5f
@ -82,6 +82,7 @@ DIRS=\
|
|||||||
exp/ebnf\
|
exp/ebnf\
|
||||||
exp/ebnflint\
|
exp/ebnflint\
|
||||||
exp/gotype\
|
exp/gotype\
|
||||||
|
exp/html\
|
||||||
exp/norm\
|
exp/norm\
|
||||||
exp/spdy\
|
exp/spdy\
|
||||||
exp/ssh\
|
exp/ssh\
|
||||||
|
20
src/pkg/exp/html/Makefile
Normal file
20
src/pkg/exp/html/Makefile
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
# Copyright 2010 The Go Authors. All rights reserved.
|
||||||
|
# Use of this source code is governed by a BSD-style
|
||||||
|
# license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
include ../../../Make.inc
|
||||||
|
|
||||||
|
TARG=html
|
||||||
|
GOFILES=\
|
||||||
|
const.go\
|
||||||
|
doc.go\
|
||||||
|
doctype.go\
|
||||||
|
entity.go\
|
||||||
|
escape.go\
|
||||||
|
foreign.go\
|
||||||
|
node.go\
|
||||||
|
parse.go\
|
||||||
|
render.go\
|
||||||
|
token.go\
|
||||||
|
|
||||||
|
include ../../../Make.pkg
|
2253
src/pkg/exp/html/entity.go
Normal file
2253
src/pkg/exp/html/entity.go
Normal file
File diff suppressed because it is too large
Load Diff
29
src/pkg/exp/html/entity_test.go
Normal file
29
src/pkg/exp/html/entity_test.go
Normal file
@ -0,0 +1,29 @@
|
|||||||
|
// Copyright 2010 The Go Authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a BSD-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
package html
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
"unicode/utf8"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestEntityLength(t *testing.T) {
|
||||||
|
// We verify that the length of UTF-8 encoding of each value is <= 1 + len(key).
|
||||||
|
// The +1 comes from the leading "&". This property implies that the length of
|
||||||
|
// unescaped text is <= the length of escaped text.
|
||||||
|
for k, v := range entity {
|
||||||
|
if 1+len(k) < utf8.RuneLen(v) {
|
||||||
|
t.Error("escaped entity &" + k + " is shorter than its UTF-8 encoding " + string(v))
|
||||||
|
}
|
||||||
|
if len(k) > longestEntityWithoutSemicolon && k[len(k)-1] != ';' {
|
||||||
|
t.Errorf("entity name %s is %d characters, but longestEntityWithoutSemicolon=%d", k, len(k), longestEntityWithoutSemicolon)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for k, v := range entity2 {
|
||||||
|
if 1+len(k) < utf8.RuneLen(v[0])+utf8.RuneLen(v[1]) {
|
||||||
|
t.Error("escaped entity &" + k + " is shorter than its UTF-8 encoding " + string(v[0]) + string(v[1]))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
253
src/pkg/exp/html/escape.go
Normal file
253
src/pkg/exp/html/escape.go
Normal file
@ -0,0 +1,253 @@
|
|||||||
|
// Copyright 2010 The Go Authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a BSD-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
package html
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"strings"
|
||||||
|
"unicode/utf8"
|
||||||
|
)
|
||||||
|
|
||||||
|
// These replacements permit compatibility with old numeric entities that
|
||||||
|
// assumed Windows-1252 encoding.
|
||||||
|
// http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
|
||||||
|
var replacementTable = [...]rune{
|
||||||
|
'\u20AC', // First entry is what 0x80 should be replaced with.
|
||||||
|
'\u0081',
|
||||||
|
'\u201A',
|
||||||
|
'\u0192',
|
||||||
|
'\u201E',
|
||||||
|
'\u2026',
|
||||||
|
'\u2020',
|
||||||
|
'\u2021',
|
||||||
|
'\u02C6',
|
||||||
|
'\u2030',
|
||||||
|
'\u0160',
|
||||||
|
'\u2039',
|
||||||
|
'\u0152',
|
||||||
|
'\u008D',
|
||||||
|
'\u017D',
|
||||||
|
'\u008F',
|
||||||
|
'\u0090',
|
||||||
|
'\u2018',
|
||||||
|
'\u2019',
|
||||||
|
'\u201C',
|
||||||
|
'\u201D',
|
||||||
|
'\u2022',
|
||||||
|
'\u2013',
|
||||||
|
'\u2014',
|
||||||
|
'\u02DC',
|
||||||
|
'\u2122',
|
||||||
|
'\u0161',
|
||||||
|
'\u203A',
|
||||||
|
'\u0153',
|
||||||
|
'\u009D',
|
||||||
|
'\u017E',
|
||||||
|
'\u0178', // Last entry is 0x9F.
|
||||||
|
// 0x00->'\uFFFD' is handled programmatically.
|
||||||
|
// 0x0D->'\u000D' is a no-op.
|
||||||
|
}
|
||||||
|
|
||||||
|
// unescapeEntity reads an entity like "<" from b[src:] and writes the
|
||||||
|
// corresponding "<" to b[dst:], returning the incremented dst and src cursors.
|
||||||
|
// Precondition: b[src] == '&' && dst <= src.
|
||||||
|
// attribute should be true if parsing an attribute value.
|
||||||
|
func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) {
|
||||||
|
// http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
|
||||||
|
|
||||||
|
// i starts at 1 because we already know that s[0] == '&'.
|
||||||
|
i, s := 1, b[src:]
|
||||||
|
|
||||||
|
if len(s) <= 1 {
|
||||||
|
b[dst] = b[src]
|
||||||
|
return dst + 1, src + 1
|
||||||
|
}
|
||||||
|
|
||||||
|
if s[i] == '#' {
|
||||||
|
if len(s) <= 3 { // We need to have at least "&#.".
|
||||||
|
b[dst] = b[src]
|
||||||
|
return dst + 1, src + 1
|
||||||
|
}
|
||||||
|
i++
|
||||||
|
c := s[i]
|
||||||
|
hex := false
|
||||||
|
if c == 'x' || c == 'X' {
|
||||||
|
hex = true
|
||||||
|
i++
|
||||||
|
}
|
||||||
|
|
||||||
|
x := '\x00'
|
||||||
|
for i < len(s) {
|
||||||
|
c = s[i]
|
||||||
|
i++
|
||||||
|
if hex {
|
||||||
|
if '0' <= c && c <= '9' {
|
||||||
|
x = 16*x + rune(c) - '0'
|
||||||
|
continue
|
||||||
|
} else if 'a' <= c && c <= 'f' {
|
||||||
|
x = 16*x + rune(c) - 'a' + 10
|
||||||
|
continue
|
||||||
|
} else if 'A' <= c && c <= 'F' {
|
||||||
|
x = 16*x + rune(c) - 'A' + 10
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
} else if '0' <= c && c <= '9' {
|
||||||
|
x = 10*x + rune(c) - '0'
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if c != ';' {
|
||||||
|
i--
|
||||||
|
}
|
||||||
|
break
|
||||||
|
}
|
||||||
|
|
||||||
|
if i <= 3 { // No characters matched.
|
||||||
|
b[dst] = b[src]
|
||||||
|
return dst + 1, src + 1
|
||||||
|
}
|
||||||
|
|
||||||
|
if 0x80 <= x && x <= 0x9F {
|
||||||
|
// Replace characters from Windows-1252 with UTF-8 equivalents.
|
||||||
|
x = replacementTable[x-0x80]
|
||||||
|
} else if x == 0 || (0xD800 <= x && x <= 0xDFFF) || x > 0x10FFFF {
|
||||||
|
// Replace invalid characters with the replacement character.
|
||||||
|
x = '\uFFFD'
|
||||||
|
}
|
||||||
|
|
||||||
|
return dst + utf8.EncodeRune(b[dst:], x), src + i
|
||||||
|
}
|
||||||
|
|
||||||
|
// Consume the maximum number of characters possible, with the
|
||||||
|
// consumed characters matching one of the named references.
|
||||||
|
|
||||||
|
for i < len(s) {
|
||||||
|
c := s[i]
|
||||||
|
i++
|
||||||
|
// Lower-cased characters are more common in entities, so we check for them first.
|
||||||
|
if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9' {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if c != ';' {
|
||||||
|
i--
|
||||||
|
}
|
||||||
|
break
|
||||||
|
}
|
||||||
|
|
||||||
|
entityName := string(s[1:i])
|
||||||
|
if entityName == "" {
|
||||||
|
// No-op.
|
||||||
|
} else if attribute && entityName[len(entityName)-1] != ';' && len(s) > i && s[i] == '=' {
|
||||||
|
// No-op.
|
||||||
|
} else if x := entity[entityName]; x != 0 {
|
||||||
|
return dst + utf8.EncodeRune(b[dst:], x), src + i
|
||||||
|
} else if x := entity2[entityName]; x[0] != 0 {
|
||||||
|
dst1 := dst + utf8.EncodeRune(b[dst:], x[0])
|
||||||
|
return dst1 + utf8.EncodeRune(b[dst1:], x[1]), src + i
|
||||||
|
} else if !attribute {
|
||||||
|
maxLen := len(entityName) - 1
|
||||||
|
if maxLen > longestEntityWithoutSemicolon {
|
||||||
|
maxLen = longestEntityWithoutSemicolon
|
||||||
|
}
|
||||||
|
for j := maxLen; j > 1; j-- {
|
||||||
|
if x := entity[entityName[:j]]; x != 0 {
|
||||||
|
return dst + utf8.EncodeRune(b[dst:], x), src + j + 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
dst1, src1 = dst+i, src+i
|
||||||
|
copy(b[dst:dst1], b[src:src1])
|
||||||
|
return dst1, src1
|
||||||
|
}
|
||||||
|
|
||||||
|
// unescape unescapes b's entities in-place, so that "a<b" becomes "a<b".
|
||||||
|
func unescape(b []byte) []byte {
|
||||||
|
for i, c := range b {
|
||||||
|
if c == '&' {
|
||||||
|
dst, src := unescapeEntity(b, i, i, false)
|
||||||
|
for src < len(b) {
|
||||||
|
c := b[src]
|
||||||
|
if c == '&' {
|
||||||
|
dst, src = unescapeEntity(b, dst, src, false)
|
||||||
|
} else {
|
||||||
|
b[dst] = c
|
||||||
|
dst, src = dst+1, src+1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return b[0:dst]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return b
|
||||||
|
}
|
||||||
|
|
||||||
|
// lower lower-cases the A-Z bytes in b in-place, so that "aBc" becomes "abc".
|
||||||
|
func lower(b []byte) []byte {
|
||||||
|
for i, c := range b {
|
||||||
|
if 'A' <= c && c <= 'Z' {
|
||||||
|
b[i] = c + 'a' - 'A'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return b
|
||||||
|
}
|
||||||
|
|
||||||
|
const escapedChars = `&'<>"`
|
||||||
|
|
||||||
|
func escape(w writer, s string) error {
|
||||||
|
i := strings.IndexAny(s, escapedChars)
|
||||||
|
for i != -1 {
|
||||||
|
if _, err := w.WriteString(s[:i]); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
var esc string
|
||||||
|
switch s[i] {
|
||||||
|
case '&':
|
||||||
|
esc = "&"
|
||||||
|
case '\'':
|
||||||
|
esc = "'"
|
||||||
|
case '<':
|
||||||
|
esc = "<"
|
||||||
|
case '>':
|
||||||
|
esc = ">"
|
||||||
|
case '"':
|
||||||
|
esc = """
|
||||||
|
default:
|
||||||
|
panic("unrecognized escape character")
|
||||||
|
}
|
||||||
|
s = s[i+1:]
|
||||||
|
if _, err := w.WriteString(esc); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
i = strings.IndexAny(s, escapedChars)
|
||||||
|
}
|
||||||
|
_, err := w.WriteString(s)
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
// EscapeString escapes special characters like "<" to become "<". It
|
||||||
|
// escapes only five such characters: amp, apos, lt, gt and quot.
|
||||||
|
// UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
|
||||||
|
// always true.
|
||||||
|
func EscapeString(s string) string {
|
||||||
|
if strings.IndexAny(s, escapedChars) == -1 {
|
||||||
|
return s
|
||||||
|
}
|
||||||
|
buf := bytes.NewBuffer(nil)
|
||||||
|
escape(buf, s)
|
||||||
|
return buf.String()
|
||||||
|
}
|
||||||
|
|
||||||
|
// UnescapeString unescapes entities like "<" to become "<". It unescapes a
|
||||||
|
// larger range of entities than EscapeString escapes. For example, "á"
|
||||||
|
// unescapes to "á", as does "á" and "&xE1;".
|
||||||
|
// UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
|
||||||
|
// always true.
|
||||||
|
func UnescapeString(s string) string {
|
||||||
|
for _, c := range s {
|
||||||
|
if c == '&' {
|
||||||
|
return string(unescape([]byte(s)))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return s
|
||||||
|
}
|
@ -6,15 +6,7 @@ include ../../Make.inc
|
|||||||
|
|
||||||
TARG=html
|
TARG=html
|
||||||
GOFILES=\
|
GOFILES=\
|
||||||
const.go\
|
|
||||||
doc.go\
|
|
||||||
doctype.go\
|
|
||||||
entity.go\
|
entity.go\
|
||||||
escape.go\
|
escape.go\
|
||||||
foreign.go\
|
|
||||||
node.go\
|
|
||||||
parse.go\
|
|
||||||
render.go\
|
|
||||||
token.go\
|
|
||||||
|
|
||||||
include ../../Make.pkg
|
include ../../Make.pkg
|
||||||
|
@ -10,6 +10,10 @@ import (
|
|||||||
"unicode/utf8"
|
"unicode/utf8"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
type writer interface {
|
||||||
|
WriteString(string) (int, error)
|
||||||
|
}
|
||||||
|
|
||||||
// These replacements permit compatibility with old numeric entities that
|
// These replacements permit compatibility with old numeric entities that
|
||||||
// assumed Windows-1252 encoding.
|
// assumed Windows-1252 encoding.
|
||||||
// http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
|
// http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
|
||||||
|
Loading…
Reference in New Issue
Block a user