html: move the HTML parser to an exp/html package. The parser is a

work in progress, and we are not ready to freeze its API for Go 1. Package html still exists, containing just two functions: EscapeString and UnescapeString. Both the packages at exp/html and html are "package html". The former is a superset of the latter. At some point in the future, the exp/html code will move back into html, once we have finalized the parser API. R=rsc, dsymonds CC=golang-dev https://golang.org/cl/5571059
2024-11-21 22:34:48 -07:00 · 2012-01-25 10:54:59 +11:00 · 2012-01-25 10:54:59 +11:00 · 324513bc5f
commit 324513bc5f
parent 66599c4070
64 changed files with 2560 additions and 8 deletions
--- a/src/pkg/Makefile
+++ b/src/pkg/Makefile
@ -82,6 +82,7 @@ DIRS=\
 	exp/ebnf\
 	exp/ebnflint\
 	exp/gotype\
+	exp/html\
 	exp/norm\
 	exp/spdy\
 	exp/ssh\
--- a/src/pkg/exp/html/Makefile
+++ b/src/pkg/exp/html/Makefile
@ -0,0 +1,20 @@
+# Copyright 2010 The Go Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+include ../../../Make.inc
+
+TARG=html
+GOFILES=\
+	const.go\
+	doc.go\
+	doctype.go\
+	entity.go\
+	escape.go\
+	foreign.go\
+	node.go\
+	parse.go\
+	render.go\
+	token.go\
+
+include ../../../Make.pkg
--- a/src/pkg/exp/html/const.go
+++ b/src/pkg/exp/html/const.go
--- a/src/pkg/exp/html/doc.go
+++ b/src/pkg/exp/html/doc.go
--- a/src/pkg/exp/html/doctype.go
+++ b/src/pkg/exp/html/doctype.go
--- a/src/pkg/exp/html/entity.go
+++ b/src/pkg/exp/html/entity.go
--- a/src/pkg/exp/html/entity_test.go
+++ b/src/pkg/exp/html/entity_test.go
@ -0,0 +1,29 @@
+// Copyright 2010 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package html
+
+import (
+	"testing"
+	"unicode/utf8"
+)
+
+func TestEntityLength(t *testing.T) {
+	// We verify that the length of UTF-8 encoding of each value is <= 1 + len(key).
+	// The +1 comes from the leading "&". This property implies that the length of
+	// unescaped text is <= the length of escaped text.
+	for k, v := range entity {
+		if 1+len(k) < utf8.RuneLen(v) {
+			t.Error("escaped entity &" + k + " is shorter than its UTF-8 encoding " + string(v))
+		}
+		if len(k) > longestEntityWithoutSemicolon && k[len(k)-1] != ';' {
+			t.Errorf("entity name %s is %d characters, but longestEntityWithoutSemicolon=%d", k, len(k), longestEntityWithoutSemicolon)
+		}
+	}
+	for k, v := range entity2 {
+		if 1+len(k) < utf8.RuneLen(v[0])+utf8.RuneLen(v[1]) {
+			t.Error("escaped entity &" + k + " is shorter than its UTF-8 encoding " + string(v[0]) + string(v[1]))
+		}
+	}
+}
--- a/src/pkg/exp/html/escape.go
+++ b/src/pkg/exp/html/escape.go
@ -0,0 +1,253 @@
+// Copyright 2010 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package html
+
+import (
+	"bytes"
+	"strings"
+	"unicode/utf8"
+)
+
+// These replacements permit compatibility with old numeric entities that 
+// assumed Windows-1252 encoding.
+// http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
+var replacementTable = [...]rune{
+	'\u20AC', // First entry is what 0x80 should be replaced with.
+	'\u0081',
+	'\u201A',
+	'\u0192',
+	'\u201E',
+	'\u2026',
+	'\u2020',
+	'\u2021',
+	'\u02C6',
+	'\u2030',
+	'\u0160',
+	'\u2039',
+	'\u0152',
+	'\u008D',
+	'\u017D',
+	'\u008F',
+	'\u0090',
+	'\u2018',
+	'\u2019',
+	'\u201C',
+	'\u201D',
+	'\u2022',
+	'\u2013',
+	'\u2014',
+	'\u02DC',
+	'\u2122',
+	'\u0161',
+	'\u203A',
+	'\u0153',
+	'\u009D',
+	'\u017E',
+	'\u0178', // Last entry is 0x9F.
+	// 0x00->'\uFFFD' is handled programmatically. 
+	// 0x0D->'\u000D' is a no-op.
+}
+
+// unescapeEntity reads an entity like "&lt;" from b[src:] and writes the
+// corresponding "<" to b[dst:], returning the incremented dst and src cursors.
+// Precondition: b[src] == '&' && dst <= src.
+// attribute should be true if parsing an attribute value.
+func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) {
+	// http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
+
+	// i starts at 1 because we already know that s[0] == '&'.
+	i, s := 1, b[src:]
+
+	if len(s) <= 1 {
+		b[dst] = b[src]
+		return dst + 1, src + 1
+	}
+
+	if s[i] == '#' {
+		if len(s) <= 3 { // We need to have at least "&#.".
+			b[dst] = b[src]
+			return dst + 1, src + 1
+		}
+		i++
+		c := s[i]
+		hex := false
+		if c == 'x' || c == 'X' {
+			hex = true
+			i++
+		}
+
+		x := '\x00'
+		for i < len(s) {
+			c = s[i]
+			i++
+			if hex {
+				if '0' <= c && c <= '9' {
+					x = 16*x + rune(c) - '0'
+					continue
+				} else if 'a' <= c && c <= 'f' {
+					x = 16*x + rune(c) - 'a' + 10
+					continue
+				} else if 'A' <= c && c <= 'F' {
+					x = 16*x + rune(c) - 'A' + 10
+					continue
+				}
+			} else if '0' <= c && c <= '9' {
+				x = 10*x + rune(c) - '0'
+				continue
+			}
+			if c != ';' {
+				i--
+			}
+			break
+		}
+
+		if i <= 3 { // No characters matched.
+			b[dst] = b[src]
+			return dst + 1, src + 1
+		}
+
+		if 0x80 <= x && x <= 0x9F {
+			// Replace characters from Windows-1252 with UTF-8 equivalents.
+			x = replacementTable[x-0x80]
+		} else if x == 0 || (0xD800 <= x && x <= 0xDFFF) || x > 0x10FFFF {
+			// Replace invalid characters with the replacement character.
+			x = '\uFFFD'
+		}
+
+		return dst + utf8.EncodeRune(b[dst:], x), src + i
+	}
+
+	// Consume the maximum number of characters possible, with the
+	// consumed characters matching one of the named references.
+
+	for i < len(s) {
+		c := s[i]
+		i++
+		// Lower-cased characters are more common in entities, so we check for them first.
+		if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9' {
+			continue
+		}
+		if c != ';' {
+			i--
+		}
+		break
+	}
+
+	entityName := string(s[1:i])
+	if entityName == "" {
+		// No-op.
+	} else if attribute && entityName[len(entityName)-1] != ';' && len(s) > i && s[i] == '=' {
+		// No-op.
+	} else if x := entity[entityName]; x != 0 {
+		return dst + utf8.EncodeRune(b[dst:], x), src + i
+	} else if x := entity2[entityName]; x[0] != 0 {
+		dst1 := dst + utf8.EncodeRune(b[dst:], x[0])
+		return dst1 + utf8.EncodeRune(b[dst1:], x[1]), src + i
+	} else if !attribute {
+		maxLen := len(entityName) - 1
+		if maxLen > longestEntityWithoutSemicolon {
+			maxLen = longestEntityWithoutSemicolon
+		}
+		for j := maxLen; j > 1; j-- {
+			if x := entity[entityName[:j]]; x != 0 {
+				return dst + utf8.EncodeRune(b[dst:], x), src + j + 1
+			}
+		}
+	}
+
+	dst1, src1 = dst+i, src+i
+	copy(b[dst:dst1], b[src:src1])
+	return dst1, src1
+}
+
+// unescape unescapes b's entities in-place, so that "a&lt;b" becomes "a<b".
+func unescape(b []byte) []byte {
+	for i, c := range b {
+		if c == '&' {
+			dst, src := unescapeEntity(b, i, i, false)
+			for src < len(b) {
+				c := b[src]
+				if c == '&' {
+					dst, src = unescapeEntity(b, dst, src, false)
+				} else {
+					b[dst] = c
+					dst, src = dst+1, src+1
+				}
+			}
+			return b[0:dst]
+		}
+	}
+	return b
+}
+
+// lower lower-cases the A-Z bytes in b in-place, so that "aBc" becomes "abc".
+func lower(b []byte) []byte {
+	for i, c := range b {
+		if 'A' <= c && c <= 'Z' {
+			b[i] = c + 'a' - 'A'
+		}
+	}
+	return b
+}
+
+const escapedChars = `&'<>"`
+
+func escape(w writer, s string) error {
+	i := strings.IndexAny(s, escapedChars)
+	for i != -1 {
+		if _, err := w.WriteString(s[:i]); err != nil {
+			return err
+		}
+		var esc string
+		switch s[i] {
+		case '&':
+			esc = "&amp;"
+		case '\'':
+			esc = "&apos;"
+		case '<':
+			esc = "&lt;"
+		case '>':
+			esc = "&gt;"
+		case '"':
+			esc = "&quot;"
+		default:
+			panic("unrecognized escape character")
+		}
+		s = s[i+1:]
+		if _, err := w.WriteString(esc); err != nil {
+			return err
+		}
+		i = strings.IndexAny(s, escapedChars)
+	}
+	_, err := w.WriteString(s)
+	return err
+}
+
+// EscapeString escapes special characters like "<" to become "&lt;". It
+// escapes only five such characters: amp, apos, lt, gt and quot.
+// UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
+// always true.
+func EscapeString(s string) string {
+	if strings.IndexAny(s, escapedChars) == -1 {
+		return s
+	}
+	buf := bytes.NewBuffer(nil)
+	escape(buf, s)
+	return buf.String()
+}
+
+// UnescapeString unescapes entities like "&lt;" to become "<". It unescapes a
+// larger range of entities than EscapeString escapes. For example, "&aacute;"
+// unescapes to "á", as does "&#225;" and "&xE1;".
+// UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
+// always true.
+func UnescapeString(s string) string {
+	for _, c := range s {
+		if c == '&' {
+			return string(unescape([]byte(s)))
+		}
+	}
+	return s
+}
--- a/src/pkg/exp/html/foreign.go
+++ b/src/pkg/exp/html/foreign.go
--- a/src/pkg/exp/html/node.go
+++ b/src/pkg/exp/html/node.go
--- a/src/pkg/exp/html/parse.go
+++ b/src/pkg/exp/html/parse.go
--- a/src/pkg/exp/html/parse_test.go
+++ b/src/pkg/exp/html/parse_test.go
--- a/src/pkg/exp/html/render.go
+++ b/src/pkg/exp/html/render.go
--- a/src/pkg/exp/html/render_test.go
+++ b/src/pkg/exp/html/render_test.go
--- a/src/pkg/exp/html/testdata/webkit/README
+++ b/src/pkg/exp/html/testdata/webkit/README
--- a/src/pkg/exp/html/testdata/webkit/adoption01.dat
+++ b/src/pkg/exp/html/testdata/webkit/adoption01.dat
--- a/src/pkg/exp/html/testdata/webkit/adoption02.dat
+++ b/src/pkg/exp/html/testdata/webkit/adoption02.dat
--- a/src/pkg/exp/html/testdata/webkit/comments01.dat
+++ b/src/pkg/exp/html/testdata/webkit/comments01.dat
--- a/src/pkg/exp/html/testdata/webkit/doctype01.dat
+++ b/src/pkg/exp/html/testdata/webkit/doctype01.dat
--- a/src/pkg/exp/html/testdata/webkit/entities01.dat
+++ b/src/pkg/exp/html/testdata/webkit/entities01.dat
--- a/src/pkg/exp/html/testdata/webkit/entities02.dat
+++ b/src/pkg/exp/html/testdata/webkit/entities02.dat
--- a/src/pkg/exp/html/testdata/webkit/html5test-com.dat
+++ b/src/pkg/exp/html/testdata/webkit/html5test-com.dat
--- a/src/pkg/exp/html/testdata/webkit/inbody01.dat
+++ b/src/pkg/exp/html/testdata/webkit/inbody01.dat
--- a/src/pkg/exp/html/testdata/webkit/isindex.dat
+++ b/src/pkg/exp/html/testdata/webkit/isindex.dat
--- a/src/pkg/exp/html/testdata/webkit/pending-spec-changes-plain-text-unsafe.dat
+++ b/src/pkg/exp/html/testdata/webkit/pending-spec-changes-plain-text-unsafe.dat
--- a/src/pkg/exp/html/testdata/webkit/pending-spec-changes.dat
+++ b/src/pkg/exp/html/testdata/webkit/pending-spec-changes.dat
--- a/src/pkg/exp/html/testdata/webkit/plain-text-unsafe.dat
+++ b/src/pkg/exp/html/testdata/webkit/plain-text-unsafe.dat
--- a/src/pkg/exp/html/testdata/webkit/scriptdata01.dat
+++ b/src/pkg/exp/html/testdata/webkit/scriptdata01.dat
--- a/src/pkg/exp/html/testdata/webkit/scripted/adoption01.dat
+++ b/src/pkg/exp/html/testdata/webkit/scripted/adoption01.dat
--- a/src/pkg/exp/html/testdata/webkit/scripted/webkit01.dat
+++ b/src/pkg/exp/html/testdata/webkit/scripted/webkit01.dat
--- a/src/pkg/exp/html/testdata/webkit/tables01.dat
+++ b/src/pkg/exp/html/testdata/webkit/tables01.dat
--- a/src/pkg/exp/html/testdata/webkit/tests1.dat
+++ b/src/pkg/exp/html/testdata/webkit/tests1.dat
--- a/src/pkg/exp/html/testdata/webkit/tests10.dat
+++ b/src/pkg/exp/html/testdata/webkit/tests10.dat
--- a/src/pkg/exp/html/testdata/webkit/tests11.dat
+++ b/src/pkg/exp/html/testdata/webkit/tests11.dat
--- a/src/pkg/exp/html/testdata/webkit/tests12.dat
+++ b/src/pkg/exp/html/testdata/webkit/tests12.dat
--- a/src/pkg/exp/html/testdata/webkit/tests14.dat
+++ b/src/pkg/exp/html/testdata/webkit/tests14.dat
--- a/src/pkg/exp/html/testdata/webkit/tests15.dat
+++ b/src/pkg/exp/html/testdata/webkit/tests15.dat
--- a/src/pkg/exp/html/testdata/webkit/tests16.dat
+++ b/src/pkg/exp/html/testdata/webkit/tests16.dat
--- a/src/pkg/exp/html/testdata/webkit/tests17.dat
+++ b/src/pkg/exp/html/testdata/webkit/tests17.dat
--- a/src/pkg/exp/html/testdata/webkit/tests18.dat
+++ b/src/pkg/exp/html/testdata/webkit/tests18.dat
--- a/src/pkg/exp/html/testdata/webkit/tests19.dat
+++ b/src/pkg/exp/html/testdata/webkit/tests19.dat
--- a/src/pkg/exp/html/testdata/webkit/tests2.dat
+++ b/src/pkg/exp/html/testdata/webkit/tests2.dat
--- a/src/pkg/exp/html/testdata/webkit/tests20.dat
+++ b/src/pkg/exp/html/testdata/webkit/tests20.dat
--- a/src/pkg/exp/html/testdata/webkit/tests21.dat
+++ b/src/pkg/exp/html/testdata/webkit/tests21.dat
--- a/src/pkg/exp/html/testdata/webkit/tests22.dat
+++ b/src/pkg/exp/html/testdata/webkit/tests22.dat
--- a/src/pkg/exp/html/testdata/webkit/tests23.dat
+++ b/src/pkg/exp/html/testdata/webkit/tests23.dat
--- a/src/pkg/exp/html/testdata/webkit/tests24.dat
+++ b/src/pkg/exp/html/testdata/webkit/tests24.dat
--- a/src/pkg/exp/html/testdata/webkit/tests25.dat
+++ b/src/pkg/exp/html/testdata/webkit/tests25.dat
--- a/src/pkg/exp/html/testdata/webkit/tests26.dat
+++ b/src/pkg/exp/html/testdata/webkit/tests26.dat
--- a/src/pkg/exp/html/testdata/webkit/tests3.dat
+++ b/src/pkg/exp/html/testdata/webkit/tests3.dat
--- a/src/pkg/exp/html/testdata/webkit/tests4.dat
+++ b/src/pkg/exp/html/testdata/webkit/tests4.dat
--- a/src/pkg/exp/html/testdata/webkit/tests5.dat
+++ b/src/pkg/exp/html/testdata/webkit/tests5.dat
--- a/src/pkg/exp/html/testdata/webkit/tests6.dat
+++ b/src/pkg/exp/html/testdata/webkit/tests6.dat
--- a/src/pkg/exp/html/testdata/webkit/tests7.dat
+++ b/src/pkg/exp/html/testdata/webkit/tests7.dat
--- a/src/pkg/exp/html/testdata/webkit/tests8.dat
+++ b/src/pkg/exp/html/testdata/webkit/tests8.dat
--- a/src/pkg/exp/html/testdata/webkit/tests9.dat
+++ b/src/pkg/exp/html/testdata/webkit/tests9.dat
--- a/src/pkg/exp/html/testdata/webkit/tests_innerHTML_1.dat
+++ b/src/pkg/exp/html/testdata/webkit/tests_innerHTML_1.dat
--- a/src/pkg/exp/html/testdata/webkit/tricky01.dat
+++ b/src/pkg/exp/html/testdata/webkit/tricky01.dat
--- a/src/pkg/exp/html/testdata/webkit/webkit01.dat
+++ b/src/pkg/exp/html/testdata/webkit/webkit01.dat
--- a/src/pkg/exp/html/testdata/webkit/webkit02.dat
+++ b/src/pkg/exp/html/testdata/webkit/webkit02.dat
--- a/src/pkg/exp/html/token.go
+++ b/src/pkg/exp/html/token.go
--- a/src/pkg/exp/html/token_test.go
+++ b/src/pkg/exp/html/token_test.go
--- a/src/pkg/html/Makefile
+++ b/src/pkg/html/Makefile
@ -6,15 +6,7 @@ include ../../Make.inc

 TARG=html
 GOFILES=\
-	const.go\
-	doc.go\
-	doctype.go\
 	entity.go\
 	escape.go\
-	foreign.go\
-	node.go\
-	parse.go\
-	render.go\
-	token.go\

 include ../../Make.pkg
--- a/src/pkg/html/escape.go
+++ b/src/pkg/html/escape.go
@ -10,6 +10,10 @@ import (
 	"unicode/utf8"
 )

+type writer interface {
+	WriteString(string) (int, error)
+}
+
 // These replacements permit compatibility with old numeric entities that 
 // assumed Windows-1252 encoding.
 // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference