1
0
mirror of https://github.com/golang/go synced 2024-11-24 10:10:07 -07:00

exp/template/html: Implement grammar for JS.

This transitions into a JS state when entering any attribute whose
name starts with "on".

It does not yet enter a JS on entry into a <script> element as script
element handling is introduced in another CL.

R=nigeltao
CC=golang-dev
https://golang.org/cl/4968052
This commit is contained in:
Mike Samuel 2011-09-01 12:03:40 +10:00 committed by Nigel Tao
parent ffe70eaa3c
commit 0253c688d0
6 changed files with 1254 additions and 60 deletions

View File

@ -8,5 +8,6 @@ TARG=exp/template/html
GOFILES=\
context.go\
escape.go\
js.go\
include ../../../../Make.pkg

View File

@ -19,13 +19,14 @@ type context struct {
state state
delim delim
urlPart urlPart
jsCtx jsCtx
errLine int
errStr string
}
// eq returns whether two contexts are equal.
func (c context) eq(d context) bool {
return c.state == d.state && c.delim == d.delim && c.urlPart == d.urlPart && c.errLine == d.errLine && c.errStr == d.errStr
return c.state == d.state && c.delim == d.delim && c.urlPart == d.urlPart && c.jsCtx == d.jsCtx && c.errLine == d.errLine && c.errStr == d.errStr
}
// state describes a high-level HTML parser state.
@ -50,17 +51,35 @@ const (
stateAttr
// stateURL occurs inside an HTML attribute whose content is a URL.
stateURL
// stateJS occurs inside an event handler or script element.
stateJS
// stateJSDqStr occurs inside a JavaScript double quoted string.
stateJSDqStr
// stateJSSqStr occurs inside a JavaScript single quoted string.
stateJSSqStr
// stateJSRegexp occurs inside a JavaScript regexp literal.
stateJSRegexp
// stateJSBlockCmt occurs inside a JavaScript /* block comment */.
stateJSBlockCmt
// stateJSLineCmt occurs inside a JavaScript // line comment.
stateJSLineCmt
// stateError is an infectious error state outside any valid
// HTML/CSS/JS construct.
stateError
)
var stateNames = [...]string{
stateText: "stateText",
stateTag: "stateTag",
stateAttr: "stateAttr",
stateURL: "stateURL",
stateError: "stateError",
stateText: "stateText",
stateTag: "stateTag",
stateAttr: "stateAttr",
stateURL: "stateURL",
stateJS: "stateJS",
stateJSDqStr: "stateJSDqStr",
stateJSSqStr: "stateJSSqStr",
stateJSRegexp: "stateJSRegexp",
stateJSBlockCmt: "stateJSBlockCmt",
stateJSLineCmt: "stateJSLineCmt",
stateError: "stateError",
}
func (s state) String() string {
@ -131,3 +150,24 @@ func (u urlPart) String() string {
}
return fmt.Sprintf("illegal urlPart %d", u)
}
// jsCtx determines whether a '/' starts a regular expression literal or a
// division operator.
type jsCtx uint8
const (
// jsCtxRegexp occurs where a '/' would start a regexp literal.
jsCtxRegexp jsCtx = iota
// jsCtxDivOp occurs where a '/' would start a division operator.
jsCtxDivOp
)
func (c jsCtx) String() string {
switch c {
case jsCtxRegexp:
return "jsCtxRegexp"
case jsCtxDivOp:
return "jsCtxDivOp"
}
return fmt.Sprintf("illegal jsCtx %d", c)
}

View File

@ -33,7 +33,10 @@ func Escape(t *template.Template) (*template.Template, os.Error) {
// funcMap maps command names to functions that render their inputs safe.
var funcMap = template.FuncMap{
"exp_template_html_urlfilter": urlFilter,
"exp_template_html_urlfilter": urlFilter,
"exp_template_html_jsvalescaper": jsValEscaper,
"exp_template_html_jsstrescaper": jsStrEscaper,
"exp_template_html_jsregexpescaper": jsRegexpEscaper,
}
// escape escapes a template node.
@ -58,15 +61,16 @@ func escape(c context, n parse.Node) context {
// escapeAction escapes an action template node.
func escapeAction(c context, n *parse.ActionNode) context {
sanitizer := "html"
if c.state == stateURL {
s := make([]string, 0, 2)
switch c.state {
case stateURL:
switch c.urlPart {
case urlPartNone:
sanitizer = "exp_template_html_urlfilter"
s = append(s, "exp_template_html_urlfilter")
case urlPartQueryOrFrag:
sanitizer = "urlquery"
s = append(s, "urlquery")
case urlPartPreQuery:
// The default "html" works here.
s = append(s, "html")
case urlPartUnknown:
return context{
state: stateError,
@ -76,21 +80,94 @@ func escapeAction(c context, n *parse.ActionNode) context {
default:
panic(c.urlPart.String())
}
case stateJS:
s = append(s, "exp_template_html_jsvalescaper")
if c.delim != delimNone {
s = append(s, "html")
}
case stateJSDqStr, stateJSSqStr:
s = append(s, "exp_template_html_jsstrescaper")
case stateJSRegexp:
s = append(s, "exp_template_html_jsregexpescaper")
case stateJSBlockCmt, stateJSLineCmt:
return context{
state: stateError,
errLine: n.Line,
errStr: fmt.Sprintf("%s appears inside a comment", n),
}
default:
s = append(s, "html")
}
// If the pipe already ends with the sanitizer, do not interfere.
if m := len(n.Pipe.Cmds); m != 0 {
if last := n.Pipe.Cmds[m-1]; len(last.Args) != 0 {
if i, ok := last.Args[0].(*parse.IdentifierNode); ok && i.Ident == sanitizer {
return c
ensurePipelineContains(n.Pipe, s)
return c
}
// ensurePipelineContains ensures that the pipeline has commands with
// the identifiers in s in order.
// If the pipeline already has some of the sanitizers, do not interfere.
// For example, if p is (.X | html) and s is ["escapeJSVal", "html"] then it
// has one matching, "html", and one to insert, "escapeJSVal", to produce
// (.X | escapeJSVal | html).
func ensurePipelineContains(p *parse.PipeNode, s []string) {
if len(s) == 0 {
return
}
n := len(p.Cmds)
// Find the identifiers at the end of the command chain.
idents := p.Cmds
for i := n - 1; i >= 0; i-- {
if cmd := p.Cmds[i]; len(cmd.Args) != 0 {
if _, ok := cmd.Args[0].(*parse.IdentifierNode); ok {
continue
}
}
idents = p.Cmds[i+1:]
}
dups := 0
for _, id := range idents {
if s[dups] == (id.Args[0].(*parse.IdentifierNode)).Ident {
dups++
if dups == len(s) {
return
}
}
}
// Otherwise, append the sanitizer.
n.Pipe.Cmds = append(n.Pipe.Cmds, &parse.CommandNode{
newCmds := make([]*parse.CommandNode, n-len(idents), n+len(s)-dups)
copy(newCmds, p.Cmds)
// Merge existing identifier commands with the sanitizers needed.
for _, id := range idents {
i := indexOfStr((id.Args[0].(*parse.IdentifierNode)).Ident, s)
if i != -1 {
for _, name := range s[:i] {
newCmds = append(newCmds, newIdentCmd(name))
}
s = s[i+1:]
}
newCmds = append(newCmds, id)
}
// Create any remaining sanitizers.
for _, name := range s {
newCmds = append(newCmds, newIdentCmd(name))
}
p.Cmds = newCmds
}
// indexOfStr is the least i such that strs[i] == s or -1 if s is not in strs.
func indexOfStr(s string, strs []string) int {
for i, t := range strs {
if s == t {
return i
}
}
return -1
}
// newIdentCmd produces a command containing a single identifier node.
func newIdentCmd(identifier string) *parse.CommandNode {
return &parse.CommandNode{
NodeType: parse.NodeCommand,
Args: []parse.Node{parse.NewIdentifier(sanitizer)},
})
return c
Args: []parse.Node{parse.NewIdentifier(identifier)},
}
}
// join joins the two contexts of a branch template node. The result is an
@ -203,11 +280,17 @@ func escapeText(c context, s []byte) context {
// A transition function takes a context and template text input, and returns
// the updated context and any unconsumed text.
var transitionFunc = [...]func(context, []byte) (context, []byte){
stateText: tText,
stateTag: tTag,
stateURL: tURL,
stateAttr: tAttr,
stateError: tError,
stateText: tText,
stateTag: tTag,
stateURL: tURL,
stateJS: tJS,
stateJSDqStr: tJSStr,
stateJSSqStr: tJSStr,
stateJSRegexp: tJSRegexp,
stateJSBlockCmt: tJSBlockCmt,
stateJSLineCmt: tJSLineCmt,
stateAttr: tAttr,
stateError: tError,
}
// tText is the context transition function for the text state.
@ -249,8 +332,11 @@ func tTag(c context, s []byte) (context, []byte) {
return context{state: stateTag}, nil
}
state := stateAttr
if urlAttr[strings.ToLower(string(s[attrStart:i]))] {
canonAttrName := strings.ToLower(string(s[attrStart:i]))
if urlAttr[canonAttrName] {
state = stateURL
} else if strings.HasPrefix(canonAttrName, "on") {
state = stateJS
}
// Look for the start of the value.
@ -268,16 +354,17 @@ func tTag(c context, s []byte) (context, []byte) {
i = eatWhiteSpace(s, i+1)
// Find the attribute delimiter.
delim := delimSpaceOrTagEnd
if i < len(s) {
switch s[i] {
case '\'':
return context{state: state, delim: delimSingleQuote}, s[i+1:]
delim, i = delimSingleQuote, i+1
case '"':
return context{state: state, delim: delimDoubleQuote}, s[i+1:]
delim, i = delimDoubleQuote, i+1
}
}
return context{state: state, delim: delimSpaceOrTagEnd}, s[i:]
return context{state: state, delim: delim}, s[i:]
}
// tAttr is the context transition function for the attribute state.
@ -295,6 +382,154 @@ func tURL(c context, s []byte) (context, []byte) {
return c, nil
}
// tJS is the context transition function for the JS state.
func tJS(c context, s []byte) (context, []byte) {
// TODO: delegate to tSpecialTagEnd to find any </script> once that CL
// has been merged.
i := bytes.IndexAny(s, `"'/`)
if i == -1 {
// Entire input is non string, comment, regexp tokens.
c.jsCtx = nextJSCtx(s, c.jsCtx)
return c, nil
}
c.jsCtx = nextJSCtx(s[:i], c.jsCtx)
switch s[i] {
case '"':
c.state, c.jsCtx = stateJSDqStr, jsCtxRegexp
case '\'':
c.state, c.jsCtx = stateJSSqStr, jsCtxRegexp
case '/':
switch {
case i+1 < len(s) && s[i+1] == '/':
c.state = stateJSLineCmt
case i+1 < len(s) && s[i+1] == '*':
c.state = stateJSBlockCmt
case c.jsCtx == jsCtxRegexp:
c.state = stateJSRegexp
default:
c.jsCtx = jsCtxRegexp
}
default:
panic("unreachable")
}
return c, s[i+1:]
}
// tJSStr is the context transition function for the JS string states.
func tJSStr(c context, s []byte) (context, []byte) {
// TODO: delegate to tSpecialTagEnd to find any </script> once that CL
// has been merged.
quoteAndEsc := `\"`
if c.state == stateJSSqStr {
quoteAndEsc = `\'`
}
b := s
for {
i := bytes.IndexAny(b, quoteAndEsc)
if i == -1 {
return c, nil
}
if b[i] == '\\' {
i++
if i == len(b) {
return context{
state: stateError,
errStr: fmt.Sprintf("unfinished escape sequence in JS string: %q", s),
}, nil
}
} else {
c.state, c.jsCtx = stateJS, jsCtxDivOp
return c, b[i+1:]
}
b = b[i+1:]
}
panic("unreachable")
}
// tJSRegexp is the context transition function for the /RegExp/ literal state.
func tJSRegexp(c context, s []byte) (context, []byte) {
// TODO: delegate to tSpecialTagEnd to find any </script> once that CL
// has been merged.
b := s
inCharset := false
for {
i := bytes.IndexAny(b, `/[\]`)
if i == -1 {
break
}
switch b[i] {
case '/':
if !inCharset {
c.state, c.jsCtx = stateJS, jsCtxDivOp
return c, b[i+1:]
}
case '\\':
i++
if i == len(b) {
return context{
state: stateError,
errStr: fmt.Sprintf("unfinished escape sequence in JS regexp: %q", s),
}, nil
}
case '[':
inCharset = true
case ']':
inCharset = false
default:
panic("unreachable")
}
b = b[i+1:]
}
if inCharset {
// This can be fixed by making context richer if interpolation
// into charsets is desired.
return context{
state: stateError,
errStr: fmt.Sprintf("unfinished JS regexp charset: %q", s),
}, nil
}
return c, nil
}
var blockCommentEnd = []byte("*/")
// tJSBlockCmt is the context transition function for the JS /*comment*/ state.
func tJSBlockCmt(c context, s []byte) (context, []byte) {
// TODO: delegate to tSpecialTagEnd to find any </script> once that CL
// has been merged.
i := bytes.Index(s, blockCommentEnd)
if i == -1 {
return c, nil
}
c.state = stateJS
return c, s[i+2:]
}
// tJSLineCmt is the context transition function for the JS //comment state.
func tJSLineCmt(c context, s []byte) (context, []byte) {
// TODO: delegate to tSpecialTagEnd to find any </script> once that CL
// has been merged.
i := bytes.IndexAny(s, "\r\n\u2028\u2029")
if i == -1 {
return c, nil
}
c.state = stateJS
// Per section 7.4 of EcmaScript 5 : http://es5.github.com/#x7.4
// "However, the LineTerminator at the end of the line is not
// considered to be part of the single-line comment; it is recognised
// separately by the lexical grammar and becomes part of the stream of
// input elements for the syntactic grammar."
return c, s[i:]
}
// tError is the context transition function for the error state.
func tError(c context, s []byte) (context, []byte) {
return c, nil

View File

@ -8,6 +8,7 @@ import (
"bytes"
"strings"
"template"
"template/parse"
"testing"
)
@ -16,6 +17,8 @@ func TestEscape(t *testing.T) {
F, T bool
C, G, H string
A, E []string
N int
Z *int
}{
F: false,
T: true,
@ -24,9 +27,11 @@ func TestEscape(t *testing.T) {
H: "<Hello>",
A: []string{"<a>", "<b>"},
E: []string{},
N: 42,
Z: nil,
}
var testCases = []struct {
tests := []struct {
name string
input string
output string
@ -141,29 +146,71 @@ func TestEscape(t *testing.T) {
`<a href="{{if .T}}/foo?a={{else}}/bar#{{end}}{{.C}}">`,
`<a href="/foo?a=%3CCincinatti%3E">`,
},
{
"jsStrValue",
"<button onclick='alert({{.H}})'>",
`<button onclick='alert(&#34;\u003cHello\u003e&#34;)'>`,
},
{
"jsNumericValue",
"<button onclick='alert({{.N}})'>",
`<button onclick='alert( 42 )'>`,
},
{
"jsBoolValue",
"<button onclick='alert({{.T}})'>",
`<button onclick='alert( true )'>`,
},
{
"jsNilValue",
"<button onclick='alert(typeof{{.Z}})'>",
`<button onclick='alert(typeof null )'>`,
},
{
"jsObjValue",
"<button onclick='alert({{.A}})'>",
`<button onclick='alert([&#34;\u003ca\u003e&#34;,&#34;\u003cb\u003e&#34;])'>`,
},
{
"jsObjValueNotOverEscaped",
"<button onclick='alert({{.A | html}})'>",
`<button onclick='alert([&#34;\u003ca\u003e&#34;,&#34;\u003cb\u003e&#34;])'>`,
},
{
"jsStr",
"<button onclick='alert(&quot;{{.H}}&quot;)'>",
`<button onclick='alert(&quot;\x3cHello\x3e&quot;)'>`,
},
{
"jsStrNotUnderEscaped",
"<button onclick='alert({{.C | urlquery}})'>",
// URL escaped, then quoted for JS.
`<button onclick='alert(&#34;%3CCincinatti%3E&#34;)'>`,
},
{
"jsRe",
"<button onclick='alert(&quot;{{.H}}&quot;)'>",
`<button onclick='alert(&quot;\x3cHello\x3e&quot;)'>`,
},
}
for _, tc := range testCases {
tmpl, err := template.New(tc.name).Parse(tc.input)
if err != nil {
t.Errorf("%s: template parsing failed: %s", tc.name, err)
continue
}
Escape(tmpl)
for _, test := range tests {
tmpl := template.Must(template.New(test.name).Parse(test.input))
tmpl, err := Escape(tmpl)
b := new(bytes.Buffer)
if err = tmpl.Execute(b, data); err != nil {
t.Errorf("%s: template execution failed: %s", tc.name, err)
t.Errorf("%s: template execution failed: %s", test.name, err)
continue
}
if w, g := tc.output, b.String(); w != g {
t.Errorf("%s: escaped output: want %q got %q", tc.name, w, g)
if w, g := test.output, b.String(); w != g {
t.Errorf("%s: escaped output: want\n\t%q\ngot\n\t%q", test.name, w, g)
continue
}
}
}
func TestErrors(t *testing.T) {
var testCases = []struct {
tests := []struct {
input string
err string
}{
@ -235,33 +282,53 @@ func TestErrors(t *testing.T) {
`<a href="{{if .F}}/foo?a={{else}}/bar/{{end}}{{.H}}">`,
"z:1: (action: [(command: [F=[H]])]) appears in an ambiguous URL context",
},
{
`<a onclick="alert('Hello \`,
`unfinished escape sequence in JS string: "Hello \\"`,
},
{
`<a onclick='alert("Hello\, World\`,
`unfinished escape sequence in JS string: "Hello\\, World\\"`,
},
{
`<a onclick='alert(/x+\`,
`unfinished escape sequence in JS regexp: "x+\\"`,
},
{
`<a onclick="/foo[\]/`,
`unfinished JS regexp charset: "foo[\\]/"`,
},
{
`<a onclick="/* alert({{.X}} */">`,
`z:1: (action: [(command: [F=[X]])]) appears inside a comment`,
},
{
`<a onclick="// alert({{.X}}">`,
`z:1: (action: [(command: [F=[X]])]) appears inside a comment`,
},
}
for _, tc := range testCases {
tmpl, err := template.New("z").Parse(tc.input)
if err != nil {
t.Errorf("input=%q: template parsing failed: %s", tc.input, err)
continue
}
for _, test := range tests {
tmpl := template.Must(template.New("z").Parse(test.input))
var got string
if _, err := Escape(tmpl); err != nil {
got = err.String()
}
if tc.err == "" {
if test.err == "" {
if got != "" {
t.Errorf("input=%q: unexpected error %q", tc.input, got)
t.Errorf("input=%q: unexpected error %q", test.input, got)
}
continue
}
if strings.Index(got, tc.err) == -1 {
t.Errorf("input=%q: error %q does not contain expected string %q", tc.input, got, tc.err)
if strings.Index(got, test.err) == -1 {
t.Errorf("input=%q: error %q does not contain expected string %q", test.input, got, test.err)
continue
}
}
}
func TestEscapeText(t *testing.T) {
var testCases = []struct {
tests := []struct {
input string
output context
}{
@ -378,18 +445,173 @@ func TestEscapeText(t *testing.T) {
`<input checked type="checkbox"`,
context{state: stateTag},
},
{
`<a onclick="`,
context{state: stateJS, delim: delimDoubleQuote},
},
{
`<a onclick="//foo`,
context{state: stateJSLineCmt, delim: delimDoubleQuote},
},
{
"<a onclick='//\n",
context{state: stateJS, delim: delimSingleQuote},
},
{
"<a onclick='//\r\n",
context{state: stateJS, delim: delimSingleQuote},
},
{
"<a onclick='//\u2028",
context{state: stateJS, delim: delimSingleQuote},
},
{
`<a onclick="/*`,
context{state: stateJSBlockCmt, delim: delimDoubleQuote},
},
{
`<a onkeypress="&quot;`,
context{state: stateJSDqStr, delim: delimDoubleQuote},
},
{
`<a onclick='&quot;foo&quot;`,
context{state: stateJS, delim: delimSingleQuote, jsCtx: jsCtxDivOp},
},
{
`<a onclick=&#39;foo&#39;`,
context{state: stateJS, delim: delimSpaceOrTagEnd, jsCtx: jsCtxDivOp},
},
{
`<a onclick=&#39;foo`,
context{state: stateJSSqStr, delim: delimSpaceOrTagEnd},
},
{
`<a onclick="&quot;foo'`,
context{state: stateJSDqStr, delim: delimDoubleQuote},
},
{
`<a onclick="'foo&quot;`,
context{state: stateJSSqStr, delim: delimDoubleQuote},
},
{
`<A ONCLICK="'`,
context{state: stateJSSqStr, delim: delimDoubleQuote},
},
{
`<a onclick="/`,
context{state: stateJSRegexp, delim: delimDoubleQuote},
},
{
`<a onclick="'foo'`,
context{state: stateJS, delim: delimDoubleQuote, jsCtx: jsCtxDivOp},
},
{
`<a onclick="'foo\'`,
context{state: stateJSSqStr, delim: delimDoubleQuote},
},
{
`<a onclick="'foo\'`,
context{state: stateJSSqStr, delim: delimDoubleQuote},
},
{
`<a onclick="/foo/`,
context{state: stateJS, delim: delimDoubleQuote, jsCtx: jsCtxDivOp},
},
{
`<a onclick="1 /foo`,
context{state: stateJS, delim: delimDoubleQuote, jsCtx: jsCtxDivOp},
},
{
`<a onclick="1 /*c*/ /foo`,
context{state: stateJS, delim: delimDoubleQuote, jsCtx: jsCtxDivOp},
},
{
`<a onclick="/foo[/]`,
context{state: stateJSRegexp, delim: delimDoubleQuote},
},
{
`<a onclick="/foo\/`,
context{state: stateJSRegexp, delim: delimDoubleQuote},
},
}
for _, tc := range testCases {
b := []byte(tc.input)
for _, test := range tests {
b := []byte(test.input)
c := escapeText(context{}, b)
if !tc.output.eq(c) {
t.Errorf("input %q: want context %v got %v", tc.input, tc.output, c)
if !test.output.eq(c) {
t.Errorf("input %q: want context\n\t%v\ngot\n\t%v", test.input, test.output, c)
continue
}
if tc.input != string(b) {
t.Errorf("input %q: text node was modified: want %q got %q", tc.input, tc.input, b)
if test.input != string(b) {
t.Errorf("input %q: text node was modified: want %q got %q", test.input, test.input, b)
continue
}
}
}
func TestEnsurePipelineContains(t *testing.T) {
tests := []struct {
input, output string
ids []string
}{
{
"{{.X}}",
"[(command: [F=[X]])]",
[]string{},
},
{
"{{.X | html}}",
"[(command: [F=[X]]) (command: [I=html])]",
[]string{},
},
{
"{{.X}}",
"[(command: [F=[X]]) (command: [I=html])]",
[]string{"html"},
},
{
"{{.X | html}}",
"[(command: [F=[X]]) (command: [I=html]) (command: [I=urlquery])]",
[]string{"urlquery"},
},
{
"{{.X | html | urlquery}}",
"[(command: [F=[X]]) (command: [I=html]) (command: [I=urlquery])]",
[]string{"urlquery"},
},
{
"{{.X | html | urlquery}}",
"[(command: [F=[X]]) (command: [I=html]) (command: [I=urlquery])]",
[]string{"html", "urlquery"},
},
{
"{{.X | html | urlquery}}",
"[(command: [F=[X]]) (command: [I=html]) (command: [I=urlquery])]",
[]string{"html"},
},
{
"{{.X | urlquery}}",
"[(command: [F=[X]]) (command: [I=html]) (command: [I=urlquery])]",
[]string{"html", "urlquery"},
},
{
"{{.X | html | print}}",
"[(command: [F=[X]]) (command: [I=urlquery]) (command: [I=html]) (command: [I=print])]",
[]string{"urlquery", "html"},
},
}
for _, test := range tests {
tmpl := template.Must(template.New("test").Parse(test.input))
action, ok := (tmpl.Tree.Root.Nodes[0].(*parse.ActionNode))
if !ok {
t.Errorf("First node is not an action: %s", test.input)
continue
}
pipe := action.Pipe
ensurePipelineContains(pipe, test.ids)
got := pipe.String()
if got != test.output {
t.Errorf("%s, %v: want\n\t%s\ngot\n\t%s", test.input, test.ids, test.output, got)
}
}
}

View File

@ -0,0 +1,344 @@
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package html
import (
"bytes"
"fmt"
"json"
"strings"
"utf8"
)
// nextJSCtx returns the context that determines whether a slash after the
// given run of tokens tokens starts a regular expression instead of a division
// operator: / or /=.
//
// This assumes that the token run does not include any string tokens, comment
// tokens, regular expression literal tokens, or division operators.
//
// This fails on some valid but nonsensical JavaScript programs like
// "x = ++/foo/i" which is quite different than "x++/foo/i", but is not known to
// fail on any known useful programs. It is based on the draft
// JavaScript 2.0 lexical grammar and requires one token of lookbehind:
// http://www.mozilla.org/js/language/js20-2000-07/rationale/syntax.html
func nextJSCtx(s []byte, preceding jsCtx) jsCtx {
s = bytes.TrimRight(s, "\t\n\f\r \u2028\u2029")
if len(s) == 0 {
return preceding
}
// All cases below are in the single-byte UTF-8 group.
switch c, n := s[len(s)-1], len(s); c {
case '+', '-':
// ++ and -- are not regexp preceders, but + and - are whether
// they are used as infix or prefix operators.
start := n - 1
// Count the number of adjacent dashes or pluses.
for start > 0 && s[start-1] == c {
start--
}
if (n-start)&1 == 1 {
// Reached for trailing minus signs since "---" is the
// same as "-- -".
return jsCtxRegexp
}
return jsCtxDivOp
case '.':
// Handle "42."
if n != 1 && '0' <= s[n-2] && s[n-2] <= '9' {
return jsCtxDivOp
}
return jsCtxRegexp
// Suffixes for all punctuators from section 7.7 of the language spec
// that only end binary operators not handled above.
case ',', '<', '>', '=', '*', '%', '&', '|', '^', '?':
return jsCtxRegexp
// Suffixes for all punctuators from section 7.7 of the language spec
// that are prefix operators not handled above.
case '!', '~':
return jsCtxRegexp
// Matches all the punctuators from section 7.7 of the language spec
// that are open brackets not handled above.
case '(', '[':
return jsCtxRegexp
// Matches all the punctuators from section 7.7 of the language spec
// that precede expression starts.
case ':', ';', '{':
return jsCtxRegexp
// CAVEAT: the close punctuators ('}', ']', ')') precede div ops and
// are handled in the default except for '}' which can precede a
// division op as in
// ({ valueOf: function () { return 42 } } / 2
// which is valid, but, in practice, developers don't divide object
// literals, so our heuristic works well for code like
// function () { ... } /foo/.test(x) && sideEffect();
// The ')' punctuator can precede a regular expression as in
// if (b) /foo/.test(x) && ...
// but this is much less likely than
// (a + b) / c
case '}':
return jsCtxRegexp
default:
// Look for an IdentifierName and see if it is a keyword that
// can precede a regular expression.
j := n
for j > 0 && isJSIdentPart(int(s[j-1])) {
j--
}
if regexpPrecederKeywords[string(s[j:])] {
return jsCtxRegexp
}
}
// Otherwise is a punctuator not listed above, or
// a string which precedes a div op, or an identifier
// which precedes a div op.
return jsCtxDivOp
}
// regexPrecederKeywords is a set of reserved JS keywords that can precede a
// regular expression in JS source.
var regexpPrecederKeywords = map[string]bool{
"break": true,
"case": true,
"continue": true,
"delete": true,
"do": true,
"else": true,
"finally": true,
"in": true,
"instanceof": true,
"return": true,
"throw": true,
"try": true,
"typeof": true,
"void": true,
}
// jsValEscaper escapes its inputs to a JS Expression (section 11.14) that has
// nether side-effects nor free variables outside (NaN, Infinity).
func jsValEscaper(args ...interface{}) string {
var a interface{}
if len(args) == 1 {
a = args[0]
} else {
a = fmt.Sprint(args...)
}
// TODO: detect cycles before calling Marshal which loops infinitely on
// cyclic data. This may be an unnacceptable DoS risk.
// TODO: make sure that json.Marshal escapes codepoints U+2028 & U+2029
// so it falls within the subset of JSON which is valid JS and maybe
// post-process to prevent it from containing
// "<!--", "-->", "<![CDATA[", "]]>", or "</script"
// in case custom marshallers produce output containing those.
// TODO: Maybe abbreviate \u00ab to \xab to produce more compact output.
// TODO: JSON allows arbitrary unicode codepoints, but EcmaScript
// defines a SourceCharacter as either a UTF-16 or UCS-2 code-unit.
// Determine whether supplemental codepoints in UTF-8 encoded JS inside
// string literals are properly interpreted by major interpreters.
b, err := json.Marshal(a)
if err != nil {
// Put a space before comment so that if it is flush against
// a division operator it is not turned into a line comment:
// x/{{y}}
// turning into
// x//* error marshalling y:
// second line of error message */null
return fmt.Sprintf(" /* %s */null ", strings.Replace(err.String(), "*/", "* /", -1))
}
if len(b) != 0 {
first, _ := utf8.DecodeRune(b)
last, _ := utf8.DecodeLastRune(b)
if isJSIdentPart(first) || isJSIdentPart(last) {
return " " + string(b) + " "
}
}
return string(b)
}
// jsStrEscaper produces a string that can be included between quotes in
// JavaScript source, in JavaScript embedded in an HTML5 <script> element,
// or in an HTML5 event handler attribute such as onclick.
func jsStrEscaper(args ...interface{}) string {
ok := false
var s string
if len(args) == 1 {
s, ok = args[0].(string)
}
if !ok {
s = fmt.Sprint(args...)
}
var b bytes.Buffer
written := 0
for i, r := range s {
var repl string
switch r {
// All cases must appear in the IndexAny call above.
case 0:
repl = `\0`
case '\t':
repl = `\t`
case '\n':
repl = `\n`
case '\v':
// "\v" == "v" on IE 6.
repl = `\x0b`
case '\f':
repl = `\f`
case '\r':
repl = `\r`
// Encode HTML specials as hex so the output can be embedded
// in HTML attributes without further encoding.
case '"':
repl = `\x22`
case '&':
repl = `\x26`
case '\'':
repl = `\x27`
case '+':
repl = `\x2b`
case '/':
repl = `\/`
case '<':
repl = `\x3c`
case '>':
repl = `\x3e`
case '\\':
repl = `\\`
case '\u2028':
repl = `\u2028`
case '\u2029':
repl = `\u2029`
default:
continue
}
b.WriteString(s[written:i])
b.WriteString(repl)
written = i + utf8.RuneLen(r)
}
if b.Len() == 0 {
return s
}
b.WriteString(s[written:])
return b.String()
}
// jsRegexpEscaper behaves like jsStrEscaper but escapes regular expression
// specials so the result is treated literally when included in a regular
// expression literal. /foo{{.X}}bar/ matches the string "foo" followed by
// the literal text of {{.X}} followed by the string "bar".
func jsRegexpEscaper(args ...interface{}) string {
ok := false
var s string
if len(args) == 1 {
s, ok = args[0].(string)
}
if !ok {
s = fmt.Sprint(args...)
}
var b bytes.Buffer
written := 0
for i, r := range s {
var repl string
switch r {
// All cases must appear in the IndexAny call above.
case 0:
repl = `\0`
case '\t':
repl = `\t`
case '\n':
repl = `\n`
case '\v':
// "\v" == "v" on IE 6.
repl = `\x0b`
case '\f':
repl = `\f`
case '\r':
repl = `\r`
// Encode HTML specials as hex so the output can be embedded
// in HTML attributes without further encoding.
case '"':
repl = `\x22`
case '$':
repl = `\$`
case '&':
repl = `\x26`
case '\'':
repl = `\x27`
case '(':
repl = `\(`
case ')':
repl = `\)`
case '*':
repl = `\*`
case '+':
repl = `\x2b`
case '-':
repl = `\-`
case '.':
repl = `\.`
case '/':
repl = `\/`
case '<':
repl = `\x3c`
case '>':
repl = `\x3e`
case '?':
repl = `\?`
case '[':
repl = `\[`
case '\\':
repl = `\\`
case ']':
repl = `\]`
case '^':
repl = `\^`
case '{':
repl = `\{`
case '|':
repl = `\|`
case '}':
repl = `\}`
case '\u2028':
repl = `\u2028`
case '\u2029':
repl = `\u2029`
default:
continue
}
b.WriteString(s[written:i])
b.WriteString(repl)
written = i + utf8.RuneLen(r)
}
if b.Len() == 0 {
return s
}
b.WriteString(s[written:])
return b.String()
}
// isJSIdentPart is true if the given rune is a JS identifier part.
// It does not handle all the non-Latin letters, joiners, and combining marks,
// but it does handle every codepoint that can occur in a numeric literal or
// a keyword.
func isJSIdentPart(rune int) bool {
switch {
case '$' == rune:
return true
case '0' <= rune && rune <= '9':
return true
case 'A' <= rune && rune <= 'Z':
return true
case '_' == rune:
return true
case 'a' <= rune && rune <= 'z':
return true
}
return false
}

View File

@ -0,0 +1,352 @@
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package html
import (
"bytes"
"math"
"strings"
"testing"
)
func TestNextJsCtx(t *testing.T) {
tests := []struct {
jsCtx jsCtx
s string
}{
// Statement terminators precede regexps.
{jsCtxRegexp, ";"},
// This is not airtight.
// ({ valueOf: function () { return 1 } } / 2)
// is valid JavaScript but in practice, devs do not do this.
// A block followed by a statement starting with a RegExp is
// much more common:
// while (x) {...} /foo/.test(x) || panic()
{jsCtxRegexp, "}"},
// But member, call, grouping, and array expression terminators
// precede div ops.
{jsCtxDivOp, ")"},
{jsCtxDivOp, "]"},
// At the start of a primary expression, array, or expression
// statement, expect a regexp.
{jsCtxRegexp, "("},
{jsCtxRegexp, "["},
{jsCtxRegexp, "{"},
// Assignment operators precede regexps as do all exclusively
// prefix and binary operators.
{jsCtxRegexp, "="},
{jsCtxRegexp, "+="},
{jsCtxRegexp, "*="},
{jsCtxRegexp, "*"},
{jsCtxRegexp, "!"},
// Whether the + or - is infix or prefix, it cannot precede a
// div op.
{jsCtxRegexp, "+"},
{jsCtxRegexp, "-"},
// An incr/decr op precedes a div operator.
// This is not airtight. In (g = ++/h/i) a regexp follows a
// pre-increment operator, but in practice devs do not try to
// increment or decrement regular expressions.
// (g++/h/i) where ++ is a postfix operator on g is much more
// common.
{jsCtxDivOp, "--"},
{jsCtxDivOp, "++"},
{jsCtxDivOp, "x--"},
// When we have many dashes or pluses, then they are grouped
// left to right.
{jsCtxRegexp, "x---"}, // A postfix -- then a -.
// return followed by a slash returns the regexp literal or the
// slash starts a regexp literal in an expression statement that
// is dead code.
{jsCtxRegexp, "return"},
{jsCtxRegexp, "return "},
{jsCtxRegexp, "return\t"},
{jsCtxRegexp, "return\n"},
{jsCtxRegexp, "return\u2028"},
// Identifiers can be divided and cannot validly be preceded by
// a regular expressions. Semicolon insertion cannot happen
// between an identifier and a regular expression on a new line
// because the one token lookahead for semicolon insertion has
// to conclude that it could be a div binary op and treat it as
// such.
{jsCtxDivOp, "x"},
{jsCtxDivOp, "x "},
{jsCtxDivOp, "x\t"},
{jsCtxDivOp, "x\n"},
{jsCtxDivOp, "x\u2028"},
{jsCtxDivOp, "preturn"},
// Numbers precede div ops.
{jsCtxDivOp, "0"},
// Dots that are part of a number are div preceders.
{jsCtxDivOp, "0."},
}
for _, test := range tests {
if nextJSCtx([]byte(test.s), jsCtxRegexp) != test.jsCtx {
t.Errorf("want %s got %q", test.jsCtx, test.s)
}
if nextJSCtx([]byte(test.s), jsCtxDivOp) != test.jsCtx {
t.Errorf("want %s got %q", test.jsCtx, test.s)
}
}
if nextJSCtx([]byte(" "), jsCtxRegexp) != jsCtxRegexp {
t.Error("Blank tokens")
}
if nextJSCtx([]byte(" "), jsCtxDivOp) != jsCtxDivOp {
t.Error("Blank tokens")
}
}
func TestJSValEscaper(t *testing.T) {
tests := []struct {
x interface{}
js string
}{
{int(42), " 42 "},
{uint(42), " 42 "},
{int16(42), " 42 "},
{uint16(42), " 42 "},
{int32(-42), " -42 "},
{uint32(42), " 42 "},
{int16(-42), " -42 "},
{uint16(42), " 42 "},
{int64(-42), " -42 "},
{uint64(42), " 42 "},
{uint64(1) << 53, " 9007199254740992 "},
// ulp(1 << 53) > 1 so this loses precision in JS
// but it is still a representable integer literal.
{uint64(1)<<53 + 1, " 9007199254740993 "},
{float32(1.0), " 1 "},
{float32(-1.0), " -1 "},
{float32(0.5), " 0.5 "},
{float32(-0.5), " -0.5 "},
{float32(1.0) / float32(256), " 0.00390625 "},
{float32(0), " 0 "},
{math.Copysign(0, -1), " -0 "},
{float64(1.0), " 1 "},
{float64(-1.0), " -1 "},
{float64(0.5), " 0.5 "},
{float64(-0.5), " -0.5 "},
{float64(0), " 0 "},
{math.Copysign(0, -1), " -0 "},
{"", `""`},
{"foo", `"foo"`},
// Newlines.
// {"\r\n\u2028\u2029", `"\r\n\u2028\u2029"`}, // TODO: FAILING. Maybe fix in json package.
// "\v" == "v" on IE 6 so use "\x0b" instead.
{"\t\x0b", `"\u0009\u000b"`},
{struct{ X, Y int }{1, 2}, `{"X":1,"Y":2}`},
{[]interface{}{}, "[]"},
{[]interface{}{42, "foo", nil}, `[42,"foo",null]`},
{"<!--", `"\u003c!--"`},
{"-->", `"--\u003e"`},
{"<![CDATA[", `"\u003c![CDATA["`},
{"]]>", `"]]\u003e"`},
{"</script", `"\u003c/script"`},
{"\U0001D11E", "\"\U0001D11E\""}, // or "\uD834\uDD1E"
}
for _, test := range tests {
if js := jsValEscaper(test.x); js != test.js {
t.Errorf("%+v: want\n\t%q\ngot\n\t%q", test.x, test.js, js)
}
// Make sure that escaping corner cases are not broken
// by nesting.
a := []interface{}{test.x}
want := "[" + strings.TrimSpace(test.js) + "]"
if js := jsValEscaper(a); js != want {
t.Errorf("%+v: want\n\t%q\ngot\n\t%q", a, want, js)
}
}
}
func TestJSStrEscaper(t *testing.T) {
tests := []struct {
x interface{}
esc string
}{
{"", ``},
{"foo", `foo`},
{"\u0000", `\0`},
{"\t", `\t`},
{"\n", `\n`},
{"\r", `\r`},
{"\u2028", `\u2028`},
{"\u2029", `\u2029`},
{"\\", `\\`},
{"\\n", `\\n`},
{"foo\r\nbar", `foo\r\nbar`},
// Preserve attribute boundaries.
{`"`, `\x22`},
{`'`, `\x27`},
// Allow embedding in HTML without further escaping.
{`&amp;`, `\x26amp;`},
// Prevent breaking out of text node and element boundaries.
{"</script>", `\x3c\/script\x3e`},
{"<![CDATA[", `\x3c![CDATA[`},
{"]]>", `]]\x3e`},
// http://dev.w3.org/html5/markup/aria/syntax.html#escaping-text-span
// "The text in style, script, title, and textarea elements
// must not have an escaping text span start that is not
// followed by an escaping text span end."
// Furthermore, spoofing an escaping text span end could lead
// to different interpretation of a </script> sequence otherwise
// masked by the escaping text span, and spoofing a start could
// allow regular text content to be interpreted as script
// allowing script execution via a combination of a JS string
// injection followed by an HTML text injection.
{"<!--", `\x3c!--`},
{"-->", `--\x3e`},
// From http://code.google.com/p/doctype/wiki/ArticleUtf7
{"+ADw-script+AD4-alert(1)+ADw-/script+AD4-",
`\x2bADw-script\x2bAD4-alert(1)\x2bADw-\/script\x2bAD4-`,
},
}
for _, test := range tests {
esc := jsStrEscaper(test.x)
if esc != test.esc {
t.Errorf("%q: want %q got %q", test.x, test.esc, esc)
}
}
}
func TestJSRegexpEscaper(t *testing.T) {
tests := []struct {
x interface{}
esc string
}{
{"", ``},
{"foo", `foo`},
{"\u0000", `\0`},
{"\t", `\t`},
{"\n", `\n`},
{"\r", `\r`},
{"\u2028", `\u2028`},
{"\u2029", `\u2029`},
{"\\", `\\`},
{"\\n", `\\n`},
{"foo\r\nbar", `foo\r\nbar`},
// Preserve attribute boundaries.
{`"`, `\x22`},
{`'`, `\x27`},
// Allow embedding in HTML without further escaping.
{`&amp;`, `\x26amp;`},
// Prevent breaking out of text node and element boundaries.
{"</script>", `\x3c\/script\x3e`},
{"<![CDATA[", `\x3c!\[CDATA\[`},
{"]]>", `\]\]\x3e`},
// Escaping text spans.
{"<!--", `\x3c!\-\-`},
{"-->", `\-\-\x3e`},
{"*", `\*`},
{"+", `\x2b`},
{"?", `\?`},
{"[](){}", `\[\]\(\)\{\}`},
{"$foo|x.y", `\$foo\|x\.y`},
{"x^y", `x\^y`},
}
for _, test := range tests {
esc := jsRegexpEscaper(test.x)
if esc != test.esc {
t.Errorf("%q: want %q got %q", test.x, test.esc, esc)
}
}
}
func TestEscapersOnLower7AndSelectHighCodepoints(t *testing.T) {
input := ("\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f" +
"\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" +
` !"#$%&'()*+,-./` +
`0123456789:;<=>?` +
`@ABCDEFGHIJKLMNO` +
`PQRSTUVWXYZ[\]^_` +
"`abcdefghijklmno" +
"pqrstuvwxyz{|}~\x7f" +
"\u00A0\u0100\u2028\u2029\ufeff\U0001D11E")
tests := []struct {
name string
escaper func(...interface{}) string
escaped string
}{
{
"jsStrEscaper",
jsStrEscaper,
"\\0\x01\x02\x03\x04\x05\x06\x07" +
"\x08\\t\\n\\x0b\\f\\r\x0E\x0F" +
"\x10\x11\x12\x13\x14\x15\x16\x17" +
"\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" +
` !\x22#$%\x26\x27()*\x2b,-.\/` +
`0123456789:;\x3c=\x3e?` +
`@ABCDEFGHIJKLMNO` +
`PQRSTUVWXYZ[\\]^_` +
"`abcdefghijklmno" +
"pqrstuvwxyz{|}~\x7f" +
"\u00A0\u0100\\u2028\\u2029\ufeff\U0001D11E",
},
{
"jsRegexpEscaper",
jsRegexpEscaper,
"\\0\x01\x02\x03\x04\x05\x06\x07" +
"\x08\\t\\n\\x0b\\f\\r\x0E\x0F" +
"\x10\x11\x12\x13\x14\x15\x16\x17" +
"\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" +
` !\x22#\$%\x26\x27\(\)\*\x2b,\-\.\/` +
`0123456789:;\x3c=\x3e\?` +
`@ABCDEFGHIJKLMNO` +
`PQRSTUVWXYZ\[\\\]\^_` +
"`abcdefghijklmno" +
`pqrstuvwxyz\{\|\}~` + "\u007f" +
"\u00A0\u0100\\u2028\\u2029\ufeff\U0001D11E",
},
}
for _, test := range tests {
if s := test.escaper(input); s != test.escaped {
t.Errorf("%s once: want\n\t%q\ngot\n\t%q", test.name, test.escaped, s)
continue
}
// Escape it rune by rune to make sure that any
// fast-path checking does not break escaping.
var buf bytes.Buffer
for _, c := range input {
buf.WriteString(test.escaper(string(c)))
}
if s := buf.String(); s != test.escaped {
t.Errorf("%s rune-wise: want\n\t%q\ngot\n\t%q", test.name, test.escaped, s)
continue
}
}
}
func BenchmarkJSStrEscaperNoSpecials(b *testing.B) {
for i := 0; i < b.N; i++ {
jsStrEscaper("The quick, brown fox jumps over the lazy dog.")
}
}
func BenchmarkJSStrEscaper(b *testing.B) {
for i := 0; i < b.N; i++ {
jsStrEscaper("The <i>quick</i>,\r\n<span style='color:brown'>brown</span> fox jumps\u2028over the <canine class=\"lazy\">dog</canine>")
}
}
func BenchmarkJSRegexpEscaperNoSpecials(b *testing.B) {
for i := 0; i < b.N; i++ {
jsRegexpEscaper("The quick, brown fox jumps over the lazy dog")
}
}
func BenchmarkJSRegexpEscaper(b *testing.B) {
for i := 0; i < b.N; i++ {
jsRegexpEscaper("The <i>quick</i>,\r\n<span style='color:brown'>brown</span> fox jumps\u2028over the <canine class=\"lazy\">dog</canine>")
}
}