// Copyright 2011 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package html import ( "bytes" "fmt" "json" "strings" "utf8" ) // nextJSCtx returns the context that determines whether a slash after the // given run of tokens tokens starts a regular expression instead of a division // operator: / or /=. // // This assumes that the token run does not include any string tokens, comment // tokens, regular expression literal tokens, or division operators. // // This fails on some valid but nonsensical JavaScript programs like // "x = ++/foo/i" which is quite different than "x++/foo/i", but is not known to // fail on any known useful programs. It is based on the draft // JavaScript 2.0 lexical grammar and requires one token of lookbehind: // http://www.mozilla.org/js/language/js20-2000-07/rationale/syntax.html func nextJSCtx(s []byte, preceding jsCtx) jsCtx { s = bytes.TrimRight(s, "\t\n\f\r \u2028\u2029") if len(s) == 0 { return preceding } // All cases below are in the single-byte UTF-8 group. switch c, n := s[len(s)-1], len(s); c { case '+', '-': // ++ and -- are not regexp preceders, but + and - are whether // they are used as infix or prefix operators. start := n - 1 // Count the number of adjacent dashes or pluses. for start > 0 && s[start-1] == c { start-- } if (n-start)&1 == 1 { // Reached for trailing minus signs since "---" is the // same as "-- -". return jsCtxRegexp } return jsCtxDivOp case '.': // Handle "42." if n != 1 && '0' <= s[n-2] && s[n-2] <= '9' { return jsCtxDivOp } return jsCtxRegexp // Suffixes for all punctuators from section 7.7 of the language spec // that only end binary operators not handled above. case ',', '<', '>', '=', '*', '%', '&', '|', '^', '?': return jsCtxRegexp // Suffixes for all punctuators from section 7.7 of the language spec // that are prefix operators not handled above. case '!', '~': return jsCtxRegexp // Matches all the punctuators from section 7.7 of the language spec // that are open brackets not handled above. case '(', '[': return jsCtxRegexp // Matches all the punctuators from section 7.7 of the language spec // that precede expression starts. case ':', ';', '{': return jsCtxRegexp // CAVEAT: the close punctuators ('}', ']', ')') precede div ops and // are handled in the default except for '}' which can precede a // division op as in // ({ valueOf: function () { return 42 } } / 2 // which is valid, but, in practice, developers don't divide object // literals, so our heuristic works well for code like // function () { ... } /foo/.test(x) && sideEffect(); // The ')' punctuator can precede a regular expression as in // if (b) /foo/.test(x) && ... // but this is much less likely than // (a + b) / c case '}': return jsCtxRegexp default: // Look for an IdentifierName and see if it is a keyword that // can precede a regular expression. j := n for j > 0 && isJSIdentPart(int(s[j-1])) { j-- } if regexpPrecederKeywords[string(s[j:])] { return jsCtxRegexp } } // Otherwise is a punctuator not listed above, or // a string which precedes a div op, or an identifier // which precedes a div op. return jsCtxDivOp } // regexPrecederKeywords is a set of reserved JS keywords that can precede a // regular expression in JS source. var regexpPrecederKeywords = map[string]bool{ "break": true, "case": true, "continue": true, "delete": true, "do": true, "else": true, "finally": true, "in": true, "instanceof": true, "return": true, "throw": true, "try": true, "typeof": true, "void": true, } // jsValEscaper escapes its inputs to a JS Expression (section 11.14) that has // nether side-effects nor free variables outside (NaN, Infinity). func jsValEscaper(args ...interface{}) string { var a interface{} if len(args) == 1 { a = args[0] } else { a = fmt.Sprint(args...) } // TODO: detect cycles before calling Marshal which loops infinitely on // cyclic data. This may be an unnacceptable DoS risk. // TODO: make sure that json.Marshal escapes codepoints U+2028 & U+2029 // so it falls within the subset of JSON which is valid JS and maybe // post-process to prevent it from containing // "", "", or " element, // or in an HTML5 event handler attribute such as onclick. func jsStrEscaper(args ...interface{}) string { ok := false var s string if len(args) == 1 { s, ok = args[0].(string) } if !ok { s = fmt.Sprint(args...) } var b bytes.Buffer written := 0 for i, r := range s { var repl string switch r { case 0: repl = `\0` case '\t': repl = `\t` case '\n': repl = `\n` case '\v': // "\v" == "v" on IE 6. repl = `\x0b` case '\f': repl = `\f` case '\r': repl = `\r` // Encode HTML specials as hex so the output can be embedded // in HTML attributes without further encoding. case '"': repl = `\x22` case '&': repl = `\x26` case '\'': repl = `\x27` case '+': repl = `\x2b` case '/': repl = `\/` case '<': repl = `\x3c` case '>': repl = `\x3e` case '\\': repl = `\\` case '\u2028': repl = `\u2028` case '\u2029': repl = `\u2029` default: continue } b.WriteString(s[written:i]) b.WriteString(repl) written = i + utf8.RuneLen(r) } if written == 0 { return s } b.WriteString(s[written:]) return b.String() } // jsRegexpEscaper behaves like jsStrEscaper but escapes regular expression // specials so the result is treated literally when included in a regular // expression literal. /foo{{.X}}bar/ matches the string "foo" followed by // the literal text of {{.X}} followed by the string "bar". func jsRegexpEscaper(args ...interface{}) string { ok := false var s string if len(args) == 1 { s, ok = args[0].(string) } if !ok { s = fmt.Sprint(args...) } var b bytes.Buffer written := 0 for i, r := range s { var repl string switch r { case 0: repl = `\0` case '\t': repl = `\t` case '\n': repl = `\n` case '\v': // "\v" == "v" on IE 6. repl = `\x0b` case '\f': repl = `\f` case '\r': repl = `\r` // Encode HTML specials as hex so the output can be embedded // in HTML attributes without further encoding. case '"': repl = `\x22` case '$': repl = `\$` case '&': repl = `\x26` case '\'': repl = `\x27` case '(': repl = `\(` case ')': repl = `\)` case '*': repl = `\*` case '+': repl = `\x2b` case '-': repl = `\-` case '.': repl = `\.` case '/': repl = `\/` case '<': repl = `\x3c` case '>': repl = `\x3e` case '?': repl = `\?` case '[': repl = `\[` case '\\': repl = `\\` case ']': repl = `\]` case '^': repl = `\^` case '{': repl = `\{` case '|': repl = `\|` case '}': repl = `\}` case '\u2028': repl = `\u2028` case '\u2029': repl = `\u2029` default: continue } b.WriteString(s[written:i]) b.WriteString(repl) written = i + utf8.RuneLen(r) } if written == 0 { return s } b.WriteString(s[written:]) return b.String() } // isJSIdentPart is true if the given rune is a JS identifier part. // It does not handle all the non-Latin letters, joiners, and combining marks, // but it does handle every codepoint that can occur in a numeric literal or // a keyword. func isJSIdentPart(rune int) bool { switch { case '$' == rune: return true case '0' <= rune && rune <= '9': return true case 'A' <= rune && rune <= 'Z': return true case '_' == rune: return true case 'a' <= rune && rune <= 'z': return true } return false }