| // Copyright 2011 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| package html |
| |
| import ( |
| "bytes" |
| "fmt" |
| "json" |
| "strings" |
| "utf8" |
| ) |
| |
| // nextJSCtx returns the context that determines whether a slash after the |
| // given run of tokens tokens starts a regular expression instead of a division |
| // operator: / or /=. |
| // |
| // This assumes that the token run does not include any string tokens, comment |
| // tokens, regular expression literal tokens, or division operators. |
| // |
| // This fails on some valid but nonsensical JavaScript programs like |
| // "x = ++/foo/i" which is quite different than "x++/foo/i", but is not known to |
| // fail on any known useful programs. It is based on the draft |
| // JavaScript 2.0 lexical grammar and requires one token of lookbehind: |
| // http://www.mozilla.org/js/language/js20-2000-07/rationale/syntax.html |
| func nextJSCtx(s []byte, preceding jsCtx) jsCtx { |
| s = bytes.TrimRight(s, "\t\n\f\r \u2028\u2029") |
| if len(s) == 0 { |
| return preceding |
| } |
| |
| // All cases below are in the single-byte UTF-8 group. |
| switch c, n := s[len(s)-1], len(s); c { |
| case '+', '-': |
| // ++ and -- are not regexp preceders, but + and - are whether |
| // they are used as infix or prefix operators. |
| start := n - 1 |
| // Count the number of adjacent dashes or pluses. |
| for start > 0 && s[start-1] == c { |
| start-- |
| } |
| if (n-start)&1 == 1 { |
| // Reached for trailing minus signs since "---" is the |
| // same as "-- -". |
| return jsCtxRegexp |
| } |
| return jsCtxDivOp |
| case '.': |
| // Handle "42." |
| if n != 1 && '0' <= s[n-2] && s[n-2] <= '9' { |
| return jsCtxDivOp |
| } |
| return jsCtxRegexp |
| // Suffixes for all punctuators from section 7.7 of the language spec |
| // that only end binary operators not handled above. |
| case ',', '<', '>', '=', '*', '%', '&', '|', '^', '?': |
| return jsCtxRegexp |
| // Suffixes for all punctuators from section 7.7 of the language spec |
| // that are prefix operators not handled above. |
| case '!', '~': |
| return jsCtxRegexp |
| // Matches all the punctuators from section 7.7 of the language spec |
| // that are open brackets not handled above. |
| case '(', '[': |
| return jsCtxRegexp |
| // Matches all the punctuators from section 7.7 of the language spec |
| // that precede expression starts. |
| case ':', ';', '{': |
| return jsCtxRegexp |
| // CAVEAT: the close punctuators ('}', ']', ')') precede div ops and |
| // are handled in the default except for '}' which can precede a |
| // division op as in |
| // ({ valueOf: function () { return 42 } } / 2 |
| // which is valid, but, in practice, developers don't divide object |
| // literals, so our heuristic works well for code like |
| // function () { ... } /foo/.test(x) && sideEffect(); |
| // The ')' punctuator can precede a regular expression as in |
| // if (b) /foo/.test(x) && ... |
| // but this is much less likely than |
| // (a + b) / c |
| case '}': |
| return jsCtxRegexp |
| default: |
| // Look for an IdentifierName and see if it is a keyword that |
| // can precede a regular expression. |
| j := n |
| for j > 0 && isJSIdentPart(int(s[j-1])) { |
| j-- |
| } |
| if regexpPrecederKeywords[string(s[j:])] { |
| return jsCtxRegexp |
| } |
| } |
| // Otherwise is a punctuator not listed above, or |
| // a string which precedes a div op, or an identifier |
| // which precedes a div op. |
| return jsCtxDivOp |
| } |
| |
| // regexPrecederKeywords is a set of reserved JS keywords that can precede a |
| // regular expression in JS source. |
| var regexpPrecederKeywords = map[string]bool{ |
| "break": true, |
| "case": true, |
| "continue": true, |
| "delete": true, |
| "do": true, |
| "else": true, |
| "finally": true, |
| "in": true, |
| "instanceof": true, |
| "return": true, |
| "throw": true, |
| "try": true, |
| "typeof": true, |
| "void": true, |
| } |
| |
| // jsValEscaper escapes its inputs to a JS Expression (section 11.14) that has |
| // nether side-effects nor free variables outside (NaN, Infinity). |
| func jsValEscaper(args ...interface{}) string { |
| var a interface{} |
| if len(args) == 1 { |
| a = args[0] |
| } else { |
| a = fmt.Sprint(args...) |
| } |
| // TODO: detect cycles before calling Marshal which loops infinitely on |
| // cyclic data. This may be an unnacceptable DoS risk. |
| |
| // TODO: make sure that json.Marshal escapes codepoints U+2028 & U+2029 |
| // so it falls within the subset of JSON which is valid JS and maybe |
| // post-process to prevent it from containing |
| // "<!--", "-->", "<![CDATA[", "]]>", or "</script" |
| // in case custom marshallers produce output containing those. |
| |
| // TODO: Maybe abbreviate \u00ab to \xab to produce more compact output. |
| |
| // TODO: JSON allows arbitrary unicode codepoints, but EcmaScript |
| // defines a SourceCharacter as either a UTF-16 or UCS-2 code-unit. |
| // Determine whether supplemental codepoints in UTF-8 encoded JS inside |
| // string literals are properly interpreted by major interpreters. |
| |
| b, err := json.Marshal(a) |
| if err != nil { |
| // Put a space before comment so that if it is flush against |
| // a division operator it is not turned into a line comment: |
| // x/{{y}} |
| // turning into |
| // x//* error marshalling y: |
| // second line of error message */null |
| return fmt.Sprintf(" /* %s */null ", strings.Replace(err.String(), "*/", "* /", -1)) |
| } |
| if len(b) != 0 { |
| first, _ := utf8.DecodeRune(b) |
| last, _ := utf8.DecodeLastRune(b) |
| if isJSIdentPart(first) || isJSIdentPart(last) { |
| return " " + string(b) + " " |
| } |
| } |
| return string(b) |
| } |
| |
| // jsStrEscaper produces a string that can be included between quotes in |
| // JavaScript source, in JavaScript embedded in an HTML5 <script> element, |
| // or in an HTML5 event handler attribute such as onclick. |
| func jsStrEscaper(args ...interface{}) string { |
| ok := false |
| var s string |
| if len(args) == 1 { |
| s, ok = args[0].(string) |
| } |
| if !ok { |
| s = fmt.Sprint(args...) |
| } |
| var b bytes.Buffer |
| written := 0 |
| for i, r := range s { |
| var repl string |
| switch r { |
| // All cases must appear in the IndexAny call above. |
| case 0: |
| repl = `\0` |
| case '\t': |
| repl = `\t` |
| case '\n': |
| repl = `\n` |
| case '\v': |
| // "\v" == "v" on IE 6. |
| repl = `\x0b` |
| case '\f': |
| repl = `\f` |
| case '\r': |
| repl = `\r` |
| // Encode HTML specials as hex so the output can be embedded |
| // in HTML attributes without further encoding. |
| case '"': |
| repl = `\x22` |
| case '&': |
| repl = `\x26` |
| case '\'': |
| repl = `\x27` |
| case '+': |
| repl = `\x2b` |
| case '/': |
| repl = `\/` |
| case '<': |
| repl = `\x3c` |
| case '>': |
| repl = `\x3e` |
| case '\\': |
| repl = `\\` |
| case '\u2028': |
| repl = `\u2028` |
| case '\u2029': |
| repl = `\u2029` |
| default: |
| continue |
| } |
| b.WriteString(s[written:i]) |
| b.WriteString(repl) |
| written = i + utf8.RuneLen(r) |
| } |
| if b.Len() == 0 { |
| return s |
| } |
| b.WriteString(s[written:]) |
| return b.String() |
| } |
| |
| // jsRegexpEscaper behaves like jsStrEscaper but escapes regular expression |
| // specials so the result is treated literally when included in a regular |
| // expression literal. /foo{{.X}}bar/ matches the string "foo" followed by |
| // the literal text of {{.X}} followed by the string "bar". |
| func jsRegexpEscaper(args ...interface{}) string { |
| ok := false |
| var s string |
| if len(args) == 1 { |
| s, ok = args[0].(string) |
| } |
| if !ok { |
| s = fmt.Sprint(args...) |
| } |
| var b bytes.Buffer |
| written := 0 |
| for i, r := range s { |
| var repl string |
| switch r { |
| // All cases must appear in the IndexAny call above. |
| case 0: |
| repl = `\0` |
| case '\t': |
| repl = `\t` |
| case '\n': |
| repl = `\n` |
| case '\v': |
| // "\v" == "v" on IE 6. |
| repl = `\x0b` |
| case '\f': |
| repl = `\f` |
| case '\r': |
| repl = `\r` |
| // Encode HTML specials as hex so the output can be embedded |
| // in HTML attributes without further encoding. |
| case '"': |
| repl = `\x22` |
| case '$': |
| repl = `\$` |
| case '&': |
| repl = `\x26` |
| case '\'': |
| repl = `\x27` |
| case '(': |
| repl = `\(` |
| case ')': |
| repl = `\)` |
| case '*': |
| repl = `\*` |
| case '+': |
| repl = `\x2b` |
| case '-': |
| repl = `\-` |
| case '.': |
| repl = `\.` |
| case '/': |
| repl = `\/` |
| case '<': |
| repl = `\x3c` |
| case '>': |
| repl = `\x3e` |
| case '?': |
| repl = `\?` |
| case '[': |
| repl = `\[` |
| case '\\': |
| repl = `\\` |
| case ']': |
| repl = `\]` |
| case '^': |
| repl = `\^` |
| case '{': |
| repl = `\{` |
| case '|': |
| repl = `\|` |
| case '}': |
| repl = `\}` |
| case '\u2028': |
| repl = `\u2028` |
| case '\u2029': |
| repl = `\u2029` |
| default: |
| continue |
| } |
| b.WriteString(s[written:i]) |
| b.WriteString(repl) |
| written = i + utf8.RuneLen(r) |
| } |
| if b.Len() == 0 { |
| return s |
| } |
| b.WriteString(s[written:]) |
| return b.String() |
| } |
| |
| // isJSIdentPart is true if the given rune is a JS identifier part. |
| // It does not handle all the non-Latin letters, joiners, and combining marks, |
| // but it does handle every codepoint that can occur in a numeric literal or |
| // a keyword. |
| func isJSIdentPart(rune int) bool { |
| switch { |
| case '$' == rune: |
| return true |
| case '0' <= rune && rune <= '9': |
| return true |
| case 'A' <= rune && rune <= 'Z': |
| return true |
| case '_' == rune: |
| return true |
| case 'a' <= rune && rune <= 'z': |
| return true |
| } |
| return false |
| } |