blob: d881328c93cb2e78aa0874b0616f4adcb299ba8c [file] [log] [blame]
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package html
import (
"bytes"
"fmt"
"unicode"
"utf8"
)
// endsWithCSSKeyword returns whether b ends with an ident that
// case-insensitively matches the lower-case kw.
func endsWithCSSKeyword(b []byte, kw string) bool {
i := len(b) - len(kw)
if i < 0 {
// Too short.
return false
}
if i != 0 {
r, _ := utf8.DecodeLastRune(b[:i])
if isCSSNmchar(r) {
// Too long.
return false
}
}
// Many CSS keywords, such as "!important" can have characters encoded,
// but the URI production does not allow that according to
// http://www.w3.org/TR/css3-syntax/#TOK-URI
// This does not attempt to recognize encoded keywords. For example,
// given "\75\72\6c" and "url" this return false.
return string(bytes.ToLower(b[i:])) == kw
}
// isCSSNmchar returns whether rune is allowed anywhere in a CSS identifier.
func isCSSNmchar(rune int) bool {
// Based on the CSS3 nmchar production but ignores multi-rune escape
// sequences.
// http://www.w3.org/TR/css3-syntax/#SUBTOK-nmchar
return 'a' <= rune && rune <= 'z' ||
'A' <= rune && rune <= 'Z' ||
'0' <= rune && rune <= '9' ||
'-' == rune ||
'_' == rune ||
// Non-ASCII cases below.
0x80 <= rune && rune <= 0xd7ff ||
0xe000 <= rune && rune <= 0xfffd ||
0x10000 <= rune && rune <= 0x10ffff
}
// decodeCSS decodes CSS3 escapes given a sequence of stringchars.
// If there is no change, it returns the input, otherwise it returns a slice
// backed by a new array.
// http://www.w3.org/TR/css3-syntax/#SUBTOK-stringchar defines stringchar.
func decodeCSS(s []byte) []byte {
i := bytes.IndexByte(s, '\\')
if i == -1 {
return s
}
// The UTF-8 sequence for a codepoint is never longer than 1 + the
// number hex digits need to represent that codepoint, so len(s) is an
// upper bound on the output length.
b := make([]byte, 0, len(s))
for len(s) != 0 {
i := bytes.IndexByte(s, '\\')
if i == -1 {
i = len(s)
}
b, s = append(b, s[:i]...), s[i:]
if len(s) < 2 {
break
}
// http://www.w3.org/TR/css3-syntax/#SUBTOK-escape
// escape ::= unicode | '\' [#x20-#x7E#x80-#xD7FF#xE000-#xFFFD#x10000-#x10FFFF]
if isHex(s[1]) {
// http://www.w3.org/TR/css3-syntax/#SUBTOK-unicode
// unicode ::= '\' [0-9a-fA-F]{1,6} wc?
j := 2
for j < len(s) && j < 7 && isHex(s[j]) {
j++
}
rune := hexDecode(s[1:j])
if rune > unicode.MaxRune {
rune, j = rune/16, j-1
}
n := utf8.EncodeRune(b[len(b):cap(b)], rune)
// The optional space at the end allows a hex
// sequence to be followed by a literal hex.
// string(decodeCSS([]byte(`\A B`))) == "\nB"
b, s = b[:len(b)+n], skipCSSSpace(s[j:])
} else {
// `\\` decodes to `\` and `\"` to `"`.
_, n := utf8.DecodeRune(s[1:])
b, s = append(b, s[1:1+n]...), s[1+n:]
}
}
return b
}
// isHex returns whether the given character is a hex digit.
func isHex(c byte) bool {
return '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F'
}
// hexDecode decodes a short hex digit sequence: "10" -> 16.
func hexDecode(s []byte) int {
n := 0
for _, c := range s {
n <<= 4
switch {
case '0' <= c && c <= '9':
n |= int(c - '0')
case 'a' <= c && c <= 'f':
n |= int(c-'a') + 10
case 'A' <= c && c <= 'F':
n |= int(c-'A') + 10
default:
panic(fmt.Sprintf("Bad hex digit in %q", s))
}
}
return n
}
// skipCSSSpace returns a suffix of c, skipping over a single space.
func skipCSSSpace(c []byte) []byte {
if len(c) == 0 {
return c
}
// wc ::= #x9 | #xA | #xC | #xD | #x20
switch c[0] {
case '\t', '\n', '\f', ' ':
return c[1:]
case '\r':
// This differs from CSS3's wc production because it contains a
// probable spec error whereby wc contains all the single byte
// sequences in nl (newline) but not CRLF.
if len(c) >= 2 && c[1] == '\n' {
return c[2:]
}
return c[1:]
}
return c
}
// cssEscaper escapes HTML and CSS special characters using \<hex>+ escapes.
func cssEscaper(args ...interface{}) string {
s, _ := stringify(args...)
var b bytes.Buffer
written := 0
for i, r := range s {
var repl string
switch r {
case 0:
repl = `\0`
case '\t':
repl = `\9`
case '\n':
repl = `\a`
case '\f':
repl = `\c`
case '\r':
repl = `\d`
// Encode HTML specials as hex so the output can be embedded
// in HTML attributes without further encoding.
case '"':
repl = `\22`
case '&':
repl = `\26`
case '\'':
repl = `\27`
case '(':
repl = `\28`
case ')':
repl = `\29`
case '+':
repl = `\2b`
case '/':
repl = `\2f`
case ':':
repl = `\3a`
case ';':
repl = `\3b`
case '<':
repl = `\3c`
case '>':
repl = `\3e`
case '\\':
repl = `\\`
case '{':
repl = `\7b`
case '}':
repl = `\7d`
default:
continue
}
b.WriteString(s[written:i])
b.WriteString(repl)
written = i + utf8.RuneLen(r)
if repl != `\\` && (written == len(s) || isHex(s[written])) {
b.WriteByte(' ')
}
}
if written == 0 {
return s
}
b.WriteString(s[written:])
return b.String()
}
var expressionBytes = []byte("expression")
var mozBindingBytes = []byte("mozbinding")
// cssValueFilter allows innocuous CSS values in the output including CSS
// quantities (10px or 25%), ID or class literals (#foo, .bar), keyword values
// (inherit, blue), and colors (#888).
// It filters out unsafe values, such as those that affect token boundaries,
// and anything that might execute scripts.
func cssValueFilter(args ...interface{}) string {
s, t := stringify(args...)
if t == contentTypeCSS {
return s
}
b, id := decodeCSS([]byte(s)), make([]byte, 0, 64)
// CSS3 error handling is specified as honoring string boundaries per
// http://www.w3.org/TR/css3-syntax/#error-handling :
// Malformed declarations. User agents must handle unexpected
// tokens encountered while parsing a declaration by reading until
// the end of the declaration, while observing the rules for
// matching pairs of (), [], {}, "", and '', and correctly handling
// escapes. For example, a malformed declaration may be missing a
// property, colon (:) or value.
// So we need to make sure that values do not have mismatched bracket
// or quote characters to prevent the browser from restarting parsing
// inside a string that might embed JavaScript source.
for i, c := range b {
switch c {
case 0, '"', '\'', '(', ')', '/', ';', '@', '[', '\\', ']', '`', '{', '}':
return filterFailsafe
case '-':
// Disallow <!-- or -->.
// -- should not appear in valid identifiers.
if i != 0 && '-' == b[i-1] {
return filterFailsafe
}
default:
if c < 0x80 && isCSSNmchar(int(c)) {
id = append(id, c)
}
}
}
id = bytes.ToLower(id)
if bytes.Index(id, expressionBytes) != -1 || bytes.Index(id, mozBindingBytes) != -1 {
return filterFailsafe
}
return string(b)
}