| // Copyright 2011 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| package html |
| |
| import ( |
| "bytes" |
| "fmt" |
| "os" |
| "strings" |
| ) |
| |
| // transitionFunc is the array of context transition functions for text nodes. |
| // A transition function takes a context and template text input, and returns |
| // the updated context and any unconsumed text. |
| var transitionFunc = [...]func(context, []byte) (context, []byte){ |
| stateText: tText, |
| stateTag: tTag, |
| stateComment: tComment, |
| stateRCDATA: tSpecialTagEnd, |
| stateAttr: tAttr, |
| stateURL: tURL, |
| stateJS: tJS, |
| stateJSDqStr: tJSStr, |
| stateJSSqStr: tJSStr, |
| stateJSRegexp: tJSRegexp, |
| stateJSBlockCmt: tBlockCmt, |
| stateJSLineCmt: tLineCmt, |
| stateCSS: tCSS, |
| stateCSSDqStr: tCSSStr, |
| stateCSSSqStr: tCSSStr, |
| stateCSSDqURL: tCSSStr, |
| stateCSSSqURL: tCSSStr, |
| stateCSSURL: tCSSStr, |
| stateCSSBlockCmt: tBlockCmt, |
| stateCSSLineCmt: tLineCmt, |
| stateError: tError, |
| } |
| |
| var commentStart = []byte("<!--") |
| var commentEnd = []byte("-->") |
| |
| // tText is the context transition function for the text state. |
| func tText(c context, s []byte) (context, []byte) { |
| for { |
| i := bytes.IndexByte(s, '<') |
| if i == -1 || i+1 == len(s) { |
| return c, nil |
| } else if i+4 <= len(s) && bytes.Equal(commentStart, s[i:i+4]) { |
| return context{state: stateComment}, s[i+4:] |
| } |
| i++ |
| if s[i] == '/' { |
| if i+1 == len(s) { |
| return c, nil |
| } |
| i++ |
| } |
| j, e := eatTagName(s, i) |
| if j != i { |
| // We've found an HTML tag. |
| return context{state: stateTag, element: e}, s[j:] |
| } |
| s = s[j:] |
| } |
| panic("unreachable") |
| } |
| |
| var elementContentType = [...]state{ |
| elementNone: stateText, |
| elementScript: stateJS, |
| elementStyle: stateCSS, |
| elementTextarea: stateRCDATA, |
| elementTitle: stateRCDATA, |
| } |
| |
| // tTag is the context transition function for the tag state. |
| func tTag(c context, s []byte) (context, []byte) { |
| // Find the attribute name. |
| attrStart := eatWhiteSpace(s, 0) |
| i, err := eatAttrName(s, attrStart) |
| if err != nil { |
| return context{ |
| state: stateError, |
| errStr: err.String(), |
| }, nil |
| } |
| if i == len(s) { |
| return c, nil |
| } |
| state := stateAttr |
| canonAttrName := strings.ToLower(string(s[attrStart:i])) |
| if urlAttr[canonAttrName] { |
| state = stateURL |
| } else if strings.HasPrefix(canonAttrName, "on") { |
| state = stateJS |
| } else if canonAttrName == "style" { |
| state = stateCSS |
| } |
| |
| // Look for the start of the value. |
| i = eatWhiteSpace(s, i) |
| if i == len(s) { |
| return c, s[i:] |
| } |
| if s[i] == '>' { |
| state = elementContentType[c.element] |
| return context{state: state, element: c.element}, s[i+1:] |
| } else if s[i] != '=' { |
| // Possible due to a valueless attribute or '/' in "<input />". |
| return c, s[i:] |
| } |
| // Consume the "=". |
| i = eatWhiteSpace(s, i+1) |
| |
| // Find the attribute delimiter. |
| delim := delimSpaceOrTagEnd |
| if i < len(s) { |
| switch s[i] { |
| case '\'': |
| delim, i = delimSingleQuote, i+1 |
| case '"': |
| delim, i = delimDoubleQuote, i+1 |
| } |
| } |
| |
| return context{state: state, delim: delim, element: c.element}, s[i:] |
| } |
| |
| // tComment is the context transition function for stateComment. |
| func tComment(c context, s []byte) (context, []byte) { |
| i := bytes.Index(s, commentEnd) |
| if i != -1 { |
| return context{}, s[i+3:] |
| } |
| return c, nil |
| } |
| |
| // specialTagEndMarkers maps element types to the character sequence that |
| // case-insensitively signals the end of the special tag body. |
| var specialTagEndMarkers = [...]string{ |
| elementScript: "</script", |
| elementStyle: "</style", |
| elementTextarea: "</textarea", |
| elementTitle: "</title", |
| } |
| |
| // tSpecialTagEnd is the context transition function for raw text and RCDATA |
| // element states. |
| func tSpecialTagEnd(c context, s []byte) (context, []byte) { |
| if c.element != elementNone { |
| end := specialTagEndMarkers[c.element] |
| i := strings.Index(strings.ToLower(string(s)), end) |
| if i != -1 { |
| return context{state: stateTag}, s[i+len(end):] |
| } |
| } |
| return c, nil |
| } |
| |
| // tAttr is the context transition function for the attribute state. |
| func tAttr(c context, s []byte) (context, []byte) { |
| return c, nil |
| } |
| |
| // tURL is the context transition function for the URL state. |
| func tURL(c context, s []byte) (context, []byte) { |
| if bytes.IndexAny(s, "#?") >= 0 { |
| c.urlPart = urlPartQueryOrFrag |
| } else if len(s) != 0 && c.urlPart == urlPartNone { |
| c.urlPart = urlPartPreQuery |
| } |
| return c, nil |
| } |
| |
| // tJS is the context transition function for the JS state. |
| func tJS(c context, s []byte) (context, []byte) { |
| if d, t := tSpecialTagEnd(c, s); t != nil { |
| return d, t |
| } |
| |
| i := bytes.IndexAny(s, `"'/`) |
| if i == -1 { |
| // Entire input is non string, comment, regexp tokens. |
| c.jsCtx = nextJSCtx(s, c.jsCtx) |
| return c, nil |
| } |
| c.jsCtx = nextJSCtx(s[:i], c.jsCtx) |
| switch s[i] { |
| case '"': |
| c.state, c.jsCtx = stateJSDqStr, jsCtxRegexp |
| case '\'': |
| c.state, c.jsCtx = stateJSSqStr, jsCtxRegexp |
| case '/': |
| switch { |
| case i+1 < len(s) && s[i+1] == '/': |
| c.state, i = stateJSLineCmt, i+1 |
| case i+1 < len(s) && s[i+1] == '*': |
| c.state, i = stateJSBlockCmt, i+1 |
| case c.jsCtx == jsCtxRegexp: |
| c.state = stateJSRegexp |
| case c.jsCtx == jsCtxDivOp: |
| c.jsCtx = jsCtxRegexp |
| default: |
| return context{ |
| state: stateError, |
| errStr: fmt.Sprintf("'/' could start div or regexp: %.32q", s[i:]), |
| }, nil |
| } |
| default: |
| panic("unreachable") |
| } |
| return c, s[i+1:] |
| } |
| |
| // tJSStr is the context transition function for the JS string states. |
| func tJSStr(c context, s []byte) (context, []byte) { |
| if d, t := tSpecialTagEnd(c, s); t != nil { |
| return d, t |
| } |
| |
| quoteAndEsc := `\"` |
| if c.state == stateJSSqStr { |
| quoteAndEsc = `\'` |
| } |
| |
| b := s |
| for { |
| i := bytes.IndexAny(b, quoteAndEsc) |
| if i == -1 { |
| return c, nil |
| } |
| if b[i] == '\\' { |
| i++ |
| if i == len(b) { |
| return context{ |
| state: stateError, |
| errStr: fmt.Sprintf("unfinished escape sequence in JS string: %q", s), |
| }, nil |
| } |
| } else { |
| c.state, c.jsCtx = stateJS, jsCtxDivOp |
| return c, b[i+1:] |
| } |
| b = b[i+1:] |
| } |
| panic("unreachable") |
| } |
| |
| // tJSRegexp is the context transition function for the /RegExp/ literal state. |
| func tJSRegexp(c context, s []byte) (context, []byte) { |
| if d, t := tSpecialTagEnd(c, s); t != nil { |
| return d, t |
| } |
| |
| b := s |
| inCharset := false |
| for { |
| i := bytes.IndexAny(b, `/[\]`) |
| if i == -1 { |
| break |
| } |
| switch b[i] { |
| case '/': |
| if !inCharset { |
| c.state, c.jsCtx = stateJS, jsCtxDivOp |
| return c, b[i+1:] |
| } |
| case '\\': |
| i++ |
| if i == len(b) { |
| return context{ |
| state: stateError, |
| errStr: fmt.Sprintf("unfinished escape sequence in JS regexp: %q", s), |
| }, nil |
| } |
| case '[': |
| inCharset = true |
| case ']': |
| inCharset = false |
| default: |
| panic("unreachable") |
| } |
| b = b[i+1:] |
| } |
| |
| if inCharset { |
| // This can be fixed by making context richer if interpolation |
| // into charsets is desired. |
| return context{ |
| state: stateError, |
| errStr: fmt.Sprintf("unfinished JS regexp charset: %q", s), |
| }, nil |
| } |
| |
| return c, nil |
| } |
| |
| var blockCommentEnd = []byte("*/") |
| |
| // tBlockCmt is the context transition function for /*comment*/ states. |
| func tBlockCmt(c context, s []byte) (context, []byte) { |
| if d, t := tSpecialTagEnd(c, s); t != nil { |
| return d, t |
| } |
| i := bytes.Index(s, blockCommentEnd) |
| if i == -1 { |
| return c, nil |
| } |
| switch c.state { |
| case stateJSBlockCmt: |
| c.state = stateJS |
| case stateCSSBlockCmt: |
| c.state = stateCSS |
| default: |
| panic(c.state.String()) |
| } |
| return c, s[i+2:] |
| } |
| |
| // tLineCmt is the context transition function for //comment states. |
| func tLineCmt(c context, s []byte) (context, []byte) { |
| if d, t := tSpecialTagEnd(c, s); t != nil { |
| return d, t |
| } |
| var lineTerminators string |
| var endState state |
| switch c.state { |
| case stateJSLineCmt: |
| lineTerminators, endState = "\n\r\u2028\u2029", stateJS |
| case stateCSSLineCmt: |
| lineTerminators, endState = "\n\f\r", stateCSS |
| // Line comments are not part of any published CSS standard but |
| // are supported by the 4 major browsers. |
| // This defines line comments as |
| // LINECOMMENT ::= "//" [^\n\f\d]* |
| // since http://www.w3.org/TR/css3-syntax/#SUBTOK-nl defines |
| // newlines: |
| // nl ::= #xA | #xD #xA | #xD | #xC |
| default: |
| panic(c.state.String()) |
| } |
| |
| i := bytes.IndexAny(s, lineTerminators) |
| if i == -1 { |
| return c, nil |
| } |
| c.state = endState |
| // Per section 7.4 of EcmaScript 5 : http://es5.github.com/#x7.4 |
| // "However, the LineTerminator at the end of the line is not |
| // considered to be part of the single-line comment; it is recognised |
| // separately by the lexical grammar and becomes part of the stream of |
| // input elements for the syntactic grammar." |
| return c, s[i:] |
| } |
| |
| // tCSS is the context transition function for the CSS state. |
| func tCSS(c context, s []byte) (context, []byte) { |
| if d, t := tSpecialTagEnd(c, s); t != nil { |
| return d, t |
| } |
| |
| // CSS quoted strings are almost never used except for: |
| // (1) URLs as in background: "/foo.png" |
| // (2) Multiword font-names as in font-family: "Times New Roman" |
| // (3) List separators in content values as in inline-lists: |
| // <style> |
| // ul.inlineList { list-style: none; padding:0 } |
| // ul.inlineList > li { display: inline } |
| // ul.inlineList > li:before { content: ", " } |
| // ul.inlineList > li:first-child:before { content: "" } |
| // </style> |
| // <ul class=inlineList><li>One<li>Two<li>Three</ul> |
| // (4) Attribute value selectors as in a[href="http://example.com/"] |
| // |
| // We conservatively treat all strings as URLs, but make some |
| // allowances to avoid confusion. |
| // |
| // In (1), our conservative assumption is justified. |
| // In (2), valid font names do not contain ':', '?', or '#', so our |
| // conservative assumption is fine since we will never transition past |
| // urlPartPreQuery. |
| // In (3), our protocol heuristic should not be tripped, and there |
| // should not be non-space content after a '?' or '#', so as long as |
| // we only %-encode RFC 3986 reserved characters we are ok. |
| // In (4), we should URL escape for URL attributes, and for others we |
| // have the attribute name available if our conservative assumption |
| // proves problematic for real code. |
| |
| for { |
| i := bytes.IndexAny(s, `("'/`) |
| if i == -1 { |
| return c, nil |
| } |
| switch s[i] { |
| case '(': |
| // Look for url to the left. |
| p := bytes.TrimRight(s[:i], "\t\n\f\r ") |
| if endsWithCSSKeyword(p, "url") { |
| q := bytes.TrimLeft(s[i+1:], "\t\n\f\r ") |
| switch { |
| case len(q) != 0 && q[0] == '"': |
| c.state, s = stateCSSDqURL, q[1:] |
| case len(q) != 0 && q[0] == '\'': |
| c.state, s = stateCSSSqURL, q[1:] |
| |
| default: |
| c.state, s = stateCSSURL, q |
| } |
| return c, s |
| } |
| case '/': |
| if i+1 < len(s) { |
| switch s[i+1] { |
| case '/': |
| c.state = stateCSSLineCmt |
| return c, s[i+2:] |
| case '*': |
| c.state = stateCSSBlockCmt |
| return c, s[i+2:] |
| } |
| } |
| case '"': |
| c.state = stateCSSDqStr |
| return c, s[i+1:] |
| case '\'': |
| c.state = stateCSSSqStr |
| return c, s[i+1:] |
| } |
| s = s[i+1:] |
| } |
| panic("unreachable") |
| } |
| |
| // tCSSStr is the context transition function for the CSS string and URL states. |
| func tCSSStr(c context, s []byte) (context, []byte) { |
| if d, t := tSpecialTagEnd(c, s); t != nil { |
| return d, t |
| } |
| |
| var endAndEsc string |
| switch c.state { |
| case stateCSSDqStr, stateCSSDqURL: |
| endAndEsc = `\"` |
| case stateCSSSqStr, stateCSSSqURL: |
| endAndEsc = `\'` |
| case stateCSSURL: |
| // Unquoted URLs end with a newline or close parenthesis. |
| // The below includes the wc (whitespace character) and nl. |
| endAndEsc = "\\\t\n\f\r )" |
| default: |
| panic(c.state.String()) |
| } |
| |
| b := s |
| for { |
| i := bytes.IndexAny(b, endAndEsc) |
| if i == -1 { |
| return tURL(c, decodeCSS(b)) |
| } |
| if b[i] == '\\' { |
| i++ |
| if i == len(b) { |
| return context{ |
| state: stateError, |
| errStr: fmt.Sprintf("unfinished escape sequence in CSS string: %q", s), |
| }, nil |
| } |
| } else { |
| c.state = stateCSS |
| return c, b[i+1:] |
| } |
| c, _ = tURL(c, decodeCSS(b[:i+1])) |
| b = b[i+1:] |
| } |
| panic("unreachable") |
| } |
| |
| // tError is the context transition function for the error state. |
| func tError(c context, s []byte) (context, []byte) { |
| return c, nil |
| } |
| |
| // eatAttrName returns the largest j such that s[i:j] is an attribute name. |
| // It returns an error if s[i:] does not look like it begins with an |
| // attribute name, such as encountering a quote mark without a preceding |
| // equals sign. |
| func eatAttrName(s []byte, i int) (int, os.Error) { |
| for j := i; j < len(s); j++ { |
| switch s[j] { |
| case ' ', '\t', '\n', '\f', '\r', '=', '>': |
| return j, nil |
| case '\'', '"', '<': |
| // These result in a parse warning in HTML5 and are |
| // indicative of serious problems if seen in an attr |
| // name in a template. |
| return 0, fmt.Errorf("%q in attribute name: %.32q", s[j:j+1], s) |
| default: |
| // No-op. |
| } |
| } |
| return len(s), nil |
| } |
| |
| var elementNameMap = map[string]element{ |
| "script": elementScript, |
| "style": elementStyle, |
| "textarea": elementTextarea, |
| "title": elementTitle, |
| } |
| |
| // eatTagName returns the largest j such that s[i:j] is a tag name and the tag type. |
| func eatTagName(s []byte, i int) (int, element) { |
| j := i |
| for ; j < len(s); j++ { |
| x := s[j] |
| if !(('a' <= x && x <= 'z') || |
| ('A' <= x && x <= 'Z') || |
| ('0' <= x && x <= '9' && i != j)) { |
| break |
| } |
| } |
| return j, elementNameMap[strings.ToLower(string(s[i:j]))] |
| } |
| |
| // eatWhiteSpace returns the largest j such that s[i:j] is white space. |
| func eatWhiteSpace(s []byte, i int) int { |
| for j := i; j < len(s); j++ { |
| switch s[j] { |
| case ' ', '\t', '\n', '\f', '\r': |
| // No-op. |
| default: |
| return j |
| } |
| } |
| return len(s) |
| } |
| |
| // urlAttr is the set of attribute names whose values are URLs. |
| // It consists of all "%URI"-typed attributes from |
| // http://www.w3.org/TR/html4/index/attributes.html |
| // as well as those attributes defined at |
| // http://dev.w3.org/html5/spec/index.html#attributes-1 |
| // whose Value column in that table matches |
| // "Valid [non-empty] URL potentially surrounded by spaces". |
| var urlAttr = map[string]bool{ |
| "action": true, |
| "archive": true, |
| "background": true, |
| "cite": true, |
| "classid": true, |
| "codebase": true, |
| "data": true, |
| "formaction": true, |
| "href": true, |
| "icon": true, |
| "longdesc": true, |
| "manifest": true, |
| "poster": true, |
| "profile": true, |
| "src": true, |
| "usemap": true, |
| } |