|  | // Copyright 2010 The Go Authors. All rights reserved. | 
|  | // Use of this source code is governed by a BSD-style | 
|  | // license that can be found in the LICENSE file. | 
|  |  | 
|  | package html | 
|  |  | 
|  | import ( | 
|  | "bytes" | 
|  | "strings" | 
|  | "unicode/utf8" | 
|  | ) | 
|  |  | 
|  | // These replacements permit compatibility with old numeric entities that | 
|  | // assumed Windows-1252 encoding. | 
|  | // https://html.spec.whatwg.org/multipage/syntax.html#consume-a-character-reference | 
|  | var replacementTable = [...]rune{ | 
|  | '\u20AC', // First entry is what 0x80 should be replaced with. | 
|  | '\u0081', | 
|  | '\u201A', | 
|  | '\u0192', | 
|  | '\u201E', | 
|  | '\u2026', | 
|  | '\u2020', | 
|  | '\u2021', | 
|  | '\u02C6', | 
|  | '\u2030', | 
|  | '\u0160', | 
|  | '\u2039', | 
|  | '\u0152', | 
|  | '\u008D', | 
|  | '\u017D', | 
|  | '\u008F', | 
|  | '\u0090', | 
|  | '\u2018', | 
|  | '\u2019', | 
|  | '\u201C', | 
|  | '\u201D', | 
|  | '\u2022', | 
|  | '\u2013', | 
|  | '\u2014', | 
|  | '\u02DC', | 
|  | '\u2122', | 
|  | '\u0161', | 
|  | '\u203A', | 
|  | '\u0153', | 
|  | '\u009D', | 
|  | '\u017E', | 
|  | '\u0178', // Last entry is 0x9F. | 
|  | // 0x00->'\uFFFD' is handled programmatically. | 
|  | // 0x0D->'\u000D' is a no-op. | 
|  | } | 
|  |  | 
|  | // unescapeEntity reads an entity like "<" from b[src:] and writes the | 
|  | // corresponding "<" to b[dst:], returning the incremented dst and src cursors. | 
|  | // Precondition: b[src] == '&' && dst <= src. | 
|  | // attribute should be true if parsing an attribute value. | 
|  | func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) { | 
|  | // https://html.spec.whatwg.org/multipage/syntax.html#consume-a-character-reference | 
|  |  | 
|  | // i starts at 1 because we already know that s[0] == '&'. | 
|  | i, s := 1, b[src:] | 
|  |  | 
|  | if len(s) <= 1 { | 
|  | b[dst] = b[src] | 
|  | return dst + 1, src + 1 | 
|  | } | 
|  |  | 
|  | if s[i] == '#' { | 
|  | if len(s) <= 3 { // We need to have at least "&#.". | 
|  | b[dst] = b[src] | 
|  | return dst + 1, src + 1 | 
|  | } | 
|  | i++ | 
|  | c := s[i] | 
|  | hex := false | 
|  | if c == 'x' || c == 'X' { | 
|  | hex = true | 
|  | i++ | 
|  | } | 
|  |  | 
|  | x := '\x00' | 
|  | for i < len(s) { | 
|  | c = s[i] | 
|  | i++ | 
|  | if hex { | 
|  | if '0' <= c && c <= '9' { | 
|  | x = 16*x + rune(c) - '0' | 
|  | continue | 
|  | } else if 'a' <= c && c <= 'f' { | 
|  | x = 16*x + rune(c) - 'a' + 10 | 
|  | continue | 
|  | } else if 'A' <= c && c <= 'F' { | 
|  | x = 16*x + rune(c) - 'A' + 10 | 
|  | continue | 
|  | } | 
|  | } else if '0' <= c && c <= '9' { | 
|  | x = 10*x + rune(c) - '0' | 
|  | continue | 
|  | } | 
|  | if c != ';' { | 
|  | i-- | 
|  | } | 
|  | break | 
|  | } | 
|  |  | 
|  | if i <= 3 { // No characters matched. | 
|  | b[dst] = b[src] | 
|  | return dst + 1, src + 1 | 
|  | } | 
|  |  | 
|  | if 0x80 <= x && x <= 0x9F { | 
|  | // Replace characters from Windows-1252 with UTF-8 equivalents. | 
|  | x = replacementTable[x-0x80] | 
|  | } else if x == 0 || (0xD800 <= x && x <= 0xDFFF) || x > 0x10FFFF { | 
|  | // Replace invalid characters with the replacement character. | 
|  | x = '\uFFFD' | 
|  | } | 
|  |  | 
|  | return dst + utf8.EncodeRune(b[dst:], x), src + i | 
|  | } | 
|  |  | 
|  | // Consume the maximum number of characters possible, with the | 
|  | // consumed characters matching one of the named references. | 
|  |  | 
|  | for i < len(s) { | 
|  | c := s[i] | 
|  | i++ | 
|  | // Lower-cased characters are more common in entities, so we check for them first. | 
|  | if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9' { | 
|  | continue | 
|  | } | 
|  | if c != ';' { | 
|  | i-- | 
|  | } | 
|  | break | 
|  | } | 
|  |  | 
|  | entityName := string(s[1:i]) | 
|  | if entityName == "" { | 
|  | // No-op. | 
|  | } else if attribute && entityName[len(entityName)-1] != ';' && len(s) > i && s[i] == '=' { | 
|  | // No-op. | 
|  | } else if x := entity[entityName]; x != 0 { | 
|  | return dst + utf8.EncodeRune(b[dst:], x), src + i | 
|  | } else if x := entity2[entityName]; x[0] != 0 { | 
|  | dst1 := dst + utf8.EncodeRune(b[dst:], x[0]) | 
|  | return dst1 + utf8.EncodeRune(b[dst1:], x[1]), src + i | 
|  | } else if !attribute { | 
|  | maxLen := len(entityName) - 1 | 
|  | if maxLen > longestEntityWithoutSemicolon { | 
|  | maxLen = longestEntityWithoutSemicolon | 
|  | } | 
|  | for j := maxLen; j > 1; j-- { | 
|  | if x := entity[entityName[:j]]; x != 0 { | 
|  | return dst + utf8.EncodeRune(b[dst:], x), src + j + 1 | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | dst1, src1 = dst+i, src+i | 
|  | copy(b[dst:dst1], b[src:src1]) | 
|  | return dst1, src1 | 
|  | } | 
|  |  | 
|  | // unescape unescapes b's entities in-place, so that "a<b" becomes "a<b". | 
|  | // attribute should be true if parsing an attribute value. | 
|  | func unescape(b []byte, attribute bool) []byte { | 
|  | for i, c := range b { | 
|  | if c == '&' { | 
|  | dst, src := unescapeEntity(b, i, i, attribute) | 
|  | for src < len(b) { | 
|  | c := b[src] | 
|  | if c == '&' { | 
|  | dst, src = unescapeEntity(b, dst, src, attribute) | 
|  | } else { | 
|  | b[dst] = c | 
|  | dst, src = dst+1, src+1 | 
|  | } | 
|  | } | 
|  | return b[0:dst] | 
|  | } | 
|  | } | 
|  | return b | 
|  | } | 
|  |  | 
|  | // lower lower-cases the A-Z bytes in b in-place, so that "aBc" becomes "abc". | 
|  | func lower(b []byte) []byte { | 
|  | for i, c := range b { | 
|  | if 'A' <= c && c <= 'Z' { | 
|  | b[i] = c + 'a' - 'A' | 
|  | } | 
|  | } | 
|  | return b | 
|  | } | 
|  |  | 
|  | const escapedChars = "&'<>\"\r" | 
|  |  | 
|  | func escape(w writer, s string) error { | 
|  | i := strings.IndexAny(s, escapedChars) | 
|  | for i != -1 { | 
|  | if _, err := w.WriteString(s[:i]); err != nil { | 
|  | return err | 
|  | } | 
|  | var esc string | 
|  | switch s[i] { | 
|  | case '&': | 
|  | esc = "&" | 
|  | case '\'': | 
|  | // "'" is shorter than "'" and apos was not in HTML until HTML5. | 
|  | esc = "'" | 
|  | case '<': | 
|  | esc = "<" | 
|  | case '>': | 
|  | esc = ">" | 
|  | case '"': | 
|  | // """ is shorter than """. | 
|  | esc = """ | 
|  | case '\r': | 
|  | esc = "
" | 
|  | default: | 
|  | panic("unrecognized escape character") | 
|  | } | 
|  | s = s[i+1:] | 
|  | if _, err := w.WriteString(esc); err != nil { | 
|  | return err | 
|  | } | 
|  | i = strings.IndexAny(s, escapedChars) | 
|  | } | 
|  | _, err := w.WriteString(s) | 
|  | return err | 
|  | } | 
|  |  | 
|  | // EscapeString escapes special characters like "<" to become "<". It | 
|  | // escapes only five such characters: <, >, &, ' and ". | 
|  | // UnescapeString(EscapeString(s)) == s always holds, but the converse isn't | 
|  | // always true. | 
|  | func EscapeString(s string) string { | 
|  | if strings.IndexAny(s, escapedChars) == -1 { | 
|  | return s | 
|  | } | 
|  | var buf bytes.Buffer | 
|  | escape(&buf, s) | 
|  | return buf.String() | 
|  | } | 
|  |  | 
|  | // UnescapeString unescapes entities like "<" to become "<". It unescapes a | 
|  | // larger range of entities than EscapeString escapes. For example, "á" | 
|  | // unescapes to "รก", as does "á" and "&xE1;". | 
|  | // UnescapeString(EscapeString(s)) == s always holds, but the converse isn't | 
|  | // always true. | 
|  | func UnescapeString(s string) string { | 
|  | for _, c := range s { | 
|  | if c == '&' { | 
|  | return string(unescape([]byte(s), false)) | 
|  | } | 
|  | } | 
|  | return s | 
|  | } |