|  | // Copyright 2010 The Go Authors. All rights reserved. | 
|  | // Use of this source code is governed by a BSD-style | 
|  | // license that can be found in the LICENSE file. | 
|  |  | 
|  | // Package html provides functions for escaping and unescaping HTML text. | 
|  | package html | 
|  |  | 
|  | import ( | 
|  | "strings" | 
|  | "unicode/utf8" | 
|  | ) | 
|  |  | 
|  | // These replacements permit compatibility with old numeric entities that | 
|  | // assumed Windows-1252 encoding. | 
|  | // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference | 
|  | var replacementTable = [...]rune{ | 
|  | '\u20AC', // First entry is what 0x80 should be replaced with. | 
|  | '\u0081', | 
|  | '\u201A', | 
|  | '\u0192', | 
|  | '\u201E', | 
|  | '\u2026', | 
|  | '\u2020', | 
|  | '\u2021', | 
|  | '\u02C6', | 
|  | '\u2030', | 
|  | '\u0160', | 
|  | '\u2039', | 
|  | '\u0152', | 
|  | '\u008D', | 
|  | '\u017D', | 
|  | '\u008F', | 
|  | '\u0090', | 
|  | '\u2018', | 
|  | '\u2019', | 
|  | '\u201C', | 
|  | '\u201D', | 
|  | '\u2022', | 
|  | '\u2013', | 
|  | '\u2014', | 
|  | '\u02DC', | 
|  | '\u2122', | 
|  | '\u0161', | 
|  | '\u203A', | 
|  | '\u0153', | 
|  | '\u009D', | 
|  | '\u017E', | 
|  | '\u0178', // Last entry is 0x9F. | 
|  | // 0x00->'\uFFFD' is handled programmatically. | 
|  | // 0x0D->'\u000D' is a no-op. | 
|  | } | 
|  |  | 
|  | // unescapeEntity reads an entity like "<" from b[src:] and writes the | 
|  | // corresponding "<" to b[dst:], returning the incremented dst and src cursors. | 
|  | // Precondition: b[src] == '&' && dst <= src. | 
|  | func unescapeEntity(b []byte, dst, src int) (dst1, src1 int) { | 
|  | const attribute = false | 
|  |  | 
|  | // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference | 
|  |  | 
|  | // i starts at 1 because we already know that s[0] == '&'. | 
|  | i, s := 1, b[src:] | 
|  |  | 
|  | if len(s) <= 1 { | 
|  | b[dst] = b[src] | 
|  | return dst + 1, src + 1 | 
|  | } | 
|  |  | 
|  | if s[i] == '#' { | 
|  | if len(s) <= 3 { // We need to have at least "&#.". | 
|  | b[dst] = b[src] | 
|  | return dst + 1, src + 1 | 
|  | } | 
|  | i++ | 
|  | c := s[i] | 
|  | hex := false | 
|  | if c == 'x' || c == 'X' { | 
|  | hex = true | 
|  | i++ | 
|  | } | 
|  |  | 
|  | x := '\x00' | 
|  | for i < len(s) { | 
|  | c = s[i] | 
|  | i++ | 
|  | if hex { | 
|  | if '0' <= c && c <= '9' { | 
|  | x = 16*x + rune(c) - '0' | 
|  | continue | 
|  | } else if 'a' <= c && c <= 'f' { | 
|  | x = 16*x + rune(c) - 'a' + 10 | 
|  | continue | 
|  | } else if 'A' <= c && c <= 'F' { | 
|  | x = 16*x + rune(c) - 'A' + 10 | 
|  | continue | 
|  | } | 
|  | } else if '0' <= c && c <= '9' { | 
|  | x = 10*x + rune(c) - '0' | 
|  | continue | 
|  | } | 
|  | if c != ';' { | 
|  | i-- | 
|  | } | 
|  | break | 
|  | } | 
|  |  | 
|  | if i <= 3 { // No characters matched. | 
|  | b[dst] = b[src] | 
|  | return dst + 1, src + 1 | 
|  | } | 
|  |  | 
|  | if 0x80 <= x && x <= 0x9F { | 
|  | // Replace characters from Windows-1252 with UTF-8 equivalents. | 
|  | x = replacementTable[x-0x80] | 
|  | } else if x == 0 || (0xD800 <= x && x <= 0xDFFF) || x > 0x10FFFF { | 
|  | // Replace invalid characters with the replacement character. | 
|  | x = '\uFFFD' | 
|  | } | 
|  |  | 
|  | return dst + utf8.EncodeRune(b[dst:], x), src + i | 
|  | } | 
|  |  | 
|  | // Consume the maximum number of characters possible, with the | 
|  | // consumed characters matching one of the named references. | 
|  |  | 
|  | for i < len(s) { | 
|  | c := s[i] | 
|  | i++ | 
|  | // Lower-cased characters are more common in entities, so we check for them first. | 
|  | if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9' { | 
|  | continue | 
|  | } | 
|  | if c != ';' { | 
|  | i-- | 
|  | } | 
|  | break | 
|  | } | 
|  |  | 
|  | entityName := s[1:i] | 
|  | if len(entityName) == 0 { | 
|  | // No-op. | 
|  | } else if attribute && entityName[len(entityName)-1] != ';' && len(s) > i && s[i] == '=' { | 
|  | // No-op. | 
|  | } else if x := entity[string(entityName)]; x != 0 { | 
|  | return dst + utf8.EncodeRune(b[dst:], x), src + i | 
|  | } else if x := entity2[string(entityName)]; x[0] != 0 { | 
|  | dst1 := dst + utf8.EncodeRune(b[dst:], x[0]) | 
|  | return dst1 + utf8.EncodeRune(b[dst1:], x[1]), src + i | 
|  | } else if !attribute { | 
|  | maxLen := len(entityName) - 1 | 
|  | if maxLen > longestEntityWithoutSemicolon { | 
|  | maxLen = longestEntityWithoutSemicolon | 
|  | } | 
|  | for j := maxLen; j > 1; j-- { | 
|  | if x := entity[string(entityName[:j])]; x != 0 { | 
|  | return dst + utf8.EncodeRune(b[dst:], x), src + j + 1 | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | dst1, src1 = dst+i, src+i | 
|  | copy(b[dst:dst1], b[src:src1]) | 
|  | return dst1, src1 | 
|  | } | 
|  |  | 
|  | var htmlEscaper = strings.NewReplacer( | 
|  | `&`, "&", | 
|  | `'`, "'", // "'" is shorter than "'" and apos was not in HTML until HTML5. | 
|  | `<`, "<", | 
|  | `>`, ">", | 
|  | `"`, """, // """ is shorter than """. | 
|  | ) | 
|  |  | 
|  | // EscapeString escapes special characters like "<" to become "<". It | 
|  | // escapes only five such characters: <, >, &, ' and ". | 
|  | // UnescapeString(EscapeString(s)) == s always holds, but the converse isn't | 
|  | // always true. | 
|  | func EscapeString(s string) string { | 
|  | return htmlEscaper.Replace(s) | 
|  | } | 
|  |  | 
|  | // UnescapeString unescapes entities like "<" to become "<". It unescapes a | 
|  | // larger range of entities than EscapeString escapes. For example, "á" | 
|  | // unescapes to "รก", as does "á" and "á". | 
|  | // UnescapeString(EscapeString(s)) == s always holds, but the converse isn't | 
|  | // always true. | 
|  | func UnescapeString(s string) string { | 
|  | populateMapsOnce.Do(populateMaps) | 
|  | i := strings.IndexByte(s, '&') | 
|  |  | 
|  | if i < 0 { | 
|  | return s | 
|  | } | 
|  |  | 
|  | b := []byte(s) | 
|  | dst, src := unescapeEntity(b, i, i) | 
|  | for len(s[src:]) > 0 { | 
|  | if s[src] == '&' { | 
|  | i = 0 | 
|  | } else { | 
|  | i = strings.IndexByte(s[src:], '&') | 
|  | } | 
|  | if i < 0 { | 
|  | dst += copy(b[dst:], s[src:]) | 
|  | break | 
|  | } | 
|  |  | 
|  | if i > 0 { | 
|  | copy(b[dst:], s[src:src+i]) | 
|  | } | 
|  | dst, src = unescapeEntity(b, dst+i, src+i) | 
|  | } | 
|  | return string(b[:dst]) | 
|  | } |