| // Copyright 2010 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| // Package html provides functions for escaping and unescaping HTML text. |
| package html |
| |
| import ( |
| "strings" |
| "unicode/utf8" |
| ) |
| |
| // These replacements permit compatibility with old numeric entities that |
| // assumed Windows-1252 encoding. |
| // https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state |
| var replacementTable = [...]rune{ |
| '\u20AC', // First entry is what 0x80 should be replaced with. |
| '\u0081', |
| '\u201A', |
| '\u0192', |
| '\u201E', |
| '\u2026', |
| '\u2020', |
| '\u2021', |
| '\u02C6', |
| '\u2030', |
| '\u0160', |
| '\u2039', |
| '\u0152', |
| '\u008D', |
| '\u017D', |
| '\u008F', |
| '\u0090', |
| '\u2018', |
| '\u2019', |
| '\u201C', |
| '\u201D', |
| '\u2022', |
| '\u2013', |
| '\u2014', |
| '\u02DC', |
| '\u2122', |
| '\u0161', |
| '\u203A', |
| '\u0153', |
| '\u009D', |
| '\u017E', |
| '\u0178', // Last entry is 0x9F. |
| // 0x00->'\uFFFD' is handled programmatically. |
| // 0x0D->'\u000D' is a no-op. |
| } |
| |
| // unescapeEntity reads an entity like "<" from b[src:] and writes the |
| // corresponding "<" to b[dst:], returning the incremented dst and src cursors. |
| // Precondition: b[src] == '&' && dst <= src. |
| func unescapeEntity(b []byte, dst, src int) (dst1, src1 int) { |
| const attribute = false |
| |
| // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference |
| |
| // i starts at 1 because we already know that s[0] == '&'. |
| i, s := 1, b[src:] |
| |
| if len(s) <= 1 { |
| b[dst] = b[src] |
| return dst + 1, src + 1 |
| } |
| |
| if s[i] == '#' { |
| if len(s) <= 3 { // We need to have at least "&#.". |
| b[dst] = b[src] |
| return dst + 1, src + 1 |
| } |
| i++ |
| c := s[i] |
| hex := false |
| if c == 'x' || c == 'X' { |
| hex = true |
| i++ |
| } |
| |
| x := '\x00' |
| for i < len(s) { |
| c = s[i] |
| i++ |
| if hex { |
| if '0' <= c && c <= '9' { |
| x = 16*x + rune(c) - '0' |
| continue |
| } else if 'a' <= c && c <= 'f' { |
| x = 16*x + rune(c) - 'a' + 10 |
| continue |
| } else if 'A' <= c && c <= 'F' { |
| x = 16*x + rune(c) - 'A' + 10 |
| continue |
| } |
| } else if '0' <= c && c <= '9' { |
| x = 10*x + rune(c) - '0' |
| continue |
| } |
| if c != ';' { |
| i-- |
| } |
| break |
| } |
| |
| if i <= 3 { // No characters matched. |
| b[dst] = b[src] |
| return dst + 1, src + 1 |
| } |
| |
| if 0x80 <= x && x <= 0x9F { |
| // Replace characters from Windows-1252 with UTF-8 equivalents. |
| x = replacementTable[x-0x80] |
| } else if x == 0 || (0xD800 <= x && x <= 0xDFFF) || x > 0x10FFFF { |
| // Replace invalid characters with the replacement character. |
| x = '\uFFFD' |
| } |
| |
| return dst + utf8.EncodeRune(b[dst:], x), src + i |
| } |
| |
| // Consume the maximum number of characters possible, with the |
| // consumed characters matching one of the named references. |
| |
| for i < len(s) { |
| c := s[i] |
| i++ |
| // Lower-cased characters are more common in entities, so we check for them first. |
| if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9' { |
| continue |
| } |
| if c != ';' { |
| i-- |
| } |
| break |
| } |
| |
| entityName := s[1:i] |
| if len(entityName) == 0 { |
| // No-op. |
| } else if attribute && entityName[len(entityName)-1] != ';' && len(s) > i && s[i] == '=' { |
| // No-op. |
| } else if x := entity[string(entityName)]; x != 0 { |
| return dst + utf8.EncodeRune(b[dst:], x), src + i |
| } else if x := entity2[string(entityName)]; x[0] != 0 { |
| dst1 := dst + utf8.EncodeRune(b[dst:], x[0]) |
| return dst1 + utf8.EncodeRune(b[dst1:], x[1]), src + i |
| } else if !attribute { |
| maxLen := len(entityName) - 1 |
| if maxLen > longestEntityWithoutSemicolon { |
| maxLen = longestEntityWithoutSemicolon |
| } |
| for j := maxLen; j > 1; j-- { |
| if x := entity[string(entityName[:j])]; x != 0 { |
| return dst + utf8.EncodeRune(b[dst:], x), src + j + 1 |
| } |
| } |
| } |
| |
| dst1, src1 = dst+i, src+i |
| copy(b[dst:dst1], b[src:src1]) |
| return dst1, src1 |
| } |
| |
| var htmlEscaper = strings.NewReplacer( |
| `&`, "&", |
| `'`, "'", // "'" is shorter than "'" and apos was not in HTML until HTML5. |
| `<`, "<", |
| `>`, ">", |
| `"`, """, // """ is shorter than """. |
| ) |
| |
| // EscapeString escapes special characters like "<" to become "<". It |
| // escapes only five such characters: <, >, &, ' and ". |
| // UnescapeString(EscapeString(s)) == s always holds, but the converse isn't |
| // always true. |
| func EscapeString(s string) string { |
| return htmlEscaper.Replace(s) |
| } |
| |
| // UnescapeString unescapes entities like "<" to become "<". It unescapes a |
| // larger range of entities than EscapeString escapes. For example, "á" |
| // unescapes to "รก", as does "á" and "á". |
| // UnescapeString(EscapeString(s)) == s always holds, but the converse isn't |
| // always true. |
| func UnescapeString(s string) string { |
| populateMapsOnce.Do(populateMaps) |
| i := strings.IndexByte(s, '&') |
| |
| if i < 0 { |
| return s |
| } |
| |
| b := []byte(s) |
| dst, src := unescapeEntity(b, i, i) |
| for len(s[src:]) > 0 { |
| if s[src] == '&' { |
| i = 0 |
| } else { |
| i = strings.IndexByte(s[src:], '&') |
| } |
| if i < 0 { |
| dst += copy(b[dst:], s[src:]) |
| break |
| } |
| |
| if i > 0 { |
| copy(b[dst:], s[src:src+i]) |
| } |
| dst, src = unescapeEntity(b, dst+i, src+i) |
| } |
| return string(b[:dst]) |
| } |