blob: dc126b788a7ed402da177668df52bf0ade791c70 [file] [log] [blame]
// Copyright 2021 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package markdown
import (
"bytes"
"strconv"
"strings"
"unicode"
)
type HTMLBlock struct {
Position
Text []string
}
func (b *HTMLBlock) PrintHTML(buf *bytes.Buffer) {
for _, s := range b.Text {
buf.WriteString(s)
buf.WriteString("\n")
}
}
func (b *HTMLBlock) printMarkdown(buf *bytes.Buffer, s mdState) {
if s.prefix1 != "" {
buf.WriteString(s.prefix1)
} else {
buf.WriteString(s.prefix)
}
b.PrintHTML(buf)
}
type htmlBuilder struct {
endBlank bool
text []string
endFunc func(string) bool
}
func (c *htmlBuilder) extend(p *parseState, s line) (line, bool) {
if c.endBlank && s.isBlank() {
return s, false
}
t := s.string()
c.text = append(c.text, t)
if c.endFunc != nil && c.endFunc(t) {
return line{}, false
}
return line{}, true
}
func (c *htmlBuilder) build(p buildState) Block {
return &HTMLBlock{
p.pos(),
c.text,
}
}
func newHTML(p *parseState, s line) (line, bool) {
peek := s
if p.startHTML(&peek) {
return line{}, true
}
return s, false
}
func (p *parseState) startHTML(s *line) bool {
tt := *s
tt.trimSpace(0, 3, false)
if tt.peek() != '<' {
return false
}
t := tt.string()
var end string
switch {
case strings.HasPrefix(t, "<!--"):
end = "-->"
case strings.HasPrefix(t, "<?"):
end = "?>"
case strings.HasPrefix(t, "<![CDATA["):
end = "]]>"
case strings.HasPrefix(t, "<!") && len(t) >= 3 && isLetter(t[2]):
if 'a' <= t[2] && t[2] <= 'z' {
// Goldmark and the Dingus only accept <!UPPER> not <!lower>.
p.corner = true
}
end = ">"
}
if end != "" {
b := &htmlBuilder{endFunc: func(s string) bool { return strings.Contains(s, end) }}
p.addBlock(b)
b.text = append(b.text, s.string())
if b.endFunc(t) {
p.closeBlock()
}
return true
}
// case 6
i := 1
if i < len(t) && t[i] == '/' {
i++
}
buf := make([]byte, 0, 16)
for ; i < len(t) && len(buf) < 16; i++ {
c := t[i]
if 'A' <= c && c <= 'Z' {
c += 'a' - 'A'
}
if !('a' <= c && c <= 'z') && !('0' <= c && c <= '9') {
break
}
buf = append(buf, c)
}
var sep byte
if i < len(t) {
switch t[i] {
default:
goto Next
case ' ', '\t', '>':
// ok
sep = t[i]
case '/':
if i+1 >= len(t) || t[i+1] != '>' {
goto Next
}
}
}
if len(buf) == 0 {
goto Next
}
{
c := buf[0]
var ok bool
for _, name := range htmlTags {
if name[0] == c && len(name) == len(buf) && name == string(buf) {
if sep == '\t' {
// Goldmark recognizes space here but not tab.
// testdata/extra.txt 143.md
p.corner = true
}
ok = true
break
}
}
if !ok {
goto Next
}
}
{
b := &htmlBuilder{endBlank: true}
p.addBlock(b)
b.text = append(b.text, s.string())
return true
}
Next:
// case 1
if len(t) > 1 && t[1] != '/' && (i >= len(t) || t[i] == ' ' || t[i] == '\t' || t[i] == '>') {
switch string(buf) {
case "pre", "script", "style", "textarea":
b := &htmlBuilder{endFunc: hasEndPre}
p.addBlock(b)
b.text = append(b.text, s.string())
if hasEndPre(t) {
p.closeBlock()
}
return true
}
}
// case 7
if p.para() == nil {
if _, e, ok := parseHTMLOpenTag(p, t, 0); ok && skipSpace(t, e) == len(t) {
if e != len(t) {
// Goldmark disallows trailing space
p.corner = true
}
b := &htmlBuilder{endBlank: true}
p.addBlock(b)
b.text = append(b.text, s.string())
return true
}
if _, e, ok := parseHTMLClosingTag(p, t, 0); ok && skipSpace(t, e) == len(t) {
b := &htmlBuilder{endBlank: true}
p.addBlock(b)
b.text = append(b.text, s.string())
return true
}
}
return false
}
func hasEndPre(s string) bool {
for i := 0; i < len(s); i++ {
if s[i] == '<' && i+1 < len(s) && s[i+1] == '/' {
buf := make([]byte, 0, 8)
for i += 2; i < len(s) && len(buf) < 8; i++ {
c := s[i]
if 'A' <= c && c <= 'Z' {
c += 'a' - 'A'
}
if c < 'a' || 'z' < c {
break
}
buf = append(buf, c)
}
if i < len(s) && s[i] == '>' {
switch string(buf) {
case "pre", "script", "style", "textarea":
return true
}
}
}
}
return false
}
func parseHTMLTag(p *parseState, s string, i int) (Inline, int, bool) {
// “An HTML tag consists of an open tag, a closing tag, an HTML comment,
// a processing instruction, a declaration, or a CDATA section.”
if i+3 <= len(s) && s[i] == '<' {
switch s[i+1] {
default:
return parseHTMLOpenTag(p, s, i)
case '/':
return parseHTMLClosingTag(p, s, i)
case '!':
switch s[i+2] {
case '-':
return parseHTMLComment(s, i)
case '[':
return parseHTMLCDATA(s, i)
default:
return parseHTMLDecl(p, s, i)
}
case '?':
return parseHTMLProcInst(s, i)
}
}
return nil, 0, false
}
func parseHTMLOpenTag(p *parseState, s string, i int) (Inline, int, bool) {
if i >= len(s) || s[i] != '<' {
return nil, 0, false
}
// “An open tag consists of a < character, a tag name, zero or more attributes,
// optional spaces, tabs, and up to one line ending, an optional / character, and a > character.”
if name, j, ok := parseTagName(s, i+1); ok {
switch name {
case "pre", "script", "style", "textarea":
// Goldmark treats these as starting a new HTMLBlock
// and ending the paragraph they appear in.
p.corner = true
}
for {
if j >= len(s) || s[j] != ' ' && s[j] != '\t' && s[j] != '\n' && s[j] != '/' && s[j] != '>' {
return nil, 0, false
}
_, k, ok := parseAttr(p, s, j)
if !ok {
break
}
j = k
}
k := skipSpace(s, j)
if k != j {
// Goldmark mishandles spaces before >.
p.corner = true
}
j = k
if j < len(s) && s[j] == '/' {
j++
}
if j < len(s) && s[j] == '>' {
return &HTMLTag{s[i : j+1]}, j + 1, true
}
}
return nil, 0, false
}
func parseHTMLClosingTag(p *parseState, s string, i int) (Inline, int, bool) {
// “A closing tag consists of the string </, a tag name,
// optional spaces, tabs, and up to one line ending, and the character >.”
if i+2 >= len(s) || s[i] != '<' || s[i+1] != '/' {
return nil, 0, false
}
if skipSpace(s, i+2) != i+2 {
// Goldmark allows spaces here but the spec and the Dingus do not.
p.corner = true
}
if _, j, ok := parseTagName(s, i+2); ok {
j = skipSpace(s, j)
if j < len(s) && s[j] == '>' {
return &HTMLTag{s[i : j+1]}, j + 1, true
}
}
return nil, 0, false
}
func parseTagName(s string, i int) (string, int, bool) {
// “A tag name consists of an ASCII letter followed by zero or more ASCII letters, digits, or hyphens (-).”
if i < len(s) && isLetter(s[i]) {
j := i + 1
for j < len(s) && isLDH(s[j]) {
j++
}
return s[i:j], j, true
}
return "", 0, false
}
func parseAttr(p *parseState, s string, i int) (string, int, bool) {
// “An attribute consists of spaces, tabs, and up to one line ending,
// an attribute name, and an optional attribute value specification.”
i = skipSpace(s, i)
if _, j, ok := parseAttrName(s, i); ok {
if _, k, ok := parseAttrValueSpec(p, s, j); ok {
j = k
}
return s[i:j], j, true
}
return "", 0, false
}
func parseAttrName(s string, i int) (string, int, bool) {
// “An attribute name consists of an ASCII letter, _, or :,
// followed by zero or more ASCII letters, digits, _, ., :, or -.”
if i+1 < len(s) && (isLetter(s[i]) || s[i] == '_' || s[i] == ':') {
j := i + 1
for j < len(s) && (isLDH(s[j]) || s[j] == '_' || s[j] == '.' || s[j] == ':') {
j++
}
return s[i:j], j, true
}
return "", 0, false
}
func parseAttrValueSpec(p *parseState, s string, i int) (string, int, bool) {
// “An attribute value specification consists of
// optional spaces, tabs, and up to one line ending,
// a = character,
// optional spaces, tabs, and up to one line ending,
// and an attribute value.”
i = skipSpace(s, i)
if i+1 < len(s) && s[i] == '=' {
i = skipSpace(s, i+1)
if _, j, ok := parseAttrValue(s, i); ok {
p.corner = p.corner || strings.Contains(s[i:j], "\ufffd")
return s[i:j], j, true
}
}
return "", 0, false
}
func parseAttrValue(s string, i int) (string, int, bool) {
// “An attribute value consists of
// an unquoted attribute value,
// a single-quoted attribute value,
// or a double-quoted attribute value.”
// TODO: No escaping???
if i < len(s) && (s[i] == '\'' || s[i] == '"') {
// “A single-quoted attribute value consists of ',
// zero or more characters not including ', and a final '.”
// “A double-quoted attribute value consists of ",
// zero or more characters not including ", and a final ".”
if j := strings.IndexByte(s[i+1:], s[i]); j >= 0 {
end := i + 1 + j + 1
return s[i:end], end, true
}
}
// “An unquoted attribute value is a nonempty string of characters
// not including spaces, tabs, line endings, ", ', =, <, >, or `.”
j := i
for j < len(s) && strings.IndexByte(" \t\n\"'=<>`", s[j]) < 0 {
j++
}
if j > i {
return s[i:j], j, true
}
return "", 0, false
}
func parseHTMLComment(s string, i int) (Inline, int, bool) {
// “An HTML comment consists of <!-- + text + -->,
// where text does not start with > or ->,
// does not end with -, and does not contain --.”
if !strings.HasPrefix(s[i:], "<!-->") &&
!strings.HasPrefix(s[i:], "<!--->") {
if x, end, ok := parseHTMLMarker(s, i, "<!--", "-->"); ok {
if t := x.(*HTMLTag).Text; !strings.Contains(t[len("<!--"):len(t)-len("->")], "--") {
return x, end, ok
}
}
}
return nil, 0, false
}
func parseHTMLCDATA(s string, i int) (Inline, int, bool) {
// “A CDATA section consists of the string <![CDATA[,
// a string of characters not including the string ]]>, and the string ]]>.”
return parseHTMLMarker(s, i, "<![CDATA[", "]]>")
}
func parseHTMLDecl(p *parseState, s string, i int) (Inline, int, bool) {
// “A declaration consists of the string <!, an ASCII letter,
// zero or more characters not including the character >, and the character >.”
if i+2 < len(s) && isLetter(s[i+2]) {
if 'a' <= s[i+2] && s[i+2] <= 'z' {
p.corner = true // goldmark requires uppercase
}
return parseHTMLMarker(s, i, "<!", ">")
}
return nil, 0, false
}
func parseHTMLProcInst(s string, i int) (Inline, int, bool) {
// “A processing instruction consists of the string <?,
// a string of characters not including the string ?>, and the string ?>.”
return parseHTMLMarker(s, i, "<?", "?>")
}
func parseHTMLMarker(s string, i int, prefix, suffix string) (Inline, int, bool) {
if strings.HasPrefix(s[i:], prefix) {
if j := strings.Index(s[i+len(prefix):], suffix); j >= 0 {
end := i + len(prefix) + j + len(suffix)
return &HTMLTag{s[i:end]}, end, true
}
}
return nil, 0, false
}
func parseHTMLEntity(_ *parseState, s string, i int) (Inline, int, int, bool) {
start := i
if i+1 < len(s) && s[i+1] == '#' {
i += 2
var r, end int
if i < len(s) && (s[i] == 'x' || s[i] == 'X') {
// hex
i++
j := i
for j < len(s) && isHexDigit(s[j]) {
j++
}
if j-i < 1 || j-i > 6 || j >= len(s) || s[j] != ';' {
return nil, 0, 0, false
}
r64, _ := strconv.ParseInt(s[i:j], 16, 0)
r = int(r64)
end = j + 1
} else {
// decimal
j := i
for j < len(s) && isDigit(s[j]) {
j++
}
if j-i < 1 || j-i > 7 || j >= len(s) || s[j] != ';' {
return nil, 0, 0, false
}
r, _ = strconv.Atoi(s[i:j])
end = j + 1
}
if r > unicode.MaxRune || r == 0 {
r = unicode.ReplacementChar
}
return &Plain{string(rune(r))}, start, end, true
}
// Max name in list is 32 bytes. Try for 64 for good measure.
for j := i + 1; j < len(s) && j-i < 64; j++ {
if s[j] == '&' { // Stop possible quadratic search on &&&&&&&.
break
}
if s[j] == ';' {
if r, ok := htmlEntity[s[i:j+1]]; ok {
return &Plain{r}, start, j + 1, true
}
break
}
}
return nil, 0, 0, false
}
type HTMLTag struct {
Text string
}
func (*HTMLTag) Inline() {}
func (x *HTMLTag) PrintHTML(buf *bytes.Buffer) {
buf.WriteString(x.Text)
}
func (x *HTMLTag) printMarkdown(buf *bytes.Buffer) {
x.PrintHTML(buf)
}
func (x *HTMLTag) PrintText(buf *bytes.Buffer) {}