blob: da953e8454b068afce3e38f5292f55c4aca6d8c7 [file] [log] [blame]
// Copyright 2021 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package markdown
import (
"bytes"
"fmt"
"strings"
"unicode"
"unicode/utf8"
)
/*
text node can be
- other literal text
- run of * or _ characters
- [
- ![
keep delimiter stack pointing at non-other literal text
each node contains
- type of delimiter [ ![ _ *
- number of delimiters
- active or not
- potential opener, potential closer, or obth
when a ] is hit, call look for link or image
when end is hit, call process emphasis
look for link or image:
find topmost [ or ![
if none, emit literal ]
if its inactive, remove and emit literal ]
parse ahead to look for rest of link; if none, remove and emit literal ]
run process emphasis on the interior,
remove opener
if this was a link (not an image), set all [ before opener to inactive, to avoid links inside links
process emphasis
walk forward in list to find a closer.
walk back to find first potential matching opener.
if found:
strong for length >= 2
insert node
drop delimiters between opener and closer
remove 1 or 2 from open/close count, removing if now empty
if closing has some left, go around again on this node
if not:
set openers bottom for this kind of element to before current_position
if the closer at current pos is not an opener, remove it
seems needlessly complex. two passes
scan and find ` ` first.
pass 1. scan and find [ and ]() and leave the rest alone.
each completed one invokes emphasis on inner text and then on the overall list.
*/
type Inline interface {
PrintHTML(*bytes.Buffer)
PrintText(*bytes.Buffer)
printMarkdown(*bytes.Buffer)
}
type Plain struct {
Text string
}
func (*Plain) Inline() {}
func (x *Plain) PrintHTML(buf *bytes.Buffer) {
htmlEscaper.WriteString(buf, x.Text)
}
func (x *Plain) printMarkdown(buf *bytes.Buffer) {
buf.WriteString(x.Text)
}
func (x *Plain) PrintText(buf *bytes.Buffer) {
htmlEscaper.WriteString(buf, x.Text)
}
type openPlain struct {
Plain
i int // position in input where bracket is
}
type emphPlain struct {
Plain
canOpen bool
canClose bool
i int // position in output where emph is
n int // length of original span
}
type Escaped struct {
Plain
}
func (x *Escaped) printMarkdown(buf *bytes.Buffer) {
buf.WriteByte('\\')
x.Plain.printMarkdown(buf)
}
type Code struct {
Text string
numTicks int
}
func (*Code) Inline() {}
func (x *Code) PrintHTML(buf *bytes.Buffer) {
fmt.Fprintf(buf, "<code>%s</code>", htmlEscaper.Replace(x.Text))
}
func (x *Code) printMarkdown(buf *bytes.Buffer) {
ticks := strings.Repeat("`", x.numTicks)
buf.WriteString(ticks)
buf.WriteString(x.Text)
buf.WriteString(ticks)
}
func (x *Code) PrintText(buf *bytes.Buffer) {
htmlEscaper.WriteString(buf, x.Text)
}
type Strong struct {
Marker string
Inner []Inline
}
func (x *Strong) Inline() {
}
func (x *Strong) PrintHTML(buf *bytes.Buffer) {
buf.WriteString("<strong>")
for _, c := range x.Inner {
c.PrintHTML(buf)
}
buf.WriteString("</strong>")
}
func (x *Strong) printMarkdown(buf *bytes.Buffer) {
buf.WriteString(x.Marker)
for _, c := range x.Inner {
c.printMarkdown(buf)
}
buf.WriteString(x.Marker)
}
func (x *Strong) PrintText(buf *bytes.Buffer) {
for _, c := range x.Inner {
c.PrintText(buf)
}
}
type Del struct {
Marker string
Inner []Inline
}
func (x *Del) Inline() {
}
func (x *Del) PrintHTML(buf *bytes.Buffer) {
buf.WriteString("<del>")
for _, c := range x.Inner {
c.PrintHTML(buf)
}
buf.WriteString("</del>")
}
func (x *Del) printMarkdown(buf *bytes.Buffer) {
buf.WriteString(x.Marker)
for _, c := range x.Inner {
c.printMarkdown(buf)
}
buf.WriteString(x.Marker)
}
func (x *Del) PrintText(buf *bytes.Buffer) {
for _, c := range x.Inner {
c.PrintText(buf)
}
}
type Emph struct {
Marker string
Inner []Inline
}
func (*Emph) Inline() {}
func (x *Emph) PrintHTML(buf *bytes.Buffer) {
buf.WriteString("<em>")
for _, c := range x.Inner {
c.PrintHTML(buf)
}
buf.WriteString("</em>")
}
func (x *Emph) printMarkdown(buf *bytes.Buffer) {
buf.WriteString(x.Marker)
for _, c := range x.Inner {
c.printMarkdown(buf)
}
buf.WriteString(x.Marker)
}
func (x *Emph) PrintText(buf *bytes.Buffer) {
for _, c := range x.Inner {
c.PrintText(buf)
}
}
func (p *parseState) emit(i int) {
if p.emitted < i {
p.list = append(p.list, &Plain{p.s[p.emitted:i]})
p.emitted = i
}
}
func (p *parseState) skip(i int) {
p.emitted = i
}
func (p *parseState) inline(s string) []Inline {
s = trimSpaceTab(s)
// Scan text looking for inlines.
// Leaf inlines are converted immediately.
// Non-leaf inlines have potential starts pushed on a stack while we await completion.
// Links take priority over other emphasis, so the emphasis must be delayed.
p.s = s
p.list = nil
p.emitted = 0
var opens []int // indexes of open ![ and [ Plains in p.list
var lastLinkOpen int
backticks := false
i := 0
for i < len(s) {
var parser func(*parseState, string, int) (Inline, int, int, bool)
switch s[i] {
case '\\':
parser = parseEscape
case '`':
if !backticks {
backticks = true
p.backticks.reset()
}
parser = p.backticks.parseCodeSpan
case '<':
parser = parseAutoLinkOrHTML
case '[':
parser = parseLinkOpen
case '!':
parser = parseImageOpen
case '_', '*':
parser = parseEmph
case '.':
if p.SmartDot {
parser = parseDot
}
case '-':
if p.SmartDash {
parser = parseDash
}
case '"', '\'':
if p.SmartQuote {
parser = parseEmph
}
case '~':
if p.Strikethrough {
parser = parseEmph
}
case '\n': // TODO what about eof
parser = parseBreak
case '&':
parser = parseHTMLEntity
case ':':
if p.Emoji {
parser = parseEmoji
}
}
if parser != nil {
if x, start, end, ok := parser(p, s, i); ok {
p.emit(start)
if _, ok := x.(*openPlain); ok {
opens = append(opens, len(p.list))
}
p.list = append(p.list, x)
i = end
p.skip(i)
continue
}
}
if s[i] == ']' && len(opens) > 0 {
oi := opens[len(opens)-1]
open := p.list[oi].(*openPlain)
opens = opens[:len(opens)-1]
if open.Text[0] == '!' || lastLinkOpen <= open.i {
if x, end, ok := p.parseLinkClose(s, i, open); ok {
p.corner = p.corner || x.corner || linkCorner(x.URL)
p.emit(i)
x.Inner = p.emph(nil, p.list[oi+1:])
if open.Text[0] == '!' {
p.list[oi] = (*Image)(x)
} else {
p.list[oi] = x
}
p.list = p.list[:oi+1]
p.skip(end)
i = end
if open.Text[0] == '[' {
// No links around links.
lastLinkOpen = open.i
}
continue
}
}
}
i++
}
p.emit(len(s))
p.list = p.emph(p.list[:0], p.list)
p.list = p.mergePlain(p.list)
p.list = p.autoLinkText(p.list)
return p.list
}
func (ps *parseState) emph(dst, src []Inline) []Inline {
const chars = "_*~\"'"
var stack [len(chars)][]*emphPlain
stackOf := func(c byte) int {
return strings.IndexByte(chars, c)
}
trimStack := func() {
for i := range stack {
stk := &stack[i]
for len(*stk) > 0 && (*stk)[len(*stk)-1].i >= len(dst) {
*stk = (*stk)[:len(*stk)-1]
}
}
}
Src:
for i := 0; i < len(src); i++ {
if open, ok := src[i].(*openPlain); ok {
// Convert unused link/image open marker to plain text.
dst = append(dst, &open.Plain)
continue
}
p, ok := src[i].(*emphPlain)
if !ok {
dst = append(dst, src[i])
continue
}
if p.canClose {
stk := &stack[stackOf(p.Text[0])]
Loop:
for p.Text != "" {
// Looking for same symbol and compatible with p.Text.
for i := len(*stk) - 1; i >= 0; i-- {
start := (*stk)[i]
if (p.Text[0] == '*' || p.Text[0] == '_') && (p.canOpen && p.canClose || start.canOpen && start.canClose) && (p.n+start.n)%3 == 0 && (p.n%3 != 0 || start.n%3 != 0) {
continue
}
if p.Text[0] == '~' && len(p.Text) != len(start.Text) { // ~ matches ~, ~~ matches ~~
continue
}
if p.Text[0] == '"' {
dst[start.i].(*emphPlain).Text = "“"
p.Text = "”"
dst = append(dst, p)
*stk = (*stk)[:i]
// no trimStack
continue Src
}
if p.Text[0] == '\'' {
dst[start.i].(*emphPlain).Text = "‘"
p.Text = "’"
dst = append(dst, p)
*stk = (*stk)[:i]
// no trimStack
continue Src
}
var d int
if len(p.Text) >= 2 && len(start.Text) >= 2 {
// strong
d = 2
} else {
// emph
d = 1
}
del := p.Text[0] == '~'
x := &Emph{Marker: p.Text[:d], Inner: append([]Inline(nil), dst[start.i+1:]...)}
start.Text = start.Text[:len(start.Text)-d]
p.Text = p.Text[d:]
if start.Text == "" {
dst = dst[:start.i]
} else {
dst = dst[:start.i+1]
}
trimStack()
if del {
dst = append(dst, (*Del)(x))
} else if d == 2 {
dst = append(dst, (*Strong)(x))
} else {
dst = append(dst, x)
}
continue Loop
}
break
}
}
if p.Text != "" {
stk := &stack[stackOf(p.Text[0])]
if p.Text == "'" {
p.Text = "’"
}
if p.Text == "\"" {
if p.canClose {
p.Text = "”"
} else {
p.Text = "“"
}
}
if p.canOpen {
p.i = len(dst)
dst = append(dst, p)
*stk = append(*stk, p)
} else {
dst = append(dst, &p.Plain)
}
}
}
return dst
}
func mdUnescape(s string) string {
if !strings.Contains(s, `\`) && !strings.Contains(s, `&`) {
return s
}
return mdUnescaper.Replace(s)
}
var mdUnescaper = func() *strings.Replacer {
var list = []string{
`\!`, `!`,
`\"`, `"`,
`\#`, `#`,
`\$`, `$`,
`\%`, `%`,
`\&`, `&`,
`\'`, `'`,
`\(`, `(`,
`\)`, `)`,
`\*`, `*`,
`\+`, `+`,
`\,`, `,`,
`\-`, `-`,
`\.`, `.`,
`\/`, `/`,
`\:`, `:`,
`\;`, `;`,
`\<`, `<`,
`\=`, `=`,
`\>`, `>`,
`\?`, `?`,
`\@`, `@`,
`\[`, `[`,
`\\`, `\`,
`\]`, `]`,
`\^`, `^`,
`\_`, `_`,
"\\`", "`",
`\{`, `{`,
`\|`, `|`,
`\}`, `}`,
`\~`, `~`,
}
for name, repl := range htmlEntity {
list = append(list, name, repl)
}
return strings.NewReplacer(list...)
}()
func isPunct(c byte) bool {
return '!' <= c && c <= '/' || ':' <= c && c <= '@' || '[' <= c && c <= '`' || '{' <= c && c <= '~'
}
func parseEscape(p *parseState, s string, i int) (Inline, int, int, bool) {
if i+1 < len(s) {
c := s[i+1]
if isPunct(c) {
return &Escaped{Plain{s[i+1 : i+2]}}, i, i + 2, true
}
if c == '\n' { // TODO what about eof
if i > 0 && s[i-1] == '\\' {
p.corner = true // goldmark mishandles \\\ newline
}
end := i + 2
for end < len(s) && (s[end] == ' ' || s[end] == '\t') {
end++
}
return &HardBreak{}, i, end, true
}
}
return nil, 0, 0, false
}
func parseDot(p *parseState, s string, i int) (Inline, int, int, bool) {
if i+2 < len(s) && s[i+1] == '.' && s[i+2] == '.' {
return &Plain{"…"}, i, i + 3, true
}
return nil, 0, 0, false
}
func parseDash(p *parseState, s string, i int) (Inline, int, int, bool) {
if i+1 >= len(s) || s[i+1] != '-' {
return nil, 0, 0, false
}
n := 2
for i+n < len(s) && s[i+n] == '-' {
n++
}
// Mimic cmark-gfm. Can't make this stuff up.
em, en := 0, 0
switch {
case n%3 == 0:
em = n / 3
case n%2 == 0:
en = n / 2
case n%3 == 2:
em = (n - 2) / 3
en = 1
case n%3 == 1:
em = (n - 4) / 3
en = 2
}
return &Plain{strings.Repeat("—", em) + strings.Repeat("–", en)}, i, i + n, true
}
// Inline code span markers must fit on punched cards, to match cmark-gfm.
const maxBackticks = 80
type backtickParser struct {
last [maxBackticks]int
scanned bool
}
func (b *backtickParser) reset() {
*b = backtickParser{}
}
func (b *backtickParser) parseCodeSpan(p *parseState, s string, i int) (Inline, int, int, bool) {
start := i
// Count leading backticks. Need to find that many again.
n := 1
for i+n < len(s) && s[i+n] == '`' {
n++
}
// If we've already scanned the whole string (for a different count),
// we can skip a failed scan by checking whether we saw this count.
// To enable this optimization, following cmark-gfm, we declare by fiat
// that more than maxBackticks backquotes is too many.
if n > len(b.last) || b.scanned && b.last[n-1] < i+n {
goto NoMatch
}
for end := i + n; end < len(s); {
if s[end] != '`' {
end++
continue
}
estart := end
for end < len(s) && s[end] == '`' {
end++
}
m := end - estart
if !b.scanned && m < len(b.last) {
b.last[m-1] = estart
}
if m == n {
// Match.
// Line endings are converted to single spaces.
text := s[i+n : estart]
text = strings.ReplaceAll(text, "\n", " ")
// If enclosed text starts and ends with a space and is not all spaces,
// one space is removed from start and end, to allow `` ` `` to quote a single backquote.
if len(text) >= 2 && text[0] == ' ' && text[len(text)-1] == ' ' && trimSpace(text) != "" {
text = text[1 : len(text)-1]
}
return &Code{text, n}, start, end, true
}
}
b.scanned = true
NoMatch:
// No match, so none of these backticks count: skip them all.
// For example ``x` is not a single backtick followed by a code span.
// Returning nil, 0, false would advance to the second backtick and try again.
return &Plain{s[i : i+n]}, start, i + n, true
}
func parseAutoLinkOrHTML(p *parseState, s string, i int) (Inline, int, int, bool) {
if x, end, ok := parseAutoLinkURI(s, i); ok {
return x, i, end, true
}
if x, end, ok := parseAutoLinkEmail(s, i); ok {
return x, i, end, true
}
if x, end, ok := parseHTMLTag(p, s, i); ok {
return x, i, end, true
}
return nil, 0, 0, false
}
func isLetter(c byte) bool {
return 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z'
}
func isLDH(c byte) bool {
return isLetterDigit(c) || c == '-'
}
func isLetterDigit(c byte) bool {
return 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z' || '0' <= c && c <= '9'
}
func parseLinkOpen(_ *parseState, s string, i int) (Inline, int, int, bool) {
return &openPlain{Plain{s[i : i+1]}, i + 1}, i, i + 1, true
}
func parseImageOpen(_ *parseState, s string, i int) (Inline, int, int, bool) {
if i+1 < len(s) && s[i+1] == '[' {
return &openPlain{Plain{s[i : i+2]}, i + 2}, i, i + 2, true
}
return nil, 0, 0, false
}
func parseEmph(p *parseState, s string, i int) (Inline, int, int, bool) {
c := s[i]
j := i + 1
if c == '*' || c == '~' || c == '_' {
for j < len(s) && s[j] == c {
j++
}
}
if c == '~' && j-i != 2 {
// Goldmark does not accept ~text~
// and incorrectly accepts ~~~text~~~.
// Only ~~ is correct.
p.corner = true
}
if c == '~' && j-i > 2 {
return &Plain{s[i:j]}, i, j, true
}
var before, after rune
if i == 0 {
before = ' '
} else {
before, _ = utf8.DecodeLastRuneInString(s[:i])
}
if j >= len(s) {
after = ' '
} else {
after, _ = utf8.DecodeRuneInString(s[j:])
}
// “A left-flanking delimiter run is a delimiter run that is
// (1) not followed by Unicode whitespace, and either
// (2a) not followed by a Unicode punctuation character, or
// (2b) followed by a Unicode punctuation character
// and preceded by Unicode whitespace or a Unicode punctuation character.
// For purposes of this definition, the beginning and the end
// of the line count as Unicode whitespace.”
leftFlank := !isUnicodeSpace(after) &&
(!isUnicodePunct(after) || isUnicodeSpace(before) || isUnicodePunct(before))
// “A right-flanking delimiter run is a delimiter run that is
// (1) not preceded by Unicode whitespace, and either
// (2a) not preceded by a Unicode punctuation character, or
// (2b) preceded by a Unicode punctuation character
// and followed by Unicode whitespace or a Unicode punctuation character.
// For purposes of this definition, the beginning and the end
// of the line count as Unicode whitespace.”
rightFlank := !isUnicodeSpace(before) &&
(!isUnicodePunct(before) || isUnicodeSpace(after) || isUnicodePunct(after))
var canOpen, canClose bool
switch c {
case '\'', '"':
canOpen = leftFlank && !rightFlank && before != ']' && before != ')'
canClose = rightFlank
case '*', '~':
// “A single * character can open emphasis iff
// it is part of a left-flanking delimiter run.”
// “A double ** can open strong emphasis iff
// it is part of a left-flanking delimiter run.”
canOpen = leftFlank
// “A single * character can close emphasis iff
// it is part of a right-flanking delimiter run.”
// “A double ** can close strong emphasis iff
// it is part of a right-flanking delimiter run.”
canClose = rightFlank
case '_':
// “A single _ character can open emphasis iff
// it is part of a left-flanking delimiter run and either
// (a) not part of a right-flanking delimiter run or
// (b) part of a right-flanking delimiter run preceded by a Unicode punctuation character.”
// “A double __ can open strong emphasis iff
// it is part of a left-flanking delimiter run and either
// (a) not part of a right-flanking delimiter run or
// (b) part of a right-flanking delimiter run preceded by a Unicode punctuation character.”
canOpen = leftFlank && (!rightFlank || isUnicodePunct(before))
// “A single _ character can close emphasis iff
// it is part of a right-flanking delimiter run and either
// (a) not part of a left-flanking delimiter run or
// (b) part of a left-flanking delimiter run followed by a Unicode punctuation character.”
// “A double __ can close strong emphasis iff
// it is part of a right-flanking delimiter run and either
// (a) not part of a left-flanking delimiter run or
// (b) part of a left-flanking delimiter run followed by a Unicode punctuation character.”
canClose = rightFlank && (!leftFlank || isUnicodePunct(after))
}
return &emphPlain{Plain: Plain{s[i:j]}, canOpen: canOpen, canClose: canClose, n: j - i}, i, j, true
}
func isUnicodeSpace(r rune) bool {
if r < 0x80 {
return r == ' ' || r == '\t' || r == '\f' || r == '\n'
}
return unicode.In(r, unicode.Zs)
}
func isUnicodePunct(r rune) bool {
if r < 0x80 {
return isPunct(byte(r))
}
return unicode.In(r, unicode.Punct)
}
func (p *parseState) parseLinkClose(s string, i int, open *openPlain) (*Link, int, bool) {
if i+1 < len(s) {
switch s[i+1] {
case '(':
// Inline link - [Text](Dest Title), with Title omitted or both Dest and Title omitted.
i := skipSpace(s, i+2)
var dest, title string
var titleChar byte
var corner bool
if i < len(s) && s[i] != ')' {
var ok bool
dest, i, ok = parseLinkDest(s, i)
if !ok {
break
}
i = skipSpace(s, i)
if i < len(s) && s[i] != ')' {
title, titleChar, i, ok = parseLinkTitle(s, i)
if title == "" {
corner = true
}
if !ok {
break
}
i = skipSpace(s, i)
}
}
if i < len(s) && s[i] == ')' {
return &Link{URL: dest, Title: title, TitleChar: titleChar, corner: corner}, i + 1, true
}
// NOTE: Test malformed ( ) with shortcut reference
// TODO fall back on syntax error?
case '[':
// Full reference link - [Text][Label]
label, i, ok := parseLinkLabel(p, s, i+1)
if !ok {
break
}
if link, ok := p.links[normalizeLabel(label)]; ok {
return &Link{URL: link.URL, Title: link.Title, corner: link.corner}, i, true
}
// Note: Could break here, but CommonMark dingus does not
// fall back to trying Text for [Text][Label] when Label is unknown.
// Unclear from spec what the correct answer is.
return nil, 0, false
}
}
// Collapsed or shortcut reference link: [Text][] or [Text].
end := i + 1
if strings.HasPrefix(s[end:], "[]") {
end += 2
}
if link, ok := p.links[normalizeLabel(s[open.i:i])]; ok {
return &Link{URL: link.URL, Title: link.Title, corner: link.corner}, end, true
}
return nil, 0, false
}
func skipSpace(s string, i int) int {
// Note: Blank lines have already been removed.
for i < len(s) && (s[i] == ' ' || s[i] == '\t' || s[i] == '\n') {
i++
}
return i
}
func linkCorner(url string) bool {
for i := 0; i < len(url); i++ {
if url[i] == '%' {
if i+2 >= len(url) || !isHexDigit(url[i+1]) || !isHexDigit(url[i+2]) {
// Goldmark and the Dingus re-escape such percents as %25,
// but the spec does not seem to require this behavior.
return true
}
}
}
return false
}
func (p *parseState) mergePlain(list []Inline) []Inline {
out := list[:0]
start := 0
for i := 0; ; i++ {
if i < len(list) && toPlain(list[i]) != nil {
continue
}
// Non-Plain or end of list.
if start < i {
out = append(out, mergePlain1(list[start:i]))
}
if i >= len(list) {
break
}
out = append(out, list[i])
start = i + 1
}
return out
}
func toPlain(x Inline) *Plain {
// TODO what about Escaped?
switch x := x.(type) {
case *Plain:
return x
case *emphPlain:
return &x.Plain
case *openPlain:
return &x.Plain
}
return nil
}
func mergePlain1(list []Inline) *Plain {
if len(list) == 1 {
return toPlain(list[0])
}
var all []string
for _, pl := range list {
all = append(all, toPlain(pl).Text)
}
return &Plain{Text: strings.Join(all, "")}
}
func parseEmoji(p *parseState, s string, i int) (Inline, int, int, bool) {
for j := i + 1; ; j++ {
if j >= len(s) || j-i > 2+maxEmojiLen {
break
}
if s[j] == ':' {
name := s[i+1 : j]
if utf, ok := emoji[name]; ok {
return &Emoji{s[i : j+1], utf}, i, j + 1, true
}
break
}
}
return nil, 0, 0, false
}
type Emoji struct {
Name string // emoji :name:, including colons
Text string // Unicode for emoji sequence
}
func (*Emoji) Inline() {}
func (x *Emoji) PrintHTML(buf *bytes.Buffer) {
htmlEscaper.WriteString(buf, x.Text)
}
func (x *Emoji) printMarkdown(buf *bytes.Buffer) {
buf.WriteString(x.Text)
}
func (x *Emoji) PrintText(buf *bytes.Buffer) {
htmlEscaper.WriteString(buf, x.Text)
}