internal/frontend: change id generation to use parsed markdown text
To produce heading ids that match between the goldmark version of the
code and the rsc.io/markdown version of the code, use the markdown
parser to parse the markdown and then extract the text from it. We do
this because rsc.io/markdown doesn't provide the raw markdown for us
to generate the ids with. This will change the ids that are generated
for some headings.
For golang/go#61399
Change-Id: Id0f26b311b59e848ff1753e058d413ed3168926d
Reviewed-on: https://go-review.googlesource.com/c/pkgsite/+/548255
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
kokoro-CI: kokoro <noreply+kokoro@google.com>
Reviewed-by: Jonathan Amsterdam <jba@google.com>
diff --git a/internal/frontend/goldmark.go b/internal/frontend/goldmark.go
index deb39f5..22f6e2c 100644
--- a/internal/frontend/goldmark.go
+++ b/internal/frontend/goldmark.go
@@ -10,7 +10,6 @@
"bytes"
"context"
"fmt"
- "regexp"
"strings"
"github.com/yuin/goldmark/ast"
@@ -22,6 +21,7 @@
"golang.org/x/pkgsite/internal"
"golang.org/x/pkgsite/internal/log"
"golang.org/x/pkgsite/internal/source"
+ "rsc.io/markdown"
)
// astTransformer is a default transformer of the goldmark tree. We pass in
@@ -185,20 +185,35 @@
// unit page. Duplicated heading ids are given an incremental suffix. See
// readme_test.go for examples.
func (s *ids) Generate(value []byte, kind ast.NodeKind) []byte {
- // Matches strings like `<tag attr="value">Text</tag>` or `[![Text](file.svg)](link.html)`.
- r := regexp.MustCompile(`(<[^<>]+>|\[\!\[[^\]]+]\([^\)]+\)\]\([^\)]+\))`)
- str := r.ReplaceAllString(string(value), "")
+ var defaultID string
+ if kind == ast.KindHeading {
+ defaultID = "heading"
+ } else {
+ defaultID = "id"
+ }
+
+ parser := &markdown.Parser{}
+ doc := parser.Parse("# " + string(value))
+ return []byte(s.generateID(doc, defaultID))
+}
+
+func (s *ids) generateID(block markdown.Block, defaultID string) string {
+ var buf bytes.Buffer
+ walkBlocks([]markdown.Block{block}, func(b markdown.Block) error {
+ if t, ok := b.(*markdown.Text); ok {
+ for _, inl := range t.Inline {
+ inl.PrintText(&buf)
+ }
+ }
+ return nil
+ })
f := func(c rune) bool {
return !('a' <= c && c <= 'z') && !('A' <= c && c <= 'Z') && !('0' <= c && c <= '9')
}
- str = strings.Join(strings.FieldsFunc(str, f), "-")
+ str := strings.Join(strings.FieldsFunc(buf.String(), f), "-")
str = strings.ToLower(str)
if len(str) == 0 {
- if kind == ast.KindHeading {
- str = "heading"
- } else {
- str = "id"
- }
+ str = defaultID
}
key := str
for i := 1; ; i++ {
@@ -208,7 +223,7 @@
}
key = fmt.Sprintf("%s-%d", str, i)
}
- return []byte("readme-" + key)
+ return "readme-" + key
}
// Put implements Put from the goldmark parser IDs interface.
diff --git a/internal/frontend/markdown.go b/internal/frontend/markdown.go
index 8f2da6c..98072e3 100644
--- a/internal/frontend/markdown.go
+++ b/internal/frontend/markdown.go
@@ -12,7 +12,6 @@
"strings"
"github.com/google/safehtml/template"
- "github.com/yuin/goldmark/ast"
"golang.org/x/pkgsite/internal"
"golang.org/x/pkgsite/internal/derrors"
"golang.org/x/pkgsite/internal/log"
@@ -118,6 +117,8 @@
err = nil
switch x := b.(type) {
+ case *markdown.Document:
+ err = walkBlocks(x.Blocks, walkFunc)
case *markdown.Text:
case *markdown.Paragraph:
err = walkBlocks([]markdown.Block{x.Text}, walkFunc)
@@ -130,7 +131,9 @@
case *markdown.Quote:
err = walkBlocks(x.Blocks, walkFunc)
case *markdown.HTMLBlock:
- continue
+ case *markdown.CodeBlock:
+ case *markdown.Empty:
+ case *markdown.ThematicBreak:
default:
return fmt.Errorf("unhandled block type %T", x)
}
@@ -287,6 +290,12 @@
rewriteHeadingsBlocks = func(blocks []markdown.Block) {
for i, b := range blocks {
switch x := b.(type) {
+ case *markdown.Paragraph:
+ rewriteHeadingsBlocks([]markdown.Block{x.Text})
+ case *markdown.List:
+ rewriteHeadingsBlocks(x.Items)
+ case *markdown.Item:
+ rewriteHeadingsBlocks(x.Blocks)
case *markdown.Quote:
rewriteHeadingsBlocks(x.Blocks)
case *markdown.Heading:
@@ -338,19 +347,12 @@
// function, but we don't have the raw markdown anymore, so we use the
// text instead.
func rewriteHeadingIDs(doc *markdown.Document) {
- ids := newIDs()
+ ids := &ids{
+ values: map[string]bool{},
+ }
walkBlocks(doc.Blocks, func(b markdown.Block) error {
if heading, ok := b.(*markdown.Heading); ok {
- var buf bytes.Buffer
- for _, inl := range heading.Text.Inline {
- // Hack: use HTML because ids strips out html tags.
- // TODO(matloob): change the goldmark code to not use
- // raw markdown text and instead depend on the text of the
- // nodes.
- inl.PrintHTML(&buf)
- }
-
- id := ids.Generate(buf.Bytes(), ast.KindHeading)
+ id := ids.generateID(heading, "heading")
heading.ID = string(id)
}
return nil
diff --git a/internal/frontend/readme_test.go b/internal/frontend/readme_test.go
index e7abd1c..df162c4 100644
--- a/internal/frontend/readme_test.go
+++ b/internal/frontend/readme_test.go
@@ -414,9 +414,9 @@
Contents: `# [![Image Text](file.svg)](link.html)
`,
},
- wantHTML: `<h3 class="h1" id="readme-heading"><a href="https://github.com/valid/module_name/blob/v1.0.0/link.html" rel="nofollow"><img src="https://github.com/valid/module_name/raw/v1.0.0/file.svg" alt="Image Text"/></a></h3>`,
+ wantHTML: `<h3 class="h1" id="readme-image-text"><a href="https://github.com/valid/module_name/blob/v1.0.0/link.html" rel="nofollow"><img src="https://github.com/valid/module_name/raw/v1.0.0/file.svg" alt="Image Text"/></a></h3>`,
wantOutline: []*Heading{
- {Level: 1, Text: "Image Text", ID: "readme-heading"},
+ {Level: 1, Text: "Image Text", ID: "readme-image-text"},
},
},
{
@@ -457,6 +457,18 @@
{Level: 1, Text: "Heading", ID: "readme-heading-3"},
},
},
+ {
+ name: "tag in heading",
+ unit: unit,
+ readme: &internal.Readme{
+ Filepath: "README.md",
+ Contents: `# A link <a href="link">link</a>`,
+ },
+ wantHTML: `<h3 class="h1" id="readme-a-link-link">A link <a href="link" rel="nofollow">link</a></h3>`,
+ wantOutline: []*Heading{
+ {Level: 1, Text: "A link link", ID: "readme-a-link-link"},
+ },
+ },
} {
t.Run(test.name, func(t *testing.T) {
processReadmes := map[string]func(ctx context.Context, u *internal.Unit) (frontendReadme *Readme, err error){