internal/frontend: change id generation to use parsed markdown text

To produce heading ids that match between the goldmark version of the
code and the rsc.io/markdown version of the code, use the markdown
parser to parse the markdown and then extract the text from it. We do
this because rsc.io/markdown doesn't provide the raw markdown for us
to generate the ids with. This will change the ids that are generated
for some headings.

For golang/go#61399

Change-Id: Id0f26b311b59e848ff1753e058d413ed3168926d
Reviewed-on: https://go-review.googlesource.com/c/pkgsite/+/548255
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
kokoro-CI: kokoro <noreply+kokoro@google.com>
Reviewed-by: Jonathan Amsterdam <jba@google.com>
diff --git a/internal/frontend/goldmark.go b/internal/frontend/goldmark.go
index deb39f5..22f6e2c 100644
--- a/internal/frontend/goldmark.go
+++ b/internal/frontend/goldmark.go
@@ -10,7 +10,6 @@
 	"bytes"
 	"context"
 	"fmt"
-	"regexp"
 	"strings"
 
 	"github.com/yuin/goldmark/ast"
@@ -22,6 +21,7 @@
 	"golang.org/x/pkgsite/internal"
 	"golang.org/x/pkgsite/internal/log"
 	"golang.org/x/pkgsite/internal/source"
+	"rsc.io/markdown"
 )
 
 // astTransformer is a default transformer of the goldmark tree. We pass in
@@ -185,20 +185,35 @@
 // unit page. Duplicated heading ids are given an incremental suffix. See
 // readme_test.go for examples.
 func (s *ids) Generate(value []byte, kind ast.NodeKind) []byte {
-	// Matches strings like `<tag attr="value">Text</tag>` or `[![Text](file.svg)](link.html)`.
-	r := regexp.MustCompile(`(<[^<>]+>|\[\!\[[^\]]+]\([^\)]+\)\]\([^\)]+\))`)
-	str := r.ReplaceAllString(string(value), "")
+	var defaultID string
+	if kind == ast.KindHeading {
+		defaultID = "heading"
+	} else {
+		defaultID = "id"
+	}
+
+	parser := &markdown.Parser{}
+	doc := parser.Parse("# " + string(value))
+	return []byte(s.generateID(doc, defaultID))
+}
+
+func (s *ids) generateID(block markdown.Block, defaultID string) string {
+	var buf bytes.Buffer
+	walkBlocks([]markdown.Block{block}, func(b markdown.Block) error {
+		if t, ok := b.(*markdown.Text); ok {
+			for _, inl := range t.Inline {
+				inl.PrintText(&buf)
+			}
+		}
+		return nil
+	})
 	f := func(c rune) bool {
 		return !('a' <= c && c <= 'z') && !('A' <= c && c <= 'Z') && !('0' <= c && c <= '9')
 	}
-	str = strings.Join(strings.FieldsFunc(str, f), "-")
+	str := strings.Join(strings.FieldsFunc(buf.String(), f), "-")
 	str = strings.ToLower(str)
 	if len(str) == 0 {
-		if kind == ast.KindHeading {
-			str = "heading"
-		} else {
-			str = "id"
-		}
+		str = defaultID
 	}
 	key := str
 	for i := 1; ; i++ {
@@ -208,7 +223,7 @@
 		}
 		key = fmt.Sprintf("%s-%d", str, i)
 	}
-	return []byte("readme-" + key)
+	return "readme-" + key
 }
 
 // Put implements Put from the goldmark parser IDs interface.
diff --git a/internal/frontend/markdown.go b/internal/frontend/markdown.go
index 8f2da6c..98072e3 100644
--- a/internal/frontend/markdown.go
+++ b/internal/frontend/markdown.go
@@ -12,7 +12,6 @@
 	"strings"
 
 	"github.com/google/safehtml/template"
-	"github.com/yuin/goldmark/ast"
 	"golang.org/x/pkgsite/internal"
 	"golang.org/x/pkgsite/internal/derrors"
 	"golang.org/x/pkgsite/internal/log"
@@ -118,6 +117,8 @@
 
 		err = nil
 		switch x := b.(type) {
+		case *markdown.Document:
+			err = walkBlocks(x.Blocks, walkFunc)
 		case *markdown.Text:
 		case *markdown.Paragraph:
 			err = walkBlocks([]markdown.Block{x.Text}, walkFunc)
@@ -130,7 +131,9 @@
 		case *markdown.Quote:
 			err = walkBlocks(x.Blocks, walkFunc)
 		case *markdown.HTMLBlock:
-			continue
+		case *markdown.CodeBlock:
+		case *markdown.Empty:
+		case *markdown.ThematicBreak:
 		default:
 			return fmt.Errorf("unhandled block type %T", x)
 		}
@@ -287,6 +290,12 @@
 	rewriteHeadingsBlocks = func(blocks []markdown.Block) {
 		for i, b := range blocks {
 			switch x := b.(type) {
+			case *markdown.Paragraph:
+				rewriteHeadingsBlocks([]markdown.Block{x.Text})
+			case *markdown.List:
+				rewriteHeadingsBlocks(x.Items)
+			case *markdown.Item:
+				rewriteHeadingsBlocks(x.Blocks)
 			case *markdown.Quote:
 				rewriteHeadingsBlocks(x.Blocks)
 			case *markdown.Heading:
@@ -338,19 +347,12 @@
 // function, but we don't have the raw markdown anymore, so we use the
 // text instead.
 func rewriteHeadingIDs(doc *markdown.Document) {
-	ids := newIDs()
+	ids := &ids{
+		values: map[string]bool{},
+	}
 	walkBlocks(doc.Blocks, func(b markdown.Block) error {
 		if heading, ok := b.(*markdown.Heading); ok {
-			var buf bytes.Buffer
-			for _, inl := range heading.Text.Inline {
-				// Hack: use HTML because ids strips out html tags.
-				// TODO(matloob): change the goldmark code to not use
-				// raw markdown text and instead depend on the text of the
-				// nodes.
-				inl.PrintHTML(&buf)
-			}
-
-			id := ids.Generate(buf.Bytes(), ast.KindHeading)
+			id := ids.generateID(heading, "heading")
 			heading.ID = string(id)
 		}
 		return nil
diff --git a/internal/frontend/readme_test.go b/internal/frontend/readme_test.go
index e7abd1c..df162c4 100644
--- a/internal/frontend/readme_test.go
+++ b/internal/frontend/readme_test.go
@@ -414,9 +414,9 @@
 				Contents: `# [![Image Text](file.svg)](link.html)
 				`,
 			},
-			wantHTML: `<h3 class="h1" id="readme-heading"><a href="https://github.com/valid/module_name/blob/v1.0.0/link.html" rel="nofollow"><img src="https://github.com/valid/module_name/raw/v1.0.0/file.svg" alt="Image Text"/></a></h3>`,
+			wantHTML: `<h3 class="h1" id="readme-image-text"><a href="https://github.com/valid/module_name/blob/v1.0.0/link.html" rel="nofollow"><img src="https://github.com/valid/module_name/raw/v1.0.0/file.svg" alt="Image Text"/></a></h3>`,
 			wantOutline: []*Heading{
-				{Level: 1, Text: "Image Text", ID: "readme-heading"},
+				{Level: 1, Text: "Image Text", ID: "readme-image-text"},
 			},
 		},
 		{
@@ -457,6 +457,18 @@
 				{Level: 1, Text: "Heading", ID: "readme-heading-3"},
 			},
 		},
+		{
+			name: "tag in heading",
+			unit: unit,
+			readme: &internal.Readme{
+				Filepath: "README.md",
+				Contents: `# A link <a href="link">link</a>`,
+			},
+			wantHTML: `<h3 class="h1" id="readme-a-link-link">A link <a href="link" rel="nofollow">link</a></h3>`,
+			wantOutline: []*Heading{
+				{Level: 1, Text: "A link link", ID: "readme-a-link-link"},
+			},
+		},
 	} {
 		t.Run(test.name, func(t *testing.T) {
 			processReadmes := map[string]func(ctx context.Context, u *internal.Unit) (frontendReadme *Readme, err error){