| // Copyright 2023 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| package frontend |
| |
| import ( |
| "bytes" |
| "context" |
| "errors" |
| "fmt" |
| "strings" |
| |
| "github.com/google/safehtml/template" |
| "golang.org/x/pkgsite/internal" |
| "golang.org/x/pkgsite/internal/derrors" |
| "golang.org/x/pkgsite/internal/log" |
| "golang.org/x/pkgsite/internal/source" |
| "rsc.io/markdown" |
| ) |
| |
| // ProcessReadme processes the README of unit u, if it has one. |
| // Processing includes rendering and sanitizing the HTML or Markdown, |
| // and extracting headings and links. |
| // |
| // Headings are prefixed with "readme-" and heading levels are adjusted to start |
| // at h3 in order to nest them properly within the rest of the page. The |
| // readme's original styling is preserved in the html by giving headings a css |
| // class styled identical to their original heading level. |
| // |
| // The extracted links are for display outside of the readme contents. |
| // |
| // This function is exported for use by external tools. |
| func ProcessReadme(ctx context.Context, u *internal.Unit) (_ *Readme, err error) { |
| defer derrors.WrapAndReport(&err, "ProcessReadme(%q, %q, %q)", u.Path, u.ModulePath, u.Version) |
| return processReadme(ctx, u.Readme, u.SourceInfo) |
| } |
| |
| func processReadme(ctx context.Context, readme *internal.Readme, info *source.Info) (frontendReadme *Readme, err error) { |
| if readme == nil || readme.Contents == "" { |
| return &Readme{}, nil |
| } |
| if !isMarkdown(readme.Filepath) { |
| t := template.Must(template.New("").Parse(`<pre class="readme">{{.}}</pre>`)) |
| h, err := t.ExecuteToHTML(readme.Contents) |
| if err != nil { |
| return nil, err |
| } |
| return &Readme{HTML: h}, nil |
| } |
| |
| p := markdown.Parser{ |
| HeadingIDs: true, |
| Strikethrough: true, |
| TaskListItems: true, |
| AutoLinkText: true, |
| Table: true, |
| Emoji: true, |
| } |
| doc := p.Parse(readme.Contents) |
| (&linkRewriter{info, readme}).rewriteLinks(doc) |
| rewriteImgSrc(doc, info, readme) |
| rewriteHeadingIDs(doc) // rewrite heading ids before extractTOC extracts them |
| et := &extractTOC{ctx: ctx, removeTitle: true} |
| et.extract(doc) |
| el := &extractLinks{ctx: ctx} |
| el.extract(doc) |
| transformHeadingsToHTML(doc) |
| var buf bytes.Buffer |
| doc.PrintHTML(&buf) |
| return &Readme{ |
| HTML: sanitizeHTML(&buf), |
| Outline: et.Headings, |
| Links: el.links, |
| }, nil |
| } |
| |
| // rewriteImgSrc rewrites the HTML in the markdown document to replace img |
| // src keys with a value that properly represents the source of the image |
| // from the repo. |
| func rewriteImgSrc(doc *markdown.Document, info *source.Info, readme *internal.Readme) { |
| walkBlocks(doc.Blocks, func(b markdown.Block) error { |
| switch x := b.(type) { |
| case *markdown.HTMLBlock: |
| htmlBlock := x |
| for i := range htmlBlock.Text { |
| translated, err := translateHTML([]byte(htmlBlock.Text[i]), info, readme) |
| if err != nil { |
| continue |
| } |
| htmlBlock.Text[i] = string(translated) |
| } |
| case *markdown.Text: |
| rewriteHtmlInline(x.Inline, info, readme) |
| } |
| return nil |
| }) |
| } |
| |
| func rewriteHtmlInline(inlines []markdown.Inline, info *source.Info, readme *internal.Readme) { |
| for _, inl := range inlines { |
| if htmlTag, ok := inl.(*markdown.HTMLTag); ok { |
| translated, err := translateHTML([]byte(htmlTag.Text), info, readme) |
| if err != nil { |
| continue |
| } |
| htmlTag.Text = string(translated) |
| } |
| } |
| } |
| |
| var errSkipChildren = errors.New("skip children") |
| |
| // walkBlocks calls walkFunc on all the blocks in the markdown document. If the |
| // walkFunc returns the errSkipChildren error the children of that block will be skipped. |
| func walkBlocks(blocks []markdown.Block, walkFunc func(b markdown.Block) error) error { |
| for _, b := range blocks { |
| err := walkFunc(b) |
| if err == errSkipChildren { |
| continue |
| } else if err != nil { |
| return err |
| } |
| |
| err = nil |
| switch x := b.(type) { |
| case *markdown.Document: |
| err = walkBlocks(x.Blocks, walkFunc) |
| case *markdown.Text: |
| case *markdown.Paragraph: |
| err = walkBlocks([]markdown.Block{x.Text}, walkFunc) |
| case *markdown.Heading: |
| err = walkBlocks([]markdown.Block{x.Text}, walkFunc) |
| case *markdown.List: |
| err = walkBlocks(x.Items, walkFunc) |
| case *markdown.Item: |
| err = walkBlocks(x.Blocks, walkFunc) |
| case *markdown.Quote: |
| err = walkBlocks(x.Blocks, walkFunc) |
| case *markdown.HTMLBlock: |
| case *markdown.CodeBlock: |
| case *markdown.Empty: |
| case *markdown.Table: |
| for _, t := range x.Header { |
| walkBlocks([]markdown.Block{t}, walkFunc) |
| } |
| for _, r := range x.Rows { |
| for _, t := range r { |
| walkBlocks([]markdown.Block{t}, walkFunc) |
| } |
| } |
| case *markdown.ThematicBreak: |
| default: |
| return fmt.Errorf("unhandled block type %T", x) |
| } |
| if err != nil { |
| return err |
| } |
| } |
| return nil |
| } |
| |
| type extractTOC struct { |
| ctx context.Context |
| Headings []*Heading |
| removeTitle bool // omit title from TOC |
| } |
| |
| // extract collects the headings from a readme into an outline |
| // of the document. It nests the headings based on the h-level hierarchy. |
| // See tests for heading levels in TestReadme for behavior. |
| func (e *extractTOC) extract(doc *markdown.Document) { |
| var headings []*Heading |
| err := walkBlocks(doc.Blocks, func(b markdown.Block) error { |
| if heading, ok := b.(*markdown.Heading); ok { |
| var textbuf bytes.Buffer |
| for _, t := range heading.Text.Inline { |
| t.PrintText(&textbuf) |
| } |
| section := &Heading{ |
| Level: heading.Level, |
| Text: textbuf.String(), |
| } |
| section.ID = heading.ID |
| headings = append(headings, section) |
| return errSkipChildren |
| } |
| return nil |
| }) |
| if err != nil { |
| log.Errorf(e.ctx, "extractTOC.extract: %v", err) |
| } |
| |
| // We nest the headings by walking through the list we extracted and |
| // establishing parent child relationships based on heading levels. |
| var nested []*Heading |
| for i, h := range headings { |
| if i == 0 { |
| nested = append(nested, h) |
| continue |
| } |
| parent := headings[i-1] |
| for parent != nil && parent.Level >= h.Level { |
| parent = parent.parent |
| } |
| if parent == nil { |
| nested = append(nested, h) |
| } else { |
| h.parent = parent |
| parent.Children = append(parent.Children, h) |
| } |
| } |
| if e.removeTitle { |
| // If there is only one top tevel heading with 1 or more children we |
| // assume it is the title of the document and remove it from the TOC. |
| if len(nested) == 1 && len(nested[0].Children) > 0 { |
| nested = nested[0].Children |
| } |
| } |
| e.Headings = nested |
| } |
| |
| type extractLinks struct { |
| ctx context.Context |
| inLinksHeading bool |
| links []link |
| } |
| |
| // The name of the heading from which we extract links. |
| const linkHeadingText = "Links" |
| |
| var linkHeadingBytes = []byte(linkHeadingText) // for faster comparison to node contents |
| |
| // extract extracts links from the "Links" section of a README. |
| func (e *extractLinks) extract(doc *markdown.Document) { |
| var seenLinksHeading bool |
| err := walkBlocks(doc.Blocks, func(b markdown.Block) error { |
| switch x := b.(type) { |
| case *markdown.Heading: |
| // We are in the links heading from the point we see a heading with |
| // linkHeadingText until the point we see the next heading. |
| if e.inLinksHeading { |
| e.inLinksHeading = false |
| } |
| var headingText bytes.Buffer |
| for _, t := range x.Text.Inline { |
| t.PrintText(&headingText) |
| } |
| if !seenLinksHeading && bytes.Equal(headingText.Bytes(), linkHeadingBytes) { |
| seenLinksHeading = true |
| e.inLinksHeading = true |
| } |
| case *markdown.Item: |
| // When in the links heading, extract links from list items. |
| if !e.inLinksHeading { |
| return errSkipChildren |
| } |
| // We expect the pattern: ListItem -> TextBlock -> Link, with no |
| // other children. |
| if len(x.Blocks) == 0 { |
| return errSkipChildren |
| } |
| if tb, ok := x.Blocks[0].(*markdown.Text); ok { |
| if len(tb.Inline) != 1 { |
| return errSkipChildren |
| } |
| if l, ok := tb.Inline[0].(*markdown.Link); ok { |
| // Record the link. |
| var linkText bytes.Buffer |
| for _, t := range l.Inner { |
| t.PrintText(&linkText) |
| } |
| e.links = append(e.links, link{ |
| Href: l.URL, |
| Body: linkText.String(), |
| }) |
| } |
| } |
| return errSkipChildren |
| } |
| return nil |
| }) |
| if err != nil { |
| log.Errorf(e.ctx, "extractLinks.extract: %v", err) |
| } |
| } |
| |
| // linkRewriter rewrites links and image targets in a markdown document |
| // using translateLink. |
| type linkRewriter struct { |
| info *source.Info |
| readme *internal.Readme |
| } |
| |
| func (g *linkRewriter) rewriteLinks(doc *markdown.Document) { |
| walkBlocks(doc.Blocks, func(b markdown.Block) error { |
| if text, ok := b.(*markdown.Text); ok { |
| g.rewriteLinksInline(text.Inline) |
| } |
| return nil |
| }) |
| } |
| |
| func (g *linkRewriter) rewriteLinksInline(inlines []markdown.Inline) { |
| for _, inl := range inlines { |
| switch x := inl.(type) { |
| case *markdown.Link: |
| g.rewriteLinksInline(x.Inner) |
| if d := translateLink(x.URL, g.info, false, g.readme); d != "" { |
| x.URL = d |
| } |
| case *markdown.Image: |
| g.rewriteLinksInline(x.Inner) |
| if d := translateLink(x.URL, g.info, true, g.readme); d != "" { |
| x.URL = d |
| } |
| case *markdown.Emph: |
| g.rewriteLinksInline(x.Inner) |
| case *markdown.Strong: |
| g.rewriteLinksInline(x.Inner) |
| } |
| |
| } |
| } |
| |
| // transformHeadingsToHTML replaces heading blocks with rendered html |
| // blocks for the heading. It converts heading levels above 6 to divs |
| // with the h[level] class set on them. |
| func transformHeadingsToHTML(doc *markdown.Document) { |
| firstHeading := true |
| offset := 0 |
| var rewriteHeadingsBlocks func([]markdown.Block) |
| rewriteHeadingsBlocks = func(blocks []markdown.Block) { |
| for i, b := range blocks { |
| switch x := b.(type) { |
| case *markdown.Text: |
| case *markdown.HTMLBlock: |
| case *markdown.Table: |
| case *markdown.Empty: |
| case *markdown.CodeBlock: |
| case *markdown.ThematicBreak: |
| case *markdown.Paragraph: |
| rewriteHeadingsBlocks([]markdown.Block{x.Text}) |
| case *markdown.List: |
| rewriteHeadingsBlocks(x.Items) |
| case *markdown.Item: |
| rewriteHeadingsBlocks(x.Blocks) |
| case *markdown.Quote: |
| rewriteHeadingsBlocks(x.Blocks) |
| case *markdown.Heading: |
| heading := x |
| if firstHeading { |
| // The offset ensures the first heading is always an <h3>. |
| offset = 3 - heading.Level |
| firstHeading = false |
| } |
| newLevel := heading.Level + offset |
| |
| htmltag := &markdown.HTMLBlock{} |
| var buf bytes.Buffer |
| // TODO(matloob): Do we want the div and h elements to have analogous classes? |
| // Currently we're using newLevel for the div's class but n.Level for the h element's |
| // class. |
| if newLevel > 6 { |
| fmt.Fprintf(&buf, `<div class="h%d" role="heading" aria-level="%d"`, newLevel, heading.Level) |
| } else { |
| fmt.Fprintf(&buf, `<h%d class="h%d"`, newLevel, heading.Level) |
| } |
| if heading.ID != "" { |
| fmt.Fprintf(&buf, ` id="%s"`, htmlQuoteEscaper.Replace(heading.ID)) |
| } |
| buf.WriteByte('>') |
| heading.Text.PrintHTML(&buf) |
| if newLevel > 6 { |
| _, _ = buf.WriteString("</div>") |
| } else { |
| fmt.Fprintf(&buf, "</h%d>", newLevel) |
| } |
| htmltag.Text = append(htmltag.Text, buf.String()) |
| blocks[i] = htmltag |
| } |
| } |
| } |
| rewriteHeadingsBlocks(doc.Blocks) |
| } |
| |
| var htmlQuoteEscaper = strings.NewReplacer( |
| "\"", """, |
| "&", "&", |
| "<", "<", |
| ">", ">", |
| ) |
| |
| // rewriteHeadingIDs generates ids based on the body of the heading. |
| // The ASCII letters and numbers from the text are used to generate |
| // each of the ids. Finally, all heading ids |
| // are prefixed with "readme-" to avoid name collisions with other ids on the |
| // unit page. Duplicated heading ids are given an incremental suffix. See |
| // readme_test.go for examples. |
| func rewriteHeadingIDs(doc *markdown.Document) { |
| ids := map[string]bool{} |
| |
| generateID := func(heading *markdown.Heading) string { |
| var buf bytes.Buffer |
| for _, inl := range heading.Text.Inline { |
| inl.PrintText(&buf) |
| } |
| f := func(c rune) bool { |
| return !('a' <= c && c <= 'z') && !('A' <= c && c <= 'Z') && !('0' <= c && c <= '9') |
| } |
| str := strings.Join(strings.FieldsFunc(buf.String(), f), "-") |
| str = strings.ToLower(str) |
| if len(str) == 0 { |
| str = "heading" |
| } |
| key := str |
| for i := 1; ; i++ { |
| if _, ok := ids[key]; !ok { |
| ids[key] = true |
| break |
| } |
| key = fmt.Sprintf("%s-%d", str, i) |
| } |
| return "readme-" + key |
| } |
| |
| walkBlocks(doc.Blocks, func(b markdown.Block) error { |
| if heading, ok := b.(*markdown.Heading); ok { |
| id := generateID(heading) |
| heading.ID = string(id) |
| } |
| return nil |
| }) |
| } |