blob: c27a6a1b596231c50a40b15a1b14a0c833142355 [file] [log] [blame]
// Copyright 2024 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package crawldocs splits crawled HTML pages into sections
// and inserts them into a document corpus.
//
// [Split] provides access to the HTML splitter;
// [Sync] and [Restart] implement the incremental
// splitting of crawled HTML into a document corpus.
package crawldocs
import (
"bytes"
"iter"
"strings"
htmlpkg "golang.org/x/net/html"
)
// A Section is an HTML document section,
// which is the text following an HTML heading
// with an anchor ID.
type Section struct {
Title string // title of heading
ID string // anchor ID of heading
Text string // text following heading
}
// Split returns an iterator over sections in html.
func Split(html []byte) iter.Seq[*Section] {
return func(yield func(*Section) bool) {
doc, err := htmlpkg.Parse(bytes.NewReader(html))
if err != nil {
// Unreachable: htmlpkg.Parse can only fail if there is a read error,
// which there won't be from bytes.NewReader,
// or if it hits one of the configured limits,
// but we haven't configured any,
// so we can assume there won't be an error.
// (There is no such thing as "bad" HTML 5.)
panic("crawldocs: internal error: HTML 5 parse failed: " + err.Error())
}
walkDoc(doc, yield)
}
}
// walkDoc walks the HTML document rooted at n looking for headings.
// When it finds one, it calls walkHeading to handle that section
// of the document.
func walkDoc(n *htmlpkg.Node, yield func(*Section) bool) bool {
for c := n.FirstChild; c != nil; c = c.NextSibling {
if heading(c) >= 1 {
// Found headings.
return walkHeadings(c, yield)
}
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
if !walkDoc(c, yield) {
return false
}
}
return true
}
// walkHeading walks the headings starting at n
// and following through n's siblings, treating each
// as the potential start of a section.
// It yields each section that it encounters.
func walkHeadings(n *htmlpkg.Node, yield func(*Section) bool) bool {
// Accumulated text for section, which ends at next heading.
var titles [6]string
var text strings.Builder
var lastID string
// flush flushes the accumulated text.
flush := func(level int, id string) bool {
if level > 1 {
// Construct a title that gives the sequence of heading titles (h1 title > h2 title > ...).
title := titles[0]
for _, s := range titles[1:] {
if s != "" {
title += " > " + s
}
}
// Emit the section.
txt := strings.TrimSpace(text.String())
if txt != "" && lastID != "" {
if !yield(&Section{Title: title, ID: lastID, Text: txt}) {
return false
}
}
}
// Clear headings below the one we are adding now
// and reset the accumulated text.
clear(titles[level-1:])
text.Reset()
lastID = id
return true
}
// Walk siblings looking for headings, and emit text between them.
for c := n; c != nil; c = c.NextSibling {
if i := heading(c); i >= 1 {
if !flush(i, findAttr(c, "id")) {
return false
}
var buf strings.Builder
addText(&buf, c)
titles[i-1] = strings.ReplaceAll(buf.String(), "\n", " ")
continue
}
addText(&text, c)
}
// Pretend there's a final very deep heading to flush the last section.
return flush(len(titles)+1, "zzz")
}
// heading reports the heading level of the node n.
// If n is not a heading, it returns 0.
func heading(n *htmlpkg.Node) int {
if n.Type == htmlpkg.ElementNode {
if len(n.Data) == 2 && n.Data[0] == 'h' && '1' <= n.Data[1] && n.Data[1] <= '6' {
return int(n.Data[1] - '0')
}
}
return 0
}
// addText adds the text from n to buf.
func addText(buf *strings.Builder, n *htmlpkg.Node) {
if n.Type == htmlpkg.TextNode {
buf.WriteString(n.Data)
return
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
addText(buf, c)
}
}
// findAttr returns the value for n's attribute with the given name.
func findAttr(n *htmlpkg.Node, name string) string {
for _, a := range n.Attr {
if a.Key == name {
return a.Val
}
}
return ""
}