| // Copyright 2024 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| // Package crawldocs splits crawled HTML pages into sections |
| // and inserts them into a document corpus. |
| // |
| // [Split] provides access to the HTML splitter; |
| // [Sync] and [Restart] implement the incremental |
| // splitting of crawled HTML into a document corpus. |
| package crawldocs |
| |
| import ( |
| "bytes" |
| "iter" |
| "strings" |
| |
| htmlpkg "golang.org/x/net/html" |
| ) |
| |
| // A Section is an HTML document section, |
| // which is the text following an HTML heading |
| // with an anchor ID. |
| type Section struct { |
| Title string // title of heading |
| ID string // anchor ID of heading |
| Text string // text following heading |
| } |
| |
| // Split returns an iterator over sections in html. |
| func Split(html []byte) iter.Seq[*Section] { |
| return func(yield func(*Section) bool) { |
| doc, err := htmlpkg.Parse(bytes.NewReader(html)) |
| if err != nil { |
| // Unreachable: htmlpkg.Parse can only fail if there is a read error, |
| // which there won't be from bytes.NewReader, |
| // or if it hits one of the configured limits, |
| // but we haven't configured any, |
| // so we can assume there won't be an error. |
| // (There is no such thing as "bad" HTML 5.) |
| panic("crawldocs: internal error: HTML 5 parse failed: " + err.Error()) |
| } |
| walkDoc(doc, yield) |
| } |
| } |
| |
| // walkDoc walks the HTML document rooted at n looking for headings. |
| // When it finds one, it calls walkHeading to handle that section |
| // of the document. |
| func walkDoc(n *htmlpkg.Node, yield func(*Section) bool) bool { |
| for c := n.FirstChild; c != nil; c = c.NextSibling { |
| if heading(c) >= 1 { |
| // Found headings. |
| return walkHeadings(c, yield) |
| } |
| } |
| for c := n.FirstChild; c != nil; c = c.NextSibling { |
| if !walkDoc(c, yield) { |
| return false |
| } |
| } |
| return true |
| } |
| |
| // walkHeading walks the headings starting at n |
| // and following through n's siblings, treating each |
| // as the potential start of a section. |
| // It yields each section that it encounters. |
| func walkHeadings(n *htmlpkg.Node, yield func(*Section) bool) bool { |
| // Accumulated text for section, which ends at next heading. |
| var titles [6]string |
| var text strings.Builder |
| var lastID string |
| |
| // flush flushes the accumulated text. |
| flush := func(level int, id string) bool { |
| if level > 1 { |
| // Construct a title that gives the sequence of heading titles (h1 title > h2 title > ...). |
| title := titles[0] |
| for _, s := range titles[1:] { |
| if s != "" { |
| title += " > " + s |
| } |
| } |
| |
| // Emit the section. |
| txt := strings.TrimSpace(text.String()) |
| if txt != "" && lastID != "" { |
| if !yield(&Section{Title: title, ID: lastID, Text: txt}) { |
| return false |
| } |
| } |
| } |
| |
| // Clear headings below the one we are adding now |
| // and reset the accumulated text. |
| clear(titles[level-1:]) |
| text.Reset() |
| lastID = id |
| return true |
| } |
| |
| // Walk siblings looking for headings, and emit text between them. |
| for c := n; c != nil; c = c.NextSibling { |
| if i := heading(c); i >= 1 { |
| if !flush(i, findAttr(c, "id")) { |
| return false |
| } |
| var buf strings.Builder |
| addText(&buf, c) |
| titles[i-1] = strings.ReplaceAll(buf.String(), "\n", " ") |
| continue |
| } |
| addText(&text, c) |
| } |
| |
| // Pretend there's a final very deep heading to flush the last section. |
| return flush(len(titles)+1, "zzz") |
| } |
| |
| // heading reports the heading level of the node n. |
| // If n is not a heading, it returns 0. |
| func heading(n *htmlpkg.Node) int { |
| if n.Type == htmlpkg.ElementNode { |
| if len(n.Data) == 2 && n.Data[0] == 'h' && '1' <= n.Data[1] && n.Data[1] <= '6' { |
| return int(n.Data[1] - '0') |
| } |
| } |
| return 0 |
| } |
| |
| // addText adds the text from n to buf. |
| func addText(buf *strings.Builder, n *htmlpkg.Node) { |
| if n.Type == htmlpkg.TextNode { |
| buf.WriteString(n.Data) |
| return |
| } |
| for c := n.FirstChild; c != nil; c = c.NextSibling { |
| addText(buf, c) |
| } |
| } |
| |
| // findAttr returns the value for n's attribute with the given name. |
| func findAttr(n *htmlpkg.Node, name string) string { |
| for _, a := range n.Attr { |
| if a.Key == name { |
| return a.Val |
| } |
| } |
| return "" |
| } |