blob: ba648a416043f6e1012138cc55a2d6fd6eb8a1fb [file] [log] [blame]
// Copyright 2024 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package crawl implements a basic web crawler for crawling a portion of a web site.
// Construct a [Crawler], configure it, and then call its [Run] method.
// The crawler stores the crawled data in a [storage.DB], and then
// [Crawler.PageWatcher] can be used to watch for new pages.
package crawl
import (
"bytes"
"context"
"encoding/json"
"io"
"iter"
"log/slog"
"net/http"
"net/url"
"strings"
"time"
"golang.org/x/net/html"
"golang.org/x/oscar/internal/docs"
"golang.org/x/oscar/internal/htmlutil"
"golang.org/x/oscar/internal/storage"
"golang.org/x/oscar/internal/storage/timed"
"rsc.io/ordered"
)
// This package stores timed entries in the database of the form:
//
// ["crawl.Page", URL] => [Raw(JSON(Page)), Raw(HTML)]
//
// The HTML is the raw HTML served at URL.
// Storing the raw HTML avoids having to re-download the site each time
// we change the way the HTML is processed.
// JSON and HTML are empty if the page has been found but not yet crawled.
const crawlKind = "crawl.Page"
const defaultRecrawl = 24 * time.Hour
// A Crawler is a basic web crawler.
//
// Note that this package does not load or process robots.txt.
// Instead the assumption is that the site owner is crawling a portion of their own site
// and will confiure the crawler appropriately.
// (In the case of Go's Oscar instance, we only crawl go.dev.)
type Crawler struct {
slog *slog.Logger
db storage.DB
http *http.Client
recrawl time.Duration
cleans []func(*url.URL) error
rules []rule
}
var _ docs.Source[*Page] = (*Crawler)(nil)
const DocWatcherID = "crawldocs"
// DocWatcher returns the page watcher with name "crawldocs".
// Implements [docs.Source.DocWatcher].
func (cr *Crawler) DocWatcher() *timed.Watcher[*Page] {
return cr.PageWatcher(DocWatcherID)
}
// ToDocs converts a crawled page to a list of embeddable documents,
// split into sections using [htmlutil.Split].
//
// Implements [docs.Source.ToDocs].
func (*Crawler) ToDocs(p *Page) (iter.Seq[*docs.Doc], bool) {
return func(yield func(*docs.Doc) bool) {
// TODO(rsc): We should probably delete the existing docs
// starting with p.URL# before embedding them.
for s := range htmlutil.Split(p.HTML) {
d := &docs.Doc{
ID: p.URL + "#" + s.ID,
Title: s.Title,
Text: s.Text,
}
if !yield(d) {
return
}
}
}, true
}
// A rule is a rule about which URLs can be crawled.
// See [Crawler.Allow] for more details.
type rule struct {
prefix string // URLs matching this prefix should be ...
allow bool // allowed or disallowed
}
// TODO(rsc): Store ETag and use to avoid redownloading?
// A Page records the result of crawling a single page.
type Page struct {
DBTime timed.DBTime
URL string // URL of page
From string // a page where we found the link to this one
LastCrawl time.Time // time of last crawl
Redirect string // HTTP redirect during fetch
HTML []byte // HTML content, if any
Error string // error fetching page, if any
}
var _ docs.Entry = (*Page)(nil)
// LastWritten implements [docs.Entry.LastWritten].
func (p *Page) LastWritten() timed.DBTime {
return p.DBTime
}
// A crawlPage is the JSON form of Page.
// The fields and field order of crawlPage and Page must match exactly; only the struct tags differ.
// We omit the DBTime, URL, and HTML fields from JSON, because they are encoded separately.
// Using this separate copy of the struct avoids forcing the internal JSON needs of this package
// onto clients using Page.
type crawlPage struct {
DBTime timed.DBTime `json:"-"`
URL string `json:"-"`
From string
LastCrawl time.Time
Redirect string
HTML []byte `json:"-"`
Error string
}
// New returns a new [Crawler] that uses the given logger, database, and HTTP client.
// The caller should configure the Crawler further by calling [Crawler.Add],
// [Crawler.Allow], [Crawler.Deny], [Crawler.Clean], and [Crawler.SetRecrawl].
// Once configured, the crawler can be run by calling [Crawler.Run].
func New(lg *slog.Logger, db storage.DB, hc *http.Client) *Crawler {
if hc != nil {
// We want a client that does not follow redirects,
// but we cannot modify the caller's http.Client directly.
// Instead, make our own copy and override CheckRedirect.
hc1 := *hc
hc = &hc1
hc.CheckRedirect = func(*http.Request, []*http.Request) error { return http.ErrUseLastResponse }
}
c := &Crawler{
slog: lg,
db: db,
http: hc,
recrawl: defaultRecrawl,
}
return c
}
// Add adds the URL to the list of roots for the crawl.
// The added URL must not include a URL fragment (#name).
func (c *Crawler) Add(url string) {
if strings.Contains(url, "#") {
panic("crawl misuse: Add of URL with fragment")
}
if _, ok := c.Get(url); ok {
return
}
b := c.db.Batch()
c.set(b, &Page{URL: url})
b.Apply()
}
// SetRecrawl sets the time to wait before recrawling a page.
// The default is 24 hours.
func (c *Crawler) SetRecrawl(d time.Duration) {
c.recrawl = d
}
// decodePage decodes the timed.Entry into a Page.
func (c *Crawler) decodePage(e *timed.Entry) *Page {
var p Page
if err := ordered.Decode(e.Key, &p.URL); err != nil {
// unreachable unless database corruption
c.db.Panic("decode crawl.Page key", "key", storage.Fmt(e.Key), "err", err)
}
// The HTML is stored separately from the JSON describing the rest of the Page
// to avoid the bother and overhead of JSON-encoding the HTML.
var js, html ordered.Raw
if err := ordered.Decode(e.Val, &js, &html); err != nil {
// unreachable unless database corruption
c.db.Panic("decode crawl.Page val", "val", storage.Fmt(e.Val), "err", err)
}
if len(js) > 0 {
if err := json.Unmarshal(js, (*crawlPage)(&p)); err != nil {
// unreachable unless database corruption
c.db.Panic("decode crawl.Page js", "js", storage.Fmt(js), "err", err)
}
}
p.HTML = html
p.DBTime = e.ModTime
return &p
}
// Get returns the result of the most recent crawl for the given URL.
// If the page has been crawled, Get returns a non-nil *Page, true.
// If the page has not been crawled, Get returns nil, false.
func (c *Crawler) Get(url string) (*Page, bool) {
e, ok := timed.Get(c.db, crawlKind, ordered.Encode(url))
if !ok {
return nil, false
}
return c.decodePage(e), true
}
// Set adds p to the crawled page database.
// It is typically only used for setting up tests.
func (c *Crawler) Set(p *Page) {
b := c.db.Batch()
c.set(b, p)
b.Apply()
}
// set records p in the batch b.
func (c *Crawler) set(b storage.Batch, p *Page) {
if strings.Contains(p.URL, "#") {
// Unreachable without logic bug in this package.
panic("crawl misuse: Set of URL with fragment")
}
timed.Set(c.db, b, crawlKind,
ordered.Encode(p.URL),
ordered.Encode(
ordered.Raw(storage.JSON((*crawlPage)(p))),
ordered.Raw(p.HTML)))
}
// Run crawls all the pages it can, returning when the entire site has been
// crawled either during this run or within the crawl duration set by
// [Crawler.Recrawl].
func (c *Crawler) Run(ctx context.Context) error {
// Crawl every page in the database.
// The pages-by-time list serves as a work queue,
// but if there are link loops we may end up writing a Page
// we've already processed, making it appear again in our scan.
// We use the crawled map to make sure we only crawl each page at most once.
// We use the queued map to make sure we only queue each found link at most once.
crawled := make(map[string]bool)
queued := make(map[string]bool)
for e := range timed.ScanAfter(c.slog, c.db, crawlKind, 0, nil) {
p := c.decodePage(e)
if time.Since(p.LastCrawl) < c.recrawl || crawled[p.URL] {
continue
}
crawled[p.URL] = true
c.crawlPage(ctx, queued, p)
}
return nil
}
// crawlPage downloads the content for a page,
// saves it, and then queues all links it can find in that page's HTML.
func (c *Crawler) crawlPage(ctx context.Context, queued map[string]bool, p *Page) {
var slogBody []byte
slog := c.slog.With("page", p.URL, "lastcrawl", p.LastCrawl)
if strings.Contains(p.URL, "#") {
// Unreachable without logic bug in this package.
panic("crawl misuse: crawlPage of URL with fragment")
}
b := c.db.Batch()
defer func() {
if p.Error != "" {
if slogBody != nil {
slog = slog.With("body", string(slogBody[:min(len(slogBody), 1<<10)]))
}
slog.Warn("crawl error", "err", p.Error, "last", p.LastCrawl)
}
c.set(b, p)
b.Apply()
c.db.Flush()
}()
p.LastCrawl = time.Now()
p.Redirect = ""
p.Error = ""
p.HTML = nil
base, err := url.Parse(p.URL)
if err != nil {
// Unreachable unless Page was corrupted.
p.Error = err.Error()
return
}
u := base.String()
slog = slog.With("url", u)
req, err := http.NewRequestWithContext(ctx, "GET", u, nil)
if err != nil {
// Unreachable unless url.String doesn't round-trip back to url.Parse.
p.Error = err.Error()
return
}
resp, err := c.http.Do(req)
if err != nil {
p.Error = err.Error()
return
}
// TODO(rsc): Make max body length adjustable by policy.
// Also set HTML tokenizer max? For now the max body length
// takes care of it for us.
const maxBody = 4 << 20
body, err := io.ReadAll(io.LimitReader(resp.Body, maxBody+1))
resp.Body.Close()
slogBody = body
if err != nil {
p.Error = err.Error()
return
}
if len(body) > maxBody {
p.Error = "body too big"
return
}
slog = slog.With("status", resp.Status)
if resp.StatusCode/10 == 30 { // Redirect
loc := resp.Header.Get("Location")
if loc == "" {
p.Error = "redirect without location"
return
}
slog = slog.With("location", loc)
locURL, err := url.Parse(loc)
if err != nil {
// Unreachable: http.Client.Do processes the Location header
// to decide about following redirects (we disable that, but that
// check only happens after the Location header is processed),
// and will return an error if the Location has a bad URL.
p.Error = err.Error()
return
}
link := base.ResolveReference(locURL)
p.Redirect = link.String()
slog.Info("crawl redirect", "link", p.Redirect)
c.queue(queued, b, link, u)
return
}
if resp.StatusCode != 200 {
p.Error = "http status " + resp.Status
return
}
slogBody = nil
ctype := resp.Header.Get("Content-Type")
if ctype != "text/html" && !strings.HasPrefix(ctype, "text/html;") {
slog = slog.With("content-type", ctype)
p.Error = "Content-Type: " + ctype
return
}
p.HTML = body
slog = slog.With("htmlsize", len(body))
doc, err := html.Parse(bytes.NewReader(body))
if err != nil {
// Unreachable because it's either a read error
// (but bytes.NewReader has no read errors)
// or hitting the max HTML token limit (but we didn't set that limit).
p.Error = "html parse error: " + err.Error()
return
}
for link := range links(slog, base, doc) {
if queued[link.String()] {
// Quiet skip to avoid tons of repetitive logging about
// all the links in the page footers.
// (Calling c.queue will skip too but also log.)
continue
}
slog.Info("crawl html link", "link", link)
c.queue(queued, b, link, u)
}
slog.Info("crawl ok")
}
// queue queues the link for crawling, unless it has already been queued.
// It records that the link came from a page with URL fromURL.
func (c *Crawler) queue(queued map[string]bool, b storage.Batch, link *url.URL, fromURL string) {
old := link.String()
if queued[old] {
return
}
queued[old] = true
if err := c.clean(link); err != nil {
c.slog.Info("crawl queue clean error", "url", old, "from", fromURL, "err", err)
return
}
targ := link.String()
if targ != old && queued[targ] {
c.slog.Info("crawl queue seen", "url", targ, "old", old, "from", fromURL)
return
}
queued[targ] = true
if !c.allowed(targ) {
c.slog.Info("crawl queue disallow after clean", "url", targ, "old", old, "from", fromURL)
return
}
if strings.Contains(targ, "#") {
// Unreachable without logic bug in this package.
panic("crawl misuse: queue of URL with fragment")
}
p := &Page{
URL: targ,
From: fromURL,
}
if old, ok := c.Get(targ); ok {
if time.Since(old.LastCrawl) < c.recrawl {
c.slog.Debug("crawl queue already visited", "url", targ, "last", old.LastCrawl)
return
}
old.From = p.From
p = old
}
c.slog.Info("crawl queue", "url", p.URL, "old", old)
c.set(b, p)
}
// links returns an iterator over all HTML links in the doc,
// interpreted relative to base.
// It logs unexpected bad URLs to slog.
func links(slog *slog.Logger, base *url.URL, doc *html.Node) iter.Seq[*url.URL] {
return func(yield func(*url.URL) bool) {
// Walk HTML looking for <a href=...>.
var yieldLinks func(*html.Node) bool
yieldLinks = func(n *html.Node) bool {
for c := n.FirstChild; c != nil; c = c.NextSibling {
if !yieldLinks(c) {
return false
}
}
var targ string
if n.Type == html.ElementNode {
switch n.Data {
case "a":
targ = findAttr(n, "href")
}
}
// Ignore no target or #fragment.
if targ == "" || strings.HasPrefix(targ, "#") {
return true
}
// Parse target as URL.
u, err := url.Parse(targ)
if err != nil {
slog.Info("links bad url", "base", base.String(), "targ", targ, "err", err)
return true
}
return yield(base.ResolveReference(u))
}
yieldLinks(doc)
}
}
// findAttr returns the value for n's attribute with the given name.
func findAttr(n *html.Node, name string) string {
for _, a := range n.Attr {
if a.Key == name {
return a.Val
}
}
return ""
}
// Clean adds a cleaning function to the crawler's list of cleaners.
// Each time the crawler considers queuing a URL to be crawled,
// it calls the cleaning functions to canonicalize or otherwise clean the URL first.
// A cleaning function might remove unnecessary URL parameters or
// canonicalize host names or paths.
// The Crawler automatically removes any URL fragment before applying registered cleaners.
func (c *Crawler) Clean(clean func(*url.URL) error) {
c.cleans = append(c.cleans, clean)
}
// allowed reports whether c's configuration allows the target URL.
func (c *Crawler) allowed(targ string) bool {
allow := false
n := 0
for _, r := range c.rules {
if n <= len(r.prefix) && hasPrefix(targ, r.prefix) {
allow = r.allow
n = len(r.prefix)
}
}
return allow
}
// Allow records that the crawler is allowed to crawl URLs with the given list of prefixes.
// A URL is considered to match a prefix if one of the following is true:
//
// - The URL is exactly the prefix.
// - The URL begins with the prefix, and the prefix ends in /.
// - The URL begins with the prefix, and the next character in the URL is / or ?.
//
// The companion function [Crawler.Deny] records that the crawler is not allowed to
// crawl URLs with a list of prefixes. When deciding whether a URL can be crawled,
// longer prefixes take priority over shorter prefixes.
// If the same prefix is added to both [Crawler.Allow] and [Crawler.Deny],
// the last call wins. The default outcome is that a URL is not
// allowed to be crawled.
//
// For example, consider this call sequence:
//
// c.Allow("https://go.dev/a/")
// c.Allow("https://go.dev/a/b/c")
// c.Deny("https://go.dev/a/b")
//
// Given these rules, the crawler makes the following decisions about these URLs:
//
// - https://go.dev/a: not allowed
// - https://go.dev/a/: allowed
// - https://go.dev/a/?x=1: allowed
// - https://go.dev/a/x: allowed
// - https://go.dev/a/b: not allowed
// - https://go.dev/a/b/x: not allowed
// - https://go.dev/a/b/c: allowed
// - https://go.dev/a/b/c/x: allowed
// - https://go.dev/x: not allowed
func (c *Crawler) Allow(prefix ...string) {
for _, p := range prefix {
c.rules = append(c.rules, rule{p, true})
}
}
// Deny records that the crawler is allowed to crawl URLs with the given list of prefixes.
// See the [Crawler.Allow] documentation for details about prefixes and interactions with Allow.
func (c *Crawler) Deny(prefix ...string) {
for _, p := range prefix {
c.rules = append(c.rules, rule{p, false})
}
}
// hasPrefix reports whether targ is considered to have the given prefix,
// following the rules documented in [Crawler.Allow]'s doc comment.
func hasPrefix(targ, prefix string) bool {
if !strings.HasPrefix(targ, prefix) {
return false
}
if len(targ) == len(prefix) || prefix != "" && prefix[len(prefix)-1] == '/' {
return true
}
switch targ[len(prefix)] {
case '/', '?':
return true
}
return false
}
// clean removes the URL Fragment and then calls the registered cleaners on u.
// If any cleaner returns an error, clean returns that error and does not run any more cleaners.
func (c *Crawler) clean(u *url.URL) error {
u.Fragment = ""
for _, fn := range c.cleans {
if err := fn(u); err != nil {
return err
}
}
return nil
}
// PageWatcher returns a timed.Watcher over Pages that the Crawler
// has stored in its database.
func (c *Crawler) PageWatcher(name string) *timed.Watcher[*Page] {
return timed.NewWatcher(c.slog, c.db, name, crawlKind, c.decodePage)
}