internal/crawldocs/sync.go - oscar - Git at Google

 // Copyright 2024 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

 package crawldocs

 import (
 	"context"
 	"log/slog"

 	"golang.org/x/oscar/internal/crawl"
 	"golang.org/x/oscar/internal/docs"
 	"golang.org/x/oscar/internal/storage/timed"
 )

 // Sync reads new HTML pages from cr, splits them into sections using [Split],
 // and adds each page section's text to the corpus dc.
 //
 // Sync uses [crawl.Crawler.PageWatcher] with the name "crawldocs"
 // to save its position across multiple calls.
 //
 // Sync logs status and unexpected problems to lg.
 //
 // Sync makes no use of its context.
 func Sync(ctx context.Context, lg *slog.Logger, dc *docs.Corpus, cr *crawl.Crawler) error {
 	w := cr.PageWatcher("crawldocs")
 	for p := range w.Recent() {
 		lg.Debug("crawldocs sync", "page", p.URL, "dbtime", p.DBTime)
 		// TODO(rsc): We should probably delete the existing docs
 		// starting with p.URL#.
 		for s := range Split(p.HTML) {
 			dc.Add(p.URL+"#"+s.ID, s.Title, s.Text)
 		}
 		w.MarkOld(p.DBTime)
 	}
 	return nil
 }

 // Restart restarts the "crawldocs" page watcher,
 // so that a future call to [Sync] will reprocess all documents.
 // Calling [Restart] may be necessary after changing [Split],
 // to reprocess those pages.
 //
 // Restart makes no use of its context.
 func Restart(ctx context.Context, lg *slog.Logger, cr *crawl.Crawler) {
 	cr.PageWatcher("crawldocs").Restart()
 }

 // Latest returns the latest known DBTime marked old by the crawler's Watcher.
 func Latest(cr *crawl.Crawler) timed.DBTime {
 	return cr.PageWatcher("crawldocs").Latest()
 }
	// Copyright 2024 The Go Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style
	// license that can be found in the LICENSE file.

	package crawldocs

	import (
	"context"
	"log/slog"

	"golang.org/x/oscar/internal/crawl"
	"golang.org/x/oscar/internal/docs"
	"golang.org/x/oscar/internal/storage/timed"
	)

	// Sync reads new HTML pages from cr, splits them into sections using [Split],
	// and adds each page section's text to the corpus dc.
	//
	// Sync uses [crawl.Crawler.PageWatcher] with the name "crawldocs"
	// to save its position across multiple calls.
	//
	// Sync logs status and unexpected problems to lg.
	//
	// Sync makes no use of its context.
	func Sync(ctx context.Context, lg slog.Logger, dc docs.Corpus, cr *crawl.Crawler) error {
	w := cr.PageWatcher("crawldocs")
	for p := range w.Recent() {
	lg.Debug("crawldocs sync", "page", p.URL, "dbtime", p.DBTime)
	// TODO(rsc): We should probably delete the existing docs
	// starting with p.URL#.
	for s := range Split(p.HTML) {
	dc.Add(p.URL+"#"+s.ID, s.Title, s.Text)
	}
	w.MarkOld(p.DBTime)
	}
	return nil
	}

	// Restart restarts the "crawldocs" page watcher,
	// so that a future call to [Sync] will reprocess all documents.
	// Calling [Restart] may be necessary after changing [Split],
	// to reprocess those pages.
	//
	// Restart makes no use of its context.
	func Restart(ctx context.Context, lg slog.Logger, cr crawl.Crawler) {
	cr.PageWatcher("crawldocs").Restart()
	}

	// Latest returns the latest known DBTime marked old by the crawler's Watcher.
	func Latest(cr *crawl.Crawler) timed.DBTime {
	return cr.PageWatcher("crawldocs").Latest()
	}