internal/postgres/search.go - pkgsite - Git at Google

 // Copyright 2019 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

 package postgres

 import (
 	"context"
 	"database/sql"
 	"fmt"
 	"io"
 	"sort"
 	"strings"
 	"time"

 	"github.com/lib/pq"
 	"go.opencensus.io/plugin/ochttp"
 	"go.opencensus.io/stats"
 	"go.opencensus.io/stats/view"
 	"go.opencensus.io/tag"
 	"go.opencensus.io/trace"
 	"golang.org/x/pkgsite/internal"
 	"golang.org/x/pkgsite/internal/database"
 	"golang.org/x/pkgsite/internal/dcensus"
 	"golang.org/x/pkgsite/internal/derrors"
 	"golang.org/x/pkgsite/internal/experiment"
 	"golang.org/x/pkgsite/internal/log"
 	"golang.org/x/pkgsite/internal/postgres/symbolsearch"
 	"golang.org/x/pkgsite/internal/stdlib"
 )

 var (
 	// searchLatency holds observed latency in individual search queries.
 	searchLatency = stats.Float64(
 		"go-discovery/search/latency",
 		"Latency of a search query.",
 		stats.UnitMilliseconds,
 	)
 	// keySearchSource is a census tag for search query types.
 	keySearchSource = tag.MustNewKey("search.source")
 	// SearchLatencyDistribution aggregates search request latency by search
 	// query type.
 	SearchLatencyDistribution = &view.View{
 		Name:        "go-discovery/search/latency",
 		Measure:     searchLatency,
 		Aggregation: ochttp.DefaultLatencyDistribution,
 		Description: "Search latency, by result source query type.",
 		TagKeys:     []tag.Key{keySearchSource},
 	}
 	// SearchResponseCount counts search responses by search query type.
 	SearchResponseCount = &view.View{
 		Name:        "go-discovery/search/count",
 		Measure:     searchLatency,
 		Aggregation: view.Count(),
 		Description: "Search count, by result source query type.",
 		TagKeys:     []tag.Key{keySearchSource},
 	}
 )

 // searchResponse is used for internal bookkeeping when fanning-out search
 // request to multiple different search queries.
 type searchResponse struct {
 	// source is a unique identifier for the search query type (e.g. 'deep',
 	// 'popular'), to be used in logging and reporting.
 	source string
 	// results are partially filled out from only the search_documents table.
 	results []*SearchResult
 	// err indicates a technical failure of the search query, or that results are
 	// not provably complete.
 	err error
 }

 // searchEvent is used to log structured information about search events for
 // later analysis. A 'search event' occurs when a searcher or count estimate
 // returns.
 type searchEvent struct {
 	// Type is either the searcher name or 'estimate' (the count estimate).
 	Type string
 	// Latency is the duration that that the operation took.
 	Latency time.Duration
 	// Err is the error returned by the operation, if any.
 	Err error
 }

 // A searcher is used to execute a single search request.
 type searcher func(db *DB, ctx context.Context, q string, limit, offset, maxResultCount int) searchResponse

 // The pkgSearchers used by Search.
 var pkgSearchers = map[string]searcher{
 	"popular": (*DB).popularSearch,
 	"deep":    (*DB).deepSearch,
 }

 var symbolSearchers = map[string]searcher{
 	"symbol": (*DB).symbolSearch,
 }

 type SearchOptions struct {
 	// Maximum number of results to return (page size).
 	MaxResults int
 	// Offset for DB query.
 	Offset int
 	// Maximum number to use for total result count.
 	MaxResultCount int
 	// If true, perform a symbol search.
 	SearchSymbols bool
 }

 // SearchResult represents a single search result from SearchDocuments.
 type SearchResult struct {
 	Name        string
 	PackagePath string
 	ModulePath  string
 	Version     string
 	Synopsis    string
 	Licenses    []string

 	CommitTime time.Time

 	// Score is used to sort items in an array of SearchResult.
 	Score float64

 	// NumImportedBy is the number of packages that import PackagePath.
 	NumImportedBy uint64

 	// SameModule is a list of SearchResults from the same module as this one,
 	// with lower scores.
 	SameModule []*SearchResult

 	// OtherMajor is a set of module paths with the same series path but at
 	// different major versions of this module.
 	OtherMajor map[string]bool

 	// NumResults is the total number of packages that were returned for this
 	// search.
 	NumResults uint64

 	// Approximate reports whether NumResults is an approximate count. NumResults
 	// can be approximate if search scanned only a subset of documents, and
 	// result count is estimated using the hyperloglog algorithm.
 	Approximate bool

 	// Symbol information returned by a search request.
 	// Only populated for symbol search mode.
 	SymbolName     string
 	SymbolKind     internal.SymbolKind
 	SymbolSynopsis string
 	SymbolGOOS     string
 	SymbolGOARCH   string

 	// Offset is the 0-based number of this row in the DB query results, which
 	// is the value to use in a SQL OFFSET clause to have this row be the first
 	// one returned.
 	Offset int
 }

 // Search executes two search requests concurrently:
 //   - a sequential scan of packages in descending order of popularity.
 //   - all packages ("deep" search) using an inverted index to filter to search
 //     terms.
 //
 // The sequential scan takes significantly less time when searching for very
 // common terms (e.g. "errors", "cloud", or "kubernetes"), due to its ability
 // to exit early once the requested page of search results is provably
 // complete.
 //
 // Because 0 <= ts_rank() <= 1, we know that the highest score of any unscanned
 // package is ln(e+N), where N is imported_by_count of the package we are
 // currently considering.  Therefore if the lowest scoring result of popular
 // search is greater than ln(e+N), we know that we haven't missed any results
 // and can return the search result immediately, cancelling other searches.
 //
 // On the other hand, if the popular search is slow, it is likely that the
 // search term is infrequent, and deep search will be fast due to our inverted
 // gin index on search tokens.
 //
 // The gap in this optimization is search terms that are very frequent, but
 // rarely relevant: "int" or "package", for example. In these cases we'll pay
 // the penalty of a deep search that scans nearly every package.
 func (db *DB) Search(ctx context.Context, q string, opts SearchOptions) (_ []*SearchResult, err error) {
 	defer derrors.WrapStack(&err, "DB.Search(ctx, %q, %+v)", q, opts)
 	if experiment.IsActive(ctx, internal.ExperimentSearchGrouping) && !opts.SearchSymbols {
 		const (
 			limitMultiplier1 = 3
 			limitMultiplier2 = 5
 		)
 		// Limit search to more rows than the requested number of results, so
 		// that it can find other packages in the modules it selects.
 		srs, err := db.search(ctx, q, opts, limitMultiplier1*opts.MaxResults)
 		if err != nil {
 			return nil, err
 		}
 		if len(srs) >= opts.MaxResults || numRows(srs) <= limitMultiplier1*opts.MaxResults {
 			return srs, nil
 		}
 		// Grouped search didn't find enough results, but there are more
 		// rows that could potentially match. Try one more time, with a
 		// larger limit.
 		return db.search(ctx, q, opts, limitMultiplier2*opts.MaxResults)
 	}
 	return db.search(ctx, q, opts, opts.MaxResults)
 }

 func (db *DB) search(ctx context.Context, q string, opts SearchOptions, limit int) (_ []*SearchResult, err error) {
 	defer derrors.WrapStack(&err, "search(limit=%d)", limit)

 	var searchers map[string]searcher
 	if opts.SearchSymbols &&
 		experiment.IsActive(ctx, internal.ExperimentSearchGrouping) &&
 		experiment.IsActive(ctx, internal.ExperimentSymbolSearch) {
 		searchers = symbolSearchers
 	} else {
 		searchers = pkgSearchers
 	}
 	resp, err := db.hedgedSearch(ctx, q, limit, opts.Offset, opts.MaxResultCount, searchers, nil)
 	if err != nil {
 		return nil, err
 	}
 	// Filter out excluded paths.
 	var results []*SearchResult
 	for _, r := range resp.results {
 		ex, err := db.IsExcluded(ctx, r.PackagePath)
 		if err != nil {
 			return nil, err
 		}
 		if !ex {
 			results = append(results, r)
 		}
 	}
 	if experiment.IsActive(ctx, internal.ExperimentSearchGrouping) && !opts.SearchSymbols {
 		results = groupSearchResults(results)
 	}
 	if len(results) > opts.MaxResults {
 		results = results[:opts.MaxResults]
 	}
 	return results, nil
 }

 // Penalties to search scores, applied as multipliers to the score.
 const (
 	// Module license is non-redistributable.
 	nonRedistributablePenalty = 0.5
 	// Module does not have a go.mod file.
 	// Start this off gently (close to 1), but consider lowering
 	// it as time goes by and more of the ecosystem converts to modules.
 	noGoModPenalty = 0.8
 )

 // scoreExpr is the expression that computes the search score.
 // It is the product of:
 // - The Postgres ts_rank score, based the relevance of the document to the query.
 // - The log of the module's popularity, estimated by the number of importing packages.
 //   The log factor contains exp(1) so that it is always >= 1. Taking the log
 //   of imported_by_count instead of using it directly makes the effect less
 //   dramatic: being 2x as popular only has an additive effect.
 // - A penalty factor for non-redistributable modules, since a lot of
 //   details cannot be displayed.
 // The first argument to ts_rank is an array of weights for the four tsvector sections,
 // in the order D, C, B, A.
 // The weights below match the defaults except for B.
 var scoreExpr = fmt.Sprintf(`
 		ts_rank('{0.1, 0.2, 1.0, 1.0}', tsv_search_tokens, websearch_to_tsquery($1)) *
 		ln(exp(1)+imported_by_count) *
 		CASE WHEN redistributable THEN 1 ELSE %f END *
 		CASE WHEN COALESCE(has_go_mod, true) THEN 1 ELSE %f END
 	`, nonRedistributablePenalty, noGoModPenalty)

 // hedgedSearch executes multiple search methods and returns the first
 // available result.
 // The optional guardTestResult func may be used to allow tests to control the
 // order in which search results are returned.
 func (db *DB) hedgedSearch(ctx context.Context, q string, limit, offset, maxResultCount int, searchers map[string]searcher, guardTestResult func(string) func()) (_ *searchResponse, err error) {
 	defer derrors.WrapStack(&err, "hedgedSearch(ctx, %q, %d, %d, %d)", q, limit, offset, maxResultCount)

 	searchStart := time.Now()
 	responses := make(chan searchResponse, len(searchers))
 	// cancel all unfinished searches when a result (or error) is returned. The
 	// effectiveness of this depends on the database driver.
 	searchCtx, cancel := context.WithCancel(ctx)
 	defer cancel()

 	// Fan out our search requests.
 	for _, s := range searchers {
 		s := s
 		go func() {
 			start := time.Now()
 			resp := s(db, searchCtx, q, limit, offset, maxResultCount)
 			log.Debug(ctx, searchEvent{
 				Type:    resp.source,
 				Latency: time.Since(start),
 				Err:     resp.err,
 			})
 			if guardTestResult != nil {
 				defer guardTestResult(resp.source)()
 			}
 			responses <- resp
 		}()
 	}
 	// Note for future readers: in previous iterations of this code we kept
 	// reading responses if the first one had an error, with the goal to minimize
 	// error ratio. That didn't behave well if Postgres was overloaded.
 	resp := <-responses
 	if resp.err != nil {
 		return nil, fmt.Errorf("%q search failed: %v", resp.source, resp.err)
 	}
 	// cancel proactively here: we've got the search result we need.
 	cancel()
 	// latency is only recorded for valid search results, as fast failures could
 	// skew the latency distribution.
 	// Note that this latency measurement might differ meaningfully from the
 	// resp.Latency, if time was spent waiting for the result count estimate.
 	stats.RecordWithTags(ctx,
 		[]tag.Mutator{tag.Upsert(keySearchSource, resp.source)},
 		dcensus.MDur(searchLatency, time.Since(searchStart)))
 	// To avoid fighting with the query planner, our searches only hit the
 	// search_documents table and we enrich after getting the results. In the
 	// future, we may want to fully denormalize and put all search data in the
 	// search_documents table.
 	if _, ok := searchers["symbol"]; !ok {
 		if err := db.addPackageDataToSearchResults(ctx, resp.results); err != nil {
 			return nil, err
 		}
 	}
 	return &resp, nil
 }

 const hllRegisterCount = 128

 // deepSearch searches all packages for the query. It is slower, but results
 // are always valid.
 func (db *DB) deepSearch(ctx context.Context, q string, limit, offset, maxResultCount int) searchResponse {
 	query := fmt.Sprintf(`
 		SELECT *, COUNT(*) OVER() AS total
 		FROM (
 			SELECT
 				package_path,
 				version,
 				module_path,
 				commit_time,
 				imported_by_count,
 				(%s) AS score
 				FROM
 					search_documents
 				WHERE tsv_search_tokens @@ websearch_to_tsquery($1)
 				ORDER BY
 					score DESC,
 					commit_time DESC,
 					package_path
 		) r
 		WHERE r.score > 0.1
 		LIMIT $2
 		OFFSET $3`, scoreExpr)

 	var (
 		results []*SearchResult
 		err     error
 	)
 	if experiment.IsActive(ctx, internal.ExperimentSearchIncrementally) {
 		modulePaths := map[string]bool{}
 		const pageSize = 10  // TODO(jba): get from elsewhere
 		additionalRows := 10 // after reaching pageSize module paths
 		collect := func(rows *sql.Rows) error {
 			var r SearchResult
 			if err := rows.Scan(&r.PackagePath, &r.Version, &r.ModulePath, &r.CommitTime,
 				&r.NumImportedBy, &r.Score, &r.NumResults); err != nil {
 				return fmt.Errorf("rows.Scan(): %v", err)
 			}
 			results = append(results, &r)
 			// Stop a few rows after we've seen pageSize module paths.
 			modulePaths[r.ModulePath] = true
 			if len(modulePaths) >= pageSize {
 				additionalRows--
 				if additionalRows <= 0 {
 					return io.EOF
 				}
 			}
 			return nil
 		}
 		const fetchSize = 20 // number of rows to fetch at a time
 		err = db.db.RunQueryIncrementally(ctx, query, fetchSize, collect, q, limit, offset)
 	} else {
 		collect := func(rows *sql.Rows) error {
 			var r SearchResult
 			if err := rows.Scan(&r.PackagePath, &r.Version, &r.ModulePath, &r.CommitTime,
 				&r.NumImportedBy, &r.Score, &r.NumResults); err != nil {
 				return fmt.Errorf("rows.Scan(): %v", err)
 			}
 			results = append(results, &r)
 			return nil
 		}
 		err = db.db.RunQuery(ctx, query, collect, q, limit, offset)
 	}
 	if err != nil {
 		results = nil
 	}
 	for i, r := range results {
 		r.Offset = offset + i
 	}
 	if len(results) > 0 && results[0].NumResults > uint64(maxResultCount) {
 		for _, r := range results {
 			r.NumResults = uint64(maxResultCount)
 		}
 	}
 	return searchResponse{
 		source:  "deep",
 		results: results,
 		err:     err,
 	}
 }

 func (db *DB) popularSearch(ctx context.Context, searchQuery string, limit, offset, maxResultCount int) searchResponse {
 	query := `
 		SELECT
 			package_path,
 			version,
 			module_path,
 			commit_time,
 			imported_by_count,
 			score
 		FROM popular_search($1, $2, $3, $4, $5)`
 	var results []*SearchResult
 	collect := func(rows *sql.Rows) error {
 		var r SearchResult
 		if err := rows.Scan(&r.PackagePath, &r.Version, &r.ModulePath, &r.CommitTime,
 			&r.NumImportedBy, &r.Score); err != nil {
 			return fmt.Errorf("rows.Scan(): %v", err)
 		}
 		results = append(results, &r)
 		return nil
 	}
 	err := db.db.RunQuery(ctx, query, collect, searchQuery, limit, offset, nonRedistributablePenalty, noGoModPenalty)
 	if err != nil {
 		results = nil
 	}
 	numResults := maxResultCount
 	if offset+limit > maxResultCount || len(results) < limit {
 		// It is practically impossible that len(results) < limit, because popular
 		// search will never linearly scan everything before deep search completes,
 		// but just to be slightly more theoretically correct, if our search
 		// results are partial we know that we have exhausted all results.
 		numResults = offset + len(results)
 	}
 	for i, r := range results {
 		r.Offset = offset + i
 		r.NumResults = uint64(numResults)
 	}
 	return searchResponse{
 		source:  "popular",
 		results: results,
 		err:     err,
 	}
 }

 // addPackageDataToSearchResults adds package information to SearchResults that is not stored
 // in the search_documents table.
 func (db *DB) addPackageDataToSearchResults(ctx context.Context, results []*SearchResult) (err error) {
 	defer derrors.WrapStack(&err, "DB.addPackageDataToSearchResults(results)")
 	if len(results) == 0 {
 		return nil
 	}
 	var (
 		keys []string
 		// resultMap tracks PackagePath->SearchResult, to allow joining with the
 		// returned package data.
 		resultMap = make(map[string]*SearchResult)
 	)
 	for _, r := range results {
 		resultMap[r.PackagePath] = r
 		key := fmt.Sprintf("(%s, %s, %s)", pq.QuoteLiteral(r.PackagePath),
 			pq.QuoteLiteral(r.Version), pq.QuoteLiteral(r.ModulePath))
 		keys = append(keys, key)
 	}
 	query := fmt.Sprintf(`
 		SELECT
 			p.path,
 			u.name,
 			d.synopsis,
 			u.license_types,
 			u.redistributable
 		FROM
 			units u
 		INNER JOIN
 			paths p
 		ON u.path_id = p.id
 		INNER JOIN
 			modules m
 		ON u.module_id = m.id
 		LEFT JOIN
 			documentation d
 		ON u.id = d.unit_id
 		WHERE
 			(p.path, m.version, m.module_path) IN (%s)`, strings.Join(keys, ","))
 	collect := func(rows *sql.Rows) error {
 		var (
 			path, name, synopsis string
 			licenseTypes         []string
 			redist               bool
 		)
 		if err := rows.Scan(&path, &name, database.NullIsEmpty(&synopsis), pq.Array(&licenseTypes), &redist); err != nil {
 			return fmt.Errorf("rows.Scan(): %v", err)
 		}
 		r, ok := resultMap[path]
 		if !ok {
 			return fmt.Errorf("BUG: unexpected package path: %q", path)
 		}
 		r.Name = name
 		if redist || db.bypassLicenseCheck {
 			r.Synopsis = synopsis
 		}
 		for _, l := range licenseTypes {
 			if l != "" {
 				r.Licenses = append(r.Licenses, l)
 			}
 		}
 		r.Licenses = sortAndDedup(r.Licenses)
 		return nil
 	}
 	return db.db.RunQuery(ctx, query, collect)
 }

 func sortAndDedup(s []string) []string {
 	var r []string
 	m := map[string]bool{}
 	for _, x := range s {
 		m[x] = true
 	}
 	for x := range m {
 		r = append(r, x)
 	}
 	sort.Strings(r)
 	return r
 }

 // groupSearchResults groups and re-orders the list of SearchResults by module
 // and series path and returns a new list of SearchResults.
 //
 // The second and later packages from a module are grouped under the first package,
 // and removed from the top-level list.
 //
 // Higher major versions of a module are put before lower ones.
 //
 // Packages from lower major versions of the module are grouped under the first
 // package of the highest major version. But they are not removed from the
 // top-level list.
 func groupSearchResults(rs []*SearchResult) []*SearchResult {
 	modules := map[string]*SearchResult{} // module path to first result
 	series := map[string]*SearchResult{}  // series path to result with max major version
 	var results []*SearchResult
 	for _, r := range rs {
 		f := modules[r.ModulePath]
 		if f == nil {
 			// First result (package) with this module path; remember it and
 			// keep it.
 			modules[r.ModulePath] = r
 			results = append(results, r)
 		} else {
 			// Record this result under the first result.
 			f.SameModule = append(f.SameModule, r)
 		}

 		seriesPath, vr := internal.SeriesPathAndMajorVersion(r.ModulePath)
 		f = series[seriesPath]
 		if f == nil {
 			// First time we've seen anything from this series: remember it.
 			r.OtherMajor = map[string]bool{}
 			series[seriesPath] = r
 		} else if r.ModulePath != f.ModulePath {
 			// Result is from a different major version.
 			// Record the larger one, and give it a higher score.
 			_, vf := internal.SeriesPathAndMajorVersion(f.ModulePath)
 			if vr > vf {
 				series[seriesPath] = r
 				r.OtherMajor = f.OtherMajor
 				f.OtherMajor = nil
 				r.OtherMajor[f.ModulePath] = true
 				if f.Score > r.Score {
 					r.Score = f.Score + 1e-5
 				}
 			} else {
 				f.OtherMajor[r.ModulePath] = true
 			}
 		}
 	}
 	// Re-sort by score, since we may have changed some.
 	sort.Slice(results, func(i, j int) bool {
 		return results[i].Score > results[j].Score
 	})
 	return results
 }

 // numRows counts the number of rows in a slice of SearchResults.
 // Grouping will put some rows inside a SearchResult.
 func numRows(rs []*SearchResult) int {
 	n := 0
 	for _, r := range rs {
 		n += 1 + len(r.SameModule)
 	}
 	return n
 }

 var upsertSearchStatement = fmt.Sprintf(`
 	INSERT INTO search_documents (
 		package_path,
 		package_path_id,
 		version,
 		module_path,
 		module_path_id,
 		unit_id,
 		name,
 		synopsis,
 		license_types,
 		redistributable,
 		version_updated_at,
 		commit_time,
 		has_go_mod,
 		-- TODO(https://golang.org/issue/44142): The path_tokens column is used
 		-- to easily iterate on tsv_path_tokens, and can be removed once
 		-- symbol search implementation is done.
 		path_tokens,
 		tsv_path_tokens,
 		tsv_search_tokens,
 		hll_register,
 		hll_leading_zeros
 	)
 	SELECT
 		p1.path,
 		p1.id,
 		m.version,
 		m.module_path,
 		p2.id,
 		u.id AS unit_id,
 		u.name,
 		d.synopsis,
 		u.license_types,
 		u.redistributable,
 		CURRENT_TIMESTAMP,
 		m.commit_time,
 		m.has_go_mod,
 		$4,
 		SETWEIGHT(TO_TSVECTOR('%s', replace($4, '_', '-')), 'A'),
 		(
 			SETWEIGHT(TO_TSVECTOR('path_tokens', $4), 'A') ||
 			SETWEIGHT(TO_TSVECTOR($5), 'B') ||
 			SETWEIGHT(TO_TSVECTOR($6), 'C') ||
 			SETWEIGHT(TO_TSVECTOR($7), 'D')
 		),
 		hll_hash(p1.path) & (%d - 1),
 		hll_zeros(hll_hash(p1.path))
 	FROM units u
 	INNER JOIN modules m ON u.module_id = m.id
 	INNER JOIN paths p1 ON p1.id = u.path_id
 	LEFT JOIN paths p2 ON p2.path = m.module_path
 	LEFT JOIN documentation d ON u.id = d.unit_id
 	WHERE
 		p1.path = $1
 		AND m.module_path = $2
 		AND m.version = $3
 	LIMIT 1 -- could be multiple build contexts
 	ON CONFLICT (package_path_id)
 	DO UPDATE SET
 		package_path=excluded.package_path,
 		version=excluded.version,
 		module_path=excluded.module_path,
 		unit_id=excluded.unit_id,
 		name=excluded.name,
 		synopsis=excluded.synopsis,
 		license_types=excluded.license_types,
 		redistributable=excluded.redistributable,
 		commit_time=excluded.commit_time,
 		has_go_mod=excluded.has_go_mod,
 		path_tokens=excluded.path_tokens,
 		tsv_path_tokens=excluded.tsv_path_tokens,
 		tsv_search_tokens=excluded.tsv_search_tokens,
 		-- the hll fields are functions of path, so they don't change
 		version_updated_at=(
 			CASE WHEN excluded.version = search_documents.version
 			THEN search_documents.version_updated_at
 			ELSE CURRENT_TIMESTAMP
 			END)
 	;`,
 	symbolsearch.SymbolTextSearchConfiguration,
 	hllRegisterCount)

 // upsertSearchDocuments adds search information for mod to the search_documents table.
 // It assumes that all non-redistributable data has been removed from mod.
 func upsertSearchDocuments(ctx context.Context, ddb *database.DB, mod *internal.Module) (err error) {
 	defer derrors.WrapStack(&err, "upsertSearchDocuments(ctx, %q, %q)", mod.ModulePath, mod.Version)
 	ctx, span := trace.StartSpan(ctx, "UpsertSearchDocuments")
 	defer span.End()

 	// Don't upsert a package if it is already present under a longer module
 	// path. We need this because search_documents can have only one row per
 	// import path, and we, like the go tool, prefer the package with the longer
 	// module path. For example, if two packages have import path "a/b/c", one
 	// in module "a" and the other in "a/b", we keep only the latter.
 	lps, err := ddb.CollectStrings(ctx, `
 		SELECT package_path
 		FROM search_documents
 		WHERE module_path LIKE $1 || '/%'
 	`, mod.ModulePath)
 	if err != nil {
 		return err
 	}
 	longerPackages := map[string]bool{}
 	for _, lp := range lps {
 		longerPackages[lp] = true
 	}

 	for _, pkg := range mod.Packages() {
 		if isInternalPackage(pkg.Path) {
 			continue
 		}
 		if longerPackages[pkg.Path] {
 			continue
 		}
 		args := UpsertSearchDocumentArgs{
 			PackagePath: pkg.Path,
 			ModulePath:  mod.ModulePath,
 			Version:     mod.Version,
 		}
 		if len(pkg.Documentation) > 0 {
 			// Use the synopsis of the first GOOS/GOARCH pair.
 			args.Synopsis = pkg.Documentation[0].Synopsis
 		}
 		if pkg.Readme != nil {
 			args.ReadmeFilePath = pkg.Readme.Filepath
 			args.ReadmeContents = pkg.Readme.Contents
 		}
 		if err := UpsertSearchDocument(ctx, ddb, args); err != nil {
 			return err
 		}
 	}
 	return nil
 }

 type UpsertSearchDocumentArgs struct {
 	PackagePath    string
 	ModulePath     string
 	Version        string
 	Synopsis       string
 	ReadmeFilePath string
 	ReadmeContents string
 }

 // UpsertSearchDocument inserts a row in search_documents for the given package.
 // The given module should have already been validated via a call to
 // validateModule.
 func UpsertSearchDocument(ctx context.Context, ddb *database.DB, args UpsertSearchDocumentArgs) (err error) {
 	defer derrors.WrapStack(&err, "DB.UpsertSearchDocument(ctx, ddb, %q, %q)", args.PackagePath, args.ModulePath)

 	// Only summarize the README if the package and module have the same path.
 	// If this changes, fix DB.ReconcileSearch.
 	if args.PackagePath != args.ModulePath {
 		args.ReadmeFilePath = ""
 		args.ReadmeContents = ""
 	}
 	pathTokens := strings.Join(GeneratePathTokens(args.PackagePath), " ")
 	sectionB, sectionC, sectionD := SearchDocumentSections(args.Synopsis, args.ReadmeFilePath, args.ReadmeContents)
 	_, err = ddb.Exec(ctx, upsertSearchStatement, args.PackagePath, args.ModulePath, args.Version, pathTokens, sectionB, sectionC, sectionD)
 	return err
 }

 // GetPackagesForSearchDocumentUpsert fetches search information for packages in search_documents
 // whose update time is before the given time.
 func (db *DB) GetPackagesForSearchDocumentUpsert(ctx context.Context, before time.Time, limit int) (argsList []UpsertSearchDocumentArgs, err error) {
 	defer derrors.WrapStack(&err, "GetPackagesForSearchDocumentUpsert(ctx, %s, %d)", before, limit)

 	query := `
 		SELECT
 			sd.package_path,
 			sd.module_path,
 			sd.version,
 			sd.synopsis,
 			sd.redistributable,
 			r.file_path,
 			r.contents
 		FROM modules m
 		INNER JOIN units u
 		ON m.id = u.module_id
 		INNER JOIN paths p
 		ON p.id = u.path_id
 		LEFT JOIN readmes r
 		ON u.id = r.unit_id
 		INNER JOIN search_documents sd
 		ON sd.package_path = p.path
 		    AND sd.module_path = m.module_path
 		    AND sd.version = m.version
 		WHERE sd.updated_at < $1
 		LIMIT $2`

 	collect := func(rows *sql.Rows) error {
 		var (
 			a      UpsertSearchDocumentArgs
 			redist bool
 		)
 		if err := rows.Scan(&a.PackagePath, &a.ModulePath, &a.Version, &a.Synopsis, &redist,
 			database.NullIsEmpty(&a.ReadmeFilePath), database.NullIsEmpty(&a.ReadmeContents)); err != nil {
 			return err
 		}
 		if !redist && !db.bypassLicenseCheck {
 			a.Synopsis = ""
 			a.ReadmeFilePath = ""
 			a.ReadmeContents = ""
 		}
 		argsList = append(argsList, a)
 		return nil
 	}
 	if err := db.db.RunQuery(ctx, query, collect, before, limit); err != nil {
 		return nil, err
 	}
 	return argsList, nil
 }

 // UpdateSearchDocumentsImportedByCount updates imported_by_count and
 // imported_by_count_updated_at.
 //
 // It does so by completely recalculating the imported-by counts
 // from the imports_unique table.
 //
 // UpdateSearchDocumentsImportedByCount returns the number of rows updated.
 func (db *DB) UpdateSearchDocumentsImportedByCount(ctx context.Context) (nUpdated int64, err error) {
 	defer derrors.WrapStack(&err, "UpdateSearchDocumentsImportedByCount(ctx)")

 	curCounts, err := db.getSearchPackages(ctx)
 	if err != nil {
 		return 0, err
 	}
 	newCounts, err := db.computeImportedByCounts(ctx, curCounts)
 	if err != nil {
 		return 0, err
 	}
 	// Include only changed counts for packages that are in search_documents.
 	changedCounts := map[string]int{}
 	for p, nc := range newCounts {
 		cc, present := curCounts[p]
 		if present && cc != nc {
 			changedCounts[p] = nc
 		}
 	}
 	pct := 0
 	if len(curCounts) > 0 {
 		pct = len(changedCounts) * 100 / len(curCounts)
 	}
 	log.Debugf(ctx, "update-imported-by-counts: %d changed (%d%%)", len(changedCounts), pct)
 	return db.UpdateSearchDocumentsImportedByCountWithCounts(ctx, changedCounts)
 }

 // How many imported-by counts to update at a time.
 // A variable for testing.
 var countBatchSize = 20_000

 func (db *DB) UpdateSearchDocumentsImportedByCountWithCounts(ctx context.Context, counts map[string]int) (nUpdated int64, err error) {
 	defer derrors.WrapStack(&err, "UpdateSearchDocumentsImportedByCountWithCounts")
 	for len(counts) > 0 {
 		var nu int64
 		err := db.db.Transact(ctx, sql.LevelDefault, func(tx *database.DB) error {
 			if err := insertImportedByCounts(ctx, tx, counts, countBatchSize); err != nil {
 				return err
 			}
 			nu, err = updateImportedByCounts(ctx, tx)
 			return err
 		})
 		if err != nil {
 			return nUpdated, err
 		}
 		nUpdated += nu
 	}
 	return nUpdated, nil
 }

 // getSearchPackages returns the set of package paths that are in the search_documents table,
 // along with their current imported-by count.
 func (db *DB) getSearchPackages(ctx context.Context) (counts map[string]int, err error) {
 	defer derrors.WrapStack(&err, "DB.getSearchPackages(ctx)")

 	counts = map[string]int{}
 	err = db.db.RunQuery(ctx, `
 		SELECT package_path, imported_by_count
 		FROM search_documents
 	`, func(rows *sql.Rows) error {
 		var (
 			p string
 			c int
 		)
 		if err := rows.Scan(&p, &c); err != nil {
 			return err
 		}
 		counts[p] = c
 		return nil
 	})
 	if err != nil {
 		return nil, err
 	}
 	return counts, nil
 }

 func (db *DB) computeImportedByCounts(ctx context.Context, curCounts map[string]int) (newCounts map[string]int, err error) {
 	defer derrors.WrapStack(&err, "db.computeImportedByCounts(ctx)")

 	newCounts = map[string]int{}
 	// Get all (from_path, to_path) pairs, deduped.
 	// Also get the from_path's module path.
 	err = db.db.RunQuery(ctx, `
 		SELECT DISTINCT from_path, from_module_path, to_path
 		FROM imports_unique
 	`, func(rows *sql.Rows) error {
 		var from, fromMod, to string
 		if err := rows.Scan(&from, &fromMod, &to); err != nil {
 			return err
 		}
 		// Don't count an importer if it's not in search_documents.
 		if _, ok := curCounts[from]; !ok {
 			return nil
 		}
 		// Don't count an importer if it's in the same module as what it's importing.
 		// Approximate that check by seeing if from_module_path is a prefix of to_path.
 		// (In some cases, e.g. when to_path is in a nested module, that is not correct.)
 		if (fromMod == stdlib.ModulePath && stdlib.Contains(to)) || strings.HasPrefix(to+"/", fromMod+"/") {
 			return nil
 		}
 		newCounts[to]++
 		return nil
 	})
 	if err != nil {
 		return nil, err
 	}
 	return newCounts, nil
 }

 // insertImportedByCounts creates a temporary table and inserts at most limit
 // rows into it, where each row is a key and value from the counts map. The
 // inserted keys are deleted from counts.
 func insertImportedByCounts(ctx context.Context, db *database.DB, counts map[string]int, limit int) (err error) {
 	defer derrors.WrapStack(&err, "insertImportedByCounts(ctx, db, counts)")

 	const createTableQuery = `
 		CREATE TEMPORARY TABLE computed_imported_by_counts (
 			package_path      TEXT NOT NULL,
 			imported_by_count INTEGER NOT NULL
 		) ON COMMIT DROP;
     `
 	if _, err := db.Exec(ctx, createTableQuery); err != nil {
 		return fmt.Errorf("CREATE TABLE: %v", err)
 	}
 	var values []interface{}
 	i := 0
 	for p, c := range counts {
 		if i >= limit {
 			break
 		}
 		values = append(values, p, c)
 		delete(counts, p)
 		i++
 	}
 	columns := []string{"package_path", "imported_by_count"}
 	return db.BulkInsert(ctx, "computed_imported_by_counts", columns, values, "")
 }

 // updateImportedByCounts updates the imported_by_count column in search_documents
 // for every package in computed_imported_by_counts.
 //
 // Rows that don't change aren't updated.
 //
 // Note that if a package is never imported, its imported_by_count column will
 // be the default (0) and its imported_by_count_updated_at column will never be set.
 func updateImportedByCounts(ctx context.Context, db *database.DB) (int64, error) {
 	// Lock the entire table to avoid deadlock. Without the lock, the update can
 	// fail because module inserts are concurrently modifying rows of
 	// search_documents.
 	// See https://www.postgresql.org/docs/11/explicit-locking.html for what locks mean.
 	// See https://www.postgresql.org/docs/11/sql-lock.html for the LOCK
 	// statement, notably the paragraph beginning "If a transaction of this sort
 	// is going to change the data...".
 	const updateStmt = `
 		LOCK TABLE search_documents IN SHARE ROW EXCLUSIVE MODE;
 		UPDATE search_documents s
 		SET
 			imported_by_count = c.imported_by_count,
 			imported_by_count_updated_at = CURRENT_TIMESTAMP
 		FROM computed_imported_by_counts c
 		WHERE s.package_path = c.package_path;`

 	n, err := db.Exec(ctx, updateStmt)
 	if err != nil {
 		return 0, fmt.Errorf("error updating imported_by_count and imported_by_count_updated_at for search documents: %v", err)
 	}
 	return n, nil
 }

 var (
 	commonHostnames = map[string]bool{
 		"bitbucket.org":         true,
 		"code.cloudfoundry.org": true,
 		"gitea.com":             true,
 		"gitee.com":             true,
 		"github.com":            true,
 		"gitlab.com":            true,
 		"go.etcd.io":            true,
 		"go.googlesource.com":   true,
 		"golang.org":            true,
 		"google.golang.org":     true,
 		"gopkg.in":              true,
 	}
 	commonHostParts = map[string]bool{
 		"code":   true,
 		"git":    true,
 		"gitlab": true,
 		"go":     true,
 		"google": true,
 		"www":    true,
 	}
 )

 // GeneratePathTokens returns the subPaths and path token parts that will be
 // indexed for search, which includes (1) the packagePath (2) all sub-paths of
 // the packagePath (3) all parts for a path element that is delimited by a dash
 // and (4) all parts of a path element that is delimited by a dot, except for
 // the last element.
 func GeneratePathTokens(packagePath string) []string {
 	packagePath = strings.Trim(packagePath, "/")

 	subPathSet := make(map[string]bool)
 	parts := strings.Split(packagePath, "/")
 	for i, part := range parts {
 		dashParts := strings.Split(part, "-")
 		if len(dashParts) > 1 {
 			for _, p := range dashParts {
 				subPathSet[p] = true
 			}
 		}
 		for j := i + 2; j <= len(parts); j++ {
 			p := strings.Join(parts[i:j], "/")
 			p = strings.Trim(p, "/")
 			subPathSet[p] = true
 		}

 		if i == 0 && commonHostnames[part] {
 			continue
 		}
 		// Only index host names if they are not part of commonHostnames.
 		subPathSet[part] = true
 		dotParts := strings.Split(part, ".")
 		if len(dotParts) > 1 {
 			for _, p := range dotParts[:len(dotParts)-1] {
 				if !commonHostParts[p] {
 					// If the host is not in commonHostnames, we want to
 					// index each element up to the extension. For example,
 					// if the host is sigs.k8s.io, we want to index sigs
 					// and k8s. Skip common host parts.
 					subPathSet[p] = true
 				}
 			}
 		}
 	}

 	var subPaths []string
 	for sp := range subPathSet {
 		if len(sp) > 0 {
 			subPaths = append(subPaths, sp)
 		}
 	}
 	sort.Strings(subPaths)
 	return subPaths
 }

 // isInternalPackage reports whether the path represents an internal directory.
 func isInternalPackage(path string) bool {
 	for _, p := range strings.Split(path, "/") {
 		if p == "internal" {
 			return true
 		}
 	}
 	return false
 }

 // UpsertSearchDocumentWithImportedByCount is the same as UpsertSearchDocument,
 // except it also updates the imported by count. This is only used for testing.
 func (db *DB) UpsertSearchDocumentWithImportedByCount(ctx context.Context, args UpsertSearchDocumentArgs, importedByCount int) (err error) {
 	defer derrors.WrapStack(&err, "DB.UpsertSearchDocumentWithImportedByCount(ctx, ddb, %q, %q)", args.PackagePath, args.ModulePath)

 	if err := UpsertSearchDocument(ctx, db.db, args); err != nil {
 		return err
 	}
 	_, err = db.db.Exec(ctx,
 		`UPDATE search_documents SET imported_by_count=$1 WHERE package_path=$2;`,
 		importedByCount, args.PackagePath)
 	return err
 }