blob: 55dcfb5f9e2f4dee86396206fda82fb2de0cfb35 [file] [log] [blame]
// Copyright 2019 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package worker
import (
"context"
"errors"
"fmt"
"net/http"
"sort"
"strconv"
"strings"
"sync"
"time"
"unicode/utf8"
"go.opencensus.io/plugin/ochttp"
"go.opencensus.io/stats"
"go.opencensus.io/stats/view"
"go.opencensus.io/tag"
"go.opencensus.io/trace"
"golang.org/x/mod/semver"
"golang.org/x/pkgsite/internal"
"golang.org/x/pkgsite/internal/cache"
"golang.org/x/pkgsite/internal/dcensus"
"golang.org/x/pkgsite/internal/derrors"
"golang.org/x/pkgsite/internal/experiment"
"golang.org/x/pkgsite/internal/fetch"
"golang.org/x/pkgsite/internal/log"
"golang.org/x/pkgsite/internal/log/stackdriverlogger"
"golang.org/x/pkgsite/internal/postgres"
"golang.org/x/pkgsite/internal/proxy"
"golang.org/x/pkgsite/internal/source"
"golang.org/x/pkgsite/internal/stdlib"
)
var (
fetchesShedded = stats.Int64(
"go-discovery/worker/fetch-shedded",
"Count of shedded fetches.",
stats.UnitDimensionless,
)
fetchLatency = stats.Float64(
"go-discovery/worker/fetch-latency",
"Latency of a fetch request.",
stats.UnitSeconds,
)
fetchedPackages = stats.Int64(
"go-discovery/worker/fetch-package-count",
"Count of successfully fetched packages.",
stats.UnitDimensionless,
)
// FetchLatencyDistribution aggregates frontend fetch request
// latency by status code. It does not count shedded requests.
FetchLatencyDistribution = &view.View{
Name: "go-discovery/worker/fetch-latency",
Measure: fetchLatency,
Aggregation: ochttp.DefaultLatencyDistribution,
Description: "Fetch latency by result status.",
TagKeys: []tag.Key{dcensus.KeyStatus},
}
// FetchResponseCount counts fetch responses by status.
FetchResponseCount = &view.View{
Name: "go-discovery/worker/fetch-count",
Measure: fetchLatency,
Aggregation: view.Count(),
Description: "Fetch request count by result status",
TagKeys: []tag.Key{dcensus.KeyStatus},
}
// FetchPackageCount counts how many packages were successfully fetched.
FetchPackageCount = &view.View{
Name: "go-discovery/worker/fetch-package-count",
Measure: fetchedPackages,
Aggregation: view.Count(),
Description: "Count of packages successfully fetched",
}
// SheddedFetchCount counts the number of fetches that were shedded.
SheddedFetchCount = &view.View{
Name: "go-discovery/worker/fetch-shedded",
Measure: fetchesShedded,
Aggregation: view.Count(),
Description: "Count of shedded fetches",
}
)
// fetchTask represents the result of a fetch task that was processed.
type fetchTask struct {
fetch.FetchResult
MainVersion string
MasterVersion string
timings map[string]time.Duration
}
// A Fetcher holds state for fetching modules.
type Fetcher struct {
ProxyClient *proxy.Client
SourceClient *source.Client
DB *postgres.DB
Cache *cache.Cache
loadShedder *loadShedder
Source string
}
// FetchAndUpdateState fetches and processes a module version, and then updates
// the module_version_states table according to the result. It returns an HTTP
// status code representing the result of the fetch operation, and a non-nil
// error if this status code is not 200.
func (f *Fetcher) FetchAndUpdateState(ctx context.Context, modulePath, requestedVersion, appVersionLabel string) (status int, resolvedVersion string, err error) {
defer derrors.Wrap(&err, "FetchAndUpdateState(%q, %q, %q)", modulePath, requestedVersion, appVersionLabel)
tctx, span := trace.StartSpan(ctx, "FetchAndUpdateState")
ctx = experiment.NewContext(tctx, experiment.FromContext(ctx).Active()...)
ctx = stackdriverlogger.NewContextWithLabel(ctx, "fetch", modulePath+"@"+requestedVersion)
start := time.Now()
var nPackages int64
defer func() {
latency := float64(time.Since(start).Seconds())
dcensus.RecordWithTag(ctx, dcensus.KeyStatus, strconv.Itoa(status), fetchLatency.M(latency))
if status < 300 {
stats.Record(ctx, fetchedPackages.M(nPackages))
}
}()
if !utf8.ValidString(modulePath) {
log.Errorf(ctx, "module path %q is not valid UTF-8", modulePath)
}
if modulePath == internal.UnknownModulePath {
return http.StatusInternalServerError, "", errors.New("called with internal.UnknownModulePath")
}
if !utf8.ValidString(requestedVersion) {
log.Errorf(ctx, "requested version %q is not valid UTF-8", requestedVersion)
}
span.AddAttributes(
trace.StringAttribute("modulePath", modulePath),
trace.StringAttribute("version", requestedVersion))
defer span.End()
// Begin by hitting the proxy's info endpoint. We need the resolved version
// to do load-shedding, but it's also important to make the proxy aware
// of the version if it isn't already, as can happen when we arrive here via
// frontend fetch.
//
// Don't fail on a non-nil error. If we return here, we won't record
// the error state in the DB.
info, err := getInfo(ctx, modulePath, requestedVersion, f.ProxyClient)
if err == nil {
// If we're overloaded, shed load by not processing this module.
// The zip endpoint requires a resolved version.
deferFunc, zipSize, err := f.maybeShed(ctx, modulePath, info.Version)
defer deferFunc()
if err != nil {
return derrors.ToStatus(err), "", err
}
fi := &FetchInfo{
ModulePath: modulePath,
Version: requestedVersion,
ZipSize: uint64(zipSize),
Start: time.Now(),
}
startFetchInfo(fi)
defer func() { finishFetchInfo(fi, status, err) }()
// If this is a valid module, insert it into module_version_states.
//
// In case something happens later on, this will make sure we retry. Also,
// modules that are introduced to pkgsite for the first time via frontend
// fetch and not index.golang.org won't have a row in
// module_version_states, so that ensures the logic below works properly as
// well.
//
// Leave the index_timestamp as empty. This will be populated when the
// module appears in the index.
if err := f.DB.InsertNewModuleVersionFromFrontendFetch(ctx, modulePath, info.Version); err != nil {
return derrors.ToStatus(err), "", err
}
}
// Get the latest-version information first, and update the DB. We'll need
// it to determine if the current module version is the latest good one for
// its path.
lmv, err := f.FetchAndUpdateLatest(ctx, modulePath)
// The only errors possible here should be DB failures.
if err != nil {
return derrors.ToStatus(err), "", err
}
ft := f.fetchAndInsertModule(ctx, modulePath, requestedVersion, lmv)
nPackages = int64(len(ft.PackageVersionStates))
span.AddAttributes(trace.Int64Attribute("numPackages", nPackages))
// If there were any errors processing the module then we didn't insert it.
// Delete it in case we are reprocessing an existing module.
// However, don't delete if the error was internal, or we are shedding load.
if ft.Status >= 400 && ft.Status < 500 {
if err := deleteModule(ctx, f.DB, ft); err != nil {
log.Error(ctx, err)
ft.Error = err
ft.Status = http.StatusInternalServerError
}
}
// Regardless of what the status code is, insert the result into
// version_map, so that a response can be returned for frontend_fetch.
if err := updateVersionMap(ctx, f.DB, ft); err != nil {
log.Error(ctx, err)
if ft.Status != http.StatusInternalServerError {
ft.Error = err
ft.Status = http.StatusInternalServerError
}
// Do not return an error here, because we want to insert into
// module_version_states below.
}
if !semver.IsValid(ft.ResolvedVersion) {
// If the requestedVersion was not successfully resolved to a semantic
// version, then at this point it will be the same as the
// resolvedVersion. This fetch request does not need to be recorded in
// module_version_states, since that table is only used to track
// modules that have been published to index.golang.org.
return ft.Status, ft.ResolvedVersion, ft.Error
}
// Return an error here if a row does not exist in module_version_states.
// This can happen if the source is frontend fetch, since we don't insert
// rows to avoid cluttering module_version_states.
if _, err := f.DB.GetModuleVersionState(ctx, modulePath, ft.ResolvedVersion); err != nil {
if errors.Is(err, derrors.NotFound) {
return ft.Status, "", ft.Error
}
return http.StatusInternalServerError, "", err
}
// Make sure the latest version of the module is the one in search_documents
// and imports_unique.
if err := f.DB.ReconcileSearch(ctx, modulePath, ft.ResolvedVersion, ft.Status); err != nil {
log.Error(ctx, err)
if ft.Status != http.StatusInternalServerError {
ft.Error = err
ft.Status = http.StatusInternalServerError
}
// Do not return an error here, because we want to insert into
// module_version_states below.
}
// Update the module_version_states table with the new status of
// module@version. This must happen last, because if it succeeds with a
// code < 500 but a later action fails, we will never retry the later
// action.
startUpdate := time.Now()
mvs := &postgres.ModuleVersionStateForUpdate{
ModulePath: ft.ModulePath,
Version: ft.ResolvedVersion,
AppVersion: appVersionLabel,
Status: ft.Status,
HasGoMod: ft.HasGoMod,
GoModPath: ft.GoModPath,
FetchErr: ft.Error,
PackageVersionStates: ft.PackageVersionStates,
}
err = f.DB.UpdateModuleVersionState(ctx, mvs)
ft.timings["db.UpdateModuleVersionState"] = time.Since(startUpdate)
if err != nil {
log.Error(ctx, err)
ft.Status = http.StatusInternalServerError
if ft.Error != nil {
ft.Error = fmt.Errorf("db.UpdateModuleVersionState: %v, original error: %v", err, ft.Error)
} else {
ft.Error = err
}
logTaskResult(ctx, ft, "Failed to update module version state")
return http.StatusInternalServerError, ft.ResolvedVersion, ft.Error
}
logTaskResult(ctx, ft, "Updated module version state")
return ft.Status, ft.ResolvedVersion, ft.Error
}
func getInfo(ctx context.Context, modulePath, requestedVersion string, prox *proxy.Client) (_ *proxy.VersionInfo, err error) {
if modulePath == stdlib.ModulePath {
var resolvedVersion string
resolvedVersion, err = stdlib.ZipInfo(requestedVersion)
if err != nil {
return nil, err
}
return &proxy.VersionInfo{Version: resolvedVersion}, nil
}
return prox.Info(ctx, modulePath, requestedVersion)
}
// fetchAndInsertModule fetches the given module version from the module proxy
// or (in the case of the standard library) from the Go repo and writes the
// resulting data to the database.
func (f *Fetcher) fetchAndInsertModule(ctx context.Context, modulePath, requestedVersion string, lmv *internal.LatestModuleVersions) *fetchTask {
ft := &fetchTask{
FetchResult: fetch.FetchResult{
ModulePath: modulePath,
RequestedVersion: requestedVersion,
},
timings: map[string]time.Duration{},
}
defer func() {
derrors.Wrap(&ft.Error, "fetchAndInsertModule(%q, %q)", modulePath, requestedVersion)
if ft.Error != nil {
ft.Status = derrors.ToStatus(ft.Error)
ft.ResolvedVersion = requestedVersion
}
}()
if f.DB.IsExcluded(ctx, modulePath, requestedVersion) {
ft.Error = derrors.Excluded
return ft
}
moduleGetter := fetch.NewProxyModuleGetter(f.ProxyClient, f.SourceClient)
if modulePath == "std" {
moduleGetter = fetch.NewStdlibZipModuleGetter()
}
// Fetch the module, and the current @main and @master version of this module.
// The @main and @master version will be used to update the version_map
// target if applicable.
var wg sync.WaitGroup
wg.Add(1)
go func() {
defer wg.Done()
start := time.Now()
fr := fetch.FetchModule(ctx, modulePath, requestedVersion, moduleGetter)
if fr == nil {
panic("fetch.FetchModule should never return a nil FetchResult")
}
ft.FetchResult = *fr
ft.timings["fetch.FetchModule"] = time.Since(start)
}()
// Do not resolve the @main and @master version if proxy fetch is disabled.
var main string
wg.Add(1)
go func() {
defer wg.Done()
if !f.ProxyClient.FetchDisabled() {
main = resolvedVersion(ctx, modulePath, internal.MainVersion, moduleGetter)
}
}()
var master string
wg.Add(1)
go func() {
defer wg.Done()
if !f.ProxyClient.FetchDisabled() {
master = resolvedVersion(ctx, modulePath, internal.MasterVersion, moduleGetter)
}
}()
wg.Wait()
ft.MainVersion = main
ft.MasterVersion = master
// There was an error fetching this module.
if ft.Error != nil {
logf := log.Infof
if ft.Status >= 500 && ft.Status != derrors.ToStatus(derrors.ProxyTimedOut) {
logf = log.Warningf
}
logf(ctx, "Error executing fetch: %v (code %d)", ft.Error, ft.Status)
return ft
}
// The module was successfully fetched.
log.Debugf(ctx, "fetch.FetchModule succeeded for %s@%s", ft.ModulePath, ft.RequestedVersion)
// Determine the current latest-version information for this module.
start := time.Now()
isLatest, err := f.DB.InsertModule(ctx, ft.Module, lmv)
ft.timings["db.InsertModule"] = time.Since(start)
if err != nil {
ft.Status = derrors.ToStatus(err)
ft.Error = err
if ft.Status >= 500 {
log.Error(ctx, err)
} else {
log.Info(ctx, err)
}
return ft
}
log.Debugf(ctx, "db.InsertModule succeeded for %s@%s", ft.ModulePath, ft.RequestedVersion)
// Invalidate the cache if we just processed the latest version of a module.
if isLatest {
if err := f.invalidateCache(ctx, ft.ModulePath); err != nil {
// Failure to invalidate the cache is not that serious; at worst it means some pages will be stale.
// (Cache TTLs for details pages configured in internal/frontend/server.go must not be too long,
// to account for this possibility.)
log.Errorf(ctx, "failed to invalidate cache for %s: %v", ft.ModulePath, err)
} else {
log.Debugf(ctx, "invalidated cache for %s", ft.ModulePath)
}
}
return ft
}
// invalidateCache deletes the series path for modulePath, as well as any
// possible URL path of which it is a componentwise prefix. That is, it deletes
// example.com/mod, example.com/mod@v1.2.3 and example.com/mod/pkg, but not the
// unrelated example.com/module.
//
// We delete the series path, not the module path, because adding a v2 module
// can affect v1 pages. For example, the first v2 module will add a "higher
// major version" banner to all v1 pages. While adding a v1 version won't
// currently affect v2 pages, that could change some day (for instance, if we
// decide to provide history). So it's better to be safe and delete all paths in
// the series.
func (f *Fetcher) invalidateCache(ctx context.Context, modulePath string) error {
if f.Cache == nil {
return nil
}
var errs []error
seriesPath := internal.SeriesPathForModule(modulePath)
// All cache keys are request URLs, so they begin with "/".
if err := f.Cache.Delete(ctx, "/"+seriesPath); err != nil {
errs = append(errs, err)
}
// Delete all suffixes of the series path followed by a character that marks its end.
for _, end := range "/@?#" {
if err := f.Cache.DeletePrefix(ctx, fmt.Sprintf("/%s%c", seriesPath, end)); err != nil {
errs = append(errs, err)
}
}
if len(errs) > 0 {
return fmt.Errorf("%d errors, first is %w", len(errs), errs[0])
}
return nil
}
func resolvedVersion(ctx context.Context, modulePath, requestedVersion string, getter fetch.ModuleGetter) string {
if modulePath == stdlib.ModulePath && requestedVersion == internal.MainVersion {
return ""
}
info, err := fetch.GetInfo(ctx, modulePath, requestedVersion, getter)
if err != nil {
if !errors.Is(err, derrors.NotFound) {
// If an error occurs, log it as a warning and insert the module as normal.
log.Warningf(ctx, "fetch.GetInfo(ctx, %v, %q, getter, false): %v", modulePath, requestedVersion, err)
}
return ""
}
return info.Version
}
func updateVersionMap(ctx context.Context, db *postgres.DB, ft *fetchTask) (err error) {
start := time.Now()
defer func() {
ft.timings["worker.updatedVersionMap"] = time.Since(start)
derrors.Wrap(&err, "updateVersionMap(%q, %q, %q, %d, %v)",
ft.ModulePath, ft.RequestedVersion, ft.ResolvedVersion, ft.Status, ft.Error)
}()
ctx, span := trace.StartSpan(ctx, "worker.updateVersionMap")
defer span.End()
var errMsg string
if ft.Error != nil {
errMsg = ft.Error.Error()
}
// If the resolved version for the this module version is also the resolved
// version for @main or @master, update version_map to match.
requestedVersions := []string{ft.RequestedVersion}
if ft.MainVersion == ft.ResolvedVersion {
requestedVersions = append(requestedVersions, internal.MainVersion)
}
if ft.MasterVersion == ft.ResolvedVersion {
requestedVersions = append(requestedVersions, internal.MasterVersion)
}
for _, v := range requestedVersions {
v := v
vm := &internal.VersionMap{
ModulePath: ft.ModulePath,
RequestedVersion: v,
ResolvedVersion: ft.ResolvedVersion,
Status: ft.Status,
GoModPath: ft.GoModPath,
Error: errMsg,
}
if err := db.UpsertVersionMap(ctx, vm); err != nil {
return err
}
}
return nil
}
func deleteModule(ctx context.Context, db *postgres.DB, ft *fetchTask) (err error) {
start := time.Now()
defer func() {
ft.timings["worker.deleteModule"] = time.Since(start)
derrors.Wrap(&err, "deleteModule(%q, %q, %q, %d, %v)",
ft.ModulePath, ft.RequestedVersion, ft.ResolvedVersion, ft.Status, ft.Error)
}()
ctx, span := trace.StartSpan(ctx, "worker.deleteModule")
defer span.End()
log.Infof(ctx, "%s@%s: code=%d, deleting", ft.ModulePath, ft.ResolvedVersion, ft.Status)
if err := db.DeleteModule(ctx, ft.ModulePath, ft.ResolvedVersion); err != nil {
return err
}
// Update the latest good version for this module, because deleting this
// version may have changed it.
return db.UpdateLatestGoodVersion(ctx, ft.ModulePath)
}
func logTaskResult(ctx context.Context, ft *fetchTask, prefix string) {
var times []string
for k, v := range ft.timings {
times = append(times, fmt.Sprintf("%s=%.3fs", k, v.Seconds()))
}
sort.Strings(times)
msg := strings.Join(times, ", ")
logf := log.Infof
if ft.Status == http.StatusInternalServerError {
logf = log.Errorf
}
logf(ctx, "%s for %s@%s: code=%d, num_packages=%d, err=%v; timings: %s",
prefix, ft.ModulePath, ft.ResolvedVersion, ft.Status, len(ft.PackageVersionStates), ft.Error, msg)
}
// FetchAndUpdateLatest fetches information about the latest versions from the proxy,
// and updates the database if the version has changed.
// It returns the most recent good information, which may be what it just fetched or
// may be what is already in the DB.
// It does not update the latest good version; that happens inside InsertModule, because
// it must be protected by the module-path advisory lock.
func (f *Fetcher) FetchAndUpdateLatest(ctx context.Context, modulePath string) (_ *internal.LatestModuleVersions, err error) {
defer derrors.Wrap(&err, "FetchAndUpdateLatest(%q)", modulePath)
lmv, err := fetch.LatestModuleVersions(ctx, modulePath, f.ProxyClient, func(v string) (bool, error) {
return f.DB.HasGoMod(ctx, modulePath, v)
})
var status int
switch {
case lmv != nil:
status = 200
case err == nil:
// There may be no version information for the module, even if it exists.
// In that case, we insert a 404 into the DB.
status = 404
default:
status = derrors.ToStatus(err)
}
if status != 200 {
return nil, f.DB.UpdateLatestModuleVersionsStatus(ctx, modulePath, status)
}
return f.DB.UpdateLatestModuleVersions(ctx, lmv)
}
func (f *Fetcher) maybeShed(ctx context.Context, modulePath, version string) (func(), int64, error) {
if f.loadShedder == nil {
return func() {}, 0, nil
}
zipSize, err := getZipSize(ctx, modulePath, version, f.ProxyClient)
if err != nil {
return func() {}, 0, err
}
// Load shed or mark module as too large.
// We treat zip size as a proxy for the total memory consumed by
// processing a module, and use it to decide whether we can currently
// afford to process a module.
shouldShed, deferFunc := f.loadShedder.shouldShed(uint64(zipSize))
if shouldShed {
stats.Record(ctx, fetchesShedded.M(1))
return deferFunc, 0, fmt.Errorf("%w: size=%dMi", derrors.SheddingLoad, zipSize/mib)
}
if zipSize > maxModuleZipSize {
log.Warningf(ctx, "FetchModule: %s@%s zip size %dMi exceeds max %dMi",
modulePath, version, zipSize/mib, maxModuleZipSize/mib)
return deferFunc, 0, derrors.ModuleTooLarge
}
return deferFunc, zipSize, nil
}
func getZipSize(ctx context.Context, modulePath, resolvedVersion string, prox *proxy.Client) (_ int64, err error) {
if modulePath == stdlib.ModulePath {
return stdlib.EstimatedZipSize, nil
}
return prox.ZipSize(ctx, modulePath, resolvedVersion)
}