blob: 9c1c616638bf8d0f8b719dc62a520f84d6bac89f [file] [log] [blame]
// Copyright 2019 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package fetch provides a way to fetch modules from a proxy.
package fetch
import (
"archive/zip"
"context"
"errors"
"fmt"
"net/http"
"path"
"sort"
"strconv"
"strings"
"sync"
"time"
"go.opencensus.io/plugin/ochttp"
"go.opencensus.io/stats"
"go.opencensus.io/stats/view"
"go.opencensus.io/tag"
"go.opencensus.io/trace"
"golang.org/x/mod/modfile"
"golang.org/x/pkgsite/internal"
"golang.org/x/pkgsite/internal/dcensus"
"golang.org/x/pkgsite/internal/derrors"
"golang.org/x/pkgsite/internal/licenses"
"golang.org/x/pkgsite/internal/log"
"golang.org/x/pkgsite/internal/proxy"
"golang.org/x/pkgsite/internal/source"
"golang.org/x/pkgsite/internal/stdlib"
)
var (
ErrModuleContainsNoPackages = errors.New("module contains 0 packages")
errMalformedZip = errors.New("module zip is malformed")
)
var (
fetchLatency = stats.Float64(
"go-discovery/worker/fetch-latency",
"Latency of a fetch request.",
stats.UnitSeconds,
)
fetchesShedded = stats.Int64(
"go-discovery/worker/fetch-shedded",
"Count of shedded fetches.",
stats.UnitDimensionless,
)
fetchedPackages = stats.Int64(
"go-discovery/worker/fetch-package-count",
"Count of successfully fetched packages.",
stats.UnitDimensionless,
)
// FetchLatencyDistribution aggregates frontend fetch request
// latency by status code. It does not count shedded requests.
FetchLatencyDistribution = &view.View{
Name: "go-discovery/worker/fetch-latency",
Measure: fetchLatency,
Aggregation: ochttp.DefaultLatencyDistribution,
Description: "Fetch latency by result status.",
TagKeys: []tag.Key{dcensus.KeyStatus},
}
// FetchResponseCount counts fetch responses by status.
FetchResponseCount = &view.View{
Name: "go-discovery/worker/fetch-count",
Measure: fetchLatency,
Aggregation: view.Count(),
Description: "Fetch request count by result status",
TagKeys: []tag.Key{dcensus.KeyStatus},
}
// FetchPackageCount counts how many packages were successfully fetched.
FetchPackageCount = &view.View{
Name: "go-discovery/worker/fetch-package-count",
Measure: fetchedPackages,
Aggregation: view.Count(),
Description: "Count of packages successfully fetched",
}
// SheddedFetchCount counts the number of fetches that were shedded.
SheddedFetchCount = &view.View{
Name: "go-discovery/worker/fetch-shedded",
Measure: fetchesShedded,
Aggregation: view.Count(),
Description: "Count of shedded fetches",
}
)
type FetchResult struct {
ModulePath string
RequestedVersion string
ResolvedVersion string
MainVersion string
MasterVersion string
// HasGoMod says whether the zip contain a go.mod file. If Module (below) is non-nil, then
// Module.HasGoMod will be the same value. But HasGoMod will be populated even if Module is nil
// because there were problems with it, as long as we can download and read the zip.
HasGoMod bool
GoModPath string
Status int
Error error
Defer func() // caller must defer this on all code paths
Module *internal.Module
PackageVersionStates []*internal.PackageVersionState
}
// FetchModule queries the proxy or the Go repo for the requested module
// version, downloads the module zip, and processes the contents to return an
// *internal.Module and related information.
//
// Even if err is non-nil, the result may contain useful information, like the go.mod path.
//
// Callers of FetchModule must
// defer fr.Defer()
// immediately after the call.
func FetchModule(ctx context.Context, modulePath, requestedVersion string, proxyClient *proxy.Client, sourceClient *source.Client) (fr *FetchResult) {
start := time.Now()
defer func() {
latency := float64(time.Since(start).Seconds())
dcensus.RecordWithTag(ctx, dcensus.KeyStatus, strconv.Itoa(fr.Status), fetchLatency.M(latency))
if fr.Status < 300 {
stats.Record(ctx, fetchedPackages.M(int64(len(fr.PackageVersionStates))))
}
}()
fr = &FetchResult{
ModulePath: modulePath,
RequestedVersion: requestedVersion,
Defer: func() {},
}
defer derrors.Wrap(&fr.Error, "FetchModule(%q, %q)", modulePath, requestedVersion)
fi, err := fetchModule(ctx, fr, proxyClient, sourceClient)
fr.Error = err
if err != nil {
fr.Status = derrors.ToStatus(fr.Error)
}
if fr.Status == 0 {
fr.Status = http.StatusOK
}
if fi != nil {
finishFetchInfo(fi, fr.Status, fr.Error)
}
return fr
}
func fetchModule(ctx context.Context, fr *FetchResult, proxyClient *proxy.Client, sourceClient *source.Client) (*FetchInfo, error) {
info, err := GetInfo(ctx, fr.ModulePath, fr.RequestedVersion, proxyClient)
if err != nil {
return nil, err
}
fr.ResolvedVersion = info.Version
commitTime := info.Time
var zipSize int64
if zipLoadShedder != nil {
var err error
zipSize, err = getZipSize(ctx, fr.ModulePath, fr.ResolvedVersion, proxyClient)
if err != nil {
return nil, err
}
// Load shed or mark module as too large.
// We treat zip size as a proxy for the total memory consumed by
// processing a module, and use it to decide whether we can currently
// afford to process a module.
shouldShed, deferFunc := zipLoadShedder.shouldShed(uint64(zipSize))
fr.Defer = deferFunc
if shouldShed {
stats.Record(ctx, fetchesShedded.M(1))
return nil, fmt.Errorf("%w: size=%dMi", derrors.SheddingLoad, zipSize/mib)
}
if zipSize > maxModuleZipSize {
log.Warningf(ctx, "FetchModule: %s@%s zip size %dMi exceeds max %dMi",
fr.ModulePath, fr.ResolvedVersion, zipSize/mib, maxModuleZipSize/mib)
return nil, derrors.ModuleTooLarge
}
}
// Proceed with the fetch.
fi := &FetchInfo{
ModulePath: fr.ModulePath,
Version: fr.ResolvedVersion,
ZipSize: uint64(zipSize),
Start: time.Now(),
}
startFetchInfo(fi)
var zipReader *zip.Reader
if fr.ModulePath == stdlib.ModulePath {
var resolvedVersion string
zipReader, resolvedVersion, commitTime, err = stdlib.Zip(fr.RequestedVersion)
if err != nil {
return fi, err
}
// If the requested version is a branch name like "master" or "main", we cannot
// determine the right resolved version until we start working with the repo.
fr.ResolvedVersion = resolvedVersion
fi.Version = resolvedVersion
} else {
zipReader, err = proxyClient.Zip(ctx, fr.ModulePath, fr.ResolvedVersion)
if err != nil {
return fi, err
}
}
// Set fr.HasGoMod as early as possible, because the go command uses it to
// decide the latest version in some cases (see fetchRawLatestVersion in
// this package) and all it requires is a valid zip.
if fr.ModulePath == stdlib.ModulePath {
fr.HasGoMod = true
} else {
fr.HasGoMod = hasGoModFile(zipReader, fr.ModulePath, fr.ResolvedVersion)
}
// getGoModPath may return a non-empty goModPath even if the error is
// non-nil, if the module version is an alternative module.
var goModBytes []byte
fr.GoModPath, goModBytes, err = getGoModPath(ctx, fr.ModulePath, fr.ResolvedVersion, proxyClient)
if err != nil {
return fi, err
}
// If there is no go.mod file in the zip, try another way to detect
// alternative modules: compare the zip signature to a list of known ones to
// see if this is a fork. The intent is to avoid processing certain known
// large modules, not to find every fork.
if !fr.HasGoMod {
forkedModule, err := forkedFrom(zipReader, fr.ModulePath, fr.ResolvedVersion)
if err != nil {
return fi, err
}
if forkedModule != "" {
return fi, fmt.Errorf("forked from %s: %w", forkedModule, derrors.AlternativeModule)
}
}
mod, pvs, err := processZipFile(ctx, fr.ModulePath, fr.ResolvedVersion, fr.RequestedVersion,
commitTime, zipReader, sourceClient)
if err != nil {
return fi, err
}
mod.HasGoMod = fr.HasGoMod
if goModBytes != nil {
if err := processGoModFile(goModBytes, mod); err != nil {
return fi, fmt.Errorf("%v: %w", err.Error(), derrors.BadModule)
}
}
fr.Module = mod
fr.PackageVersionStates = pvs
for _, state := range fr.PackageVersionStates {
if state.Status != http.StatusOK {
fr.Status = derrors.ToStatus(derrors.HasIncompletePackages)
}
}
return fi, nil
}
// GetInfo returns the result of a request to the proxy .info endpoint. If
// the modulePath is "std", a request to @master will return an empty
// commit time.
func GetInfo(ctx context.Context, modulePath, requestedVersion string, proxyClient *proxy.Client) (_ *proxy.VersionInfo, err error) {
if modulePath == stdlib.ModulePath {
var resolvedVersion string
resolvedVersion, err = stdlib.ZipInfo(requestedVersion)
if err != nil {
return nil, err
}
return &proxy.VersionInfo{Version: resolvedVersion}, nil
}
return proxyClient.Info(ctx, modulePath, requestedVersion)
}
func getZipSize(ctx context.Context, modulePath, resolvedVersion string, proxyClient *proxy.Client) (_ int64, err error) {
if modulePath == stdlib.ModulePath {
return stdlib.EstimatedZipSize, nil
}
return proxyClient.ZipSize(ctx, modulePath, resolvedVersion)
}
// getGoModPath returns the module path from the go.mod file, as well as the contents of the file obtained from the proxy.
// If modulePath is the standard library, then the contents will be nil.
func getGoModPath(ctx context.Context, modulePath, resolvedVersion string, proxyClient *proxy.Client) (string, []byte, error) {
if modulePath == stdlib.ModulePath {
return stdlib.ModulePath, nil, nil
}
goModBytes, err := proxyClient.Mod(ctx, modulePath, resolvedVersion)
if err != nil {
return "", nil, err
}
goModPath := modfile.ModulePath(goModBytes)
if goModPath == "" {
return "", nil, fmt.Errorf("go.mod has no module path: %w", derrors.BadModule)
}
if goModPath != modulePath {
// The module path in the go.mod file doesn't match the path of the
// zip file. Don't insert the module. Store an AlternativeModule
// status in module_version_states.
return goModPath, goModBytes, fmt.Errorf("module path=%s, go.mod path=%s: %w", modulePath, goModPath, derrors.AlternativeModule)
}
return goModPath, goModBytes, nil
}
// processZipFile extracts information from the module version zip.
func processZipFile(ctx context.Context, modulePath, resolvedVersion, requestedVersion string,
commitTime time.Time, zipReader *zip.Reader, sourceClient *source.Client) (_ *internal.Module, _ []*internal.PackageVersionState, err error) {
defer derrors.Wrap(&err, "processZipFile(%q, %q)", modulePath, resolvedVersion)
ctx, span := trace.StartSpan(ctx, "fetch.processZipFile")
defer span.End()
v := resolvedVersion
if modulePath == stdlib.ModulePath && stdlib.SupportedBranches[requestedVersion] {
v = requestedVersion
}
sourceInfo, err := source.ModuleInfo(ctx, sourceClient, modulePath, v)
if err != nil {
log.Infof(ctx, "error getting source info: %v", err)
}
readmes, err := extractReadmesFromZip(modulePath, resolvedVersion, zipReader)
if err != nil {
return nil, nil, fmt.Errorf("extractReadmesFromZip(%q, %q, zipReader): %v", modulePath, resolvedVersion, err)
}
logf := func(format string, args ...interface{}) {
log.Infof(ctx, format, args...)
}
d := licenses.NewDetector(modulePath, v, zipReader, logf)
allLicenses := d.AllLicenses()
packages, packageVersionStates, err := extractPackagesFromZip(ctx, modulePath, resolvedVersion, requestedVersion, zipReader, d, sourceInfo)
if errors.Is(err, ErrModuleContainsNoPackages) || errors.Is(err, errMalformedZip) {
return nil, nil, fmt.Errorf("%v: %w", err.Error(), derrors.BadModule)
}
if err != nil {
return nil, nil, fmt.Errorf("extractPackagesFromZip(%q, %q, zipReader, %v): %v", modulePath, resolvedVersion, allLicenses, err)
}
return &internal.Module{
ModuleInfo: internal.ModuleInfo{
ModulePath: modulePath,
Version: resolvedVersion,
CommitTime: commitTime,
IsRedistributable: d.ModuleIsRedistributable(),
SourceInfo: sourceInfo,
// HasGoMod is populated by the caller.
},
Licenses: allLicenses,
Units: moduleUnits(modulePath, resolvedVersion, packages, readmes, d),
}, packageVersionStates, nil
}
func hasGoModFile(zr *zip.Reader, m, v string) bool {
return zipFile(zr, path.Join(moduleVersionDir(m, v), "go.mod")) != nil
}
// processGoModFile populates mod with information extracted from the contents of the go.mod file.
func processGoModFile(goModBytes []byte, mod *internal.Module) (err error) {
defer derrors.Wrap(&err, "processGoModFile")
mf, err := modfile.Parse("go.mod", goModBytes, nil)
if err != nil {
return err
}
mod.Deprecated, mod.DeprecationComment = extractDeprecatedComment(mf)
return nil
}
// extractDeprecatedComment looks for "Deprecated" comments in the line comments
// before the module declaration. If it finds one, it returns true along with
// the text after "Deprecated:". Otherwise it returns false, "".
func extractDeprecatedComment(mf *modfile.File) (bool, string) {
const prefix = "Deprecated:"
if mf.Module == nil {
return false, ""
}
for _, comment := range append(mf.Module.Syntax.Before, mf.Module.Syntax.Suffix...) {
text := strings.TrimSpace(strings.TrimPrefix(comment.Token, "//"))
if strings.HasPrefix(text, prefix) {
return true, strings.TrimSpace(text[len(prefix):])
}
}
return false, ""
}
// moduleVersionDir formats the content subdirectory for the given
// modulePath and version.
func moduleVersionDir(modulePath, version string) string {
return fmt.Sprintf("%s@%s", modulePath, version)
}
// zipFile returns the file in r whose name matches the given name, or nil
// if there isn't one.
func zipFile(r *zip.Reader, name string) *zip.File {
for _, f := range r.File {
if f.Name == name {
return f
}
}
return nil
}
type FetchInfo struct {
ModulePath string
Version string
ZipSize uint64
Start time.Time
Finish time.Time
Status int
Error error
}
var (
fetchInfoMu sync.Mutex
fetchInfoMap = map[*FetchInfo]struct{}{}
)
func init() {
const linger = time.Minute
go func() {
for {
now := time.Now()
fetchInfoMu.Lock()
for fi := range fetchInfoMap {
if !fi.Finish.IsZero() && now.Sub(fi.Finish) > linger {
delete(fetchInfoMap, fi)
}
}
fetchInfoMu.Unlock()
time.Sleep(linger)
}
}()
}
func startFetchInfo(fi *FetchInfo) {
fetchInfoMu.Lock()
defer fetchInfoMu.Unlock()
fetchInfoMap[fi] = struct{}{}
}
func finishFetchInfo(fi *FetchInfo, status int, err error) {
fetchInfoMu.Lock()
defer fetchInfoMu.Unlock()
fi.Finish = time.Now()
fi.Status = status
fi.Error = err
}
// FetchInfos returns information about all fetches in progress,
// sorted by start time.
func FetchInfos() []*FetchInfo {
var fis []*FetchInfo
fetchInfoMu.Lock()
for fi := range fetchInfoMap {
// Copy to avoid races on Status and Error when read by
// worker home page.
cfi := *fi
fis = append(fis, &cfi)
}
fetchInfoMu.Unlock()
// Order first by done-ness, then by age.
sort.Slice(fis, func(i, j int) bool {
if (fis[i].Status == 0) == (fis[j].Status == 0) {
return fis[i].Start.Before(fis[j].Start)
}
return fis[i].Status == 0
})
return fis
}