internal/fetch: split fetch.go into multiple files
fetch.go is split into readme.go, package.go and load.go.
This CL is movement only - there are no code changes.
Change-Id: If13826ed1d267b58a712fdbf8e65d1e225bab28b
Reviewed-on: https://go-review.googlesource.com/c/pkgsite/+/256138
Trust: Julie Qiu <julie@golang.org>
Run-TryBot: Julie Qiu <julie@golang.org>
Reviewed-by: Jonathan Amsterdam <jba@google.com>
TryBot-Result: kokoro <noreply+kokoro@google.com>
diff --git a/internal/fetch/fetch.go b/internal/fetch/fetch.go
index 4b4b554..25a1437 100644
--- a/internal/fetch/fetch.go
+++ b/internal/fetch/fetch.go
@@ -7,35 +7,17 @@
import (
"archive/zip"
- "bytes"
"context"
"errors"
"fmt"
- "go/ast"
- "go/build"
- "go/parser"
- "go/token"
- "io"
- "io/ioutil"
- "math"
"net/http"
- "os"
"path"
- "runtime"
- "runtime/debug"
- "sort"
- "strconv"
- "strings"
"time"
- "github.com/google/safehtml/template"
"go.opencensus.io/trace"
"golang.org/x/mod/modfile"
- "golang.org/x/mod/module"
"golang.org/x/pkgsite/internal"
"golang.org/x/pkgsite/internal/derrors"
- "golang.org/x/pkgsite/internal/fetch/dochtml"
- "golang.org/x/pkgsite/internal/fetch/internal/doc"
"golang.org/x/pkgsite/internal/licenses"
"golang.org/x/pkgsite/internal/log"
"golang.org/x/pkgsite/internal/proxy"
@@ -248,265 +230,6 @@
return fmt.Sprintf("%s@%s", modulePath, version)
}
-// extractReadmesFromZip returns the file path and contents of all files from r
-// that are README files.
-func extractReadmesFromZip(modulePath, resolvedVersion string, r *zip.Reader) ([]*internal.Readme, error) {
- var readmes []*internal.Readme
- for _, zipFile := range r.File {
- if isReadme(zipFile.Name) {
- if zipFile.UncompressedSize64 > MaxFileSize {
- return nil, fmt.Errorf("file size %d exceeds max limit %d", zipFile.UncompressedSize64, MaxFileSize)
- }
- c, err := readZipFile(zipFile, MaxFileSize)
- if err != nil {
- return nil, err
- }
- readmes = append(readmes, &internal.Readme{
- Filepath: strings.TrimPrefix(zipFile.Name, moduleVersionDir(modulePath, resolvedVersion)+"/"),
- Contents: string(c),
- })
-
- }
- }
- return readmes, nil
-}
-
-var excludedReadmeExts = map[string]bool{".go": true, ".vendor": true}
-
-// isReadme reports whether file is README or if the base name of file, with or
-// without the extension, is equal to expectedFile. README.go files will return
-// false. It is case insensitive. It operates on '/'-separated paths.
-func isReadme(file string) bool {
- const expectedFile = "README"
- base := path.Base(file)
- ext := path.Ext(base)
- return !excludedReadmeExts[ext] && strings.EqualFold(strings.TrimSuffix(base, ext), expectedFile)
-}
-
-// extractPackagesFromZip returns a slice of packages from the module zip r.
-// It matches against the given licenses to determine the subset of licenses
-// that applies to each package.
-// The second return value says whether any packages are "incomplete," meaning
-// that they contained .go files but couldn't be processed due to current
-// limitations of this site. The limitations are:
-// * a maximum file size (MaxFileSize)
-// * the particular set of build contexts we consider (goEnvs)
-// * whether the import path is valid.
-func extractPackagesFromZip(ctx context.Context, modulePath, resolvedVersion string, r *zip.Reader, d *licenses.Detector, sourceInfo *source.Info) (_ []*internal.LegacyPackage, _ []*internal.PackageVersionState, err error) {
- ctx, span := trace.StartSpan(ctx, "fetch.extractPackagesFromZip")
- defer span.End()
- defer func() {
- if e := recover(); e != nil {
- // The package processing code performs some sanity checks along the way.
- // None of the panics should occur, but if they do, we want to log them and
- // be able to find them. So, convert internal panics to internal errors here.
- err = fmt.Errorf("internal panic: %v\n\n%s", e, debug.Stack())
- }
- }()
-
- // The high-level approach is to split the processing of the zip file
- // into two phases:
- //
- // 1. loop over all files, looking at file metadata only
- // 2. process all files by reading their contents
- //
- // During phase 1, we populate the dirs map for each directory
- // that contains at least one .go file.
-
- var (
- // modulePrefix is the "<module>@<resolvedVersion>/" prefix that all files
- // are expected to have according to the zip archive layout specification
- // at the bottom of https://golang.org/cmd/go/#hdr-Module_proxy_protocol.
- modulePrefix = moduleVersionDir(modulePath, resolvedVersion) + "/"
-
- // dirs is the set of directories with at least one .go file,
- // to be populated during phase 1 and used during phase 2.
- //
- // The map key is the directory path, with the modulePrefix trimmed.
- // The map value is a slice of all .go files, and no other files.
- dirs = make(map[string][]*zip.File)
-
- // modInfo contains all the module information a package in the module
- // needs to render its documentation, to be populated during phase 1
- // and used during phase 2.
- modInfo = &dochtml.ModuleInfo{
- ModulePath: modulePath,
- ResolvedVersion: resolvedVersion,
- ModulePackages: make(map[string]bool),
- }
-
- // incompleteDirs tracks directories for which we have incomplete
- // information, due to a problem processing one of the go files contained
- // therein. We use this so that a single unprocessable package does not
- // prevent processing of other packages in the module.
- incompleteDirs = make(map[string]bool)
- packageVersionStates = []*internal.PackageVersionState{}
- )
-
- // Phase 1.
- // Loop over zip files preemptively and check for problems
- // that can be detected by looking at metadata alone.
- // We'll be looking at file contents starting with phase 2 only,
- // only after we're sure this phase passed without errors.
- for _, f := range r.File {
- if f.Mode().IsDir() {
- // While "go mod download" will never put a directory in a zip, any can serve their
- // own zips. Example: go.felesatra.moe/binpack@v0.1.0.
- // Directory entries are harmless, so we just ignore them.
- continue
- }
- if !strings.HasPrefix(f.Name, modulePrefix) {
- // Well-formed module zips have all files under modulePrefix.
- return nil, nil, fmt.Errorf("expected file to have prefix %q; got = %q: %w",
- modulePrefix, f.Name, errMalformedZip)
- }
- innerPath := path.Dir(f.Name[len(modulePrefix):])
- if incompleteDirs[innerPath] {
- // We already know this directory cannot be processed, so skip.
- continue
- }
- importPath := path.Join(modulePath, innerPath)
- if ignoredByGoTool(importPath) || isVendored(importPath) {
- // File is in a directory we're not looking to process at this time, so skip it.
- continue
- }
- if !strings.HasSuffix(f.Name, ".go") {
- // We care about .go files only.
- continue
- }
- // It's possible to have a Go package in a directory that does not result in a valid import path.
- // That package cannot be imported, but that may be fine if it's a main package, intended to built
- // and run from that directory.
- // Example: https://github.com/postmannen/go-learning/blob/master/concurrency/01-sending%20numbers%20and%20receving%20numbers%20from%20a%20channel/main.go
- // We're not set up to handle invalid import paths, so skip these packages.
- if err := module.CheckImportPath(importPath); err != nil {
- incompleteDirs[innerPath] = true
- packageVersionStates = append(packageVersionStates, &internal.PackageVersionState{
- ModulePath: modulePath,
- PackagePath: importPath,
- Version: resolvedVersion,
- Status: derrors.ToStatus(derrors.PackageBadImportPath),
- Error: err.Error(),
- })
- continue
- }
- if f.UncompressedSize64 > MaxFileSize {
- incompleteDirs[innerPath] = true
- status := derrors.ToStatus(derrors.PackageMaxFileSizeLimitExceeded)
- err := fmt.Sprintf("Unable to process %s: file size %d exceeds max limit %d",
- f.Name, f.UncompressedSize64, MaxFileSize)
- packageVersionStates = append(packageVersionStates, &internal.PackageVersionState{
- ModulePath: modulePath,
- PackagePath: importPath,
- Version: resolvedVersion,
- Status: status,
- Error: err,
- })
- continue
- }
- dirs[innerPath] = append(dirs[innerPath], f)
- if len(dirs) > maxPackagesPerModule {
- return nil, nil, fmt.Errorf("%d packages found in %q; exceeds limit %d for maxPackagePerModule", len(dirs), modulePath, maxPackagesPerModule)
- }
- }
- for pkgName := range dirs {
- modInfo.ModulePackages[path.Join(modulePath, pkgName)] = true
- }
-
- // Phase 2.
- // If we got this far, the file metadata was okay.
- // Start reading the file contents now to extract information
- // about Go packages.
- var pkgs []*internal.LegacyPackage
- for innerPath, goFiles := range dirs {
- if incompleteDirs[innerPath] {
- // Something went wrong when processing this directory, so we skip.
- log.Infof(ctx, "Skipping %q because it is incomplete", innerPath)
- continue
- }
-
- var (
- status error
- errMsg string
- )
- pkg, err := loadPackage(ctx, goFiles, innerPath, sourceInfo, modInfo)
- if bpe := (*BadPackageError)(nil); errors.As(err, &bpe) {
- incompleteDirs[innerPath] = true
- status = derrors.PackageInvalidContents
- errMsg = err.Error()
- } else if errors.Is(err, dochtml.ErrTooLarge) {
- status = derrors.PackageDocumentationHTMLTooLarge
- errMsg = err.Error()
- } else if err != nil {
- return nil, nil, fmt.Errorf("unexpected error loading package: %v", err)
- }
-
- var pkgPath string
- if pkg == nil {
- // No package.
- if len(goFiles) > 0 {
- // There were go files, but no build contexts matched them.
- incompleteDirs[innerPath] = true
- status = derrors.PackageBuildContextNotSupported
- }
- pkgPath = path.Join(modulePath, innerPath)
- } else {
- if d != nil { // should only be nil for tests
- isRedist, lics := d.PackageInfo(innerPath)
- pkg.IsRedistributable = isRedist
- for _, l := range lics {
- pkg.Licenses = append(pkg.Licenses, l.Metadata)
- }
- }
- pkgs = append(pkgs, pkg)
- pkgPath = pkg.Path
- }
- code := http.StatusOK
- if status != nil {
- code = derrors.ToStatus(status)
- }
- packageVersionStates = append(packageVersionStates, &internal.PackageVersionState{
- ModulePath: modulePath,
- PackagePath: pkgPath,
- Version: resolvedVersion,
- Status: code,
- Error: errMsg,
- })
- }
- if len(pkgs) == 0 {
- return nil, packageVersionStates, errModuleContainsNoPackages
- }
- return pkgs, packageVersionStates, nil
-}
-
-// ignoredByGoTool reports whether the given import path corresponds
-// to a directory that would be ignored by the go tool.
-//
-// The logic of the go tool for ignoring directories is documented at
-// https://golang.org/cmd/go/#hdr-Package_lists_and_patterns:
-//
-// LegacyDirectory and file names that begin with "." or "_" are ignored
-// by the go tool, as are directories named "testdata".
-//
-func ignoredByGoTool(importPath string) bool {
- for _, el := range strings.Split(importPath, "/") {
- if strings.HasPrefix(el, ".") || strings.HasPrefix(el, "_") || el == "testdata" {
- return true
- }
- }
- return false
-}
-
-// isVendored reports whether the given import path corresponds
-// to a Go package that is inside a vendor directory.
-//
-// The logic for what is considered a vendor directory is documented at
-// https://golang.org/cmd/go/#hdr-Vendor_Directories.
-func isVendored(importPath string) bool {
- return strings.HasPrefix(importPath, "vendor/") ||
- strings.Contains(importPath, "/vendor/")
-}
-
// zipContainsFilename reports whether there is a file with the given name in the zip.
func zipContainsFilename(r *zip.Reader, name string) bool {
for _, f := range r.File {
@@ -516,319 +239,3 @@
}
return false
}
-
-// BadPackageError represents an error loading a package
-// because its contents do not make up a valid package.
-//
-// This can happen, for example, if the .go files fail
-// to parse or declare different package names.
-type BadPackageError struct {
- Err error // Not nil.
-}
-
-func (bpe *BadPackageError) Error() string { return bpe.Err.Error() }
-
-// Go environments used to construct build contexts in loadPackage.
-var goEnvs = []struct{ GOOS, GOARCH string }{
- {"linux", "amd64"},
- {"windows", "amd64"},
- {"darwin", "amd64"},
- {"js", "wasm"},
- {"linux", "js"},
-}
-
-// loadPackage loads a Go package by calling loadPackageWithBuildContext, trying
-// several build contexts in turn. The first build context in the list to produce
-// a non-empty package is used. If none of them result in a package, then
-// loadPackage returns nil, nil.
-//
-// If the package is fine except that its documentation is too large, loadPackage
-// returns both a package and a non-nil error with dochtml.ErrTooLarge in its chain.
-func loadPackage(ctx context.Context, zipGoFiles []*zip.File, innerPath string, sourceInfo *source.Info, modInfo *dochtml.ModuleInfo) (*internal.LegacyPackage, error) {
- ctx, span := trace.StartSpan(ctx, "fetch.loadPackage")
- defer span.End()
- for _, env := range goEnvs {
- pkg, err := loadPackageWithBuildContext(ctx, env.GOOS, env.GOARCH, zipGoFiles, innerPath, sourceInfo, modInfo)
- if err != nil && !errors.Is(err, dochtml.ErrTooLarge) {
- return nil, err
- }
- if pkg != nil {
- return pkg, err
- }
- }
- return nil, nil
-}
-
-// httpPost allows package fetch tests to stub out playground URL fetches.
-var httpPost = http.Post
-
-const docTooLargeReplacement = `<p>Documentation is too large to display.</p>`
-
-// loadPackageWithBuildContext loads a Go package made of .go files in zipGoFiles
-// using a build context constructed from the given GOOS and GOARCH values.
-// modulePath is stdlib.ModulePath for the Go standard library and the module
-// path for all other modules. innerPath is the path of the Go package directory
-// relative to the module root.
-//
-// zipGoFiles must contain only .go files that have been verified
-// to be of reasonable size.
-//
-// The returned LegacyPackage.Licenses field is not populated.
-//
-// It returns a nil LegacyPackage if the directory doesn't contain a Go package
-// or all .go files have been excluded by constraints.
-// A *BadPackageError error is returned if the directory
-// contains .go files but do not make up a valid package.
-func loadPackageWithBuildContext(ctx context.Context, goos, goarch string, zipGoFiles []*zip.File, innerPath string, sourceInfo *source.Info, modInfo *dochtml.ModuleInfo) (_ *internal.LegacyPackage, err error) {
- modulePath := modInfo.ModulePath
- defer derrors.Wrap(&err, "loadPackageWithBuildContext(%q, %q, zipGoFiles, %q, %q, %+v)",
- goos, goarch, innerPath, modulePath, sourceInfo)
- // Apply build constraints to get a map from matching file names to their contents.
- files, err := matchingFiles(goos, goarch, zipGoFiles)
- if err != nil {
- return nil, err
- }
-
- // Parse .go files and add them to the goFiles slice.
- var (
- fset = token.NewFileSet()
- goFiles = make(map[string]*ast.File)
- allGoFiles []*ast.File
- packageName string
- packageNameFile string // Name of file where packageName came from.
- )
- for name, b := range files {
- pf, err := parser.ParseFile(fset, name, b, parser.ParseComments)
- if err != nil {
- if pf == nil {
- return nil, fmt.Errorf("internal error: the source couldn't be read: %v", err)
- }
- return nil, &BadPackageError{Err: err}
- }
- allGoFiles = append(allGoFiles, pf)
- if strings.HasSuffix(name, "_test.go") {
- continue
- }
- goFiles[name] = pf
- if len(goFiles) == 1 {
- packageName = pf.Name.Name
- packageNameFile = name
- } else if pf.Name.Name != packageName {
- return nil, &BadPackageError{Err: &build.MultiplePackageError{
- Dir: innerPath,
- Packages: []string{packageName, pf.Name.Name},
- Files: []string{packageNameFile, name},
- }}
- }
- }
- if len(goFiles) == 0 {
- // This directory doesn't contain a package, or at least not one
- // that matches this build context.
- return nil, nil
- }
-
- // The "builtin" package in the standard library is a special case.
- // We want to show documentation for all globals (not just exported ones),
- // and avoid association of consts, vars, and factory functions with types
- // since it's not helpful (see golang.org/issue/6645).
- var noFiltering, noTypeAssociation bool
- if modulePath == stdlib.ModulePath && innerPath == "builtin" {
- noFiltering = true
- noTypeAssociation = true
- }
-
- // Compute package documentation.
- importPath := path.Join(modulePath, innerPath)
- var m doc.Mode
- if noFiltering {
- m |= doc.AllDecls
- }
- d, err := doc.NewFromFiles(fset, allGoFiles, importPath, m)
- if err != nil {
- return nil, fmt.Errorf("doc.NewFromFiles: %v", err)
- }
- if d.ImportPath != importPath || d.Name != packageName {
- panic(fmt.Errorf("internal error: *doc.Package has an unexpected import path (%q != %q) or package name (%q != %q)", d.ImportPath, importPath, d.Name, packageName))
- }
- if noTypeAssociation {
- for _, t := range d.Types {
- d.Consts, t.Consts = append(d.Consts, t.Consts...), nil
- d.Vars, t.Vars = append(d.Vars, t.Vars...), nil
- d.Funcs, t.Funcs = append(d.Funcs, t.Funcs...), nil
- }
- sort.Slice(d.Funcs, func(i, j int) bool { return d.Funcs[i].Name < d.Funcs[j].Name })
- }
-
- // Process package imports.
- if len(d.Imports) > maxImportsPerPackage {
- return nil, fmt.Errorf("%d imports found package %q; exceeds limit %d for maxImportsPerPackage", len(d.Imports), importPath, maxImportsPerPackage)
- }
-
- // Render documentation HTML.
- sourceLinkFunc := func(n ast.Node) string {
- if sourceInfo == nil {
- return ""
- }
- p := fset.Position(n.Pos())
- if p.Line == 0 { // invalid Position
- return ""
- }
- return sourceInfo.LineURL(path.Join(innerPath, p.Filename), p.Line)
- }
- fileLinkFunc := func(filename string) string {
- if sourceInfo == nil {
- return ""
- }
- return sourceInfo.FileURL(path.Join(innerPath, filename))
- }
-
- docHTML, err := dochtml.Render(ctx, fset, d, dochtml.RenderOptions{
- FileLinkFunc: fileLinkFunc,
- SourceLinkFunc: sourceLinkFunc,
- ModInfo: modInfo,
- Limit: int64(MaxDocumentationHTML),
- })
- if errors.Is(err, dochtml.ErrTooLarge) {
- docHTML = template.MustParseAndExecuteToHTML(docTooLargeReplacement)
- } else if err != nil {
- return nil, fmt.Errorf("dochtml.Render: %v", err)
- }
- if modulePath == stdlib.ModulePath {
- importPath = innerPath
- }
- v1path := internal.V1Path(importPath, modulePath)
- return &internal.LegacyPackage{
- Path: importPath,
- Name: packageName,
- Synopsis: doc.Synopsis(d.Doc),
- V1Path: v1path,
- Imports: d.Imports,
- DocumentationHTML: docHTML,
- GOOS: goos,
- GOARCH: goarch,
- }, err
-}
-
-// matchingFiles returns a map from file names to their contents, read from zipGoFiles.
-// It includes only those files that match the build context determined by goos and goarch.
-func matchingFiles(goos, goarch string, zipGoFiles []*zip.File) (files map[string][]byte, err error) {
- defer derrors.Wrap(&err, "matchingFiles(%q, %q, zipGoFiles)", goos, goarch)
- // Populate the map with all the zip files.
- files = make(map[string][]byte)
- for _, f := range zipGoFiles {
- _, name := path.Split(f.Name)
- b, err := readZipFile(f, MaxFileSize)
- if err != nil {
- return nil, err
- }
- files[name] = b
- }
-
- // bctx is used to make decisions about which of the .go files are included
- // by build constraints.
- bctx := &build.Context{
- GOOS: goos,
- GOARCH: goarch,
- CgoEnabled: true,
- Compiler: build.Default.Compiler,
- ReleaseTags: build.Default.ReleaseTags,
-
- JoinPath: path.Join,
- OpenFile: func(name string) (io.ReadCloser, error) {
- return ioutil.NopCloser(bytes.NewReader(files[name])), nil
- },
-
- // If left nil, the default implementations of these read from disk,
- // which we do not want. None of these functions should be used
- // inside this function; it would be an internal error if they are.
- // Set them to non-nil values to catch if that happens.
- SplitPathList: func(string) []string { panic("internal error: unexpected call to SplitPathList") },
- IsAbsPath: func(string) bool { panic("internal error: unexpected call to IsAbsPath") },
- IsDir: func(string) bool { panic("internal error: unexpected call to IsDir") },
- HasSubdir: func(string, string) (string, bool) { panic("internal error: unexpected call to HasSubdir") },
- ReadDir: func(string) ([]os.FileInfo, error) { panic("internal error: unexpected call to ReadDir") },
- }
-
- for name := range files {
- match, err := bctx.MatchFile(".", name) // This will access the file we just added to files map above.
- if err != nil {
- return nil, &BadPackageError{Err: fmt.Errorf(`bctx.MatchFile(".", %q): %w`, name, err)}
- }
- if !match {
- // Excluded by build context.
- delete(files, name)
- }
- }
- return files, nil
-}
-
-// readZipFile decompresses zip file f and returns its uncompressed contents.
-// The caller can check f.UncompressedSize64 before calling readZipFile to
-// get the expected uncompressed size of f.
-//
-// limit is the maximum number of bytes to read.
-func readZipFile(f *zip.File, limit int64) (_ []byte, err error) {
- defer derrors.Add(&err, "readZipFile(%q)", f.Name)
-
- r, err := f.Open()
- if err != nil {
- return nil, fmt.Errorf("f.Open(): %v", err)
- }
- b, err := ioutil.ReadAll(io.LimitReader(r, limit))
- if err != nil {
- r.Close()
- return nil, fmt.Errorf("ioutil.ReadAll(r): %v", err)
- }
- if err := r.Close(); err != nil {
- return nil, fmt.Errorf("closing: %v", err)
- }
- return b, nil
-}
-
-func allocMeg() int {
- var ms runtime.MemStats
- runtime.ReadMemStats(&ms)
- return int(ms.Alloc / (1024 * 1024))
-}
-
-// mib is the number of bytes in a mebibyte (Mi).
-const mib = 1024 * 1024
-
-// The largest module zip size we can comfortably process.
-// We probably will OOM if we process a module whose zip is larger.
-var maxModuleZipSize int64 = math.MaxInt64
-
-func init() {
- m := os.Getenv("GO_DISCOVERY_MAX_MODULE_ZIP_MI")
- if m != "" {
- v, err := strconv.ParseInt(m, 10, 64)
- if err != nil {
- log.Errorf(context.Background(), "could not parse GO_DISCOVERY_MAX_MODULE_ZIP_MI value %q", v)
- } else {
- maxModuleZipSize = v * mib
- }
- }
-}
-
-var zipLoadShedder = loadShedder{maxSizeInFlight: math.MaxUint64}
-
-func init() {
- ctx := context.Background()
- m := os.Getenv("GO_DISCOVERY_MAX_IN_FLIGHT_ZIP_MI")
- if m != "" {
- mebis, err := strconv.ParseUint(m, 10, 64)
- if err != nil {
- log.Fatalf(ctx, "could not parse GO_DISCOVERY_MAX_IN_FLIGHT_ZIP_MI value %q", m)
- } else if mebis == 0 {
- log.Fatalf(ctx, "bad value for GO_DISCOVERY_MAX_IN_FLIGHT_ZIP_MI: %d. Must be >= 1.", mebis)
- } else {
- log.Infof(ctx, "shedding load over %dMi", mebis)
- zipLoadShedder.maxSizeInFlight = mebis * mib
- }
- }
-}
-
-// ZipLoadShedStats returns a snapshot of the current LoadShedStats for zip files.
-func ZipLoadShedStats() LoadShedStats {
- return zipLoadShedder.stats()
-}
diff --git a/internal/fetch/fetch_test.go b/internal/fetch/fetch_test.go
index 66efa57..afe50be 100644
--- a/internal/fetch/fetch_test.go
+++ b/internal/fetch/fetch_test.go
@@ -5,14 +5,11 @@
package fetch
import (
- "archive/zip"
- "bytes"
"context"
"errors"
"io"
"net/http"
"net/http/httptest"
- "sort"
"testing"
"time"
@@ -24,7 +21,6 @@
"golang.org/x/pkgsite/internal/source"
"golang.org/x/pkgsite/internal/stdlib"
"golang.org/x/pkgsite/internal/testing/sample"
- "golang.org/x/pkgsite/internal/testing/testhelper"
)
var (
@@ -154,236 +150,3 @@
})
}
}
-
-func TestExtractReadmesFromZip(t *testing.T) {
- stdlib.UseTestData = true
-
- ctx, cancel := context.WithTimeout(context.Background(), testTimeout)
- defer cancel()
-
- sortReadmes := func(readmes []*internal.Readme) {
- sort.Slice(readmes, func(i, j int) bool {
- return readmes[i].Filepath < readmes[j].Filepath
- })
- }
-
- for _, test := range []struct {
- modulePath, version string
- files map[string]string
- want []*internal.Readme
- }{
- {
- modulePath: stdlib.ModulePath,
- version: "v1.12.5",
- want: []*internal.Readme{
- {
- Filepath: "README.md",
- Contents: "# The Go Programming Language\n",
- },
- {
- Filepath: "cmd/pprof/README",
- Contents: "This directory is the copy of Google's pprof shipped as part of the Go distribution.\n",
- },
- },
- },
- {
- modulePath: "github.com/my/module",
- version: "v1.0.0",
- files: map[string]string{
- "README.md": "README FILE FOR TESTING.",
- "foo/README": "Another README",
- },
- want: []*internal.Readme{
- {
- Filepath: "README.md",
- Contents: "README FILE FOR TESTING.",
- },
- {
- Filepath: "foo/README",
- Contents: "Another README",
- },
- },
- },
- {
- modulePath: "emp.ty/module",
- version: "v1.0.0",
- files: map[string]string{},
- },
- } {
- t.Run(test.modulePath, func(t *testing.T) {
- var (
- reader *zip.Reader
- err error
- )
- if test.modulePath == stdlib.ModulePath {
- reader, _, err = stdlib.Zip(test.version)
- if err != nil {
- t.Fatal(err)
- }
- } else {
- proxyClient, teardownProxy := proxy.SetupTestClient(t, []*proxy.Module{
- {ModulePath: test.modulePath, Files: test.files}})
- defer teardownProxy()
- reader, err = proxyClient.GetZip(ctx, test.modulePath, "v1.0.0")
- if err != nil {
- t.Fatal(err)
- }
- }
-
- got, err := extractReadmesFromZip(test.modulePath, test.version, reader)
- if err != nil {
- t.Fatal(err)
- }
-
- sortReadmes(test.want)
- sortReadmes(got)
- if diff := cmp.Diff(test.want, got); diff != "" {
- t.Errorf("mismatch (-want +got):\n%s", diff)
- }
- })
- }
-}
-
-func TestIsReadme(t *testing.T) {
- for _, test := range []struct {
- name, file string
- want bool
- }{
- {
- name: "README in nested dir returns true",
- file: "github.com/my/module@v1.0.0/README.md",
- want: true,
- },
- {
- name: "case insensitive",
- file: "rEaDme",
- want: true,
- },
- {
- name: "random extension returns true",
- file: "README.FOO",
- want: true,
- },
- {
- name: "{prefix}readme will return false",
- file: "FOO_README",
- want: false,
- },
- {
- file: "README_FOO",
- name: "readme{suffix} will return false",
- want: false,
- },
- {
- file: "README.FOO.FOO",
- name: "README file with multiple extensions will return false",
- want: false,
- },
- {
- file: "readme.go",
- name: ".go README file will return false",
- want: false,
- },
- {
- file: "readme.vendor",
- name: ".vendor README file will return false",
- want: false,
- },
- {
- file: "",
- name: "empty filename returns false",
- want: false,
- },
- } {
- {
- t.Run(test.file, func(t *testing.T) {
- if got := isReadme(test.file); got != test.want {
- t.Errorf("isReadme(%q) = %t: %t", test.file, got, test.want)
- }
- })
- }
- }
-}
-
-func TestMatchingFiles(t *testing.T) {
- plainGoBody := `
- package plain
- type Value int`
- jsGoBody := `
- // +build js,wasm
-
- // Package js only works with wasm.
- package js
- type Value int`
-
- plainContents := map[string]string{
- "README.md": "THIS IS A README",
- "LICENSE.md": testhelper.MITLicense,
- "plain/plain.go": plainGoBody,
- }
-
- jsContents := map[string]string{
- "README.md": "THIS IS A README",
- "LICENSE.md": testhelper.MITLicense,
- "js/js.go": jsGoBody,
- }
- for _, test := range []struct {
- name string
- goos, goarch string
- contents map[string]string
- want map[string][]byte
- }{
- {
- name: "plain-linux",
- goos: "linux",
- goarch: "amd64",
- contents: plainContents,
- want: map[string][]byte{
- "plain.go": []byte(plainGoBody),
- },
- },
- {
- name: "plain-js",
- goos: "js",
- goarch: "wasm",
- contents: plainContents,
- want: map[string][]byte{
- "plain.go": []byte(plainGoBody),
- },
- },
- {
- name: "wasm-linux",
- goos: "linux",
- goarch: "amd64",
- contents: jsContents,
- want: map[string][]byte{},
- },
- {
- name: "wasm-js",
- goos: "js",
- goarch: "wasm",
- contents: jsContents,
- want: map[string][]byte{
- "js.go": []byte(jsGoBody),
- },
- },
- } {
- t.Run(test.name, func(t *testing.T) {
- data, err := testhelper.ZipContents(test.contents)
- if err != nil {
- t.Fatal(err)
- }
- r, err := zip.NewReader(bytes.NewReader(data), int64(len(data)))
- if err != nil {
- t.Fatal(err)
- }
- got, err := matchingFiles(test.goos, test.goarch, r.File)
- if err != nil {
- t.Fatal(err)
- }
- if diff := cmp.Diff(test.want, got); diff != "" {
- t.Errorf("mismatch (-want +got):\n%s", diff)
- }
- })
- }
-}
diff --git a/internal/fetch/load.go b/internal/fetch/load.go
new file mode 100644
index 0000000..a9329b4
--- /dev/null
+++ b/internal/fetch/load.go
@@ -0,0 +1,354 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package fetch provides a way to fetch modules from a proxy.
+package fetch
+
+import (
+ "archive/zip"
+ "bytes"
+ "context"
+ "errors"
+ "fmt"
+ "go/ast"
+ "go/build"
+ "go/parser"
+ "go/token"
+ "io"
+ "io/ioutil"
+ "math"
+ "net/http"
+ "os"
+ "path"
+ "runtime"
+ "sort"
+ "strconv"
+ "strings"
+
+ "github.com/google/safehtml/template"
+ "go.opencensus.io/trace"
+ "golang.org/x/pkgsite/internal"
+ "golang.org/x/pkgsite/internal/derrors"
+ "golang.org/x/pkgsite/internal/fetch/dochtml"
+ "golang.org/x/pkgsite/internal/fetch/internal/doc"
+ "golang.org/x/pkgsite/internal/log"
+ "golang.org/x/pkgsite/internal/source"
+ "golang.org/x/pkgsite/internal/stdlib"
+)
+
+// BadPackageError represents an error loading a package
+// because its contents do not make up a valid package.
+//
+// This can happen, for example, if the .go files fail
+// to parse or declare different package names.
+type BadPackageError struct {
+ Err error // Not nil.
+}
+
+func (bpe *BadPackageError) Error() string { return bpe.Err.Error() }
+
+// Go environments used to construct build contexts in loadPackage.
+var goEnvs = []struct{ GOOS, GOARCH string }{
+ {"linux", "amd64"},
+ {"windows", "amd64"},
+ {"darwin", "amd64"},
+ {"js", "wasm"},
+ {"linux", "js"},
+}
+
+// loadPackage loads a Go package by calling loadPackageWithBuildContext, trying
+// several build contexts in turn. The first build context in the list to produce
+// a non-empty package is used. If none of them result in a package, then
+// loadPackage returns nil, nil.
+//
+// If the package is fine except that its documentation is too large, loadPackage
+// returns both a package and a non-nil error with dochtml.ErrTooLarge in its chain.
+func loadPackage(ctx context.Context, zipGoFiles []*zip.File, innerPath string, sourceInfo *source.Info, modInfo *dochtml.ModuleInfo) (*internal.LegacyPackage, error) {
+ ctx, span := trace.StartSpan(ctx, "fetch.loadPackage")
+ defer span.End()
+ for _, env := range goEnvs {
+ pkg, err := loadPackageWithBuildContext(ctx, env.GOOS, env.GOARCH, zipGoFiles, innerPath, sourceInfo, modInfo)
+ if err != nil && !errors.Is(err, dochtml.ErrTooLarge) {
+ return nil, err
+ }
+ if pkg != nil {
+ return pkg, err
+ }
+ }
+ return nil, nil
+}
+
+// httpPost allows package fetch tests to stub out playground URL fetches.
+var httpPost = http.Post
+
+const docTooLargeReplacement = `<p>Documentation is too large to display.</p>`
+
+// loadPackageWithBuildContext loads a Go package made of .go files in zipGoFiles
+// using a build context constructed from the given GOOS and GOARCH values.
+// modulePath is stdlib.ModulePath for the Go standard library and the module
+// path for all other modules. innerPath is the path of the Go package directory
+// relative to the module root.
+//
+// zipGoFiles must contain only .go files that have been verified
+// to be of reasonable size.
+//
+// The returned LegacyPackage.Licenses field is not populated.
+//
+// It returns a nil LegacyPackage if the directory doesn't contain a Go package
+// or all .go files have been excluded by constraints.
+// A *BadPackageError error is returned if the directory
+// contains .go files but do not make up a valid package.
+func loadPackageWithBuildContext(ctx context.Context, goos, goarch string, zipGoFiles []*zip.File, innerPath string, sourceInfo *source.Info, modInfo *dochtml.ModuleInfo) (_ *internal.LegacyPackage, err error) {
+ modulePath := modInfo.ModulePath
+ defer derrors.Wrap(&err, "loadPackageWithBuildContext(%q, %q, zipGoFiles, %q, %q, %+v)",
+ goos, goarch, innerPath, modulePath, sourceInfo)
+ // Apply build constraints to get a map from matching file names to their contents.
+ files, err := matchingFiles(goos, goarch, zipGoFiles)
+ if err != nil {
+ return nil, err
+ }
+
+ // Parse .go files and add them to the goFiles slice.
+ var (
+ fset = token.NewFileSet()
+ goFiles = make(map[string]*ast.File)
+ allGoFiles []*ast.File
+ packageName string
+ packageNameFile string // Name of file where packageName came from.
+ )
+ for name, b := range files {
+ pf, err := parser.ParseFile(fset, name, b, parser.ParseComments)
+ if err != nil {
+ if pf == nil {
+ return nil, fmt.Errorf("internal error: the source couldn't be read: %v", err)
+ }
+ return nil, &BadPackageError{Err: err}
+ }
+ allGoFiles = append(allGoFiles, pf)
+ if strings.HasSuffix(name, "_test.go") {
+ continue
+ }
+ goFiles[name] = pf
+ if len(goFiles) == 1 {
+ packageName = pf.Name.Name
+ packageNameFile = name
+ } else if pf.Name.Name != packageName {
+ return nil, &BadPackageError{Err: &build.MultiplePackageError{
+ Dir: innerPath,
+ Packages: []string{packageName, pf.Name.Name},
+ Files: []string{packageNameFile, name},
+ }}
+ }
+ }
+ if len(goFiles) == 0 {
+ // This directory doesn't contain a package, or at least not one
+ // that matches this build context.
+ return nil, nil
+ }
+
+ // The "builtin" package in the standard library is a special case.
+ // We want to show documentation for all globals (not just exported ones),
+ // and avoid association of consts, vars, and factory functions with types
+ // since it's not helpful (see golang.org/issue/6645).
+ var noFiltering, noTypeAssociation bool
+ if modulePath == stdlib.ModulePath && innerPath == "builtin" {
+ noFiltering = true
+ noTypeAssociation = true
+ }
+
+ // Compute package documentation.
+ importPath := path.Join(modulePath, innerPath)
+ var m doc.Mode
+ if noFiltering {
+ m |= doc.AllDecls
+ }
+ d, err := doc.NewFromFiles(fset, allGoFiles, importPath, m)
+ if err != nil {
+ return nil, fmt.Errorf("doc.NewFromFiles: %v", err)
+ }
+ if d.ImportPath != importPath || d.Name != packageName {
+ panic(fmt.Errorf("internal error: *doc.Package has an unexpected import path (%q != %q) or package name (%q != %q)", d.ImportPath, importPath, d.Name, packageName))
+ }
+ if noTypeAssociation {
+ for _, t := range d.Types {
+ d.Consts, t.Consts = append(d.Consts, t.Consts...), nil
+ d.Vars, t.Vars = append(d.Vars, t.Vars...), nil
+ d.Funcs, t.Funcs = append(d.Funcs, t.Funcs...), nil
+ }
+ sort.Slice(d.Funcs, func(i, j int) bool { return d.Funcs[i].Name < d.Funcs[j].Name })
+ }
+
+ // Process package imports.
+ if len(d.Imports) > maxImportsPerPackage {
+ return nil, fmt.Errorf("%d imports found package %q; exceeds limit %d for maxImportsPerPackage", len(d.Imports), importPath, maxImportsPerPackage)
+ }
+
+ // Render documentation HTML.
+ sourceLinkFunc := func(n ast.Node) string {
+ if sourceInfo == nil {
+ return ""
+ }
+ p := fset.Position(n.Pos())
+ if p.Line == 0 { // invalid Position
+ return ""
+ }
+ return sourceInfo.LineURL(path.Join(innerPath, p.Filename), p.Line)
+ }
+ fileLinkFunc := func(filename string) string {
+ if sourceInfo == nil {
+ return ""
+ }
+ return sourceInfo.FileURL(path.Join(innerPath, filename))
+ }
+
+ docHTML, err := dochtml.Render(ctx, fset, d, dochtml.RenderOptions{
+ FileLinkFunc: fileLinkFunc,
+ SourceLinkFunc: sourceLinkFunc,
+ ModInfo: modInfo,
+ Limit: int64(MaxDocumentationHTML),
+ })
+ if errors.Is(err, dochtml.ErrTooLarge) {
+ docHTML = template.MustParseAndExecuteToHTML(docTooLargeReplacement)
+ } else if err != nil {
+ return nil, fmt.Errorf("dochtml.Render: %v", err)
+ }
+ if modulePath == stdlib.ModulePath {
+ importPath = innerPath
+ }
+ v1path := internal.V1Path(importPath, modulePath)
+ return &internal.LegacyPackage{
+ Path: importPath,
+ Name: packageName,
+ Synopsis: doc.Synopsis(d.Doc),
+ V1Path: v1path,
+ Imports: d.Imports,
+ DocumentationHTML: docHTML,
+ GOOS: goos,
+ GOARCH: goarch,
+ }, err
+}
+
+// matchingFiles returns a map from file names to their contents, read from zipGoFiles.
+// It includes only those files that match the build context determined by goos and goarch.
+func matchingFiles(goos, goarch string, zipGoFiles []*zip.File) (files map[string][]byte, err error) {
+ defer derrors.Wrap(&err, "matchingFiles(%q, %q, zipGoFiles)", goos, goarch)
+ // Populate the map with all the zip files.
+ files = make(map[string][]byte)
+ for _, f := range zipGoFiles {
+ _, name := path.Split(f.Name)
+ b, err := readZipFile(f, MaxFileSize)
+ if err != nil {
+ return nil, err
+ }
+ files[name] = b
+ }
+
+ // bctx is used to make decisions about which of the .go files are included
+ // by build constraints.
+ bctx := &build.Context{
+ GOOS: goos,
+ GOARCH: goarch,
+ CgoEnabled: true,
+ Compiler: build.Default.Compiler,
+ ReleaseTags: build.Default.ReleaseTags,
+
+ JoinPath: path.Join,
+ OpenFile: func(name string) (io.ReadCloser, error) {
+ return ioutil.NopCloser(bytes.NewReader(files[name])), nil
+ },
+
+ // If left nil, the default implementations of these read from disk,
+ // which we do not want. None of these functions should be used
+ // inside this function; it would be an internal error if they are.
+ // Set them to non-nil values to catch if that happens.
+ SplitPathList: func(string) []string { panic("internal error: unexpected call to SplitPathList") },
+ IsAbsPath: func(string) bool { panic("internal error: unexpected call to IsAbsPath") },
+ IsDir: func(string) bool { panic("internal error: unexpected call to IsDir") },
+ HasSubdir: func(string, string) (string, bool) { panic("internal error: unexpected call to HasSubdir") },
+ ReadDir: func(string) ([]os.FileInfo, error) { panic("internal error: unexpected call to ReadDir") },
+ }
+
+ for name := range files {
+ match, err := bctx.MatchFile(".", name) // This will access the file we just added to files map above.
+ if err != nil {
+ return nil, &BadPackageError{Err: fmt.Errorf(`bctx.MatchFile(".", %q): %w`, name, err)}
+ }
+ if !match {
+ // Excluded by build context.
+ delete(files, name)
+ }
+ }
+ return files, nil
+}
+
+// readZipFile decompresses zip file f and returns its uncompressed contents.
+// The caller can check f.UncompressedSize64 before calling readZipFile to
+// get the expected uncompressed size of f.
+//
+// limit is the maximum number of bytes to read.
+func readZipFile(f *zip.File, limit int64) (_ []byte, err error) {
+ defer derrors.Add(&err, "readZipFile(%q)", f.Name)
+
+ r, err := f.Open()
+ if err != nil {
+ return nil, fmt.Errorf("f.Open(): %v", err)
+ }
+ b, err := ioutil.ReadAll(io.LimitReader(r, limit))
+ if err != nil {
+ r.Close()
+ return nil, fmt.Errorf("ioutil.ReadAll(r): %v", err)
+ }
+ if err := r.Close(); err != nil {
+ return nil, fmt.Errorf("closing: %v", err)
+ }
+ return b, nil
+}
+
+func allocMeg() int {
+ var ms runtime.MemStats
+ runtime.ReadMemStats(&ms)
+ return int(ms.Alloc / (1024 * 1024))
+}
+
+// mib is the number of bytes in a mebibyte (Mi).
+const mib = 1024 * 1024
+
+// The largest module zip size we can comfortably process.
+// We probably will OOM if we process a module whose zip is larger.
+var maxModuleZipSize int64 = math.MaxInt64
+
+func init() {
+ m := os.Getenv("GO_DISCOVERY_MAX_MODULE_ZIP_MI")
+ if m != "" {
+ v, err := strconv.ParseInt(m, 10, 64)
+ if err != nil {
+ log.Errorf(context.Background(), "could not parse GO_DISCOVERY_MAX_MODULE_ZIP_MI value %q", v)
+ } else {
+ maxModuleZipSize = v * mib
+ }
+ }
+}
+
+var zipLoadShedder = loadShedder{maxSizeInFlight: math.MaxUint64}
+
+func init() {
+ ctx := context.Background()
+ m := os.Getenv("GO_DISCOVERY_MAX_IN_FLIGHT_ZIP_MI")
+ if m != "" {
+ mebis, err := strconv.ParseUint(m, 10, 64)
+ if err != nil {
+ log.Fatalf(ctx, "could not parse GO_DISCOVERY_MAX_IN_FLIGHT_ZIP_MI value %q", m)
+ } else if mebis == 0 {
+ log.Fatalf(ctx, "bad value for GO_DISCOVERY_MAX_IN_FLIGHT_ZIP_MI: %d. Must be >= 1.", mebis)
+ } else {
+ log.Infof(ctx, "shedding load over %dMi", mebis)
+ zipLoadShedder.maxSizeInFlight = mebis * mib
+ }
+ }
+}
+
+// ZipLoadShedStats returns a snapshot of the current LoadShedStats for zip files.
+func ZipLoadShedStats() LoadShedStats {
+ return zipLoadShedder.stats()
+}
diff --git a/internal/fetch/load_test.go b/internal/fetch/load_test.go
new file mode 100644
index 0000000..94f90d2
--- /dev/null
+++ b/internal/fetch/load_test.go
@@ -0,0 +1,97 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package fetch
+
+import (
+ "archive/zip"
+ "bytes"
+ "testing"
+
+ "github.com/google/go-cmp/cmp"
+ "golang.org/x/pkgsite/internal/testing/testhelper"
+)
+
+func TestMatchingFiles(t *testing.T) {
+ plainGoBody := `
+ package plain
+ type Value int`
+ jsGoBody := `
+ // +build js,wasm
+
+ // Package js only works with wasm.
+ package js
+ type Value int`
+
+ plainContents := map[string]string{
+ "README.md": "THIS IS A README",
+ "LICENSE.md": testhelper.MITLicense,
+ "plain/plain.go": plainGoBody,
+ }
+
+ jsContents := map[string]string{
+ "README.md": "THIS IS A README",
+ "LICENSE.md": testhelper.MITLicense,
+ "js/js.go": jsGoBody,
+ }
+ for _, test := range []struct {
+ name string
+ goos, goarch string
+ contents map[string]string
+ want map[string][]byte
+ }{
+ {
+ name: "plain-linux",
+ goos: "linux",
+ goarch: "amd64",
+ contents: plainContents,
+ want: map[string][]byte{
+ "plain.go": []byte(plainGoBody),
+ },
+ },
+ {
+ name: "plain-js",
+ goos: "js",
+ goarch: "wasm",
+ contents: plainContents,
+ want: map[string][]byte{
+ "plain.go": []byte(plainGoBody),
+ },
+ },
+ {
+ name: "wasm-linux",
+ goos: "linux",
+ goarch: "amd64",
+ contents: jsContents,
+ want: map[string][]byte{},
+ },
+ {
+ name: "wasm-js",
+ goos: "js",
+ goarch: "wasm",
+ contents: jsContents,
+ want: map[string][]byte{
+ "js.go": []byte(jsGoBody),
+ },
+ },
+ } {
+ t.Run(test.name, func(t *testing.T) {
+ data, err := testhelper.ZipContents(test.contents)
+ if err != nil {
+ t.Fatal(err)
+ }
+ r, err := zip.NewReader(bytes.NewReader(data), int64(len(data)))
+ if err != nil {
+ t.Fatal(err)
+ }
+ got, err := matchingFiles(test.goos, test.goarch, r.File)
+ if err != nil {
+ t.Fatal(err)
+ }
+ if diff := cmp.Diff(test.want, got); diff != "" {
+ t.Errorf("mismatch (-want +got):\n%s", diff)
+ }
+ })
+ }
+}
diff --git a/internal/fetch/package.go b/internal/fetch/package.go
new file mode 100644
index 0000000..1b6bfa4
--- /dev/null
+++ b/internal/fetch/package.go
@@ -0,0 +1,250 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package fetch provides a way to fetch modules from a proxy.
+package fetch
+
+import (
+ "archive/zip"
+ "context"
+ "errors"
+ "fmt"
+ "net/http"
+ "path"
+ "runtime/debug"
+ "strings"
+
+ "go.opencensus.io/trace"
+ "golang.org/x/mod/module"
+ "golang.org/x/pkgsite/internal"
+ "golang.org/x/pkgsite/internal/derrors"
+ "golang.org/x/pkgsite/internal/fetch/dochtml"
+ "golang.org/x/pkgsite/internal/licenses"
+ "golang.org/x/pkgsite/internal/log"
+ "golang.org/x/pkgsite/internal/source"
+)
+
+// extractPackagesFromZip returns a slice of packages from the module zip r.
+// It matches against the given licenses to determine the subset of licenses
+// that applies to each package.
+// The second return value says whether any packages are "incomplete," meaning
+// that they contained .go files but couldn't be processed due to current
+// limitations of this site. The limitations are:
+// * a maximum file size (MaxFileSize)
+// * the particular set of build contexts we consider (goEnvs)
+// * whether the import path is valid.
+func extractPackagesFromZip(ctx context.Context, modulePath, resolvedVersion string, r *zip.Reader, d *licenses.Detector, sourceInfo *source.Info) (_ []*internal.LegacyPackage, _ []*internal.PackageVersionState, err error) {
+ ctx, span := trace.StartSpan(ctx, "fetch.extractPackagesFromZip")
+ defer span.End()
+ defer func() {
+ if e := recover(); e != nil {
+ // The package processing code performs some sanity checks along the way.
+ // None of the panics should occur, but if they do, we want to log them and
+ // be able to find them. So, convert internal panics to internal errors here.
+ err = fmt.Errorf("internal panic: %v\n\n%s", e, debug.Stack())
+ }
+ }()
+
+ // The high-level approach is to split the processing of the zip file
+ // into two phases:
+ //
+ // 1. loop over all files, looking at file metadata only
+ // 2. process all files by reading their contents
+ //
+ // During phase 1, we populate the dirs map for each directory
+ // that contains at least one .go file.
+
+ var (
+ // modulePrefix is the "<module>@<resolvedVersion>/" prefix that all files
+ // are expected to have according to the zip archive layout specification
+ // at the bottom of https://golang.org/cmd/go/#hdr-Module_proxy_protocol.
+ modulePrefix = moduleVersionDir(modulePath, resolvedVersion) + "/"
+
+ // dirs is the set of directories with at least one .go file,
+ // to be populated during phase 1 and used during phase 2.
+ //
+ // The map key is the directory path, with the modulePrefix trimmed.
+ // The map value is a slice of all .go files, and no other files.
+ dirs = make(map[string][]*zip.File)
+
+ // modInfo contains all the module information a package in the module
+ // needs to render its documentation, to be populated during phase 1
+ // and used during phase 2.
+ modInfo = &dochtml.ModuleInfo{
+ ModulePath: modulePath,
+ ResolvedVersion: resolvedVersion,
+ ModulePackages: make(map[string]bool),
+ }
+
+ // incompleteDirs tracks directories for which we have incomplete
+ // information, due to a problem processing one of the go files contained
+ // therein. We use this so that a single unprocessable package does not
+ // prevent processing of other packages in the module.
+ incompleteDirs = make(map[string]bool)
+ packageVersionStates = []*internal.PackageVersionState{}
+ )
+
+ // Phase 1.
+ // Loop over zip files preemptively and check for problems
+ // that can be detected by looking at metadata alone.
+ // We'll be looking at file contents starting with phase 2 only,
+ // only after we're sure this phase passed without errors.
+ for _, f := range r.File {
+ if f.Mode().IsDir() {
+ // While "go mod download" will never put a directory in a zip, any can serve their
+ // own zips. Example: go.felesatra.moe/binpack@v0.1.0.
+ // Directory entries are harmless, so we just ignore them.
+ continue
+ }
+ if !strings.HasPrefix(f.Name, modulePrefix) {
+ // Well-formed module zips have all files under modulePrefix.
+ return nil, nil, fmt.Errorf("expected file to have prefix %q; got = %q: %w",
+ modulePrefix, f.Name, errMalformedZip)
+ }
+ innerPath := path.Dir(f.Name[len(modulePrefix):])
+ if incompleteDirs[innerPath] {
+ // We already know this directory cannot be processed, so skip.
+ continue
+ }
+ importPath := path.Join(modulePath, innerPath)
+ if ignoredByGoTool(importPath) || isVendored(importPath) {
+ // File is in a directory we're not looking to process at this time, so skip it.
+ continue
+ }
+ if !strings.HasSuffix(f.Name, ".go") {
+ // We care about .go files only.
+ continue
+ }
+ // It's possible to have a Go package in a directory that does not result in a valid import path.
+ // That package cannot be imported, but that may be fine if it's a main package, intended to built
+ // and run from that directory.
+ // Example: https://github.com/postmannen/go-learning/blob/master/concurrency/01-sending%20numbers%20and%20receving%20numbers%20from%20a%20channel/main.go
+ // We're not set up to handle invalid import paths, so skip these packages.
+ if err := module.CheckImportPath(importPath); err != nil {
+ incompleteDirs[innerPath] = true
+ packageVersionStates = append(packageVersionStates, &internal.PackageVersionState{
+ ModulePath: modulePath,
+ PackagePath: importPath,
+ Version: resolvedVersion,
+ Status: derrors.ToStatus(derrors.PackageBadImportPath),
+ Error: err.Error(),
+ })
+ continue
+ }
+ if f.UncompressedSize64 > MaxFileSize {
+ incompleteDirs[innerPath] = true
+ status := derrors.ToStatus(derrors.PackageMaxFileSizeLimitExceeded)
+ err := fmt.Sprintf("Unable to process %s: file size %d exceeds max limit %d",
+ f.Name, f.UncompressedSize64, MaxFileSize)
+ packageVersionStates = append(packageVersionStates, &internal.PackageVersionState{
+ ModulePath: modulePath,
+ PackagePath: importPath,
+ Version: resolvedVersion,
+ Status: status,
+ Error: err,
+ })
+ continue
+ }
+ dirs[innerPath] = append(dirs[innerPath], f)
+ if len(dirs) > maxPackagesPerModule {
+ return nil, nil, fmt.Errorf("%d packages found in %q; exceeds limit %d for maxPackagePerModule", len(dirs), modulePath, maxPackagesPerModule)
+ }
+ }
+ for pkgName := range dirs {
+ modInfo.ModulePackages[path.Join(modulePath, pkgName)] = true
+ }
+
+ // Phase 2.
+ // If we got this far, the file metadata was okay.
+ // Start reading the file contents now to extract information
+ // about Go packages.
+ var pkgs []*internal.LegacyPackage
+ for innerPath, goFiles := range dirs {
+ if incompleteDirs[innerPath] {
+ // Something went wrong when processing this directory, so we skip.
+ log.Infof(ctx, "Skipping %q because it is incomplete", innerPath)
+ continue
+ }
+
+ var (
+ status error
+ errMsg string
+ )
+ pkg, err := loadPackage(ctx, goFiles, innerPath, sourceInfo, modInfo)
+ if bpe := (*BadPackageError)(nil); errors.As(err, &bpe) {
+ incompleteDirs[innerPath] = true
+ status = derrors.PackageInvalidContents
+ errMsg = err.Error()
+ } else if errors.Is(err, dochtml.ErrTooLarge) {
+ status = derrors.PackageDocumentationHTMLTooLarge
+ errMsg = err.Error()
+ } else if err != nil {
+ return nil, nil, fmt.Errorf("unexpected error loading package: %v", err)
+ }
+
+ var pkgPath string
+ if pkg == nil {
+ // No package.
+ if len(goFiles) > 0 {
+ // There were go files, but no build contexts matched them.
+ incompleteDirs[innerPath] = true
+ status = derrors.PackageBuildContextNotSupported
+ }
+ pkgPath = path.Join(modulePath, innerPath)
+ } else {
+ if d != nil { // should only be nil for tests
+ isRedist, lics := d.PackageInfo(innerPath)
+ pkg.IsRedistributable = isRedist
+ for _, l := range lics {
+ pkg.Licenses = append(pkg.Licenses, l.Metadata)
+ }
+ }
+ pkgs = append(pkgs, pkg)
+ pkgPath = pkg.Path
+ }
+ code := http.StatusOK
+ if status != nil {
+ code = derrors.ToStatus(status)
+ }
+ packageVersionStates = append(packageVersionStates, &internal.PackageVersionState{
+ ModulePath: modulePath,
+ PackagePath: pkgPath,
+ Version: resolvedVersion,
+ Status: code,
+ Error: errMsg,
+ })
+ }
+ if len(pkgs) == 0 {
+ return nil, packageVersionStates, errModuleContainsNoPackages
+ }
+ return pkgs, packageVersionStates, nil
+}
+
+// ignoredByGoTool reports whether the given import path corresponds
+// to a directory that would be ignored by the go tool.
+//
+// The logic of the go tool for ignoring directories is documented at
+// https://golang.org/cmd/go/#hdr-Package_lists_and_patterns:
+//
+// LegacyDirectory and file names that begin with "." or "_" are ignored
+// by the go tool, as are directories named "testdata".
+//
+func ignoredByGoTool(importPath string) bool {
+ for _, el := range strings.Split(importPath, "/") {
+ if strings.HasPrefix(el, ".") || strings.HasPrefix(el, "_") || el == "testdata" {
+ return true
+ }
+ }
+ return false
+}
+
+// isVendored reports whether the given import path corresponds
+// to a Go package that is inside a vendor directory.
+//
+// The logic for what is considered a vendor directory is documented at
+// https://golang.org/cmd/go/#hdr-Vendor_Directories.
+func isVendored(importPath string) bool {
+ return strings.HasPrefix(importPath, "vendor/") ||
+ strings.Contains(importPath, "/vendor/")
+}
diff --git a/internal/fetch/readme.go b/internal/fetch/readme.go
new file mode 100644
index 0000000..9b07828
--- /dev/null
+++ b/internal/fetch/readme.go
@@ -0,0 +1,50 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package fetch provides a way to fetch modules from a proxy.
+package fetch
+
+import (
+ "archive/zip"
+ "fmt"
+ "path"
+ "strings"
+
+ "golang.org/x/pkgsite/internal"
+)
+
+// extractReadmesFromZip returns the file path and contents of all files from r
+// that are README files.
+func extractReadmesFromZip(modulePath, resolvedVersion string, r *zip.Reader) ([]*internal.Readme, error) {
+ var readmes []*internal.Readme
+ for _, zipFile := range r.File {
+ if isReadme(zipFile.Name) {
+ if zipFile.UncompressedSize64 > MaxFileSize {
+ return nil, fmt.Errorf("file size %d exceeds max limit %d", zipFile.UncompressedSize64, MaxFileSize)
+ }
+ c, err := readZipFile(zipFile, MaxFileSize)
+ if err != nil {
+ return nil, err
+ }
+ readmes = append(readmes, &internal.Readme{
+ Filepath: strings.TrimPrefix(zipFile.Name, moduleVersionDir(modulePath, resolvedVersion)+"/"),
+ Contents: string(c),
+ })
+
+ }
+ }
+ return readmes, nil
+}
+
+var excludedReadmeExts = map[string]bool{".go": true, ".vendor": true}
+
+// isReadme reports whether file is README or if the base name of file, with or
+// without the extension, is equal to expectedFile. README.go files will return
+// false. It is case insensitive. It operates on '/'-separated paths.
+func isReadme(file string) bool {
+ const expectedFile = "README"
+ base := path.Base(file)
+ ext := path.Ext(base)
+ return !excludedReadmeExts[ext] && strings.EqualFold(strings.TrimSuffix(base, ext), expectedFile)
+}
diff --git a/internal/fetch/readme_test.go b/internal/fetch/readme_test.go
new file mode 100644
index 0000000..8522378
--- /dev/null
+++ b/internal/fetch/readme_test.go
@@ -0,0 +1,167 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package fetch
+
+import (
+ "archive/zip"
+ "context"
+ "sort"
+ "testing"
+
+ "github.com/google/go-cmp/cmp"
+ "golang.org/x/pkgsite/internal"
+ "golang.org/x/pkgsite/internal/proxy"
+ "golang.org/x/pkgsite/internal/stdlib"
+)
+
+func TestExtractReadmesFromZip(t *testing.T) {
+ stdlib.UseTestData = true
+
+ ctx, cancel := context.WithTimeout(context.Background(), testTimeout)
+ defer cancel()
+
+ sortReadmes := func(readmes []*internal.Readme) {
+ sort.Slice(readmes, func(i, j int) bool {
+ return readmes[i].Filepath < readmes[j].Filepath
+ })
+ }
+
+ for _, test := range []struct {
+ modulePath, version string
+ files map[string]string
+ want []*internal.Readme
+ }{
+ {
+ modulePath: stdlib.ModulePath,
+ version: "v1.12.5",
+ want: []*internal.Readme{
+ {
+ Filepath: "README.md",
+ Contents: "# The Go Programming Language\n",
+ },
+ {
+ Filepath: "cmd/pprof/README",
+ Contents: "This directory is the copy of Google's pprof shipped as part of the Go distribution.\n",
+ },
+ },
+ },
+ {
+ modulePath: "github.com/my/module",
+ version: "v1.0.0",
+ files: map[string]string{
+ "README.md": "README FILE FOR TESTING.",
+ "foo/README": "Another README",
+ },
+ want: []*internal.Readme{
+ {
+ Filepath: "README.md",
+ Contents: "README FILE FOR TESTING.",
+ },
+ {
+ Filepath: "foo/README",
+ Contents: "Another README",
+ },
+ },
+ },
+ {
+ modulePath: "emp.ty/module",
+ version: "v1.0.0",
+ files: map[string]string{},
+ },
+ } {
+ t.Run(test.modulePath, func(t *testing.T) {
+ var (
+ reader *zip.Reader
+ err error
+ )
+ if test.modulePath == stdlib.ModulePath {
+ reader, _, err = stdlib.Zip(test.version)
+ if err != nil {
+ t.Fatal(err)
+ }
+ } else {
+ proxyClient, teardownProxy := proxy.SetupTestClient(t, []*proxy.Module{
+ {ModulePath: test.modulePath, Files: test.files}})
+ defer teardownProxy()
+ reader, err = proxyClient.GetZip(ctx, test.modulePath, "v1.0.0")
+ if err != nil {
+ t.Fatal(err)
+ }
+ }
+
+ got, err := extractReadmesFromZip(test.modulePath, test.version, reader)
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ sortReadmes(test.want)
+ sortReadmes(got)
+ if diff := cmp.Diff(test.want, got); diff != "" {
+ t.Errorf("mismatch (-want +got):\n%s", diff)
+ }
+ })
+ }
+}
+
+func TestIsReadme(t *testing.T) {
+ for _, test := range []struct {
+ name, file string
+ want bool
+ }{
+ {
+ name: "README in nested dir returns true",
+ file: "github.com/my/module@v1.0.0/README.md",
+ want: true,
+ },
+ {
+ name: "case insensitive",
+ file: "rEaDme",
+ want: true,
+ },
+ {
+ name: "random extension returns true",
+ file: "README.FOO",
+ want: true,
+ },
+ {
+ name: "{prefix}readme will return false",
+ file: "FOO_README",
+ want: false,
+ },
+ {
+ file: "README_FOO",
+ name: "readme{suffix} will return false",
+ want: false,
+ },
+ {
+ file: "README.FOO.FOO",
+ name: "README file with multiple extensions will return false",
+ want: false,
+ },
+ {
+ file: "readme.go",
+ name: ".go README file will return false",
+ want: false,
+ },
+ {
+ file: "readme.vendor",
+ name: ".vendor README file will return false",
+ want: false,
+ },
+ {
+ file: "",
+ name: "empty filename returns false",
+ want: false,
+ },
+ } {
+ {
+ t.Run(test.file, func(t *testing.T) {
+ if got := isReadme(test.file); got != test.want {
+ t.Errorf("isReadme(%q) = %t: %t", test.file, got, test.want)
+ }
+ })
+ }
+ }
+}