internal/fetch: use FS for large forks

Replace ZipSignature with FSSignature.

For golang/go#47834

Change-Id: Ie6cc6053a21260b3e63a1b47359a95731d1be63b
Reviewed-on: https://go-review.googlesource.com/c/pkgsite/+/343951
Trust: Jonathan Amsterdam <jba@google.com>
Run-TryBot: Jonathan Amsterdam <jba@google.com>
Reviewed-by: Julie Qiu <julie@golang.org>
diff --git a/internal/fetch/fetch.go b/internal/fetch/fetch.go
index 91806f7..b691e31 100644
--- a/internal/fetch/fetch.go
+++ b/internal/fetch/fetch.go
@@ -10,6 +10,7 @@
 	"context"
 	"errors"
 	"fmt"
+	"io/fs"
 	"net/http"
 	"path"
 	"sort"
@@ -254,7 +255,11 @@
 	// see if this is a fork. The intent is to avoid processing certain known
 	// large modules, not to find every fork.
 	if !fr.HasGoMod {
-		forkedModule, err := forkedFrom(zipReader, fr.ModulePath, fr.ResolvedVersion)
+		contentsDir, err := fs.Sub(zipReader, fr.ModulePath+"@"+fr.ResolvedVersion)
+		if err != nil {
+			return fi, err
+		}
+		forkedModule, err := forkedFrom(contentsDir, fr.ModulePath, fr.ResolvedVersion)
 		if err != nil {
 			return fi, err
 		}
diff --git a/internal/fetch/gen_zip_signatures.go b/internal/fetch/gen_zip_signatures.go
index 904abb6..6ee1553 100644
--- a/internal/fetch/gen_zip_signatures.go
+++ b/internal/fetch/gen_zip_signatures.go
@@ -18,6 +18,7 @@
 	"flag"
 	"fmt"
 	"go/format"
+	"io/fs"
 	"io/ioutil"
 	"log"
 	"sort"
@@ -141,7 +142,11 @@
 	if err != nil {
 		return "", err
 	}
-	sig, err := fetch.ZipSignature(zr, mv.String())
+	contentsDir, err := fs.Sub(zr, mv.String())
+	if err != nil {
+		return "", err
+	}
+	sig, err := fetch.FSSignature(contentsDir)
 	if err != nil {
 		return "", err
 	}
diff --git a/internal/fetch/largefork.go b/internal/fetch/largefork.go
index ca766c0..d632a4b 100644
--- a/internal/fetch/largefork.go
+++ b/internal/fetch/largefork.go
@@ -9,29 +9,43 @@
 //go:generate go run gen_zip_signatures.go -v
 
 import (
-	"archive/zip"
 	"crypto/sha256"
+	"errors"
 	"fmt"
 	"io"
+	"io/fs"
 	"sort"
-	"strings"
 )
 
-// ZipSignature calculates a signature that uniquely identifies a zip file.
-// It hashes every filename and its contents. Filenames must begin with prefix,
-// which is not included in the hash.
-func ZipSignature(r *zip.Reader, prefix string) (string, error) {
-	files := make([]*zip.File, len(r.File))
-	copy(files, r.File)
-	sort.Slice(files, func(i, j int) bool { return files[i].Name < files[j].Name })
-	h := sha256.New()
-	for _, f := range files {
-		if !strings.HasPrefix(f.Name, prefix) {
-			return "", fmt.Errorf("zip file %q does not have prefix %q", f.Name, prefix)
+// FSSignature calculates a signature that uniquely identifies a filesystem.
+// It hashes every filename and its contents.
+func FSSignature(fsys fs.FS) (string, error) {
+	// To match the behavior of the old ZipSignatures function that this is
+	// based on, sort the paths from fs.WalkDir. Although fs.WalkDir traverses
+	// the files in lexical order within each directory, that is not the same
+	// order as sorting all the paths. For example, fs.WalkDir will return
+	// ["a/b", "a#b"], but because '#' comes before '/', sorting the paths
+	// swaps them.
+	var paths []string
+	err := fs.WalkDir(fsys, ".", func(path string, d fs.DirEntry, err error) error {
+		if err != nil {
+			return err
 		}
-		io.WriteString(h, f.Name[len(prefix):])
+		if !d.IsDir() {
+			paths = append(paths, path)
+		}
+		return nil
+	})
+	if err != nil && !errors.Is(err, fs.ErrNotExist) { // we can get NotExist on an empty FS
+		return "", err
+	}
+	sort.Strings(paths)
+
+	h := sha256.New()
+	for _, path := range paths {
+		io.WriteString(h, "/"+path) // slash needed to match ZipSignatures
 		h.Write([]byte{0})
-		rc, err := f.Open()
+		rc, err := fsys.Open(path)
 		if err != nil {
 			return "", err
 		}
@@ -42,11 +56,11 @@
 }
 
 // forkedFrom returns a module that the current one has been forked from. It
-// consults a built-in list of modules and their zip signatures, and returns a
-// module path from that list if its zip file and version are identical to the
+// consults a built-in list of modules and their signatures, and returns a
+// module path from that list if its contents and version are identical to the
 // given ones. If there is no matching module, it returns the empty string.
-func forkedFrom(z *zip.Reader, module, version string) (string, error) {
-	sig, err := ZipSignature(z, module+"@"+version)
+func forkedFrom(moduleContents fs.FS, module, version string) (string, error) {
+	sig, err := FSSignature(moduleContents)
 	if err != nil {
 		return "", err
 	}
diff --git a/internal/fetch/largefork_test.go b/internal/fetch/largefork_test.go
index 436ee66..ac5fbf5 100644
--- a/internal/fetch/largefork_test.go
+++ b/internal/fetch/largefork_test.go
@@ -11,36 +11,36 @@
 	"testing"
 )
 
-func TestZipSignature(t *testing.T) {
+func TestFSSignature(t *testing.T) {
 	zip1 := newzip(t, [][2]string{
-		{"p/file1", "abc"},
-		{"p/file2", "def"},
+		{"file1", "abc"},
+		{"file2", "def"},
 	})
 	zip2 := newzip(t, [][2]string{ // same files, different order, different prefix
-		{"q/file2", "def"},
-		{"q/file1", "abc"},
+		{"file2", "def"},
+		{"file1", "abc"},
 	})
 	zip3 := newzip(t, [][2]string{ // different files
-		{"r/file1", "abc"},
-		{"r/file2d", "ef"},
+		{"file1", "abc"},
+		{"file2d", "ef"},
 	})
 
-	sig := func(z []byte, prefix string) string {
+	sig := func(z []byte) string {
 		r, err := zip.NewReader(bytes.NewReader(z), int64(len(z)))
 		if err != nil {
 			t.Fatal(err)
 		}
-		s, err := ZipSignature(r, prefix)
+		s, err := FSSignature(r)
 		if err != nil {
 			t.Fatal(err)
 		}
 		return s
 	}
 
-	if sig(zip1, "p") != sig(zip2, "q") {
+	if sig(zip1) != sig(zip2) {
 		t.Error("same files, different order: got different signatures, want same")
 	}
-	if sig(zip1, "p") == sig(zip3, "r") {
+	if sig(zip1) == sig(zip3) {
 		t.Error("different files: got same signatures, wanted different")
 	}
 }