sumdb/dirhash: directory tree hash algorithm

Copied from cmd/go/internal/dirhash, with additional doc comments.

For golang/go#31761.

Change-Id: Id56c1bbb6f27e69194f383d49b9def6876f948be
Reviewed-on: https://go-review.googlesource.com/c/mod/+/176464
Reviewed-by: Hyang-Ah Hana Kim <hyangah@gmail.com>
diff --git a/sumdb/dirhash/hash.go b/sumdb/dirhash/hash.go
new file mode 100644
index 0000000..ef5df6f
--- /dev/null
+++ b/sumdb/dirhash/hash.go
@@ -0,0 +1,132 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package dirhash defines hashes over directory trees.
+// These hashes are recorded in go.sum files and in the Go checksum database,
+// to allow verifying that a newly-downloaded module has the expected content.
+package dirhash
+
+import (
+	"archive/zip"
+	"crypto/sha256"
+	"encoding/base64"
+	"errors"
+	"fmt"
+	"io"
+	"os"
+	"path/filepath"
+	"sort"
+	"strings"
+)
+
+// DefaultHash is the default hash function used in new go.sum entries.
+var DefaultHash Hash = Hash1
+
+// A Hash is a directory hash function.
+// It accepts a list of files along with a function that opens the content of each file.
+// It opens, reads, hashes, and closes each file and returns the overall directory hash.
+type Hash func(files []string, open func(string) (io.ReadCloser, error)) (string, error)
+
+// Hash1 is the "h1:" directory hash function, using SHA-256.
+//
+// Hash1 is "h1:" followed by the base64-encoded SHA-256 hash of a summary
+// prepared as if by the Unix command:
+//
+//	find . -type f | sort | sha256sum
+//
+// More precisely, the hashed summary contains a single line for each file in the list,
+// ordered by sort.Strings applied to the file names, where each line consists of
+// the hexadecimal SHA-256 hash of the file content,
+// two spaces (U+0020), the file name, and a newline (U+000A).
+//
+// File names with newlines (U+000A) are disallowed.
+func Hash1(files []string, open func(string) (io.ReadCloser, error)) (string, error) {
+	h := sha256.New()
+	files = append([]string(nil), files...)
+	sort.Strings(files)
+	for _, file := range files {
+		if strings.Contains(file, "\n") {
+			return "", errors.New("dirhash: filenames with newlines are not supported")
+		}
+		r, err := open(file)
+		if err != nil {
+			return "", err
+		}
+		hf := sha256.New()
+		_, err = io.Copy(hf, r)
+		r.Close()
+		if err != nil {
+			return "", err
+		}
+		fmt.Fprintf(h, "%x  %s\n", hf.Sum(nil), file)
+	}
+	return "h1:" + base64.StdEncoding.EncodeToString(h.Sum(nil)), nil
+}
+
+// HashDir returns the hash of the local file system directory dir,
+// replacing the directory name itself with prefix in the file names
+// used in the hash function.
+func HashDir(dir, prefix string, hash Hash) (string, error) {
+	files, err := DirFiles(dir, prefix)
+	if err != nil {
+		return "", err
+	}
+	osOpen := func(name string) (io.ReadCloser, error) {
+		return os.Open(filepath.Join(dir, strings.TrimPrefix(name, prefix)))
+	}
+	return hash(files, osOpen)
+}
+
+// DirFiles returns the list of files in the tree rooted at dir,
+// replacing the directory name dir with prefix in each name.
+// The resulting names always use forward slashes.
+func DirFiles(dir, prefix string) ([]string, error) {
+	var files []string
+	dir = filepath.Clean(dir)
+	err := filepath.Walk(dir, func(file string, info os.FileInfo, err error) error {
+		if err != nil {
+			return err
+		}
+		if info.IsDir() {
+			return nil
+		}
+		rel := file
+		if dir != "." {
+			rel = file[len(dir)+1:]
+		}
+		f := filepath.Join(prefix, rel)
+		files = append(files, filepath.ToSlash(f))
+		return nil
+	})
+	if err != nil {
+		return nil, err
+	}
+	return files, nil
+}
+
+// HashZip returns the hash of the file content in the named zip file.
+// Only the file names and their contents are included in the hash:
+// the exact zip file format encoding, compression method,
+// per-file modification times, and other metadata are ignored.
+func HashZip(zipfile string, hash Hash) (string, error) {
+	z, err := zip.OpenReader(zipfile)
+	if err != nil {
+		return "", err
+	}
+	defer z.Close()
+	var files []string
+	zfiles := make(map[string]*zip.File)
+	for _, file := range z.File {
+		files = append(files, file.Name)
+		zfiles[file.Name] = file
+	}
+	zipOpen := func(name string) (io.ReadCloser, error) {
+		f := zfiles[name]
+		if f == nil {
+			return nil, fmt.Errorf("file %q not found in zip", name) // should never happen
+		}
+		return f.Open()
+	}
+	return hash(files, zipOpen)
+}
diff --git a/sumdb/dirhash/hash_test.go b/sumdb/dirhash/hash_test.go
new file mode 100644
index 0000000..ed463c1
--- /dev/null
+++ b/sumdb/dirhash/hash_test.go
@@ -0,0 +1,135 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package dirhash
+
+import (
+	"archive/zip"
+	"crypto/sha256"
+	"encoding/base64"
+	"fmt"
+	"io"
+	"io/ioutil"
+	"os"
+	"path/filepath"
+	"strings"
+	"testing"
+)
+
+func h(s string) string {
+	return fmt.Sprintf("%x", sha256.Sum256([]byte(s)))
+}
+
+func htop(k string, s string) string {
+	sum := sha256.Sum256([]byte(s))
+	return k + ":" + base64.StdEncoding.EncodeToString(sum[:])
+}
+
+func TestHash1(t *testing.T) {
+	files := []string{"xyz", "abc"}
+	open := func(name string) (io.ReadCloser, error) {
+		return ioutil.NopCloser(strings.NewReader("data for " + name)), nil
+	}
+	want := htop("h1", fmt.Sprintf("%s  %s\n%s  %s\n", h("data for abc"), "abc", h("data for xyz"), "xyz"))
+	out, err := Hash1(files, open)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if out != want {
+		t.Errorf("Hash1(...) = %s, want %s", out, want)
+	}
+
+	_, err = Hash1([]string{"xyz", "a\nbc"}, open)
+	if err == nil {
+		t.Error("Hash1: expected error on newline in filenames")
+	}
+}
+
+func TestHashDir(t *testing.T) {
+	dir, err := ioutil.TempDir("", "dirhash-test-")
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer os.RemoveAll(dir)
+	if err := ioutil.WriteFile(filepath.Join(dir, "xyz"), []byte("data for xyz"), 0666); err != nil {
+		t.Fatal(err)
+	}
+	if err := ioutil.WriteFile(filepath.Join(dir, "abc"), []byte("data for abc"), 0666); err != nil {
+		t.Fatal(err)
+	}
+	want := htop("h1", fmt.Sprintf("%s  %s\n%s  %s\n", h("data for abc"), "prefix/abc", h("data for xyz"), "prefix/xyz"))
+	out, err := HashDir(dir, "prefix", Hash1)
+	if err != nil {
+		t.Fatalf("HashDir: %v", err)
+	}
+	if out != want {
+		t.Errorf("HashDir(...) = %s, want %s", out, want)
+	}
+}
+
+func TestHashZip(t *testing.T) {
+	f, err := ioutil.TempFile("", "dirhash-test-")
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer os.Remove(f.Name())
+	defer f.Close()
+
+	z := zip.NewWriter(f)
+	w, err := z.Create("prefix/xyz")
+	if err != nil {
+		t.Fatal(err)
+	}
+	w.Write([]byte("data for xyz"))
+	w, err = z.Create("prefix/abc")
+	if err != nil {
+		t.Fatal(err)
+	}
+	w.Write([]byte("data for abc"))
+	if err := z.Close(); err != nil {
+		t.Fatal(err)
+	}
+	if err := f.Close(); err != nil {
+		t.Fatal(err)
+	}
+
+	want := htop("h1", fmt.Sprintf("%s  %s\n%s  %s\n", h("data for abc"), "prefix/abc", h("data for xyz"), "prefix/xyz"))
+	out, err := HashZip(f.Name(), Hash1)
+	if err != nil {
+		t.Fatalf("HashDir: %v", err)
+	}
+	if out != want {
+		t.Errorf("HashDir(...) = %s, want %s", out, want)
+	}
+}
+
+func TestDirFiles(t *testing.T) {
+	dir, err := ioutil.TempDir("", "dirfiles-test-")
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer os.RemoveAll(dir)
+	if err := ioutil.WriteFile(filepath.Join(dir, "xyz"), []byte("data for xyz"), 0666); err != nil {
+		t.Fatal(err)
+	}
+	if err := ioutil.WriteFile(filepath.Join(dir, "abc"), []byte("data for abc"), 0666); err != nil {
+		t.Fatal(err)
+	}
+	if err := os.Mkdir(filepath.Join(dir, "subdir"), 0777); err != nil {
+		t.Fatal(err)
+	}
+	if err := ioutil.WriteFile(filepath.Join(dir, "subdir", "xyz"), []byte("data for subdir xyz"), 0666); err != nil {
+		t.Fatal(err)
+	}
+	prefix := "foo/bar@v2.3.4"
+	out, err := DirFiles(dir, prefix)
+	if err != nil {
+		t.Fatalf("DirFiles: %v", err)
+	}
+	for _, file := range out {
+		if !strings.HasPrefix(file, prefix) {
+			t.Errorf("Dir file = %s, want prefix %s", file, prefix)
+		}
+	}
+}