crfs/stargz: add start of package

Basic API, format, tests.

Good enough checkpoint.

Updates golang/go#30829

Change-Id: Iaec5b205314d64fca5056f6b19a7bae52e5cef94
Reviewed-on: https://go-review.googlesource.com/c/build/+/167769
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
diff --git a/crfs/stargz/stargz.go b/crfs/stargz/stargz.go
new file mode 100644
index 0000000..dd6d143
--- /dev/null
+++ b/crfs/stargz/stargz.go
@@ -0,0 +1,345 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// The stargz package reads & writes tar.gz ("tarball") files in a
+// seekable, indexed format call "stargz". A stargz file is still a
+// valid tarball, but it's slightly bigger with new gzip streams for
+// each new file & throughout large files, and has an index in a magic
+// file at the end.
+package stargz
+
+import (
+	"archive/tar"
+	"bufio"
+	"bytes"
+	"compress/gzip"
+	"encoding/json"
+	"errors"
+	"fmt"
+	"io"
+	"strconv"
+	"time"
+)
+
+// TOCTarName is the name of the JSON file in the tar archive in the
+// table of contents gzip stream.
+const TOCTarName = "stargz.index.json"
+
+// FooterSize is the number of bytes in the stargz footer.
+//
+// The footer is an empty gzip stream with no compression and an Extra
+// header of the form "%016xSTARGZ", where the 64 bit hex-encoded
+// number is the offset to the gzip stream of JSON TOC.
+//
+// 47 comes from:
+//
+//   10 byte gzip header +
+//   2 byte (LE16) length of extra, encoding 22 (16 hex digits + len("STARGZ")) == "\x16\x00" +
+//   22 bytes of extra (fmt.Sprintf("%016xSTARGZ", tocGzipOffset))
+//   5 byte flate header
+//   8 byte gzip footer (two little endian uint32s: digest, size)
+const FooterSize = 47
+
+// A Reader permits random access reads from a stargz file.
+type Reader struct {
+	sr  *io.SectionReader
+	TOC *TOC
+}
+
+// Open opens a stargz file for reading.
+func Open(sr *io.SectionReader) (*Reader, error) {
+	if sr.Size() < FooterSize {
+		return nil, fmt.Errorf("stargz size %d is smaller than the stargz footer size", sr.Size())
+	}
+	// TODO: read a bigger chunk (1MB?) at once here to hopefully
+	// get the TOC + footer in one go.
+	var footer [FooterSize]byte
+	if _, err := sr.ReadAt(footer[:], sr.Size()-FooterSize); err != nil {
+		return nil, fmt.Errorf("error reading footer: %v", err)
+	}
+	tocOff, ok := parseFooter(footer[:])
+	if !ok {
+		return nil, fmt.Errorf("error parsing footer")
+	}
+	tocTargz := make([]byte, sr.Size()-tocOff-FooterSize)
+	if _, err := sr.ReadAt(tocTargz, tocOff); err != nil {
+		return nil, fmt.Errorf("error reading %d byte TOC targz: %v", len(tocTargz), err)
+	}
+	zr, err := gzip.NewReader(bytes.NewReader(tocTargz))
+	if err != nil {
+		return nil, fmt.Errorf("malformed TOC gzip header: %v", err)
+	}
+	zr.Multistream(false)
+	tr := tar.NewReader(zr)
+	h, err := tr.Next()
+	if err != nil {
+		return nil, fmt.Errorf("failed to find tar header in TOC gzip stream: %v", err)
+	}
+	if h.Name != TOCTarName {
+		return nil, fmt.Errorf("TOC tar entry had name %q; expected %q", h.Name, TOCTarName)
+	}
+	toc := new(TOC)
+	if err := json.NewDecoder(tr).Decode(&toc); err != nil {
+		return nil, fmt.Errorf("error decoding TOC JSON: %v", err)
+	}
+	return &Reader{sr: sr, TOC: toc}, nil
+}
+
+// TOCEntry is an entry in the stargz file's TOC (Table of Contents).
+type TOCEntry struct {
+	Offset   int64  `json:"offset,omitempty"` // offset to gzip stream of tar entry (for regular files only)
+	Name     string `json:"name"`
+	Type     string `json:"type"` // "dir", "reg", TODO
+	Size     int64  `json:"size,omitempty"`
+	LinkName string `json:"linkName,omitempty"`  // for symlinks
+	Mode     int64  `json:"mode,omitempty"`      // Permission and mode bits
+	Uid      int    `json:"uid,omitempty"`       // User ID of owner
+	Gid      int    `json:"gid,omitempty"`       // Group ID of owner
+	Uname    string `json:"userName,omitempty"`  // User name of owner
+	Gname    string `json:"groupName,omitempty"` // Group name of owner
+	ModTime  string `json:"modtime,omitempty"`
+
+	// ChunkOffset is non-zero if this is a chunk of a large,
+	// regular file. If so, the Offset is where the gzip header of
+	// ChunkSize bytes at ChunkOffset in Name begin. If both
+	// ChunkOffset and ChunkSize are zero, the file contents are
+	// completely represented at the tar gzip stream starting at
+	// Offset.
+	ChunkOffset int64 `json:"chunkOffset,omitempty"`
+	ChunkSize   int64 `json:"chunkSize,omitempty"`
+}
+
+// TOC is the table of contents index of the files in the stargz file.
+type TOC struct {
+	Version int        `json:"version"`
+	Entries []TOCEntry `json:"entries"`
+}
+
+// A Writer writes stargz files.
+//
+// Use NewWriter to create a new Writer.
+type Writer struct {
+	bw  *bufio.Writer
+	cw  *countWriter
+	toc *TOC
+
+	closed bool
+	gz     *gzip.Writer
+}
+
+// NewWriter returns a new stargz writer writing to w.
+//
+// The writer must be closed to write its trailing table of contents.
+func NewWriter(w io.Writer) *Writer {
+	bw := bufio.NewWriter(w)
+	cw := &countWriter{w: bw}
+	return &Writer{
+		bw:  bw,
+		cw:  cw,
+		toc: &TOC{Version: 1},
+	}
+}
+
+// Close writes the stargz's table of contents and flushes all the
+// buffers, returning any error.
+func (w *Writer) Close() error {
+	if w.closed {
+		return nil
+	}
+	defer func() { w.closed = true }()
+
+	if err := w.closeGz(); err != nil {
+		return err
+	}
+
+	// Write the TOC index.
+	tocOff := w.cw.n
+	w.gz, _ = gzip.NewWriterLevel(w.cw, gzip.BestCompression)
+	tw := tar.NewWriter(w.gz)
+	tocJSON, err := json.MarshalIndent(w.toc, "", "\t")
+	if err != nil {
+		return err
+	}
+	if err := tw.WriteHeader(&tar.Header{
+		Typeflag: tar.TypeReg,
+		Name:     TOCTarName,
+		Size:     int64(len(tocJSON)),
+	}); err != nil {
+		return err
+	}
+	if _, err := tw.Write(tocJSON); err != nil {
+		return err
+	}
+
+	if err := tw.Close(); err != nil {
+		return err
+	}
+	if err := w.closeGz(); err != nil {
+		return err
+	}
+
+	// And a little footer with pointer to the TOC gzip stream.
+	if _, err := w.bw.Write(footerBytes(tocOff)); err != nil {
+		return err
+	}
+
+	if err := w.bw.Flush(); err != nil {
+		return err
+	}
+
+	return nil
+}
+
+func (w *Writer) closeGz() error {
+	if w.closed {
+		return errors.New("write on closed Writer")
+	}
+	if w.gz != nil {
+		if err := w.gz.Close(); err != nil {
+			return err
+		}
+		w.gz = nil
+	}
+	return nil
+}
+
+// AppendTar reads the tar or tar.gz file from r and appends
+// each of its contents to w.
+//
+// The input r can optionally be gzip compressed but the output will
+// always be gzip compressed.
+func (w *Writer) AppendTar(r io.Reader) error {
+	br := bufio.NewReader(r)
+	var tr *tar.Reader
+	if isGzip(br) {
+		// NewReader can't fail if isGzip returned true.
+		zr, _ := gzip.NewReader(br)
+		tr = tar.NewReader(zr)
+	} else {
+		tr = tar.NewReader(br)
+	}
+	for {
+		h, err := tr.Next()
+		if err == io.EOF {
+			break
+		}
+		if err != nil {
+			return fmt.Errorf("error reading from source tar: tar.Reader.Next: %v", err)
+		}
+		ent := TOCEntry{
+			Name:    h.Name,
+			Mode:    h.Mode,
+			Uid:     h.Uid,
+			Gid:     h.Gid,
+			Uname:   h.Uname,
+			Gname:   h.Gname,
+			ModTime: formatModtime(h.ModTime),
+		}
+		switch h.Typeflag {
+		case tar.TypeLink:
+			return fmt.Errorf("TODO: unsupported hardlink %q => %q", h.Name, h.Linkname)
+		case tar.TypeSymlink:
+			ent.Type = "symlink"
+			ent.LinkName = h.Linkname
+		case tar.TypeDir:
+			ent.Type = "dir"
+		case tar.TypeReg:
+			ent.Offset = w.cw.n
+			ent.Type = "reg"
+			ent.Size = h.Size
+
+			// Start a new gzip stream for regular files.
+			if err := w.closeGz(); err != nil {
+				return err
+			}
+		default:
+			return fmt.Errorf("unsupported input tar entry %q", h.Typeflag)
+		}
+		w.toc.Entries = append(w.toc.Entries, ent)
+		if w.gz == nil {
+			w.gz, err = gzip.NewWriterLevel(w.cw, gzip.BestCompression)
+			if err != nil {
+				return err
+			}
+		}
+		tw := tar.NewWriter(w.gz)
+		if err := tw.WriteHeader(h); err != nil {
+			return err
+		}
+		if _, err := io.Copy(tw, tr); err != nil {
+			return err
+		}
+		if err := tw.Flush(); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// footerBytes the 47 byte footer.
+func footerBytes(tocOff int64) []byte {
+	buf := bytes.NewBuffer(make([]byte, 0, FooterSize))
+	gz, _ := gzip.NewWriterLevel(buf, gzip.NoCompression)
+	gz.Header.Extra = []byte(fmt.Sprintf("%016xSTARGZ", tocOff))
+	gz.Close()
+	if buf.Len() != FooterSize {
+		panic(fmt.Sprintf("footer buffer = %d, not %d", buf.Len(), FooterSize))
+	}
+	return buf.Bytes()
+}
+
+func parseFooter(p []byte) (tocOffset int64, ok bool) {
+	if len(p) != FooterSize {
+		return 0, false
+	}
+	zr, err := gzip.NewReader(bytes.NewReader(p))
+	if err != nil {
+		return 0, false
+	}
+	extra := zr.Header.Extra
+	if len(extra) != 16+len("STARGZ") {
+		return 0, false
+	}
+	if string(extra[16:]) != "STARGZ" {
+		return 0, false
+	}
+	tocOffset, err = strconv.ParseInt(string(extra[:16]), 16, 64)
+	return tocOffset, err == nil
+}
+
+func formatModtime(t time.Time) string {
+	if t.IsZero() || t.Unix() == 0 {
+		return ""
+	}
+	t = t.UTC()
+	if t.Equal(t.Round(time.Second)) {
+		return t.UTC().Format(time.RFC3339)
+	}
+	return t.UTC().Format(time.RFC3339Nano)
+}
+
+// countWriter counts how many bytes have been written to its wrapped
+// io.Writer.
+type countWriter struct {
+	w io.Writer
+	n int64
+}
+
+func (cw *countWriter) Write(p []byte) (n int, err error) {
+	n, err = cw.w.Write(p)
+	cw.n += int64(n)
+	return
+}
+
+// isGzip reports whether br is positioned right before an upcoming gzip stream.
+// It does not consume any bytes from br.
+func isGzip(br *bufio.Reader) bool {
+	const (
+		gzipID1     = 0x1f
+		gzipID2     = 0x8b
+		gzipDeflate = 8
+	)
+	peek, _ := br.Peek(3)
+	return len(peek) >= 3 && peek[0] == gzipID1 && peek[1] == gzipID2 && peek[2] == gzipDeflate
+}
diff --git a/crfs/stargz/stargz_test.go b/crfs/stargz/stargz_test.go
new file mode 100644
index 0000000..0250649
--- /dev/null
+++ b/crfs/stargz/stargz_test.go
@@ -0,0 +1,292 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package stargz
+
+import (
+	"archive/tar"
+	"bufio"
+	"bytes"
+	"compress/gzip"
+	"encoding/json"
+	"errors"
+	"fmt"
+	"io"
+	"io/ioutil"
+	"strings"
+	"testing"
+)
+
+// Tests 47 byte footer encoding, size, and parsing.
+func TestFooter(t *testing.T) {
+	for off := int64(0); off <= 200000; off += 1023 {
+		footer := footerBytes(off)
+		if len(footer) != FooterSize {
+			t.Fatalf("for offset %v, footer length was %d, not expected %d. got bytes: %q", off, len(footer), FooterSize, footer)
+		}
+		got, ok := parseFooter(footer)
+		if !ok {
+			t.Fatalf("failed to parse footer for offset %d, footer: %q", off, footer)
+		}
+		if got != off {
+			t.Fatalf("parseFooter(footerBytes(offset %d)) = %d; want %d", off, got, off)
+
+		}
+	}
+}
+
+func TestWriteAndOpen(t *testing.T) {
+	tests := []struct {
+		name      string
+		in        []tarEntry
+		want      []stargzCheck
+		wantNumGz int // expected number of gzip streams
+	}{
+		{
+			name:      "empty",
+			in:        tarOf(),
+			wantNumGz: 2, // TOC + footer
+			want: checks(
+				numTOCEntries(0),
+			),
+		},
+		{
+			name: "1dir_1file",
+			in: tarOf(
+				dir("foo/"),
+				file("foo/bar.txt", "Some contents"),
+			),
+			wantNumGz: 4, // var dir, foo.txt alone, TOC, footer
+			want: checks(
+				numTOCEntries(2),
+				hasDir("foo/"),
+				hasFileLen("foo/bar.txt", len("Some contents")),
+			),
+		},
+		{
+			name: "2meta_2file",
+			in: tarOf(
+				dir("bar/"),
+				dir("foo/"),
+				file("foo/bar.txt", "Some contents"),
+			),
+			wantNumGz: 4, // both dirs, foo.txt alone, TOC, footer
+			want: checks(
+				numTOCEntries(3),
+				hasDir("bar/"),
+				hasDir("foo/"),
+				hasFileLen("foo/bar.txt", len("Some contents")),
+			),
+		},
+		{
+			name: "symlink",
+			in: tarOf(
+				dir("foo/"),
+				symlink("foo/bar", "../../x"),
+			),
+			wantNumGz: 3, // metas + TOC + footer
+			want: checks(
+				numTOCEntries(2),
+				hasSymlink("foo/bar", "../../x"),
+			),
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			tr, cancel := buildTarGz(t, tt.in)
+			defer cancel()
+			var stargzBuf bytes.Buffer
+			w := NewWriter(&stargzBuf)
+			if err := w.AppendTar(tr); err != nil {
+				t.Fatalf("Append: %v", err)
+			}
+			if err := w.Close(); err != nil {
+				t.Fatalf("Writer.Close: %v", err)
+			}
+			b := stargzBuf.Bytes()
+
+			got := countGzStreams(t, b)
+			if got != tt.wantNumGz {
+				t.Errorf("number of gzip streams = %d; want %d", got, tt.wantNumGz)
+			}
+
+			r, err := Open(io.NewSectionReader(bytes.NewReader(b), 0, int64(len(b))))
+			if err != nil {
+				t.Fatalf("stargz.Open: %v", err)
+			}
+			for _, want := range tt.want {
+				want.check(t, r)
+			}
+		})
+	}
+}
+
+func countGzStreams(t *testing.T, b []byte) (numStreams int) {
+	br := bufio.NewReader(bytes.NewReader(b))
+	zr := new(gzip.Reader)
+	for {
+		if err := zr.Reset(br); err != nil {
+			if err == io.EOF {
+				return
+			}
+			t.Fatalf("countGzStreams, Reset: %v", err)
+		}
+		zr.Multistream(false)
+		_, err := io.Copy(ioutil.Discard, zr)
+		if err != nil {
+			t.Fatalf("countGzStreams, Copy: %v", err)
+		}
+		numStreams++
+	}
+}
+
+type numTOCEntries int
+
+func (n numTOCEntries) check(t *testing.T, r *Reader) {
+	if r.TOC == nil {
+		t.Fatal("nil TOC")
+	}
+	if got, want := len(r.TOC.Entries), int(n); got != want {
+		t.Errorf("got %d TOC entries; want %d", got, want)
+		t.Logf("got TOC entries:")
+		for i, ent := range r.TOC.Entries {
+			entj, _ := json.Marshal(ent)
+			t.Logf("  [%d]: %s\n", i, entj)
+		}
+		t.FailNow()
+	}
+}
+
+func tarOf(s ...tarEntry) []tarEntry { return s }
+
+func checks(s ...stargzCheck) []stargzCheck { return s }
+
+type stargzCheck interface {
+	check(t *testing.T, r *Reader)
+}
+
+type stargzCheckFn func(*testing.T, *Reader)
+
+func (f stargzCheckFn) check(t *testing.T, r *Reader) { f(t, r) }
+
+func hasFileLen(file string, wantLen int) stargzCheck {
+	return stargzCheckFn(func(t *testing.T, r *Reader) {
+		for _, ent := range r.TOC.Entries {
+			if ent.Name == file {
+				if ent.Type != "reg" {
+					t.Errorf("file type of %q is %q; want \"reg\"", file, ent.Type)
+				} else if ent.Size != int64(wantLen) {
+					t.Errorf("file size of %q = %d; want %d", file, ent.Size, wantLen)
+				}
+				return
+			}
+		}
+		t.Errorf("file %q not found", file)
+	})
+}
+
+func hasDir(file string) stargzCheck {
+	return stargzCheckFn(func(t *testing.T, r *Reader) {
+		for _, ent := range r.TOC.Entries {
+			if ent.Name == file {
+				if ent.Type != "dir" {
+					t.Errorf("file type of %q is %q; want \"dir\"", file, ent.Type)
+				}
+				return
+			}
+		}
+		t.Errorf("directory %q not found", file)
+	})
+}
+
+func hasSymlink(file, target string) stargzCheck {
+	return stargzCheckFn(func(t *testing.T, r *Reader) {
+		for _, ent := range r.TOC.Entries {
+			if ent.Name == file {
+				if ent.Type != "symlink" {
+					t.Errorf("file type of %q is %q; want \"symlink\"", file, ent.Type)
+				} else if ent.LinkName != target {
+					t.Errorf("link target of symlink %q is %q; want %q", file, ent.LinkName, target)
+				}
+				return
+			}
+		}
+		t.Errorf("symlink %q not found", file)
+	})
+}
+
+type tarEntry interface {
+	appendTar(*tar.Writer) error
+}
+
+type tarEntryFunc func(*tar.Writer) error
+
+func (f tarEntryFunc) appendTar(tw *tar.Writer) error { return f(tw) }
+
+func buildTarGz(t *testing.T, ents []tarEntry) (r io.Reader, cancel func()) {
+	pr, pw := io.Pipe()
+	go func() {
+		tw := tar.NewWriter(pw)
+		for _, ent := range ents {
+			if err := ent.appendTar(tw); err != nil {
+				t.Errorf("building input tar: %v", err)
+				pw.Close()
+				return
+			}
+		}
+		if err := tw.Close(); err != nil {
+			t.Errorf("closing write of input tar: %v", err)
+		}
+		pw.Close()
+		return
+	}()
+	return pr, func() { go pr.Close(); go pw.Close() }
+}
+
+func dir(d string) tarEntry {
+	return tarEntryFunc(func(tw *tar.Writer) error {
+		name := string(d)
+		if !strings.HasSuffix(name, "/") {
+			panic(fmt.Sprintf("missing trailing slash in dir %q ", name))
+		}
+		return tw.WriteHeader(&tar.Header{
+			Typeflag: tar.TypeDir,
+			Name:     name,
+			Mode:     0755,
+		})
+	})
+}
+
+func file(name, contents string, extraAttr ...interface{}) tarEntry {
+	return tarEntryFunc(func(tw *tar.Writer) error {
+		if len(extraAttr) > 0 {
+			return errors.New("unsupported extraAttr")
+		}
+		if strings.HasSuffix(name, "/") {
+			return fmt.Errorf("bogus trailing slash in file %q", name)
+		}
+		if err := tw.WriteHeader(&tar.Header{
+			Typeflag: tar.TypeReg,
+			Name:     name,
+			Mode:     0644,
+			Size:     int64(len(contents)),
+		}); err != nil {
+			return err
+		}
+		_, err := io.WriteString(tw, contents)
+		return err
+	})
+}
+
+func symlink(name, target string) tarEntry {
+	return tarEntryFunc(func(tw *tar.Writer) error {
+		return tw.WriteHeader(&tar.Header{
+			Typeflag: tar.TypeSymlink,
+			Name:     name,
+			Linkname: target,
+			Mode:     0644,
+		})
+	})
+}