crfs/stargz: add start of package
Basic API, format, tests.
Good enough checkpoint.
Updates golang/go#30829
Change-Id: Iaec5b205314d64fca5056f6b19a7bae52e5cef94
Reviewed-on: https://go-review.googlesource.com/c/build/+/167769
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
diff --git a/crfs/stargz/stargz.go b/crfs/stargz/stargz.go
new file mode 100644
index 0000000..dd6d143
--- /dev/null
+++ b/crfs/stargz/stargz.go
@@ -0,0 +1,345 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// The stargz package reads & writes tar.gz ("tarball") files in a
+// seekable, indexed format call "stargz". A stargz file is still a
+// valid tarball, but it's slightly bigger with new gzip streams for
+// each new file & throughout large files, and has an index in a magic
+// file at the end.
+package stargz
+
+import (
+ "archive/tar"
+ "bufio"
+ "bytes"
+ "compress/gzip"
+ "encoding/json"
+ "errors"
+ "fmt"
+ "io"
+ "strconv"
+ "time"
+)
+
+// TOCTarName is the name of the JSON file in the tar archive in the
+// table of contents gzip stream.
+const TOCTarName = "stargz.index.json"
+
+// FooterSize is the number of bytes in the stargz footer.
+//
+// The footer is an empty gzip stream with no compression and an Extra
+// header of the form "%016xSTARGZ", where the 64 bit hex-encoded
+// number is the offset to the gzip stream of JSON TOC.
+//
+// 47 comes from:
+//
+// 10 byte gzip header +
+// 2 byte (LE16) length of extra, encoding 22 (16 hex digits + len("STARGZ")) == "\x16\x00" +
+// 22 bytes of extra (fmt.Sprintf("%016xSTARGZ", tocGzipOffset))
+// 5 byte flate header
+// 8 byte gzip footer (two little endian uint32s: digest, size)
+const FooterSize = 47
+
+// A Reader permits random access reads from a stargz file.
+type Reader struct {
+ sr *io.SectionReader
+ TOC *TOC
+}
+
+// Open opens a stargz file for reading.
+func Open(sr *io.SectionReader) (*Reader, error) {
+ if sr.Size() < FooterSize {
+ return nil, fmt.Errorf("stargz size %d is smaller than the stargz footer size", sr.Size())
+ }
+ // TODO: read a bigger chunk (1MB?) at once here to hopefully
+ // get the TOC + footer in one go.
+ var footer [FooterSize]byte
+ if _, err := sr.ReadAt(footer[:], sr.Size()-FooterSize); err != nil {
+ return nil, fmt.Errorf("error reading footer: %v", err)
+ }
+ tocOff, ok := parseFooter(footer[:])
+ if !ok {
+ return nil, fmt.Errorf("error parsing footer")
+ }
+ tocTargz := make([]byte, sr.Size()-tocOff-FooterSize)
+ if _, err := sr.ReadAt(tocTargz, tocOff); err != nil {
+ return nil, fmt.Errorf("error reading %d byte TOC targz: %v", len(tocTargz), err)
+ }
+ zr, err := gzip.NewReader(bytes.NewReader(tocTargz))
+ if err != nil {
+ return nil, fmt.Errorf("malformed TOC gzip header: %v", err)
+ }
+ zr.Multistream(false)
+ tr := tar.NewReader(zr)
+ h, err := tr.Next()
+ if err != nil {
+ return nil, fmt.Errorf("failed to find tar header in TOC gzip stream: %v", err)
+ }
+ if h.Name != TOCTarName {
+ return nil, fmt.Errorf("TOC tar entry had name %q; expected %q", h.Name, TOCTarName)
+ }
+ toc := new(TOC)
+ if err := json.NewDecoder(tr).Decode(&toc); err != nil {
+ return nil, fmt.Errorf("error decoding TOC JSON: %v", err)
+ }
+ return &Reader{sr: sr, TOC: toc}, nil
+}
+
+// TOCEntry is an entry in the stargz file's TOC (Table of Contents).
+type TOCEntry struct {
+ Offset int64 `json:"offset,omitempty"` // offset to gzip stream of tar entry (for regular files only)
+ Name string `json:"name"`
+ Type string `json:"type"` // "dir", "reg", TODO
+ Size int64 `json:"size,omitempty"`
+ LinkName string `json:"linkName,omitempty"` // for symlinks
+ Mode int64 `json:"mode,omitempty"` // Permission and mode bits
+ Uid int `json:"uid,omitempty"` // User ID of owner
+ Gid int `json:"gid,omitempty"` // Group ID of owner
+ Uname string `json:"userName,omitempty"` // User name of owner
+ Gname string `json:"groupName,omitempty"` // Group name of owner
+ ModTime string `json:"modtime,omitempty"`
+
+ // ChunkOffset is non-zero if this is a chunk of a large,
+ // regular file. If so, the Offset is where the gzip header of
+ // ChunkSize bytes at ChunkOffset in Name begin. If both
+ // ChunkOffset and ChunkSize are zero, the file contents are
+ // completely represented at the tar gzip stream starting at
+ // Offset.
+ ChunkOffset int64 `json:"chunkOffset,omitempty"`
+ ChunkSize int64 `json:"chunkSize,omitempty"`
+}
+
+// TOC is the table of contents index of the files in the stargz file.
+type TOC struct {
+ Version int `json:"version"`
+ Entries []TOCEntry `json:"entries"`
+}
+
+// A Writer writes stargz files.
+//
+// Use NewWriter to create a new Writer.
+type Writer struct {
+ bw *bufio.Writer
+ cw *countWriter
+ toc *TOC
+
+ closed bool
+ gz *gzip.Writer
+}
+
+// NewWriter returns a new stargz writer writing to w.
+//
+// The writer must be closed to write its trailing table of contents.
+func NewWriter(w io.Writer) *Writer {
+ bw := bufio.NewWriter(w)
+ cw := &countWriter{w: bw}
+ return &Writer{
+ bw: bw,
+ cw: cw,
+ toc: &TOC{Version: 1},
+ }
+}
+
+// Close writes the stargz's table of contents and flushes all the
+// buffers, returning any error.
+func (w *Writer) Close() error {
+ if w.closed {
+ return nil
+ }
+ defer func() { w.closed = true }()
+
+ if err := w.closeGz(); err != nil {
+ return err
+ }
+
+ // Write the TOC index.
+ tocOff := w.cw.n
+ w.gz, _ = gzip.NewWriterLevel(w.cw, gzip.BestCompression)
+ tw := tar.NewWriter(w.gz)
+ tocJSON, err := json.MarshalIndent(w.toc, "", "\t")
+ if err != nil {
+ return err
+ }
+ if err := tw.WriteHeader(&tar.Header{
+ Typeflag: tar.TypeReg,
+ Name: TOCTarName,
+ Size: int64(len(tocJSON)),
+ }); err != nil {
+ return err
+ }
+ if _, err := tw.Write(tocJSON); err != nil {
+ return err
+ }
+
+ if err := tw.Close(); err != nil {
+ return err
+ }
+ if err := w.closeGz(); err != nil {
+ return err
+ }
+
+ // And a little footer with pointer to the TOC gzip stream.
+ if _, err := w.bw.Write(footerBytes(tocOff)); err != nil {
+ return err
+ }
+
+ if err := w.bw.Flush(); err != nil {
+ return err
+ }
+
+ return nil
+}
+
+func (w *Writer) closeGz() error {
+ if w.closed {
+ return errors.New("write on closed Writer")
+ }
+ if w.gz != nil {
+ if err := w.gz.Close(); err != nil {
+ return err
+ }
+ w.gz = nil
+ }
+ return nil
+}
+
+// AppendTar reads the tar or tar.gz file from r and appends
+// each of its contents to w.
+//
+// The input r can optionally be gzip compressed but the output will
+// always be gzip compressed.
+func (w *Writer) AppendTar(r io.Reader) error {
+ br := bufio.NewReader(r)
+ var tr *tar.Reader
+ if isGzip(br) {
+ // NewReader can't fail if isGzip returned true.
+ zr, _ := gzip.NewReader(br)
+ tr = tar.NewReader(zr)
+ } else {
+ tr = tar.NewReader(br)
+ }
+ for {
+ h, err := tr.Next()
+ if err == io.EOF {
+ break
+ }
+ if err != nil {
+ return fmt.Errorf("error reading from source tar: tar.Reader.Next: %v", err)
+ }
+ ent := TOCEntry{
+ Name: h.Name,
+ Mode: h.Mode,
+ Uid: h.Uid,
+ Gid: h.Gid,
+ Uname: h.Uname,
+ Gname: h.Gname,
+ ModTime: formatModtime(h.ModTime),
+ }
+ switch h.Typeflag {
+ case tar.TypeLink:
+ return fmt.Errorf("TODO: unsupported hardlink %q => %q", h.Name, h.Linkname)
+ case tar.TypeSymlink:
+ ent.Type = "symlink"
+ ent.LinkName = h.Linkname
+ case tar.TypeDir:
+ ent.Type = "dir"
+ case tar.TypeReg:
+ ent.Offset = w.cw.n
+ ent.Type = "reg"
+ ent.Size = h.Size
+
+ // Start a new gzip stream for regular files.
+ if err := w.closeGz(); err != nil {
+ return err
+ }
+ default:
+ return fmt.Errorf("unsupported input tar entry %q", h.Typeflag)
+ }
+ w.toc.Entries = append(w.toc.Entries, ent)
+ if w.gz == nil {
+ w.gz, err = gzip.NewWriterLevel(w.cw, gzip.BestCompression)
+ if err != nil {
+ return err
+ }
+ }
+ tw := tar.NewWriter(w.gz)
+ if err := tw.WriteHeader(h); err != nil {
+ return err
+ }
+ if _, err := io.Copy(tw, tr); err != nil {
+ return err
+ }
+ if err := tw.Flush(); err != nil {
+ return err
+ }
+ }
+ return nil
+}
+
+// footerBytes the 47 byte footer.
+func footerBytes(tocOff int64) []byte {
+ buf := bytes.NewBuffer(make([]byte, 0, FooterSize))
+ gz, _ := gzip.NewWriterLevel(buf, gzip.NoCompression)
+ gz.Header.Extra = []byte(fmt.Sprintf("%016xSTARGZ", tocOff))
+ gz.Close()
+ if buf.Len() != FooterSize {
+ panic(fmt.Sprintf("footer buffer = %d, not %d", buf.Len(), FooterSize))
+ }
+ return buf.Bytes()
+}
+
+func parseFooter(p []byte) (tocOffset int64, ok bool) {
+ if len(p) != FooterSize {
+ return 0, false
+ }
+ zr, err := gzip.NewReader(bytes.NewReader(p))
+ if err != nil {
+ return 0, false
+ }
+ extra := zr.Header.Extra
+ if len(extra) != 16+len("STARGZ") {
+ return 0, false
+ }
+ if string(extra[16:]) != "STARGZ" {
+ return 0, false
+ }
+ tocOffset, err = strconv.ParseInt(string(extra[:16]), 16, 64)
+ return tocOffset, err == nil
+}
+
+func formatModtime(t time.Time) string {
+ if t.IsZero() || t.Unix() == 0 {
+ return ""
+ }
+ t = t.UTC()
+ if t.Equal(t.Round(time.Second)) {
+ return t.UTC().Format(time.RFC3339)
+ }
+ return t.UTC().Format(time.RFC3339Nano)
+}
+
+// countWriter counts how many bytes have been written to its wrapped
+// io.Writer.
+type countWriter struct {
+ w io.Writer
+ n int64
+}
+
+func (cw *countWriter) Write(p []byte) (n int, err error) {
+ n, err = cw.w.Write(p)
+ cw.n += int64(n)
+ return
+}
+
+// isGzip reports whether br is positioned right before an upcoming gzip stream.
+// It does not consume any bytes from br.
+func isGzip(br *bufio.Reader) bool {
+ const (
+ gzipID1 = 0x1f
+ gzipID2 = 0x8b
+ gzipDeflate = 8
+ )
+ peek, _ := br.Peek(3)
+ return len(peek) >= 3 && peek[0] == gzipID1 && peek[1] == gzipID2 && peek[2] == gzipDeflate
+}
diff --git a/crfs/stargz/stargz_test.go b/crfs/stargz/stargz_test.go
new file mode 100644
index 0000000..0250649
--- /dev/null
+++ b/crfs/stargz/stargz_test.go
@@ -0,0 +1,292 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package stargz
+
+import (
+ "archive/tar"
+ "bufio"
+ "bytes"
+ "compress/gzip"
+ "encoding/json"
+ "errors"
+ "fmt"
+ "io"
+ "io/ioutil"
+ "strings"
+ "testing"
+)
+
+// Tests 47 byte footer encoding, size, and parsing.
+func TestFooter(t *testing.T) {
+ for off := int64(0); off <= 200000; off += 1023 {
+ footer := footerBytes(off)
+ if len(footer) != FooterSize {
+ t.Fatalf("for offset %v, footer length was %d, not expected %d. got bytes: %q", off, len(footer), FooterSize, footer)
+ }
+ got, ok := parseFooter(footer)
+ if !ok {
+ t.Fatalf("failed to parse footer for offset %d, footer: %q", off, footer)
+ }
+ if got != off {
+ t.Fatalf("parseFooter(footerBytes(offset %d)) = %d; want %d", off, got, off)
+
+ }
+ }
+}
+
+func TestWriteAndOpen(t *testing.T) {
+ tests := []struct {
+ name string
+ in []tarEntry
+ want []stargzCheck
+ wantNumGz int // expected number of gzip streams
+ }{
+ {
+ name: "empty",
+ in: tarOf(),
+ wantNumGz: 2, // TOC + footer
+ want: checks(
+ numTOCEntries(0),
+ ),
+ },
+ {
+ name: "1dir_1file",
+ in: tarOf(
+ dir("foo/"),
+ file("foo/bar.txt", "Some contents"),
+ ),
+ wantNumGz: 4, // var dir, foo.txt alone, TOC, footer
+ want: checks(
+ numTOCEntries(2),
+ hasDir("foo/"),
+ hasFileLen("foo/bar.txt", len("Some contents")),
+ ),
+ },
+ {
+ name: "2meta_2file",
+ in: tarOf(
+ dir("bar/"),
+ dir("foo/"),
+ file("foo/bar.txt", "Some contents"),
+ ),
+ wantNumGz: 4, // both dirs, foo.txt alone, TOC, footer
+ want: checks(
+ numTOCEntries(3),
+ hasDir("bar/"),
+ hasDir("foo/"),
+ hasFileLen("foo/bar.txt", len("Some contents")),
+ ),
+ },
+ {
+ name: "symlink",
+ in: tarOf(
+ dir("foo/"),
+ symlink("foo/bar", "../../x"),
+ ),
+ wantNumGz: 3, // metas + TOC + footer
+ want: checks(
+ numTOCEntries(2),
+ hasSymlink("foo/bar", "../../x"),
+ ),
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ tr, cancel := buildTarGz(t, tt.in)
+ defer cancel()
+ var stargzBuf bytes.Buffer
+ w := NewWriter(&stargzBuf)
+ if err := w.AppendTar(tr); err != nil {
+ t.Fatalf("Append: %v", err)
+ }
+ if err := w.Close(); err != nil {
+ t.Fatalf("Writer.Close: %v", err)
+ }
+ b := stargzBuf.Bytes()
+
+ got := countGzStreams(t, b)
+ if got != tt.wantNumGz {
+ t.Errorf("number of gzip streams = %d; want %d", got, tt.wantNumGz)
+ }
+
+ r, err := Open(io.NewSectionReader(bytes.NewReader(b), 0, int64(len(b))))
+ if err != nil {
+ t.Fatalf("stargz.Open: %v", err)
+ }
+ for _, want := range tt.want {
+ want.check(t, r)
+ }
+ })
+ }
+}
+
+func countGzStreams(t *testing.T, b []byte) (numStreams int) {
+ br := bufio.NewReader(bytes.NewReader(b))
+ zr := new(gzip.Reader)
+ for {
+ if err := zr.Reset(br); err != nil {
+ if err == io.EOF {
+ return
+ }
+ t.Fatalf("countGzStreams, Reset: %v", err)
+ }
+ zr.Multistream(false)
+ _, err := io.Copy(ioutil.Discard, zr)
+ if err != nil {
+ t.Fatalf("countGzStreams, Copy: %v", err)
+ }
+ numStreams++
+ }
+}
+
+type numTOCEntries int
+
+func (n numTOCEntries) check(t *testing.T, r *Reader) {
+ if r.TOC == nil {
+ t.Fatal("nil TOC")
+ }
+ if got, want := len(r.TOC.Entries), int(n); got != want {
+ t.Errorf("got %d TOC entries; want %d", got, want)
+ t.Logf("got TOC entries:")
+ for i, ent := range r.TOC.Entries {
+ entj, _ := json.Marshal(ent)
+ t.Logf(" [%d]: %s\n", i, entj)
+ }
+ t.FailNow()
+ }
+}
+
+func tarOf(s ...tarEntry) []tarEntry { return s }
+
+func checks(s ...stargzCheck) []stargzCheck { return s }
+
+type stargzCheck interface {
+ check(t *testing.T, r *Reader)
+}
+
+type stargzCheckFn func(*testing.T, *Reader)
+
+func (f stargzCheckFn) check(t *testing.T, r *Reader) { f(t, r) }
+
+func hasFileLen(file string, wantLen int) stargzCheck {
+ return stargzCheckFn(func(t *testing.T, r *Reader) {
+ for _, ent := range r.TOC.Entries {
+ if ent.Name == file {
+ if ent.Type != "reg" {
+ t.Errorf("file type of %q is %q; want \"reg\"", file, ent.Type)
+ } else if ent.Size != int64(wantLen) {
+ t.Errorf("file size of %q = %d; want %d", file, ent.Size, wantLen)
+ }
+ return
+ }
+ }
+ t.Errorf("file %q not found", file)
+ })
+}
+
+func hasDir(file string) stargzCheck {
+ return stargzCheckFn(func(t *testing.T, r *Reader) {
+ for _, ent := range r.TOC.Entries {
+ if ent.Name == file {
+ if ent.Type != "dir" {
+ t.Errorf("file type of %q is %q; want \"dir\"", file, ent.Type)
+ }
+ return
+ }
+ }
+ t.Errorf("directory %q not found", file)
+ })
+}
+
+func hasSymlink(file, target string) stargzCheck {
+ return stargzCheckFn(func(t *testing.T, r *Reader) {
+ for _, ent := range r.TOC.Entries {
+ if ent.Name == file {
+ if ent.Type != "symlink" {
+ t.Errorf("file type of %q is %q; want \"symlink\"", file, ent.Type)
+ } else if ent.LinkName != target {
+ t.Errorf("link target of symlink %q is %q; want %q", file, ent.LinkName, target)
+ }
+ return
+ }
+ }
+ t.Errorf("symlink %q not found", file)
+ })
+}
+
+type tarEntry interface {
+ appendTar(*tar.Writer) error
+}
+
+type tarEntryFunc func(*tar.Writer) error
+
+func (f tarEntryFunc) appendTar(tw *tar.Writer) error { return f(tw) }
+
+func buildTarGz(t *testing.T, ents []tarEntry) (r io.Reader, cancel func()) {
+ pr, pw := io.Pipe()
+ go func() {
+ tw := tar.NewWriter(pw)
+ for _, ent := range ents {
+ if err := ent.appendTar(tw); err != nil {
+ t.Errorf("building input tar: %v", err)
+ pw.Close()
+ return
+ }
+ }
+ if err := tw.Close(); err != nil {
+ t.Errorf("closing write of input tar: %v", err)
+ }
+ pw.Close()
+ return
+ }()
+ return pr, func() { go pr.Close(); go pw.Close() }
+}
+
+func dir(d string) tarEntry {
+ return tarEntryFunc(func(tw *tar.Writer) error {
+ name := string(d)
+ if !strings.HasSuffix(name, "/") {
+ panic(fmt.Sprintf("missing trailing slash in dir %q ", name))
+ }
+ return tw.WriteHeader(&tar.Header{
+ Typeflag: tar.TypeDir,
+ Name: name,
+ Mode: 0755,
+ })
+ })
+}
+
+func file(name, contents string, extraAttr ...interface{}) tarEntry {
+ return tarEntryFunc(func(tw *tar.Writer) error {
+ if len(extraAttr) > 0 {
+ return errors.New("unsupported extraAttr")
+ }
+ if strings.HasSuffix(name, "/") {
+ return fmt.Errorf("bogus trailing slash in file %q", name)
+ }
+ if err := tw.WriteHeader(&tar.Header{
+ Typeflag: tar.TypeReg,
+ Name: name,
+ Mode: 0644,
+ Size: int64(len(contents)),
+ }); err != nil {
+ return err
+ }
+ _, err := io.WriteString(tw, contents)
+ return err
+ })
+}
+
+func symlink(name, target string) tarEntry {
+ return tarEntryFunc(func(tw *tar.Writer) error {
+ return tw.WriteHeader(&tar.Header{
+ Typeflag: tar.TypeSymlink,
+ Name: name,
+ Linkname: target,
+ Mode: 0644,
+ })
+ })
+}