| // Copyright 2026 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| package zip |
| |
| import ( |
| "cmp" |
| "compress/gzip" |
| "errors" |
| "fmt" |
| "io" |
| "os" |
| "slices" |
| ) |
| |
| // A sparseFile represents an archive as a sequence of non-zero byte spans |
| // (the LFH headers, the Central Directory, the EOCD records, and any |
| // non-zero compressed bodies) plus a total length. Bytes outside any span |
| // are implicitly zero. This is the storage format used for goldens under |
| // testdata/zip64/ (suffix .zsparse) and the in-memory shape produced by |
| // the writer-reproduction harness. |
| // |
| // On-disk layout (all little-endian): |
| // |
| // uint64 size |
| // uint32 numSpans |
| // for each span: |
| // uint64 offset |
| // uint32 dataLen |
| // dataLen bytes |
| // |
| // Spans are sorted by offset and non-overlapping. |
| type sparseFile struct { |
| Size int64 |
| Spans []sparseSpan |
| } |
| |
| type sparseSpan struct { |
| Offset int64 |
| Data []byte |
| } |
| |
| // ReadAt implements [io.ReaderAt] by serving the underlying spans and |
| // synthesizing zero bytes for any gap inside [0, Size). |
| func (f *sparseFile) ReadAt(p []byte, off int64) (int, error) { |
| if off < 0 { |
| return 0, errors.New("sparseFile: negative offset") |
| } |
| if off >= f.Size { |
| return 0, io.EOF |
| } |
| end := min(off+int64(len(p)), f.Size) |
| n := int(end - off) |
| clear(p[:n]) |
| for _, s := range f.Spans { |
| sEnd := s.Offset + int64(len(s.Data)) |
| if sEnd <= off || s.Offset >= end { |
| continue |
| } |
| from := max(s.Offset, off) |
| to := min(sEnd, end) |
| copy(p[from-off:to-off], s.Data[from-s.Offset:to-s.Offset]) |
| } |
| if n < len(p) { |
| return n, io.EOF |
| } |
| return n, nil |
| } |
| |
| // materializeTail returns the last keep bytes of the conceptual file as a |
| // plain byte slice, suitable for [parseCD]. |
| func (f *sparseFile) materializeTail(keep int64) (data []byte, baseOff uint64) { |
| if keep > f.Size { |
| keep = f.Size |
| } |
| base := f.Size - keep |
| buf := make([]byte, keep) |
| f.ReadAt(buf, base) |
| return buf, uint64(base) |
| } |
| |
| const sparseChunk = 4096 |
| |
| // scanSparse stream-reads r and builds a sparseFile, treating any contiguous |
| // run of zero bytes (rounded to sparseChunk boundaries) as a gap. Adjacent |
| // non-zero chunks are coalesced into one span. |
| func scanSparse(r io.Reader) (*sparseFile, error) { |
| f := &sparseFile{} |
| var cur *sparseSpan |
| buf := make([]byte, sparseChunk) |
| for { |
| n, err := io.ReadFull(r, buf) |
| if n > 0 { |
| chunk := buf[:n] |
| if isAllZero(chunk) { |
| if cur != nil { |
| f.Spans = append(f.Spans, *cur) |
| cur = nil |
| } |
| } else { |
| if cur == nil { |
| cur = &sparseSpan{Offset: f.Size} |
| } |
| cur.Data = append(cur.Data, chunk...) |
| } |
| f.Size += int64(n) |
| } |
| if err != nil { |
| if err == io.EOF || err == io.ErrUnexpectedEOF { |
| break |
| } |
| return nil, err |
| } |
| } |
| if cur != nil { |
| f.Spans = append(f.Spans, *cur) |
| } |
| return f, nil |
| } |
| |
| // writeSparse serializes f to w in the on-disk format described on |
| // [sparseFile]. |
| func writeSparse(w io.Writer, f *sparseFile) error { |
| var hdr [12]byte |
| le.PutUint64(hdr[:8], uint64(f.Size)) |
| le.PutUint32(hdr[8:12], uint32(len(f.Spans))) |
| if _, err := w.Write(hdr[:]); err != nil { |
| return err |
| } |
| for _, s := range f.Spans { |
| var b [12]byte |
| le.PutUint64(b[:8], uint64(s.Offset)) |
| le.PutUint32(b[8:12], uint32(len(s.Data))) |
| if _, err := w.Write(b[:]); err != nil { |
| return err |
| } |
| if _, err := w.Write(s.Data); err != nil { |
| return err |
| } |
| } |
| return nil |
| } |
| |
| // readSparse parses the on-disk format from r. |
| func readSparse(r io.Reader) (*sparseFile, error) { |
| var hdr [12]byte |
| if _, err := io.ReadFull(r, hdr[:]); err != nil { |
| return nil, err |
| } |
| f := &sparseFile{ |
| Size: int64(le.Uint64(hdr[:8])), |
| } |
| n := le.Uint32(hdr[8:12]) |
| if n > 1<<20 { |
| return nil, fmt.Errorf("sparseFile: implausible span count %d", n) |
| } |
| f.Spans = make([]sparseSpan, n) |
| for i := range f.Spans { |
| var b [12]byte |
| if _, err := io.ReadFull(r, b[:]); err != nil { |
| return nil, err |
| } |
| f.Spans[i].Offset = int64(le.Uint64(b[:8])) |
| sz := le.Uint32(b[8:12]) |
| f.Spans[i].Data = make([]byte, sz) |
| if _, err := io.ReadFull(r, f.Spans[i].Data); err != nil { |
| return nil, err |
| } |
| } |
| if !slices.IsSortedFunc(f.Spans, func(a, b sparseSpan) int { |
| return cmp.Compare(a.Offset, b.Offset) |
| }) { |
| return nil, errors.New("sparseFile: spans not sorted") |
| } |
| return f, nil |
| } |
| |
| // readSparseFile reads a sparse file from path. The file is expected to be |
| // gzip-compressed; the outer gzip wrap shrinks goldens that contain non-zero |
| // compressed bodies (e.g., the deflate-zeros entries) by 100x because |
| // deflate-of-zeros is highly repetitive. Small Store goldens benefit too: |
| // gzip's header overhead is ~30 bytes, well under the bytes saved on a 4 KB |
| // sparse representation. |
| func readSparseFile(path string) (*sparseFile, error) { |
| f, err := os.Open(path) |
| if err != nil { |
| return nil, err |
| } |
| defer f.Close() |
| zr, err := gzip.NewReader(f) |
| if err != nil { |
| return nil, err |
| } |
| defer zr.Close() |
| return readSparse(zr) |
| } |
| |
| // isAllZero reports whether every byte in b is 0. |
| func isAllZero(b []byte) bool { |
| for _, c := range b { |
| if c != 0 { |
| return false |
| } |
| } |
| return true |
| } |
| |
| // sparseBuffer accumulates writes into a [sparseFile], dropping any |
| // chunkSize-byte chunk that is all-zero. This makes capturing the result |
| // of pushing multi-GiB streams of zeros through the writer almost free — |
| // the only bytes that end up retained are the LFHs, the Central |
| // Directory, the EOCD records, and any non-zero compressed body. |
| type sparseBuffer struct { |
| f sparseFile |
| cur *sparseSpan |
| } |
| |
| func (t *sparseBuffer) Write(p []byte) (int, error) { |
| n := len(p) |
| for len(p) > 0 { |
| k := len(p) |
| if k > sparseChunk { |
| k = sparseChunk |
| } |
| chunk := p[:k] |
| if isAllZero(chunk) { |
| t.cur = nil |
| } else { |
| if t.cur == nil { |
| t.f.Spans = append(t.f.Spans, sparseSpan{Offset: t.f.Size}) |
| t.cur = &t.f.Spans[len(t.f.Spans)-1] |
| } |
| t.cur.Data = append(t.cur.Data, chunk...) |
| } |
| t.f.Size += int64(k) |
| p = p[k:] |
| } |
| return n, nil |
| } |