| // Copyright 2021 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| package gitfs |
| |
| import ( |
| "bytes" |
| "compress/zlib" |
| "crypto/sha1" |
| "encoding/binary" |
| "fmt" |
| "io/ioutil" |
| ) |
| |
| // unpack parses data, which is a Git pack-formatted archive, |
| // writing every object it contains to the store s. |
| // |
| // See https://git-scm.com/docs/pack-format for format documentation. |
| func unpack(s *store, data []byte) error { |
| // If the store is empty, pre-allocate the length of data. |
| // This should be about the right order of magnitude for the eventual data, |
| // avoiding many growing steps during append. |
| if len(s.data) == 0 { |
| s.data = make([]byte, 0, len(data)) |
| } |
| |
| // Pack data starts with 12-byte header: "PACK" version[4] nobj[4]. |
| if len(data) < 12+20 { |
| return fmt.Errorf("malformed git pack: too short") |
| } |
| hdr := data[:12] |
| vers := binary.BigEndian.Uint32(hdr[4:8]) |
| nobj := binary.BigEndian.Uint32(hdr[8:12]) |
| if string(hdr[:4]) != "PACK" || vers != 2 && vers != 3 || len(data) < 12+20 || int64(nobj) >= int64(len(data)) { |
| return fmt.Errorf("malformed git pack") |
| } |
| if vers == 3 { |
| return fmt.Errorf("cannot read git pack v3") |
| } |
| |
| // Pack data ends with SHA1 of the entire pack. |
| sum := sha1.Sum(data[:len(data)-20]) |
| if !bytes.Equal(sum[:], data[len(data)-20:]) { |
| return fmt.Errorf("malformed git pack: bad checksum") |
| } |
| |
| // Object data is everything between hdr and ending SHA1. |
| // Unpack every object into the store. |
| objs := data[12 : len(data)-20] |
| off := 0 |
| for i := 0; i < int(nobj); i++ { |
| _, _, _, encSize, err := unpackObject(s, objs, off) |
| if err != nil { |
| return fmt.Errorf("unpack: malformed git pack: %v", err) |
| } |
| off += encSize |
| } |
| if off != len(objs) { |
| return fmt.Errorf("malformed git pack: junk after objects") |
| } |
| return nil |
| } |
| |
| // unpackObject unpacks the object at objs[off:] and writes it to the store s. |
| // It returns the type, hash, and content of the object, as well as the encoded size, |
| // meaning the number of bytes at the start of objs[off:] that this record occupies. |
| func unpackObject(s *store, objs []byte, off int) (typ objType, h Hash, content []byte, encSize int, err error) { |
| fail := func(err error) (objType, Hash, []byte, int, error) { |
| return 0, Hash{}, nil, 0, err |
| } |
| if off < 0 || off >= len(objs) { |
| return fail(fmt.Errorf("invalid object offset")) |
| } |
| |
| // Object starts with varint-encoded type and length n. |
| // (The length n is the length of the compressed data that follows, |
| // not the length of the actual object.) |
| u, size := binary.Uvarint(objs[off:]) |
| if size <= 0 { |
| return fail(fmt.Errorf("invalid object: bad varint header")) |
| } |
| typ = objType((u >> 4) & 7) |
| n := int(u&15 | u>>7<<4) |
| |
| // Git often stores objects that differ very little (different revs of a file). |
| // It can save space by encoding one as "start with this other object and apply these diffs". |
| // There are two ways to specify "this other object": an object ref (20-byte SHA1) |
| // or as a relative offset to an earlier position in the objs slice. |
| // For either of these, we need to fetch the other object's type and data (deltaTyp and deltaBase). |
| // The Git docs call this the "deltified representation". |
| var deltaTyp objType |
| var deltaBase []byte |
| switch typ { |
| case objRefDelta: |
| if len(objs)-(off+size) < 20 { |
| return fail(fmt.Errorf("invalid object: bad delta ref")) |
| } |
| // Base block identified by SHA1 of an already unpacked hash. |
| var h Hash |
| copy(h[:], objs[off+size:]) |
| size += 20 |
| deltaTyp, deltaBase = s.object(h) |
| if deltaTyp == 0 { |
| return fail(fmt.Errorf("invalid object: unknown delta ref %v", h)) |
| } |
| |
| case objOfsDelta: |
| i := off + size |
| if len(objs)-i < 20 { |
| return fail(fmt.Errorf("invalid object: too short")) |
| } |
| // Base block identified by relative offset to earlier position in objs, |
| // using a varint-like but not-quite-varint encoding. |
| // Look for "offset encoding:" in https://git-scm.com/docs/pack-format. |
| d := int64(objs[i] & 0x7f) |
| for objs[i]&0x80 != 0 { |
| i++ |
| if i-(off+size) > 10 { |
| return fail(fmt.Errorf("invalid object: malformed delta offset")) |
| } |
| d = d<<7 | int64(objs[i]&0x7f) |
| d += 1 << 7 |
| } |
| i++ |
| size = i - off |
| |
| // Re-unpack the object at the earlier offset to find its type and content. |
| if d == 0 || d > int64(off) { |
| return fail(fmt.Errorf("invalid object: bad delta offset")) |
| } |
| var err error |
| deltaTyp, _, deltaBase, _, err = unpackObject(s, objs, off-int(d)) |
| if err != nil { |
| return fail(fmt.Errorf("invalid object: bad delta offset")) |
| } |
| } |
| |
| // The main encoded data is a zlib-compressed stream. |
| br := bytes.NewReader(objs[off+size:]) |
| zr, err := zlib.NewReader(br) |
| if err != nil { |
| return fail(fmt.Errorf("invalid object deflate: %v", err)) |
| } |
| data, err := ioutil.ReadAll(zr) |
| if err != nil { |
| return fail(fmt.Errorf("invalid object: bad deflate: %v", err)) |
| } |
| if len(data) != n { |
| return fail(fmt.Errorf("invalid object: deflate size %d != %d", len(data), n)) |
| } |
| encSize = len(objs[off:]) - br.Len() |
| |
| // If we fetched a base object above, the stream is an encoded delta. |
| // Otherwise it is the raw data. |
| switch typ { |
| default: |
| return fail(fmt.Errorf("invalid object: unknown object type")) |
| case objCommit, objTree, objBlob, objTag: |
| // ok |
| case objRefDelta, objOfsDelta: |
| // Actual object type is the type of the base object. |
| typ = deltaTyp |
| |
| // Delta encoding starts with size of base object and size of new object. |
| baseSize, s := binary.Uvarint(data) |
| data = data[s:] |
| if baseSize != uint64(len(deltaBase)) { |
| return fail(fmt.Errorf("invalid object: mismatched delta src size")) |
| } |
| targSize, s := binary.Uvarint(data) |
| data = data[s:] |
| |
| // Apply delta to base object, producing new object. |
| targ := make([]byte, targSize) |
| if err := applyDelta(targ, deltaBase, data); err != nil { |
| return fail(fmt.Errorf("invalid object: %v", err)) |
| } |
| data = targ |
| } |
| |
| h, data = s.add(typ, data) |
| return typ, h, data, encSize, nil |
| } |
| |
| // applyDelta applies the delta encoding to src, producing dst, |
| // which has already been allocated to the expected final size. |
| // See https://git-scm.com/docs/pack-format#_deltified_representation for docs. |
| func applyDelta(dst, src, delta []byte) error { |
| for len(delta) > 0 { |
| // Command byte says what comes next. |
| cmd := delta[0] |
| delta = delta[1:] |
| switch { |
| case cmd == 0: |
| // cmd == 0 is reserved. |
| return fmt.Errorf("invalid delta cmd") |
| |
| case cmd&0x80 != 0: |
| // Copy from base object, 4-byte offset, 3-byte size. |
| // But any zero byte in the offset or size can be omitted. |
| // The bottom 7 bits of cmd say which offset/size bytes are present. |
| var off, size int64 |
| for i := uint(0); i < 4; i++ { |
| if cmd&(1<<i) != 0 { |
| off |= int64(delta[0]) << (8 * i) |
| delta = delta[1:] |
| } |
| } |
| for i := uint(0); i < 3; i++ { |
| if cmd&(0x10<<i) != 0 { |
| size |= int64(delta[0]) << (8 * i) |
| delta = delta[1:] |
| } |
| } |
| // Size 0 means size 0x10000 for some reason. (!) |
| if size == 0 { |
| size = 0x10000 |
| } |
| copy(dst[:size], src[off:off+size]) |
| dst = dst[size:] |
| |
| default: |
| // Up to 0x7F bytes of literal data, length in bottom 7 bits of cmd. |
| n := int(cmd) |
| copy(dst[:n], delta[:n]) |
| dst = dst[n:] |
| delta = delta[n:] |
| } |
| } |
| if len(dst) != 0 { |
| return fmt.Errorf("delta encoding too short") |
| } |
| return nil |
| } |