| // Copyright 2017 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| // Package cache implements a build artifact cache. |
| package cache |
| |
| import ( |
| "bytes" |
| "crypto/sha256" |
| "encoding/hex" |
| "errors" |
| "fmt" |
| "io" |
| "io/ioutil" |
| "os" |
| "path/filepath" |
| "strconv" |
| "strings" |
| "time" |
| ) |
| |
| // An ActionID is a cache action key, the hash of a complete description of a |
| // repeatable computation (command line, environment variables, |
| // input file contents, executable contents). |
| type ActionID [HashSize]byte |
| |
| // An OutputID is a cache output key, the hash of an output of a computation. |
| type OutputID [HashSize]byte |
| |
| // A Cache is a package cache, backed by a file system directory tree. |
| type Cache struct { |
| dir string |
| log *os.File |
| now func() time.Time |
| } |
| |
| // Open opens and returns the cache in the given directory. |
| // |
| // It is safe for multiple processes on a single machine to use the |
| // same cache directory in a local file system simultaneously. |
| // They will coordinate using operating system file locks and may |
| // duplicate effort but will not corrupt the cache. |
| // |
| // However, it is NOT safe for multiple processes on different machines |
| // to share a cache directory (for example, if the directory were stored |
| // in a network file system). File locking is notoriously unreliable in |
| // network file systems and may not suffice to protect the cache. |
| // |
| func Open(dir string) (*Cache, error) { |
| info, err := os.Stat(dir) |
| if err != nil { |
| return nil, err |
| } |
| if !info.IsDir() { |
| return nil, &os.PathError{Op: "open", Path: dir, Err: fmt.Errorf("not a directory")} |
| } |
| for i := 0; i < 256; i++ { |
| name := filepath.Join(dir, fmt.Sprintf("%02x", i)) |
| if err := os.MkdirAll(name, 0777); err != nil { |
| return nil, err |
| } |
| } |
| f, err := os.OpenFile(filepath.Join(dir, "log.txt"), os.O_WRONLY|os.O_APPEND|os.O_CREATE, 0666) |
| if err != nil { |
| return nil, err |
| } |
| c := &Cache{ |
| dir: dir, |
| log: f, |
| now: time.Now, |
| } |
| return c, nil |
| } |
| |
| // fileName returns the name of the file corresponding to the given id. |
| func (c *Cache) fileName(id [HashSize]byte, key string) string { |
| return filepath.Join(c.dir, fmt.Sprintf("%02x", id[0]), fmt.Sprintf("%x", id)+"-"+key) |
| } |
| |
| var errMissing = errors.New("cache entry not found") |
| |
| const ( |
| // action entry file is "v1 <hex id> <hex out> <decimal size space-padded to 20 bytes> <unixnano space-padded to 20 bytes>\n" |
| hexSize = HashSize * 2 |
| entrySize = 2 + 1 + hexSize + 1 + hexSize + 1 + 20 + 1 + 20 + 1 |
| ) |
| |
| // verify controls whether to run the cache in verify mode. |
| // In verify mode, the cache always returns errMissing from Get |
| // but then double-checks in Put that the data being written |
| // exactly matches any existing entry. This provides an easy |
| // way to detect program behavior that would have been different |
| // had the cache entry been returned from Get. |
| // |
| // verify is enabled by setting the environment variable |
| // GODEBUG=gocacheverify=1. |
| var verify = false |
| |
| // DebugTest is set when GODEBUG=gocachetest=1 is in the environment. |
| var DebugTest = false |
| |
| func init() { initEnv() } |
| |
| func initEnv() { |
| verify = false |
| debugHash = false |
| debug := strings.Split(os.Getenv("GODEBUG"), ",") |
| for _, f := range debug { |
| if f == "gocacheverify=1" { |
| verify = true |
| } |
| if f == "gocachehash=1" { |
| debugHash = true |
| } |
| if f == "gocachetest=1" { |
| DebugTest = true |
| } |
| } |
| } |
| |
| // Get looks up the action ID in the cache, |
| // returning the corresponding output ID and file size, if any. |
| // Note that finding an output ID does not guarantee that the |
| // saved file for that output ID is still available. |
| func (c *Cache) Get(id ActionID) (Entry, error) { |
| if verify { |
| return Entry{}, errMissing |
| } |
| return c.get(id) |
| } |
| |
| type Entry struct { |
| OutputID OutputID |
| Size int64 |
| Time time.Time |
| } |
| |
| // get is Get but does not respect verify mode, so that Put can use it. |
| func (c *Cache) get(id ActionID) (Entry, error) { |
| missing := func() (Entry, error) { |
| fmt.Fprintf(c.log, "%d miss %x\n", c.now().Unix(), id) |
| return Entry{}, errMissing |
| } |
| f, err := os.Open(c.fileName(id, "a")) |
| if err != nil { |
| return missing() |
| } |
| defer f.Close() |
| entry := make([]byte, entrySize+1) // +1 to detect whether f is too long |
| if n, err := io.ReadFull(f, entry); n != entrySize || err != io.ErrUnexpectedEOF { |
| return missing() |
| } |
| if entry[0] != 'v' || entry[1] != '1' || entry[2] != ' ' || entry[3+hexSize] != ' ' || entry[3+hexSize+1+hexSize] != ' ' || entry[3+hexSize+1+hexSize+1+20] != ' ' || entry[entrySize-1] != '\n' { |
| return missing() |
| } |
| eid, entry := entry[3:3+hexSize], entry[3+hexSize:] |
| eout, entry := entry[1:1+hexSize], entry[1+hexSize:] |
| esize, entry := entry[1:1+20], entry[1+20:] |
| etime, entry := entry[1:1+20], entry[1+20:] |
| var buf [HashSize]byte |
| if _, err := hex.Decode(buf[:], eid); err != nil || buf != id { |
| return missing() |
| } |
| if _, err := hex.Decode(buf[:], eout); err != nil { |
| return missing() |
| } |
| i := 0 |
| for i < len(esize) && esize[i] == ' ' { |
| i++ |
| } |
| size, err := strconv.ParseInt(string(esize[i:]), 10, 64) |
| if err != nil || size < 0 { |
| return missing() |
| } |
| i = 0 |
| for i < len(etime) && etime[i] == ' ' { |
| i++ |
| } |
| tm, err := strconv.ParseInt(string(etime[i:]), 10, 64) |
| if err != nil || size < 0 { |
| return missing() |
| } |
| |
| fmt.Fprintf(c.log, "%d get %x\n", c.now().Unix(), id) |
| |
| c.used(c.fileName(id, "a")) |
| |
| return Entry{buf, size, time.Unix(0, tm)}, nil |
| } |
| |
| // GetBytes looks up the action ID in the cache and returns |
| // the corresponding output bytes. |
| // GetBytes should only be used for data that can be expected to fit in memory. |
| func (c *Cache) GetBytes(id ActionID) ([]byte, Entry, error) { |
| entry, err := c.Get(id) |
| if err != nil { |
| return nil, entry, err |
| } |
| data, _ := ioutil.ReadFile(c.OutputFile(entry.OutputID)) |
| if sha256.Sum256(data) != entry.OutputID { |
| return nil, entry, errMissing |
| } |
| return data, entry, nil |
| } |
| |
| // OutputFile returns the name of the cache file storing output with the given OutputID. |
| func (c *Cache) OutputFile(out OutputID) string { |
| file := c.fileName(out, "d") |
| c.used(file) |
| return file |
| } |
| |
| // Time constants for cache expiration. |
| // |
| // We set the mtime on a cache file on each use, but at most one per mtimeInterval (1 hour), |
| // to avoid causing many unnecessary inode updates. The mtimes therefore |
| // roughly reflect "time of last use" but may in fact be older by at most an hour. |
| // |
| // We scan the cache for entries to delete at most once per trimInterval (1 day). |
| // |
| // When we do scan the cache, we delete entries that have not been used for |
| // at least trimLimit (5 days). Statistics gathered from a month of usage by |
| // Go developers found that essentially all reuse of cached entries happened |
| // within 5 days of the previous reuse. See golang.org/issue/22990. |
| const ( |
| mtimeInterval = 1 * time.Hour |
| trimInterval = 24 * time.Hour |
| trimLimit = 5 * 24 * time.Hour |
| ) |
| |
| // used makes a best-effort attempt to update mtime on file, |
| // so that mtime reflects cache access time. |
| // |
| // Because the reflection only needs to be approximate, |
| // and to reduce the amount of disk activity caused by using |
| // cache entries, used only updates the mtime if the current |
| // mtime is more than an hour old. This heuristic eliminates |
| // nearly all of the mtime updates that would otherwise happen, |
| // while still keeping the mtimes useful for cache trimming. |
| func (c *Cache) used(file string) { |
| info, err := os.Stat(file) |
| if err == nil && c.now().Sub(info.ModTime()) < mtimeInterval { |
| return |
| } |
| os.Chtimes(file, c.now(), c.now()) |
| } |
| |
| // Trim removes old cache entries that are likely not to be reused. |
| func (c *Cache) Trim() { |
| now := c.now() |
| |
| // We maintain in dir/trim.txt the time of the last completed cache trim. |
| // If the cache has been trimmed recently enough, do nothing. |
| // This is the common case. |
| data, _ := ioutil.ReadFile(filepath.Join(c.dir, "trim.txt")) |
| t, err := strconv.ParseInt(strings.TrimSpace(string(data)), 10, 64) |
| if err == nil && now.Sub(time.Unix(t, 0)) < trimInterval { |
| return |
| } |
| |
| // Trim each of the 256 subdirectories. |
| // We subtract an additional mtimeInterval |
| // to account for the imprecision of our "last used" mtimes. |
| cutoff := now.Add(-trimLimit - mtimeInterval) |
| for i := 0; i < 256; i++ { |
| subdir := filepath.Join(c.dir, fmt.Sprintf("%02x", i)) |
| c.trimSubdir(subdir, cutoff) |
| } |
| |
| ioutil.WriteFile(filepath.Join(c.dir, "trim.txt"), []byte(fmt.Sprintf("%d", now.Unix())), 0666) |
| } |
| |
| // trimSubdir trims a single cache subdirectory. |
| func (c *Cache) trimSubdir(subdir string, cutoff time.Time) { |
| // Read all directory entries from subdir before removing |
| // any files, in case removing files invalidates the file offset |
| // in the directory scan. Also, ignore error from f.Readdirnames, |
| // because we don't care about reporting the error and we still |
| // want to process any entries found before the error. |
| f, err := os.Open(subdir) |
| if err != nil { |
| return |
| } |
| names, _ := f.Readdirnames(-1) |
| f.Close() |
| |
| for _, name := range names { |
| // Remove only cache entries (xxxx-a and xxxx-d). |
| if !strings.HasSuffix(name, "-a") && !strings.HasSuffix(name, "-d") { |
| continue |
| } |
| entry := filepath.Join(subdir, name) |
| info, err := os.Stat(entry) |
| if err == nil && info.ModTime().Before(cutoff) { |
| os.Remove(entry) |
| } |
| } |
| } |
| |
| // putIndexEntry adds an entry to the cache recording that executing the action |
| // with the given id produces an output with the given output id (hash) and size. |
| func (c *Cache) putIndexEntry(id ActionID, out OutputID, size int64, allowVerify bool) error { |
| // Note: We expect that for one reason or another it may happen |
| // that repeating an action produces a different output hash |
| // (for example, if the output contains a time stamp or temp dir name). |
| // While not ideal, this is also not a correctness problem, so we |
| // don't make a big deal about it. In particular, we leave the action |
| // cache entries writable specifically so that they can be overwritten. |
| // |
| // Setting GODEBUG=gocacheverify=1 does make a big deal: |
| // in verify mode we are double-checking that the cache entries |
| // are entirely reproducible. As just noted, this may be unrealistic |
| // in some cases but the check is also useful for shaking out real bugs. |
| entry := []byte(fmt.Sprintf("v1 %x %x %20d %20d\n", id, out, size, time.Now().UnixNano())) |
| if verify && allowVerify { |
| old, err := c.get(id) |
| if err == nil && (old.OutputID != out || old.Size != size) { |
| // panic to show stack trace, so we can see what code is generating this cache entry. |
| msg := fmt.Sprintf("go: internal cache error: cache verify failed: id=%x changed:<<<\n%s\n>>>\nold: %x %d\nnew: %x %d", id, reverseHash(id), out, size, old.OutputID, old.Size) |
| panic(msg) |
| } |
| } |
| file := c.fileName(id, "a") |
| if err := ioutil.WriteFile(file, entry, 0666); err != nil { |
| os.Remove(file) |
| return err |
| } |
| os.Chtimes(file, c.now(), c.now()) // mainly for tests |
| |
| fmt.Fprintf(c.log, "%d put %x %x %d\n", c.now().Unix(), id, out, size) |
| return nil |
| } |
| |
| // Put stores the given output in the cache as the output for the action ID. |
| // It may read file twice. The content of file must not change between the two passes. |
| func (c *Cache) Put(id ActionID, file io.ReadSeeker) (OutputID, int64, error) { |
| return c.put(id, file, true) |
| } |
| |
| // PutNoVerify is like Put but disables the verify check |
| // when GODEBUG=goverifycache=1 is set. |
| // It is meant for data that is OK to cache but that we expect to vary slightly from run to run, |
| // like test output containing times and the like. |
| func (c *Cache) PutNoVerify(id ActionID, file io.ReadSeeker) (OutputID, int64, error) { |
| return c.put(id, file, false) |
| } |
| |
| func (c *Cache) put(id ActionID, file io.ReadSeeker, allowVerify bool) (OutputID, int64, error) { |
| // Compute output ID. |
| h := sha256.New() |
| if _, err := file.Seek(0, 0); err != nil { |
| return OutputID{}, 0, err |
| } |
| size, err := io.Copy(h, file) |
| if err != nil { |
| return OutputID{}, 0, err |
| } |
| var out OutputID |
| h.Sum(out[:0]) |
| |
| // Copy to cached output file (if not already present). |
| if err := c.copyFile(file, out, size); err != nil { |
| return out, size, err |
| } |
| |
| // Add to cache index. |
| return out, size, c.putIndexEntry(id, out, size, allowVerify) |
| } |
| |
| // PutBytes stores the given bytes in the cache as the output for the action ID. |
| func (c *Cache) PutBytes(id ActionID, data []byte) error { |
| _, _, err := c.Put(id, bytes.NewReader(data)) |
| return err |
| } |
| |
| // copyFile copies file into the cache, expecting it to have the given |
| // output ID and size, if that file is not present already. |
| func (c *Cache) copyFile(file io.ReadSeeker, out OutputID, size int64) error { |
| name := c.fileName(out, "d") |
| info, err := os.Stat(name) |
| if err == nil && info.Size() == size { |
| // Check hash. |
| if f, err := os.Open(name); err == nil { |
| h := sha256.New() |
| io.Copy(h, f) |
| f.Close() |
| var out2 OutputID |
| h.Sum(out2[:0]) |
| if out == out2 { |
| return nil |
| } |
| } |
| // Hash did not match. Fall through and rewrite file. |
| } |
| |
| // Copy file to cache directory. |
| mode := os.O_RDWR | os.O_CREATE |
| if err == nil && info.Size() > size { // shouldn't happen but fix in case |
| mode |= os.O_TRUNC |
| } |
| f, err := os.OpenFile(name, mode, 0666) |
| if err != nil { |
| return err |
| } |
| defer f.Close() |
| if size == 0 { |
| // File now exists with correct size. |
| // Only one possible zero-length file, so contents are OK too. |
| // Early return here makes sure there's a "last byte" for code below. |
| return nil |
| } |
| |
| // From here on, if any of the I/O writing the file fails, |
| // we make a best-effort attempt to truncate the file f |
| // before returning, to avoid leaving bad bytes in the file. |
| |
| // Copy file to f, but also into h to double-check hash. |
| if _, err := file.Seek(0, 0); err != nil { |
| f.Truncate(0) |
| return err |
| } |
| h := sha256.New() |
| w := io.MultiWriter(f, h) |
| if _, err := io.CopyN(w, file, size-1); err != nil { |
| f.Truncate(0) |
| return err |
| } |
| // Check last byte before writing it; writing it will make the size match |
| // what other processes expect to find and might cause them to start |
| // using the file. |
| buf := make([]byte, 1) |
| if _, err := file.Read(buf); err != nil { |
| f.Truncate(0) |
| return err |
| } |
| h.Write(buf) |
| sum := h.Sum(nil) |
| if !bytes.Equal(sum, out[:]) { |
| f.Truncate(0) |
| return fmt.Errorf("file content changed underfoot") |
| } |
| |
| // Commit cache file entry. |
| if _, err := f.Write(buf); err != nil { |
| f.Truncate(0) |
| return err |
| } |
| if err := f.Close(); err != nil { |
| // Data might not have been written, |
| // but file may look like it is the right size. |
| // To be extra careful, remove cached file. |
| os.Remove(name) |
| return err |
| } |
| os.Chtimes(name, c.now(), c.now()) // mainly for tests |
| |
| return nil |
| } |