| // Copyright 2017 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| package maintner |
| |
| import ( |
| "bufio" |
| "bytes" |
| "context" |
| "encoding/hex" |
| "errors" |
| "fmt" |
| "log" |
| "os/exec" |
| "strconv" |
| "strings" |
| "time" |
| |
| "golang.org/x/build/maintner/maintpb" |
| ) |
| |
| // GitHash is a git commit in binary form (NOT hex form). |
| // They are currently always 20 bytes long. (for SHA-1 refs) |
| // That may change in the future. |
| type GitHash string |
| |
| func (h GitHash) String() string { return fmt.Sprintf("%x", string(h)) } |
| |
| // requires c.mu be held for writing |
| func (c *Corpus) gitHashFromHexStr(s string) GitHash { |
| if len(s) != 40 { |
| panic(fmt.Sprintf("bogus git hash %q", s)) |
| } |
| var buf [40]byte |
| copy(buf[:], s) |
| _, err := hex.Decode(buf[:20], buf[:]) // aliasing is safe |
| if err != nil { |
| panic(fmt.Sprintf("bogus git hash %q: %v", s, err)) |
| } |
| return GitHash(c.strb(buf[:20])) |
| } |
| |
| // requires c.mu be held for writing |
| func (c *Corpus) gitHashFromHex(s []byte) GitHash { |
| if len(s) != 40 { |
| panic(fmt.Sprintf("bogus git hash %q", s)) |
| } |
| var buf [20]byte |
| _, err := hex.Decode(buf[:], s) |
| if err != nil { |
| panic(fmt.Sprintf("bogus git hash %q: %v", s, err)) |
| } |
| return GitHash(c.strb(buf[:20])) |
| } |
| |
| // GitCommit represents a single commit in a git repository. |
| type GitCommit struct { |
| Hash GitHash |
| Tree GitHash |
| Parents []GitHash |
| Author *GitPerson |
| AuthorTime time.Time |
| Committer *GitPerson |
| CommitTime time.Time |
| Msg string // Commit message subject and body |
| Files []*maintpb.GitDiffTreeFile |
| } |
| |
| // GitPerson is a person in a git commit. |
| type GitPerson struct { |
| Str string // "Foo Bar <foo@bar.com>" |
| } |
| |
| // Email returns the GitPerson's email address only, without the name |
| // or angle brackets. |
| func (p *GitPerson) Email() string { |
| lt := strings.IndexByte(p.Str, '<') |
| gt := strings.IndexByte(p.Str, '>') |
| if lt < 0 || gt < lt { |
| return "" |
| } |
| return p.Str[lt+1 : gt] |
| } |
| |
| func (p *GitPerson) Name() string { |
| i := strings.IndexByte(p.Str, '<') |
| if i < 0 { |
| return p.Str |
| } |
| return strings.TrimSpace(p.Str[:i]) |
| } |
| |
| // requires c.mu be held for writing. |
| func (c *Corpus) enqueueCommitLocked(h GitHash) { |
| if _, ok := c.gitCommit[h]; ok { |
| return |
| } |
| if c.gitCommitTodo == nil { |
| c.gitCommitTodo = map[GitHash]bool{} |
| } |
| c.gitCommitTodo[h] = true |
| } |
| |
| // syncGitCommits polls for git commits in a directory. |
| func (c *Corpus) syncGitCommits(ctx context.Context, conf polledGitCommits, loop bool) error { |
| cmd := exec.CommandContext(ctx, "git", "show-ref", "refs/remotes/origin/master") |
| cmd.Dir = conf.dir |
| out, err := cmd.Output() |
| if err != nil { |
| log.Fatal(err) |
| } |
| outs := strings.TrimSpace(string(out)) |
| if outs == "" { |
| return fmt.Errorf("no remote found for refs/remotes/origin/master") |
| } |
| ref := strings.Fields(outs)[0] |
| c.mu.Lock() |
| refHash := c.gitHashFromHexStr(ref) |
| c.enqueueCommitLocked(refHash) |
| c.mu.Unlock() |
| |
| idle := false |
| for { |
| hash := c.gitCommitToIndex() |
| if hash == "" { |
| if !loop { |
| return nil |
| } |
| if !idle { |
| log.Printf("All git commits index for %v; idle.", conf.repo) |
| idle = true |
| } |
| time.Sleep(5 * time.Second) |
| continue |
| } |
| if err := c.indexCommit(conf, hash); err != nil { |
| log.Printf("Error indexing %v: %v", hash, err) |
| select { |
| case <-ctx.Done(): |
| return ctx.Err() |
| // TODO: temporary vs permanent failure? reschedule? fail hard? |
| // For now just loop with a sleep. |
| case <-time.After(5 * time.Second): |
| } |
| } |
| } |
| } |
| |
| // returns nil if no work. |
| func (c *Corpus) gitCommitToIndex() GitHash { |
| c.mu.RLock() |
| defer c.mu.RUnlock() |
| for hash := range c.gitCommitTodo { |
| if _, ok := c.gitCommit[hash]; !ok { |
| return hash |
| } |
| log.Printf("Warning: git commit %v in todo map, but already known; ignoring", hash) |
| } |
| return "" |
| } |
| |
| var ( |
| nlnl = []byte("\n\n") |
| parentSpace = []byte("parent ") |
| authorSpace = []byte("author ") |
| committerSpace = []byte("committer ") |
| treeSpace = []byte("tree ") |
| golangHgSpace = []byte("golang-hg ") |
| gpgSigSpace = []byte("gpgsig ") |
| encodingSpace = []byte("encoding ") |
| space = []byte(" ") |
| ) |
| |
| func parseCommitFromGit(dir string, hash GitHash) (*maintpb.GitCommit, error) { |
| cmd := exec.Command("git", "cat-file", "commit", hash.String()) |
| cmd.Dir = dir |
| catFile, err := cmd.Output() |
| if err != nil { |
| return nil, fmt.Errorf("git cat-file -p %v: %v", hash, err) |
| } |
| cmd = exec.Command("git", "diff-tree", "--numstat", hash.String()) |
| cmd.Dir = dir |
| diffTreeOut, err := cmd.Output() |
| if err != nil { |
| return nil, fmt.Errorf("git diff-tree --numstat %v: %v", hash, err) |
| } |
| |
| diffTree := &maintpb.GitDiffTree{} |
| bs := bufio.NewScanner(bytes.NewReader(diffTreeOut)) |
| lineNum := 0 |
| for bs.Scan() { |
| line := strings.TrimSpace(bs.Text()) |
| lineNum++ |
| if lineNum == 1 && line == hash.String() { |
| continue |
| } |
| f := strings.Fields(line) |
| // A line is like: <added> WS+ <deleted> WS+ <filename> |
| // Where <added> or <deleted> can be '-' to mean binary. |
| // The filename could contain spaces. |
| // 49 8 maintner/maintner.go |
| // Or: |
| // 49 8 some/name with spaces.txt |
| if len(f) < 3 { |
| continue |
| } |
| binary := f[0] == "-" || f[1] == "-" |
| added, _ := strconv.ParseInt(f[0], 10, 64) |
| deleted, _ := strconv.ParseInt(f[1], 10, 64) |
| file := strings.TrimPrefix(line, f[0]) |
| file = strings.TrimSpace(file) |
| file = strings.TrimPrefix(file, f[1]) |
| file = strings.TrimSpace(file) |
| |
| diffTree.File = append(diffTree.File, &maintpb.GitDiffTreeFile{ |
| File: file, |
| Added: added, |
| Deleted: deleted, |
| Binary: binary, |
| }) |
| } |
| if err := bs.Err(); err != nil { |
| return nil, err |
| } |
| commit := &maintpb.GitCommit{ |
| Raw: catFile, |
| DiffTree: diffTree, |
| } |
| switch len(hash) { |
| case 20: |
| commit.Sha1 = hash.String() |
| default: |
| return nil, fmt.Errorf("unsupported git hash %q", hash.String()) |
| } |
| return commit, nil |
| } |
| |
| func (c *Corpus) indexCommit(conf polledGitCommits, hash GitHash) error { |
| if conf.repo == nil { |
| panic("bogus config; nil repo") |
| } |
| commit, err := parseCommitFromGit(conf.dir, hash) |
| if err != nil { |
| return err |
| } |
| m := &maintpb.Mutation{ |
| Git: &maintpb.GitMutation{ |
| Repo: conf.repo, |
| Commit: commit, |
| }, |
| } |
| c.addMutation(m) |
| return nil |
| } |
| |
| // c.mu is held for writing. |
| func (c *Corpus) processGitMutation(m *maintpb.GitMutation) { |
| commit := m.Commit |
| if commit == nil { |
| return |
| } |
| // TODO: care about m.Repo? |
| c.processGitCommit(commit) |
| } |
| |
| // c.mu is held for writing. |
| func (c *Corpus) processGitCommit(commit *maintpb.GitCommit) (*GitCommit, error) { |
| if len(commit.Sha1) != 40 { |
| return nil, fmt.Errorf("bogus git sha1 %q", commit.Sha1) |
| } |
| hash := c.gitHashFromHexStr(commit.Sha1) |
| |
| catFile := commit.Raw |
| i := bytes.Index(catFile, nlnl) |
| if i == 0 { |
| return nil, fmt.Errorf("commit %v lacks double newline", hash) |
| } |
| hdr, msg := catFile[:i], catFile[i+2:] |
| gc := &GitCommit{ |
| Hash: hash, |
| Parents: make([]GitHash, 0, bytes.Count(hdr, parentSpace)), |
| Msg: c.strb(msg), |
| } |
| if commit.DiffTree != nil { |
| gc.Files = commit.DiffTree.File |
| } |
| for _, f := range gc.Files { |
| f.File = c.str(f.File) // intern the string |
| } |
| parents := 0 |
| err := foreachLine(hdr, func(ln []byte) error { |
| if bytes.HasPrefix(ln, parentSpace) { |
| parents++ |
| parentHash := c.gitHashFromHex(ln[len(parentSpace):]) |
| gc.Parents = append(gc.Parents, parentHash) |
| c.enqueueCommitLocked(parentHash) |
| return nil |
| } |
| if bytes.HasPrefix(ln, authorSpace) { |
| p, t, err := c.parsePerson(ln[len(authorSpace):]) |
| if err != nil { |
| return fmt.Errorf("unrecognized author line %q: %v", ln, err) |
| } |
| gc.Author = p |
| gc.AuthorTime = t |
| return nil |
| } |
| if bytes.HasPrefix(ln, committerSpace) { |
| p, t, err := c.parsePerson(ln[len(committerSpace):]) |
| if err != nil { |
| return fmt.Errorf("unrecognized committer line %q: %v", ln, err) |
| } |
| gc.Committer = p |
| gc.CommitTime = t |
| return nil |
| } |
| if bytes.HasPrefix(ln, treeSpace) { |
| gc.Tree = c.gitHashFromHex(ln[len(treeSpace):]) |
| return nil |
| } |
| if bytes.HasPrefix(ln, golangHgSpace) { |
| if c.gitOfHg == nil { |
| c.gitOfHg = map[string]GitHash{} |
| } |
| c.gitOfHg[string(ln[len(golangHgSpace):])] = hash |
| return nil |
| } |
| if bytes.HasPrefix(ln, gpgSigSpace) || bytes.HasPrefix(ln, space) { |
| // Jessie Frazelle is a unique butterfly. |
| return nil |
| } |
| if bytes.HasPrefix(ln, encodingSpace) { |
| // Also ignore this. In practice this has only |
| // been seen to declare that a commit's |
| // metadata is utf-8 when the author name has |
| // non-ASCII. |
| return nil |
| } |
| log.Printf("in commit %s, unrecognized line %q", hash, ln) |
| return nil |
| }) |
| if err != nil { |
| log.Printf("Unparseable commit %q: %v", hash, err) |
| return nil, fmt.Errorf("Unparseable commit %q: %v", hash, err) |
| } |
| if c.gitCommit == nil { |
| c.gitCommit = map[GitHash]*GitCommit{} |
| } |
| c.gitCommit[hash] = gc |
| if c.gitCommitTodo != nil { |
| delete(c.gitCommitTodo, hash) |
| } |
| if c.verbose { |
| now := time.Now() |
| if now.After(c.lastGitCount.Add(time.Second)) { |
| c.lastGitCount = now |
| log.Printf("Num git commits = %v", len(c.gitCommit)) |
| } |
| } |
| return gc, nil |
| } |
| |
| // calls f on each non-empty line in v, without the trailing \n. the |
| // final line need not include a trailing \n. Returns first non-nil |
| // error returned by f. |
| func foreachLine(v []byte, f func([]byte) error) error { |
| for len(v) > 0 { |
| i := bytes.IndexByte(v, '\n') |
| if i < 0 { |
| return f(v) |
| } |
| if err := f(v[:i]); err != nil { |
| return err |
| } |
| v = v[i+1:] |
| } |
| return nil |
| } |
| |
| // parsePerson parses an "author" or "committer" value from "git cat-file -p COMMIT" |
| // The values are like: |
| // Foo Bar <foobar@gmail.com> 1488624439 +0900 |
| // c.mu must be held for writing. |
| func (c *Corpus) parsePerson(v []byte) (*GitPerson, time.Time, error) { |
| v = bytes.TrimSpace(v) |
| |
| lastSpace := bytes.LastIndexByte(v, ' ') |
| if lastSpace < 0 { |
| return nil, time.Time{}, errors.New("failed to match person") |
| } |
| tz := v[lastSpace+1:] // "+0800" |
| v = v[:lastSpace] // now v is "Foo Bar <foobar@gmail.com> 1488624439" |
| |
| lastSpace = bytes.LastIndexByte(v, ' ') |
| if lastSpace < 0 { |
| return nil, time.Time{}, errors.New("failed to match person") |
| } |
| unixTime := v[lastSpace+1:] |
| nameEmail := v[:lastSpace] // now v is "Foo Bar <foobar@gmail.com>" |
| |
| ut, err := strconv.ParseInt(string(unixTime), 10, 64) |
| if err != nil { |
| return nil, time.Time{}, err |
| } |
| t := time.Unix(ut, 0).In(c.gitLocation(tz)) |
| |
| p, ok := c.gitPeople[string(nameEmail)] |
| if !ok { |
| p = &GitPerson{Str: string(nameEmail)} |
| if c.gitPeople == nil { |
| c.gitPeople = map[string]*GitPerson{} |
| } |
| c.gitPeople[p.Str] = p |
| } |
| return p, t, nil |
| |
| } |
| |
| // v is like '[+-]hhmm' |
| // c.mu must be held for writing. |
| func (c *Corpus) gitLocation(v []byte) *time.Location { |
| if loc, ok := c.zoneCache[string(v)]; ok { |
| return loc |
| } |
| s := string(v) |
| h, _ := strconv.Atoi(s[1:3]) |
| m, _ := strconv.Atoi(s[3:5]) |
| east := 1 |
| if v[0] == '-' { |
| east = -1 |
| } |
| loc := time.FixedZone(s, east*(h*3600+m*60)) |
| if c.zoneCache == nil { |
| c.zoneCache = map[string]*time.Location{} |
| } |
| c.zoneCache[s] = loc |
| return loc |
| } |