gosrc: Remove noise packages.

A package will be removed if either it has no commits in two years and
no imports from other packages, or it is created for a quick bug fix,
which means it has one or two commits within a week of creation time and
no other activity since then.

This CL checks such packages from GitHub and BitBucket utilizing their
API to gather commits information.

This CL also removes the checks for references for GitHub repo, instead
it check the most recent commit from default branch.

This fixes #405

Change-Id: I14b0f0133f31851511aaa63eee8acbfba63e13d2
Reviewed-on: https://go-review.googlesource.com/24513
Reviewed-by: Alan Donovan <adonovan@google.com>
diff --git a/database/database.go b/database/database.go
index d545184..d0c827d 100644
--- a/database/database.go
+++ b/database/database.go
@@ -601,6 +601,21 @@
 func (db *Database) Delete(path string) error {
 	c := db.Pool.Get()
 	defer c.Close()
+
+	if GAESearch {
+		ctx := bgCtx()
+		id, err := redis.String(c.Do("HGET", "ids", path))
+		if err == redis.ErrNil {
+			return nil
+		}
+		if err != nil {
+			return err
+		}
+		if err := deleteIndex(ctx, id); err != nil {
+			return err
+		}
+	}
+
 	_, err := deleteScript.Do(c, path)
 	return err
 }
diff --git a/database/indexae.go b/database/indexae.go
index 92f6fc2..4259025 100644
--- a/database/indexae.go
+++ b/database/indexae.go
@@ -172,6 +172,14 @@
 		unicode.IsSymbol(r)
 }
 
+func deleteIndex(c context.Context, id string) error {
+	idx, err := search.Open("packages")
+	if err != nil {
+		return err
+	}
+	return idx.Delete(c, id)
+}
+
 // PurgeIndex deletes all the packages from the search index.
 func PurgeIndex(c context.Context) error {
 	idx, err := search.Open("packages")
diff --git a/database/indexae_test.go b/database/indexae_test.go
index 6c5ee0e..42530d7 100644
--- a/database/indexae_test.go
+++ b/database/indexae_test.go
@@ -77,6 +77,7 @@
 		ImportCount: 1,
 		Fork:        true,
 		Stars:       10,
+		Score:       0.99,
 	}
 	if got != wanted {
 		t.Errorf("PutIndex got %v, want %v", got, wanted)
diff --git a/gddo-server/crawl.go b/gddo-server/crawl.go
index fb23fa7..6fd960d 100644
--- a/gddo-server/crawl.go
+++ b/gddo-server/crawl.go
@@ -61,7 +61,7 @@
 			}
 			pdoc = nil
 			err = gosrc.NotFoundError{Message: "no Go files or subdirs"}
-		} else if err != gosrc.ErrNotModified {
+		} else if _, ok := err.(gosrc.NotModifiedError); !ok {
 			pdoc = pdocNew
 		}
 	}
@@ -75,27 +75,54 @@
 		nextCrawl = start.Add(*maxAge * 30)
 	}
 
-	switch {
-	case err == nil:
+	if err == nil {
 		message = append(message, "put:", pdoc.Etag)
 		if err := db.Put(pdoc, nextCrawl, false); err != nil {
 			log.Printf("ERROR db.Put(%q): %v", importPath, err)
 		}
 		return pdoc, nil
-	case err == gosrc.ErrNotModified:
+	} else if e, ok := err.(gosrc.NotModifiedError); ok {
+		if !pdoc.IsCmd && isInactivePkg(importPath, e.Since) {
+			message = append(message, "delete inactive")
+			if err := db.Delete(importPath); err != nil {
+				log.Printf("ERROR db.Delete(%q): %v", importPath, err)
+			}
+			return nil, e
+		}
+		// Touch the package without updating and move on to next one.
 		message = append(message, "touch")
 		if err := db.SetNextCrawlEtag(pdoc.ProjectRoot, pdoc.Etag, nextCrawl); err != nil {
 			log.Printf("ERROR db.SetNextCrawlEtag(%q): %v", importPath, err)
 		}
 		return pdoc, nil
-	case gosrc.IsNotFound(err):
-		message = append(message, "notfound:", err)
+	} else if err == gosrc.ErrQuickFork {
+		message = append(message, "delete", err)
 		if err := db.Delete(importPath); err != nil {
 			log.Printf("ERROR db.Delete(%q): %v", importPath, err)
 		}
 		return nil, err
-	default:
+	} else if e, ok := err.(gosrc.NotFoundError); ok {
+		message = append(message, "notfound:", e)
+		if err := db.Delete(importPath); err != nil {
+			log.Printf("ERROR db.Delete(%q): %v", importPath, err)
+		}
+		return nil, e
+	} else {
 		message = append(message, "ERROR:", err)
 		return nil, err
 	}
 }
+
+// isInactivePkg reports whether the specified package is not imported
+// and has not been modified in 2 years.
+func isInactivePkg(pkg string, lastCommitted time.Time) bool {
+	if lastCommitted.IsZero() ||
+		time.Now().Before(lastCommitted.Add(2*365*24*time.Hour)) {
+		return false
+	}
+	n, err := db.ImporterCount(pkg)
+	if err != nil {
+		log.Printf("ERROR db.ImporterCount(%q): %v", pkg, err)
+	}
+	return n == 0
+}
diff --git a/gosrc/bitbucket.go b/gosrc/bitbucket.go
index c89feb3..76cea46 100644
--- a/gosrc/bitbucket.go
+++ b/gosrc/bitbucket.go
@@ -7,6 +7,7 @@
 package gosrc
 
 import (
+	"log"
 	"net/http"
 	"path"
 	"regexp"
@@ -34,6 +35,11 @@
 	IsFork    bool `json:"is_fork"`
 }
 
+type bitbucketNode struct {
+	Node      string `json:"node"`
+	Timestamp string `json:"utctimestamp"`
+}
+
 func getBitbucketDir(client *http.Client, match map[string]string, savedEtag string) (*Directory, error) {
 	var repo *bitbucketRepo
 	c := &httpClient{client: client}
@@ -50,27 +56,35 @@
 	}
 
 	tags := make(map[string]string)
+	timestamps := make(map[string]time.Time)
+
 	for _, nodeType := range []string{"branches", "tags"} {
-		var nodes map[string]struct {
-			Node string
-		}
+		var nodes map[string]bitbucketNode
 		if _, err := c.getJSON(expand("https://api.bitbucket.org/1.0/repositories/{owner}/{repo}/{0}", match, nodeType), &nodes); err != nil {
 			return nil, err
 		}
 		for t, n := range nodes {
 			tags[t] = n.Node
+			const timeFormat = "2006-01-02 15:04:05Z07:00"
+			committed, err := time.Parse(timeFormat, n.Timestamp)
+			if err != nil {
+				log.Println("error parsing timestamp:", n.Timestamp)
+				continue
+			}
+			timestamps[t] = committed
 		}
 	}
 
 	var err error
-	match["tag"], match["commit"], err = bestTag(tags, defaultTags[match["vcs"]])
+	tag, commit, err := bestTag(tags, defaultTags[match["vcs"]])
 	if err != nil {
 		return nil, err
 	}
-
+	match["tag"] = tag
+	match["commit"] = commit
 	etag := expand("{vcs}-{commit}", match)
 	if etag == savedEtag {
-		return nil, ErrNotModified
+		return nil, NotModifiedError{Since: timestamps[tag]}
 	}
 
 	if repo == nil {
diff --git a/gosrc/github.go b/gosrc/github.go
index ad8f4a3..49a00df 100644
--- a/gosrc/github.go
+++ b/gosrc/github.go
@@ -38,6 +38,15 @@
 	ownerRepoPat        = regexp.MustCompile(`^https://api.github.com/repos/([^/]+)/([^/]+)/`)
 )
 
+type githubCommit struct {
+	ID     string `json:"sha"`
+	Commit struct {
+		Committer struct {
+			Date time.Time `json:"date"`
+		} `json:"committer"`
+	} `json:"commit"`
+}
+
 func gitHubError(resp *http.Response) error {
 	var e struct {
 		Message string `json:"message"`
@@ -52,53 +61,34 @@
 
 	c := &httpClient{client: client, errFn: gitHubError}
 
-	type refJSON struct {
-		Object struct {
-			Type string
-			Sha  string
-			URL  string
-		}
-		Ref string
-		URL string
+	var repo struct {
+		Fork      bool      `json:"fork"`
+		Stars     int       `json:"stargazers_count"`
+		CreatedAt time.Time `json:"created_at"`
+		PushedAt  time.Time `json:"pushed_at"`
 	}
-	var refs []*refJSON
 
-	resp, err := c.getJSON(expand("https://api.github.com/repos/{owner}/{repo}/git/refs", match), &refs)
-	if err != nil {
+	if _, err := c.getJSON(expand("https://api.github.com/repos/{owner}/{repo}", match), &repo); err != nil {
 		return nil, err
 	}
 
-	// If the response contains a Link header, then fallback to requesting "master" and "go1" by name.
-	if resp.Header.Get("Link") != "" {
-		var masterRef refJSON
-		if _, err := c.getJSON(expand("https://api.github.com/repos/{owner}/{repo}/git/refs/heads/master", match), &masterRef); err == nil {
-			refs = append(refs, &masterRef)
-		}
-
-		var go1Ref refJSON
-		if _, err := c.getJSON(expand("https://api.github.com/repos/{owner}/{repo}/git/refs/tags/go1", match), &go1Ref); err == nil {
-			refs = append(refs, &go1Ref)
-		}
+	var commits []*githubCommit
+	url := expand("https://api.github.com/repos/{owner}/{repo}/commits", match)
+	url += fmt.Sprintf("?since=%s", repo.CreatedAt.Format(time.RFC3339))
+	if match["dir"] != "" {
+		url += fmt.Sprintf("&path=%s", match["dir"])
 	}
-
-	tags := make(map[string]string)
-	for _, ref := range refs {
-		switch {
-		case strings.HasPrefix(ref.Ref, "refs/heads/"):
-			tags[ref.Ref[len("refs/heads/"):]] = ref.Object.Sha
-		case strings.HasPrefix(ref.Ref, "refs/tags/"):
-			tags[ref.Ref[len("refs/tags/"):]] = ref.Object.Sha
-		}
-	}
-
-	var commit string
-	match["tag"], commit, err = bestTag(tags, "master")
-	if err != nil {
+	if _, err := c.getJSON(url, &commits); err != nil {
 		return nil, err
 	}
-
-	if commit == savedEtag {
-		return nil, ErrNotModified
+	if repo.Fork && isQuickFork(commits, repo.CreatedAt) {
+		return nil, ErrQuickFork
+	}
+	if len(commits) == 0 {
+		return nil, NotFoundError{Message: "package directory changed or removed"}
+	}
+	if commits[0].ID == savedEtag {
+		return nil, NotModifiedError{Since: commits[0].Commit.Committer.Date}
 	}
 
 	var contents []*struct {
@@ -108,7 +98,7 @@
 		HTMLURL string `json:"html_url"`
 	}
 
-	if _, err := c.getJSON(expand("https://api.github.com/repos/{owner}/{repo}/contents{dir}?ref={tag}", match), &contents); err != nil {
+	if _, err := c.getJSON(expand("https://api.github.com/repos/{owner}/{repo}/contents{dir}", match), &contents); err != nil {
 		// The GitHub content API returns array values for directories
 		// and object values for files. If there's a type mismatch at
 		// the beginning of the response, then assume that the path is
@@ -157,25 +147,14 @@
 
 	browseURL := expand("https://github.com/{owner}/{repo}", match)
 	if match["dir"] != "" {
-		browseURL = expand("https://github.com/{owner}/{repo}/tree/{tag}{dir}", match)
-	}
-
-	var repo = struct {
-		Fork      bool      `json:"fork"`
-		Stars     int       `json:"stargazers_count"`
-		CreatedAt time.Time `json:"created_at"`
-		PushedAt  time.Time `json:"pushed_at"`
-	}{}
-
-	if _, err := c.getJSON(expand("https://api.github.com/repos/{owner}/{repo}", match), &repo); err != nil {
-		return nil, err
+		browseURL = expand("https://github.com/{owner}/{repo}/tree{dir}", match)
 	}
 
 	isDeadEndFork := repo.Fork && repo.PushedAt.Before(repo.CreatedAt)
 
 	return &Directory{
 		BrowseURL:      browseURL,
-		Etag:           commit,
+		Etag:           commits[0].ID,
 		Files:          files,
 		LineFmt:        "%s#L%d",
 		ProjectName:    match["repo"],
@@ -189,6 +168,24 @@
 	}, nil
 }
 
+// isQuickFork reports whether the repository is a "quick fork":
+// it has fewer than 3 commits, all within a week of the repo creation, createdAt.
+func isQuickFork(commits []*githubCommit, createdAt time.Time) bool {
+	if len(commits) > 2 {
+		return false
+	}
+	oneWeekOld := createdAt.Add(7 * 24 * time.Hour)
+	if oneWeekOld.After(time.Now()) {
+		return false // a newborn baby of a repository
+	}
+	for _, commit := range commits {
+		if commit.Commit.Committer.Date.After(oneWeekOld) {
+			return false
+		}
+	}
+	return true
+}
+
 func getGitHubPresentation(client *http.Client, match map[string]string) (*Presentation, error) {
 	c := &httpClient{client: client, header: gitHubRawHeader}
 
@@ -310,7 +307,7 @@
 	commit := gist.History[0].Version
 
 	if commit == savedEtag {
-		return nil, ErrNotModified
+		return nil, NotModifiedError{}
 	}
 
 	var files []*File
diff --git a/gosrc/golang.go b/gosrc/golang.go
index f3d474d..faed15a 100644
--- a/gosrc/golang.go
+++ b/gosrc/golang.go
@@ -23,12 +23,6 @@
 
 	browseURL := "https://golang.org/src/" + importPath + "/"
 	p, err := c.getBytes(browseURL)
-	if IsNotFound(err) {
-		// Fallback to Go 1.3 directory structure.
-		// TODO(garyburd): Delete fallback after 1.4 is pushed to golang.org.
-		browseURL = "https://golang.org/src/pkg/" + importPath + "/"
-		p, err = c.getBytes(browseURL)
-	}
 	if err != nil {
 		return nil, err
 	}
@@ -40,7 +34,7 @@
 	}
 	etag = strings.Trim(string(m[1]), ". ")
 	if etag == savedEtag {
-		return nil, ErrNotModified
+		return nil, NotModifiedError{}
 	}
 
 	var files []*File
diff --git a/gosrc/google.go b/gosrc/google.go
index 1a47b59..3a2fc36 100644
--- a/gosrc/google.go
+++ b/gosrc/google.go
@@ -75,7 +75,7 @@
 	}
 	etag = expand("{vcs}-{0}", match, string(m[1]))
 	if etag == savedEtag {
-		return nil, ErrNotModified
+		return nil, NotModifiedError{}
 	}
 
 	var subdirs []string
diff --git a/gosrc/gosrc.go b/gosrc/gosrc.go
index b30c360..df33ee1 100644
--- a/gosrc/gosrc.go
+++ b/gosrc/gosrc.go
@@ -16,6 +16,7 @@
 	"path"
 	"regexp"
 	"strings"
+	"time"
 )
 
 // File represents a file.
@@ -112,8 +113,15 @@
 	return e.err.Error()
 }
 
-// ErrNotModified indicates that the directory matches the specified etag.
-var ErrNotModified = errors.New("package not modified")
+type NotModifiedError struct {
+	Since time.Time
+}
+
+func (e NotModifiedError) Error() string {
+	return fmt.Sprintf("package not modified since %s", e.Since.Format(time.RFC1123))
+}
+
+var ErrQuickFork = errors.New("package is a quick bug-fix fork")
 
 var errNoMatch = errors.New("no match")
 
diff --git a/gosrc/launchpad.go b/gosrc/launchpad.go
index 5cda037..3d561c8 100644
--- a/gosrc/launchpad.go
+++ b/gosrc/launchpad.go
@@ -119,7 +119,7 @@
 	hash = m.Sum(hash[:0])
 	etag := hex.EncodeToString(hash)
 	if etag == savedEtag {
-		return nil, ErrNotModified
+		return nil, NotModifiedError{}
 	}
 
 	return &Directory{
diff --git a/gosrc/vcs.go b/gosrc/vcs.go
index 6fbb545..bd7cc06 100644
--- a/gosrc/vcs.go
+++ b/gosrc/vcs.go
@@ -148,7 +148,7 @@
 	etag := scheme + "-" + commit
 
 	if etag == savedEtag {
-		return "", "", ErrNotModified
+		return "", "", NotModifiedError{}
 	}
 
 	dir := filepath.Join(TempDir, repo+".git")
@@ -201,7 +201,7 @@
 
 	etag := scheme + "-" + revno
 	if etag == savedEtag {
-		return "", "", ErrNotModified
+		return "", "", NotModifiedError{}
 	}
 
 	dir := filepath.Join(TempDir, repo+".svn")