Merge pull request #231 from garyburd/master

Improve handling of vendored packages, fix issues
diff --git a/database/index.go b/database/index.go
index 91511fb..3986cdf 100644
--- a/database/index.go
+++ b/database/index.go
@@ -103,23 +103,22 @@
 	return result
 }
 
-func isExcludedPath(path string) bool {
-	if strings.HasSuffix(path, ".go") ||
-		strings.HasPrefix(path, "gist.github.com/") ||
-		strings.Contains(path, "/internal/") ||
-		strings.HasSuffix(path, "/internal") ||
-		strings.Contains(path, "/third_party/") {
-		return true
-	}
-	return false
-}
+// vendorPat matches the path of a vendored package.
+var vendorPat = regexp.MustCompile(
+	// match directories used by tools to vendor packages.
+	`/(?:_?third_party|vendors|Godeps/_workspace/src)/` +
+		// match a domain name.
+		`[^./]+\.[^/]+`)
 
 func documentScore(pdoc *doc.Package) float64 {
 	if pdoc.Name == "" ||
-		pdoc.IsCmd ||
 		pdoc.DeadEndFork ||
 		len(pdoc.Errors) > 0 ||
-		isExcludedPath(pdoc.ImportPath) {
+		strings.HasSuffix(pdoc.ImportPath, ".go") ||
+		strings.HasPrefix(pdoc.ImportPath, "gist.github.com/") ||
+		strings.HasSuffix(pdoc.ImportPath, "/internal") ||
+		strings.Contains(pdoc.ImportPath, "/internal/") ||
+		vendorPat.MatchString(pdoc.ImportPath) {
 		return 0
 	}
 
@@ -129,31 +128,48 @@
 		}
 	}
 
-	if !pdoc.IsCmd && !pdoc.Truncated &&
-		len(pdoc.Consts) == 0 &&
-		len(pdoc.Vars) == 0 &&
-		len(pdoc.Funcs) == 0 &&
-		len(pdoc.Types) == 0 &&
-		len(pdoc.Examples) == 0 {
-		return 0
-	}
-
 	r := 1.0
-	if pdoc.IsCmd && !importsGoPackages(pdoc) {
-		// Penalize commands that don't use the "go/*" packages.
-		r *= 0.9
-	}
-	if pdoc.Doc == "" || pdoc.Synopsis == "" {
-		r *= 0.95
-	}
-	if path.Base(pdoc.ImportPath) != pdoc.Name {
-		r *= 0.9
-	}
-	for i := 0; i < strings.Count(pdoc.ImportPath[len(pdoc.ProjectRoot):], "/"); i++ {
-		r *= 0.99
-	}
-	if strings.Index(pdoc.ImportPath[len(pdoc.ProjectRoot):], "/src/") > 0 {
-		r *= 0.95
+	if pdoc.IsCmd {
+		if pdoc.Doc == "" {
+			// Do not include command in index if it does not have documentation.
+			return 0
+		}
+		if !importsGoPackages(pdoc) {
+			// Penalize commands that don't use the "go/*" packages.
+			r *= 0.9
+		}
+	} else {
+		if !pdoc.Truncated &&
+			len(pdoc.Consts) == 0 &&
+			len(pdoc.Vars) == 0 &&
+			len(pdoc.Funcs) == 0 &&
+			len(pdoc.Types) == 0 &&
+			len(pdoc.Examples) == 0 {
+			// Do not include package in index if it does not have exports.
+			return 0
+		}
+		if pdoc.Doc == "" {
+			// Penalty for no documentation.
+			r *= 0.95
+		}
+		if path.Base(pdoc.ImportPath) != pdoc.Name {
+			// Penalty for last element of path != package name.
+			r *= 0.9
+		}
+		for i := 0; i < strings.Count(pdoc.ImportPath[len(pdoc.ProjectRoot):], "/"); i++ {
+			// Penalty for deeply nested packages.
+			r *= 0.99
+		}
+		if strings.Index(pdoc.ImportPath[len(pdoc.ProjectRoot):], "/src/") > 0 {
+			r *= 0.95
+		}
+		for _, p := range pdoc.Imports {
+			if vendorPat.MatchString(p) {
+				// Penalize packages that import vendored packages.
+				r *= 0.1
+				break
+			}
+		}
 	}
 	return r
 }
diff --git a/database/index_test.go b/database/index_test.go
index 7b13dbf..ea40f69 100644
--- a/database/index_test.go
+++ b/database/index_test.go
@@ -100,21 +100,30 @@
 	}
 }
 
-func TestExcludedPath(t *testing.T) {
-	tests := []struct {
-		path     string
-		expected bool
-	}{
-		{"code.google.com/p/go.text/internal/ucd", true},
-		{"code.google.com/p/go.text/internal", true},
-		{"camlistore.org/third_party/bazil.org/fuse", true},
-		{"bazil.org/fuse", false},
-	}
+var vendorPatTests = []struct {
+	path  string
+	match bool
+}{
+	{"camlistore.org/third_party/github.com/user/repo", true},
+	{"camlistore.org/third_party/dir", false},
+	{"camlistore.org/third_party", false},
+	{"camlistore.org/xthird_party/github.com/user/repo", false},
+	{"camlistore.org/third_partyx/github.com/user/repo", false},
 
-	for _, tt := range tests {
-		actual := isExcludedPath(tt.path)
-		if actual != tt.expected {
-			t.Errorf("isExcludedPath=%t, want %t for %s", actual, tt.expected, tt.path)
+	{"example.org/_third_party/github.com/user/repo/dir", true},
+	{"example.org/_third_party/dir", false},
+
+	{"github.com/user/repo/Godeps/_workspace/src/github.com/user/repo", true},
+	{"github.com/user/repo/Godeps/_workspace/src/dir", false},
+
+	{"github.com/user/repo", false},
+}
+
+func TestVendorPat(t *testing.T) {
+	for _, tt := range vendorPatTests {
+		match := vendorPat.MatchString(tt.path)
+		if match != tt.match {
+			t.Errorf("match(%q) = %v, want %v", tt.path, match, match)
 		}
 	}
 }
diff --git a/gddo-server/crawl.go b/gddo-server/crawl.go
index 1a8dd50..5df4a57 100644
--- a/gddo-server/crawl.go
+++ b/gddo-server/crawl.go
@@ -8,7 +8,6 @@
 
 import (
 	"log"
-	"regexp"
 	"strings"
 	"time"
 
@@ -16,16 +15,6 @@
 	"github.com/golang/gddo/gosrc"
 )
 
-var nestedProjectPat = regexp.MustCompile(`/(?:github\.com|launchpad\.net|code\.google\.com/p|bitbucket\.org|labix\.org)/`)
-
-func exists(path string) bool {
-	b, err := db.Exists(path)
-	if err != nil {
-		b = false
-	}
-	return b
-}
-
 // crawlDoc fetches the package documentation from the VCS and updates the database.
 func crawlDoc(source string, importPath string, pdoc *doc.Package, hasSubdirs bool, nextCrawl time.Time) (*doc.Package, error) {
 	message := []interface{}{source}
@@ -61,9 +50,6 @@
 		// Old import path for Go sub-repository.
 		pdoc = nil
 		err = gosrc.NotFoundError{Message: "old Go sub-repo", Redirect: "golang.org/x/" + importPath[len("code.google.com/p/go."):]}
-	} else if m := nestedProjectPat.FindStringIndex(importPath); m != nil && exists(importPath[m[0]+1:]) {
-		pdoc = nil
-		err = gosrc.NotFoundError{Message: "copy of other project."}
 	} else if blocked, e := db.IsBlocked(importPath); blocked && e == nil {
 		pdoc = nil
 		err = gosrc.NotFoundError{Message: "blocked."}
diff --git a/gosrc/path.go b/gosrc/path.go
index c162df6..e10529a 100644
--- a/gosrc/path.go
+++ b/gosrc/path.go
@@ -15,7 +15,7 @@
 )
 
 var validHost = regexp.MustCompile(`^[-a-z0-9]+(?:\.[-a-z0-9]+)+$`)
-var validPathElement = regexp.MustCompile(`^[-A-Za-z0-9~+][-A-Za-z0-9_.]*$`)
+var validPathElement = regexp.MustCompile(`^[-A-Za-z0-9~+_][-A-Za-z0-9_.]*$`)
 
 func isValidPathElement(s string) bool {
 	return validPathElement.MatchString(s) && s != "testdata"
diff --git a/gosrc/path_test.go b/gosrc/path_test.go
index 024c186..8cad7a7 100644
--- a/gosrc/path_test.go
+++ b/gosrc/path_test.go
@@ -19,6 +19,7 @@
 	"example.com/foo.git",
 	"launchpad.net/~user/foo/trunk",
 	"launchpad.net/~user/+junk/version",
+	"github.com/user/repo/_ok/x",
 }
 
 var badImportPaths = []string{
@@ -28,7 +29,6 @@
 	"favicon.ico",
 	"exmpple.com",
 	"github.com/user/repo/testdata/x",
-	"github.com/user/repo/_ignore/x",
 	"github.com/user/repo/.ignore/x",
 }