database: Customize rank function for the search index.

database.PackageDocument is now merged into database.Package.
database.Package now implements search.FieldLoadSaver, which can
customize the Rank method to our default sorting algorithm. This allows
faster search queries by avoiding sorting. This also eliminates the
confusion of having two similar package structs in database.

Two admin handlers are used to reindex and purge index. These handlers
are restricted to be accessible to admin only in the yaml file.

Change-Id: I63929a50b08b9817e7d62020f9755fba9d4ec8f0
Reviewed-on: https://go-review.googlesource.com/24450
Reviewed-by: Alan Donovan <adonovan@google.com>
Reviewed-by: Andrew Gerrand <adg@golang.org>
diff --git a/database/database.go b/database/database.go
index 2f952ac..d545184 100644
--- a/database/database.go
+++ b/database/database.go
@@ -43,11 +43,13 @@
 	"strconv"
 	"strings"
 	"time"
+	"unicode/utf8"
 
 	"github.com/garyburd/redigo/redis"
 	"github.com/golang/snappy"
 	"golang.org/x/net/context"
 	"google.golang.org/appengine"
+	"google.golang.org/appengine/search"
 
 	"github.com/golang/gddo/doc"
 	"github.com/golang/gddo/gosrc"
@@ -59,12 +61,17 @@
 	}
 }
 
+// Package represents the content of a package both for the search index and
+// for the HTML template. It implements the search.FieldLoadSaver interface
+// to customize the Rank function in the search index.
 type Package struct {
-	Path        string `json:"path"`
-	ImportCount int    `json:"import_count`
-	Synopsis    string `json:"synopsis,omitempty"`
-	Fork        bool   `json:"fork,omitempty"`
-	Stars       int    `json:"stars,omitempty"`
+	Name        string  `json:"name,omitempty"`
+	Path        string  `json:"path"`
+	ImportCount int     `json:"import_count`
+	Synopsis    string  `json:"synopsis,omitempty"`
+	Fork        bool    `json:"fork,omitempty"`
+	Stars       int     `json:"stars,omitempty"`
+	Score       float64 `json:"score,omitempty"`
 }
 
 type byPath []Package
@@ -1150,3 +1157,71 @@
 func (db *Database) IncrementCounter(key string, delta float64) (float64, error) {
 	return db.incrementCounterInternal(key, delta, time.Now())
 }
+
+// Reindex gets all the packages in database and put them into the search index.
+// This will update the search index with the path, synopsis, score, import counts
+// of all the packages in the database.
+func (db *Database) Reindex(ctx context.Context) error {
+	c := db.Pool.Get()
+	defer c.Close()
+
+	idx, err := search.Open("packages")
+	if err != nil {
+		return fmt.Errorf("database: failed to open packages: %v", err)
+	}
+	npkgs := 0
+	for {
+		// Get 200 packages from the nextCrawl set each time. Use npkgs as a cursor
+		// to store the current position we actually indexed. Retry from the cursor
+		// position if we received a timeout error from app engine.
+		values, err := redis.Values(c.Do(
+			"SORT", "nextCrawl",
+			"LIMIT", strconv.Itoa(npkgs), "200",
+			"GET", "pkg:*->path",
+			"GET", "pkg:*->synopsis",
+			"GET", "pkg:*->score",
+		))
+		if err != nil {
+			return err
+		}
+		if len(values) == 0 {
+			break // all done
+		}
+
+		// The Search API should support put in batches of up to 200 documents,
+		// the Go version of this API does not support this yet.
+		// TODO(shantuo): Put packages in batch operations.
+		for ; len(values) > 0; npkgs++ {
+			var pdoc doc.Package
+			var score float64
+			values, err = redis.Scan(values, &pdoc.ImportPath, &pdoc.Synopsis, &score)
+			if err != nil {
+				return err
+			}
+			// There are some corrupted data in our current database
+			// that causes an error when putting the package into the
+			// search index which only supports UTF8 encoding.
+			if !utf8.ValidString(pdoc.Synopsis) {
+				pdoc.Synopsis = ""
+			}
+			id, n, err := pkgIDAndImportCount(c, pdoc.ImportPath)
+			if err != nil {
+				return err
+			}
+			if _, err := idx.Put(ctx, id, &Package{
+				Path:        pdoc.ImportPath,
+				Synopsis:    pdoc.Synopsis,
+				Score:       score,
+				ImportCount: n,
+			}); err != nil {
+				if appengine.IsTimeoutError(err) {
+					log.Printf("App Engine timeout: %v. Continue...", err)
+					break
+				}
+				return fmt.Errorf("Failed to put index %s: %v", id, err)
+			}
+		}
+	}
+	log.Printf("%d packages are reindexed", npkgs)
+	return nil
+}
diff --git a/database/database_test.go b/database/database_test.go
index fe5a1da..c9d63b3 100644
--- a/database/database_test.go
+++ b/database/database_test.go
@@ -134,7 +134,7 @@
 	if err != nil {
 		t.Fatalf("db.Importers() returned error %v", err)
 	}
-	expectedImporters := []Package{{"github.com/user/repo/foo/bar", "hello"}}
+	expectedImporters := []Package{{Path: "github.com/user/repo/foo/bar", Synopsis: "hello"}}
 	if !reflect.DeepEqual(actualImporters, expectedImporters) {
 		t.Errorf("db.Importers() = %v, want %v", actualImporters, expectedImporters)
 	}
@@ -147,7 +147,11 @@
 			actualImports[i].Synopsis = ""
 		}
 	}
-	expectedImports := []Package{{"C", ""}, {"errors", ""}, {"github.com/user/repo/foo/bar", "hello"}}
+	expectedImports := []Package{
+		{Path: "C", Synopsis: ""},
+		{Path: "errors", Synopsis: ""},
+		{Path: "github.com/user/repo/foo/bar", Synopsis: "hello"},
+	}
 	if !reflect.DeepEqual(actualImports, expectedImports) {
 		t.Errorf("db.Imports() = %v, want %v", actualImports, expectedImports)
 	}
diff --git a/database/indexae.go b/database/indexae.go
index 877e6ea..92f6fc2 100644
--- a/database/indexae.go
+++ b/database/indexae.go
@@ -10,6 +10,8 @@
 	"bytes"
 	"errors"
 	"fmt"
+	"log"
+	"math"
 	"strings"
 	"unicode"
 
@@ -19,16 +21,67 @@
 	"github.com/golang/gddo/doc"
 )
 
-// PackageDocument defines the data structure used to represent a package document
-// in the search index.
-type PackageDocument struct {
-	Name        search.Atom
-	Path        string
-	Synopsis    string
-	Score       float64
-	ImportCount float64
-	Stars       float64
-	Fork        search.Atom
+func (p *Package) Load(fields []search.Field, meta *search.DocumentMetadata) error {
+	for _, f := range fields {
+		switch f.Name {
+		case "Name":
+			if v, ok := f.Value.(search.Atom); ok {
+				p.Name = string(v)
+			}
+		case "Path":
+			if v, ok := f.Value.(string); ok {
+				p.Path = v
+			}
+		case "Synopsis":
+			if v, ok := f.Value.(string); ok {
+				p.Synopsis = v
+			}
+		case "ImportCount":
+			if v, ok := f.Value.(float64); ok {
+				p.ImportCount = int(v)
+			}
+		case "Stars":
+			if v, ok := f.Value.(float64); ok {
+				p.Stars = int(v)
+			}
+		case "Score":
+			if v, ok := f.Value.(float64); ok {
+				p.Score = v
+			}
+		}
+	}
+	if p.Path == "" {
+		return errors.New("Invalid document: missing Path field")
+	}
+	for _, f := range meta.Facets {
+		if f.Name == "Fork" {
+			p.Fork = f.Value.(search.Atom) == "true"
+		}
+	}
+	return nil
+}
+
+func (p *Package) Save() ([]search.Field, *search.DocumentMetadata, error) {
+	fields := []search.Field{
+		{Name: "Name", Value: search.Atom(p.Name)},
+		{Name: "Path", Value: p.Path},
+		{Name: "Synopsis", Value: p.Synopsis},
+		{Name: "Score", Value: p.Score},
+		{Name: "ImportCount", Value: float64(p.ImportCount)},
+		{Name: "Stars", Value: float64(p.Stars)},
+	}
+	fork := fmt.Sprint(p.Fork) // "true" or "false"
+	meta := &search.DocumentMetadata{
+		// Customize the rank property by the product of the package score and
+		// natural logarithm of the import count. Rank must be a positive integer.
+		// Use 1 as minimum rank and keep 3 digits of precision to distinguish
+		// close ranks.
+		Rank: int(math.Max(1, 1000*p.Score*math.Log(math.E+float64(p.ImportCount)))),
+		Facets: []search.Facet{
+			{Name: "Fork", Value: search.Atom(fork)},
+		},
+	}
+	return fields, meta, nil
 }
 
 // PutIndex creates or updates a package entry in the search index. id identifies the document in the index.
@@ -44,7 +97,7 @@
 		return err
 	}
 
-	var pkg PackageDocument
+	var pkg Package
 	if err := idx.Get(c, id, &pkg); err != nil {
 		if err != search.ErrNoSuchDocument {
 			return err
@@ -57,20 +110,16 @@
 
 	// Update document information accordingly.
 	if pdoc != nil {
-		pkg.Name = search.Atom(pdoc.Name)
+		pkg.Name = pdoc.Name
 		pkg.Path = pdoc.ImportPath
 		pkg.Synopsis = pdoc.Synopsis
-		pkg.Stars = float64(pdoc.Stars)
-		var fork string
-		if forkAvailable(pdoc.ImportPath) {
-			fork = fmt.Sprint(pdoc.Fork) // "true" or "false"
-		}
-		pkg.Fork = search.Atom(fork)
+		pkg.Stars = pdoc.Stars
+		pkg.Fork = pdoc.Fork
 	}
 	if score >= 0 {
 		pkg.Score = score
 	}
-	pkg.ImportCount = float64(importCount)
+	pkg.ImportCount = importCount
 
 	if _, err := idx.Put(c, id, &pkg); err != nil {
 		return err
@@ -78,10 +127,6 @@
 	return nil
 }
 
-func forkAvailable(p string) bool {
-	return strings.HasPrefix(p, "github.com") || strings.HasPrefix(p, "bitbucket.org")
-}
-
 // Search searches the packages index for a given query. A path-like query string
 // will be passed in unchanged, whereas single words will be stemmed.
 func Search(c context.Context, q string) ([]Package, error) {
@@ -91,33 +136,18 @@
 	}
 	var pkgs []Package
 	opt := &search.SearchOptions{
-		Sort: &search.SortOptions{
-			Expressions: []search.SortExpression{
-				{Expr: "Score * log(10 + ImportCount)"},
-			},
-		},
+		Limit: 100,
 	}
 	for it := index.Search(c, parseQuery2(q), opt); ; {
-		var pd PackageDocument
-		_, err := it.Next(&pd)
+		var p Package
+		_, err := it.Next(&p)
 		if err == search.Done {
 			break
 		}
 		if err != nil {
 			return nil, err
 		}
-		pkg := Package{
-			Path:        pd.Path,
-			ImportCount: int(pd.ImportCount),
-			Synopsis:    pd.Synopsis,
-		}
-		if pd.Fork == "true" {
-			pkg.Fork = true
-		}
-		if pd.Stars > 0 {
-			pkg.Stars = int(pd.Stars)
-		}
-		pkgs = append(pkgs, pkg)
+		pkgs = append(pkgs, p)
 	}
 	return pkgs, nil
 }
@@ -141,3 +171,29 @@
 		r != '.' && r != '/' && unicode.IsPunct(r) ||
 		unicode.IsSymbol(r)
 }
+
+// PurgeIndex deletes all the packages from the search index.
+func PurgeIndex(c context.Context) error {
+	idx, err := search.Open("packages")
+	if err != nil {
+		return err
+	}
+	n := 0
+
+	for it := idx.List(c, &search.ListOptions{IDsOnly: true}); ; n++ {
+		var pkg Package
+		id, err := it.Next(&pkg)
+		if err == search.Done {
+			break
+		}
+		if err != nil {
+			return err
+		}
+		if err := idx.Delete(c, id); err != nil {
+			log.Printf("Failed to delete package %s: %v", id, err)
+			continue
+		}
+	}
+	log.Printf("Purged %d packages from the search index.", n)
+	return nil
+}
diff --git a/database/indexae_test.go b/database/indexae_test.go
index 84a636a..6c5ee0e 100644
--- a/database/indexae_test.go
+++ b/database/indexae_test.go
@@ -18,9 +18,11 @@
 )
 
 var pdoc = &doc.Package{
-	ImportPath:  "github.com/golang/test",
-	ProjectName: "test",
-	Synopsis:    "This is a test package.",
+	ImportPath: "github.com/golang/test",
+	Name:       "test",
+	Synopsis:   "This is a test package.",
+	Fork:       true,
+	Stars:      10,
 }
 
 func TestPutIndexWithEmptyId(t *testing.T) {
@@ -64,16 +66,17 @@
 	if err != nil {
 		t.Fatal(err)
 	}
-	var got PackageDocument
+	var got Package
 	if err = idx.Get(c, "12345", &got); err != nil && err != search.ErrNoSuchDocument {
 		t.Fatal(err)
 	}
-	wanted := PackageDocument{
-		Name:        search.Atom(pdoc.ProjectName),
+	wanted := Package{
+		Name:        pdoc.Name,
 		Path:        pdoc.ImportPath,
 		Synopsis:    pdoc.Synopsis,
-		Score:       0.99,
 		ImportCount: 1,
+		Fork:        true,
+		Stars:       10,
 	}
 	if got != wanted {
 		t.Errorf("PutIndex got %v, want %v", got, wanted)
@@ -105,7 +108,7 @@
 	for i := 2; i < 6; i++ {
 		id += strconv.Itoa(i)
 		pdoc.Synopsis = id
-		if err := PutIndex(c, pdoc, id, math.Pow(0.95, float64(i)), 10*i); err != nil {
+		if err := PutIndex(c, pdoc, id, math.Pow(0.9, float64(i)), 10*i); err != nil {
 			t.Fatal(err)
 		}
 	}
@@ -113,10 +116,10 @@
 	if err != nil {
 		t.Fatal(err)
 	}
-	wanted := []string{"1234", "12345", "123", "12"}
+	wanted := []string{"123", "12", "1234", "12345"}
 	for i, p := range got {
 		if p.Synopsis != wanted[i] {
-			t.Errorf("PutIndex got %v, want %v", p, wanted[i])
+			t.Errorf("Search got %v, want %v", p.Synopsis, wanted[i])
 		}
 	}
 }
diff --git a/gddo-server/assets/templates/common.html b/gddo-server/assets/templates/common.html
index 2754b88..a38dee5 100644
--- a/gddo-server/assets/templates/common.html
+++ b/gddo-server/assets/templates/common.html
@@ -51,11 +51,11 @@
       <tr><td>
         {{if .Path|isValidImportPath}}
         <a href="/{{.Path}}">{{.Path|importPath}}</a>
-          <ul class="list-inline">
+          {{if gaeSearch}}<ul class="list-inline">
             <li class="additional-info">{{.ImportCount}} imports</li>
             {{if .Fork}}<li class="additional-info">· fork</li>{{end}}
             {{if .Stars}}<li class="additional-info">· {{.Stars}} stars</li>{{end}}
-          </ul>
+          </ul>{{end}}
         {{else}}{{.Path|importPath}}</td>
         {{end}}
       <td class="synopsis">{{.Synopsis|importPath}}</td></tr>
diff --git a/gddo-server/background.go b/gddo-server/background.go
index d9c8c91..9b3c8c1 100644
--- a/gddo-server/background.go
+++ b/gddo-server/background.go
@@ -11,6 +11,9 @@
 	"log"
 	"time"
 
+	"google.golang.org/appengine"
+
+	"github.com/golang/gddo/database"
 	"github.com/golang/gddo/gosrc"
 )
 
@@ -113,3 +116,17 @@
 	}
 	return nil
 }
+
+func reindex() {
+	c := appengine.BackgroundContext()
+	if err := db.Reindex(c); err != nil {
+		log.Println("reindex:", err)
+	}
+}
+
+func purgeIndex() {
+	c := appengine.BackgroundContext()
+	if err := database.PurgeIndex(c); err != nil {
+		log.Println("purgeIndex:", err)
+	}
+}
diff --git a/gddo-server/main.go b/gddo-server/main.go
index d04f274..e0f62e6 100644
--- a/gddo-server/main.go
+++ b/gddo-server/main.go
@@ -475,6 +475,16 @@
 	})
 }
 
+func runReindex(resp http.ResponseWriter, req *http.Request) {
+	fmt.Fprintln(resp, "Reindexing...")
+	go reindex()
+}
+
+func runPurgeIndex(resp http.ResponseWriter, req *http.Request) {
+	fmt.Fprintln(resp, "Purging the search index...")
+	go purgeIndex()
+}
+
 type byPath struct {
 	pkgs []database.Package
 	rank []int
@@ -994,6 +1004,8 @@
 	mux.Handle("/-/subrepo", handler(serveGoSubrepoIndex))
 	mux.Handle("/-/index", handler(serveIndex))
 	mux.Handle("/-/refresh", handler(serveRefresh))
+	mux.Handle("/-/admin/reindex", http.HandlerFunc(runReindex))
+	mux.Handle("/-/admin/purgeindex", http.HandlerFunc(runPurgeIndex))
 	mux.Handle("/a/index", http.RedirectHandler("/-/index", http.StatusMovedPermanently))
 	mux.Handle("/about", http.RedirectHandler("/-/about", http.StatusMovedPermanently))
 	mux.Handle("/favicon.ico", staticServer.FileHandler("favicon.ico"))
diff --git a/gddo-server/template.go b/gddo-server/template.go
index b7bf32e..2da07a0 100644
--- a/gddo-server/template.go
+++ b/gddo-server/template.go
@@ -25,6 +25,7 @@
 	ttemp "text/template"
 	"time"
 
+	"github.com/golang/gddo/database"
 	"github.com/golang/gddo/doc"
 	"github.com/golang/gddo/gosrc"
 	"github.com/golang/gddo/httputil"
@@ -463,6 +464,10 @@
 	return gaAccount
 }
 
+func gaeSearchFn() bool {
+	return database.GAESearch
+}
+
 func noteTitleFn(s string) string {
 	return strings.Title(strings.ToLower(s))
 }
@@ -517,6 +522,7 @@
 			"comment":           commentFn,
 			"equal":             reflect.DeepEqual,
 			"gaAccount":         gaAccountFn,
+			"gaeSearch":         gaeSearchFn,
 			"host":              hostFn,
 			"htmlComment":       htmlCommentFn,
 			"importPath":        importPathFn,