internal/fetch: convert ast.Files to and from bytes

Add a pair of functions to encode and decode ast.Files.

We'll use these to store the files in the DB so we
can generate documentation on the frontend.

Originally we stored the full source in a zip file.
That turned out to be too slow.

Then we tried to remove unexported functions and function bodies from
the source before saving it. The problem is that doing so changes the
line numbers, so the source links in the rendered doc point to the
wrong lines. We could perhaps fix that by inserting "//line" compiler
directives in the right places, but it would be tricky.

Instead, we just encode the AST directly, using gob. Gob doesn't
handle cycles, but the fields that are responsible for the cycles
aren't needed for doc generation anyway. It is important to get
the ast.Object pointers right, so we do that.

Change-Id: I79f49b125adfc4e516f0aea6ad91cd5259b3d012
Reviewed-on: https://go-review.googlesource.com/c/pkgsite/+/257757
Trust: Jonathan Amsterdam <jba@google.com>
Run-TryBot: Jonathan Amsterdam <jba@google.com>
TryBot-Result: kokoro <noreply+kokoro@google.com>
Reviewed-by: Julie Qiu <julie@golang.org>
diff --git a/internal/fetch/sourcefiles.go b/internal/fetch/sourcefiles.go
new file mode 100644
index 0000000..c4dec5a
--- /dev/null
+++ b/internal/fetch/sourcefiles.go
@@ -0,0 +1,175 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package fetch
+
+import (
+	"bytes"
+	"encoding/gob"
+	"fmt"
+	"go/ast"
+	"go/token"
+	"io"
+)
+
+// encodingType identifies the encoding being used, in case
+// we ever use a different one and need to distinguish them
+// when reading from the DB.
+// It should be a four-byte string.
+const encodingType = "AST1"
+
+// Register ast types for gob, so it can decode concrete types that are stored
+// in interface variables.
+func init() {
+	for _, n := range []interface{}{
+		&ast.ArrayType{},
+		&ast.AssignStmt{},
+		&ast.BasicLit{},
+		&ast.BinaryExpr{},
+		&ast.BlockStmt{},
+		&ast.BranchStmt{},
+		&ast.CallExpr{},
+		&ast.CaseClause{},
+		&ast.CompositeLit{},
+		&ast.DeclStmt{},
+		&ast.DeferStmt{},
+		&ast.Ellipsis{},
+		&ast.ExprStmt{},
+		&ast.ForStmt{},
+		&ast.FuncDecl{},
+		&ast.FuncLit{},
+		&ast.FuncType{},
+		&ast.GenDecl{},
+		&ast.KeyValueExpr{},
+		&ast.IfStmt{},
+		&ast.ImportSpec{},
+		&ast.IncDecStmt{},
+		&ast.IndexExpr{},
+		&ast.InterfaceType{},
+		&ast.MapType{},
+		&ast.ParenExpr{},
+		&ast.RangeStmt{},
+		&ast.ReturnStmt{},
+		&ast.SelectorExpr{},
+		&ast.SliceExpr{},
+		&ast.StarExpr{},
+		&ast.StructType{},
+		&ast.TypeAssertExpr{},
+		&ast.TypeSpec{},
+		&ast.TypeSwitchStmt{},
+		&ast.UnaryExpr{},
+		&ast.ValueSpec{},
+		&ast.Ident{},
+	} {
+		gob.Register(n)
+	}
+}
+
+// Encode fset and files into a byte slice.
+func EncodeASTFiles(fset *token.FileSet, files []*ast.File) ([]byte, error) {
+	var buf bytes.Buffer
+	io.WriteString(&buf, encodingType)
+	enc := gob.NewEncoder(&buf)
+	// Encode the fset using the Write method it provides.
+	if err := fset.Write(enc.Encode); err != nil {
+		return nil, err
+	}
+	// Encode each file.
+	for _, f := range files {
+		removeCycles(f)
+		if err := enc.Encode(f); err != nil {
+			return nil, err
+		}
+	}
+	return buf.Bytes(), nil
+}
+
+// Decode a byte slice encoded with EncodeASTFiles into a FileSet and a list of files.
+func DecodeASTFiles(data []byte) (*token.FileSet, []*ast.File, error) {
+	le := len(encodingType)
+	if len(data) < le || string(data[:le]) != encodingType {
+		return nil, nil, fmt.Errorf("want initial bytes to be %q but they aren't", encodingType)
+	}
+	dec := gob.NewDecoder(bytes.NewReader(data[le:]))
+	fset := token.NewFileSet()
+	if err := fset.Read(dec.Decode); err != nil {
+		return nil, nil, err
+	}
+	var files []*ast.File
+	for {
+		var f *ast.File
+		err := dec.Decode(&f)
+		if err == io.EOF {
+			break
+		}
+		if err != nil {
+			return nil, nil, err
+		}
+		fixupObjects(f)
+		files = append(files, f)
+	}
+	return fset, files, nil
+}
+
+// removeCycles removes cycles from f. There are two sources of cycles
+// in an ast.File: Scopes and Objects.
+//
+// removeCycles removes all Scopes, since doc generation doesn't use them. Doc
+// generation does use Objects, and it needs object identity to be preserved
+// (see internal/fetch/internal/doc/example.go). But it doesn't need the Decl,
+// Data or Type fields of ast.Object, which are responsible for cycles.
+//
+// If we just nulled out those three fields, there would be no cycles, but we
+// wouldn't preserve Object identity when we decoded. For example, if ast.Idents
+// A and B both pointed to the same Object, gob would write them as two separate
+// objects, and decoding would preserve that. (See TestObjectIdentity for
+// a small example of this sort of sharing.)
+//
+// So after nulling out those fields, we place a unique integer into the Decl
+// field if one isn't there already. (Decl would never normally hold an int.)
+// That serves to give a unique label to each object, which decoding can use
+// to reconstruct the original set of relationships.
+func removeCycles(f *ast.File) {
+	next := 0
+	ast.Inspect(f, func(n ast.Node) bool {
+		switch n := n.(type) {
+		case *ast.File:
+			n.Scope = nil // doc doesn't use scopes
+		case *ast.Ident:
+			if n.Obj != nil {
+				if _, ok := n.Obj.Decl.(int); !ok {
+					n.Obj.Data = nil
+					n.Obj.Type = nil
+					n.Obj.Decl = next
+					next++
+				}
+			}
+		}
+		return true
+	})
+}
+
+// fixupObjects re-establishes the original object relationships of the ast.File f.
+//
+// f is the result of EncodeASTFiles, which uses removeCycles (see above) to
+// modify ast.Objects so that they are uniquely identified by their Decl field.
+// fixupObjects uses that value to reconstruct the same set of relationships.
+func fixupObjects(f *ast.File) {
+	objs := map[int]*ast.Object{}
+	ast.Inspect(f, func(n ast.Node) bool {
+		if id, ok := n.(*ast.Ident); ok {
+			if id.Obj != nil {
+				n := id.Obj.Decl.(int)
+				if obj := objs[n]; obj != nil {
+					// If we've seen object n before, then id.Obj should be the same object.
+					id.Obj = obj
+				} else {
+					// If we haven't seen object n before, then remember it.
+					objs[n] = id.Obj
+				}
+			}
+		}
+		return true
+	})
+}
diff --git a/internal/fetch/sourcefiles_test.go b/internal/fetch/sourcefiles_test.go
new file mode 100644
index 0000000..38c9a80
--- /dev/null
+++ b/internal/fetch/sourcefiles_test.go
@@ -0,0 +1,99 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package fetch
+
+import (
+	"bytes"
+	"go/ast"
+	"go/parser"
+	"go/token"
+	"testing"
+)
+
+// astFilesForDir reads the Go files in dir and returns
+// them along with a FileSet.
+func astFilesForDir(dir string) (*token.FileSet, []*ast.File, error) {
+	fset := token.NewFileSet()
+	pkgs, err := parser.ParseDir(fset, dir, nil, parser.ParseComments)
+	if err != nil {
+		return nil, nil, err
+	}
+	var files []*ast.File
+	for _, p := range pkgs {
+		for _, f := range p.Files {
+			files = append(files, f)
+		}
+	}
+	return fset, files, nil
+}
+
+func TestEncodeDecodeASTFiles(t *testing.T) {
+	// Verify that we can encode and decode the Go files in this directory.
+	fset := token.NewFileSet()
+	pkgs, err := parser.ParseDir(fset, ".", nil, parser.ParseComments)
+	if err != nil {
+		t.Fatal(err)
+	}
+	var files []*ast.File
+	for _, p := range pkgs {
+		for _, f := range p.Files {
+			files = append(files, f)
+		}
+	}
+
+	data, err := EncodeASTFiles(fset, files)
+	if err != nil {
+		t.Fatal(err)
+	}
+	gotFset, gotFiles, err := DecodeASTFiles(data)
+	if err != nil {
+		t.Fatal(err)
+	}
+	data2, err := EncodeASTFiles(gotFset, gotFiles)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if !bytes.Equal(data, data2) {
+		t.Fatal("datas unequal")
+	}
+
+}
+
+func TestObjectIdentity(t *testing.T) {
+	// Check that encoding and decoding preserves object identity.
+	const file = `
+package p
+var a int
+func main() { a = 1 }
+`
+
+	compareObjs := func(f *ast.File) {
+		t.Helper()
+		// We know (from hand-inspecting the output of ast.Fprintf) that these two
+		// objects are identical in the above program.
+		o1 := f.Decls[0].(*ast.GenDecl).Specs[0].(*ast.ValueSpec).Names[0].Obj
+		o2 := f.Decls[1].(*ast.FuncDecl).Body.List[0].(*ast.AssignStmt).Lhs[0].(*ast.Ident).Obj
+		if o1 != o2 {
+			t.Fatal("objects not identical")
+		}
+	}
+
+	fset := token.NewFileSet()
+	f, err := parser.ParseFile(fset, "test.go", file, parser.ParseComments)
+	if err != nil {
+		t.Fatal(err)
+	}
+	compareObjs(f)
+
+	data, err := EncodeASTFiles(fset, []*ast.File{f})
+	if err != nil {
+		t.Fatal(err)
+	}
+	_, files, err := DecodeASTFiles(data)
+	if err != nil {
+		t.Fatal(err)
+	}
+	compareObjs(files[0])
+}