internal/gcimporter: preserve column and line in shallow iexport

Historically, export data has discarded source position information.
Originally not even line numbers were available; at some point
line and column numbers were stored, up to some fixed limit, but
more recently column numbers were again discarded.
Offset information has always been incorrect.

gopls is moving toward greater reliance on incremental operation
using a file-based cache of export data and other serializable
records that are similar in character. It is critical that it
be able to accurately recover file position information for
dependencies.

This change causes the iexport function to encode each object's
token.Pos as a pair (file, offset), where file indicates the
token.File and offset is a byte offset within the file.
The token.File is serialized as a filename and a delta-encoded
line-offset table. (We discard the lineInfos table that supports
//line directives because gopls no longer uses it.)
The iimport function constructs a token.File and calls SetLines,
and then all token.Pos values work exactly as they would with
source.

This causes about a 74% increase in size of the shallow export
data for the standard library: was 564KB, now 982KB.

token.File has a SetLines method but no GetLines. This change
must therefore resort to... unorthodox methods to retrieve the
field. Suggestions welcome.

This alternative encoding is enabled by using "shallow" mode,
which is effectively a license for gopls to do whatever it wants.
Again, suggestions welcome.

Updates golang/go#57708

Change-Id: I028ed669161e38a9a4672dd8d9cadb268a0cdd07
Reviewed-on: https://go-review.googlesource.com/c/tools/+/461215
Run-TryBot: Alan Donovan <adonovan@google.com>
gopls-CI: kokoro <noreply+kokoro@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Robert Findley <rfindley@google.com>
diff --git a/internal/gcimporter/iexport.go b/internal/gcimporter/iexport.go
index 61e1dd2..93a1846 100644
--- a/internal/gcimporter/iexport.go
+++ b/internal/gcimporter/iexport.go
@@ -21,6 +21,8 @@
 	"sort"
 	"strconv"
 	"strings"
+	"sync"
+	"unsafe"
 
 	"golang.org/x/tools/internal/typeparams"
 )
@@ -100,6 +102,7 @@
 		shallow:     shallow,
 		allPkgs:     map[*types.Package]bool{},
 		stringIndex: map[string]uint64{},
+		fileIndex:   map[*token.File]uint64{},
 		declIndex:   map[types.Object]uint64{},
 		tparamNames: map[types.Object]string{},
 		typIndex:    map[types.Type]uint64{},
@@ -163,11 +166,17 @@
 	}
 	hdr.uint64(uint64(p.version))
 	hdr.uint64(uint64(p.strings.Len()))
+	if p.shallow {
+		hdr.uint64(uint64(p.files.Len()))
+	}
 	hdr.uint64(dataLen)
 
 	// Flush output.
 	io.Copy(out, &hdr)
 	io.Copy(out, &p.strings)
+	if p.shallow {
+		io.Copy(out, &p.files)
+	}
 	io.Copy(out, &p.data0)
 
 	return nil
@@ -255,6 +264,11 @@
 	strings     intWriter
 	stringIndex map[string]uint64
 
+	// In shallow mode, object positions are encoded as (file, offset).
+	// Each file is recorded as a line-number table.
+	files     intWriter
+	fileIndex map[*token.File]uint64
+
 	data0       intWriter
 	declIndex   map[types.Object]uint64
 	tparamNames map[types.Object]string // typeparam->exported name
@@ -286,6 +300,35 @@
 	return off
 }
 
+// fileOff returns the offset of the token.File encoding.
+// If not already present, it's added to the end.
+func (p *iexporter) fileOff(file *token.File) uint64 {
+	off, ok := p.fileIndex[file]
+	if !ok {
+		off = uint64(p.files.Len())
+		p.fileIndex[file] = off
+
+		p.files.uint64(p.stringOff(file.Name()))
+		p.files.uint64(uint64(file.Size()))
+
+		// Delta-encode the line offsets, omitting the initial zero.
+		// (An empty file has an empty lines array.)
+		//
+		// TODO(adonovan): opt: use a two-pass approach that
+		// first gathers the set of Pos values and then
+		// encodes only the information necessary for them.
+		// This would allow us to discard the lines after the
+		// last object of interest and to run-length encode the
+		// trivial lines between lines with needed positions.
+		lines := getLines(file)
+		p.files.uint64(uint64(len(lines)))
+		for i := 1; i < len(lines); i++ {
+			p.files.uint64(uint64(lines[i] - lines[i-1]))
+		}
+	}
+	return off
+}
+
 // pushDecl adds n to the declaration work queue, if not already present.
 func (p *iexporter) pushDecl(obj types.Object) {
 	// Package unsafe is known to the compiler and predeclared.
@@ -464,13 +507,29 @@
 }
 
 func (w *exportWriter) pos(pos token.Pos) {
-	if w.p.version >= iexportVersionPosCol {
+	if w.p.shallow {
+		w.posV2(pos)
+	} else if w.p.version >= iexportVersionPosCol {
 		w.posV1(pos)
 	} else {
 		w.posV0(pos)
 	}
 }
 
+// posV2 encoding (used only in shallow mode) records positions as
+// (file, offset), where file is the index in the token.File table
+// (which records the file name and newline offsets) and offset is a
+// byte offset. It effectively ignores //line directives.
+func (w *exportWriter) posV2(pos token.Pos) {
+	if pos == token.NoPos {
+		w.uint64(0)
+		return
+	}
+	file := w.p.fset.File(pos) // fset must be non-nil
+	w.uint64(1 + w.p.fileOff(file))
+	w.uint64(uint64(file.Offset(pos)))
+}
+
 func (w *exportWriter) posV1(pos token.Pos) {
 	if w.p.fset == nil {
 		w.int64(0)
@@ -1060,3 +1119,50 @@
 	q.head++
 	return obj
 }
+
+// getLines returns the table of line-start offsets from a token.File.
+func getLines(file *token.File) []int {
+	// Use this variant once proposal #57708 is implemented:
+	//
+	// if file, ok := file.(interface{ Lines() []int }); ok {
+	// 	return file.Lines()
+	// }
+
+	// This declaration must match that of token.File.
+	// This creates a risk of dependency skew.
+	// For now we check that the size of the two
+	// declarations is the same, on the (fragile) assumption
+	// that future changes would add fields.
+	type tokenFile119 struct {
+		_     string
+		_     int
+		_     int
+		mu    sync.Mutex // we're not complete monsters
+		lines []int
+		_     []struct{}
+	}
+	type tokenFile118 struct {
+		_ *token.FileSet // deleted in go1.19
+		tokenFile119
+	}
+
+	type uP = unsafe.Pointer
+	switch unsafe.Sizeof(*file) {
+	case unsafe.Sizeof(tokenFile118{}):
+		var ptr *tokenFile118
+		*(*uP)(uP(&ptr)) = uP(file)
+		ptr.mu.Lock()
+		defer ptr.mu.Unlock()
+		return ptr.lines
+
+	case unsafe.Sizeof(tokenFile119{}):
+		var ptr *tokenFile119
+		*(*uP)(uP(&ptr)) = uP(file)
+		ptr.mu.Lock()
+		defer ptr.mu.Unlock()
+		return ptr.lines
+
+	default:
+		panic("unexpected token.File size")
+	}
+}
diff --git a/internal/gcimporter/iimport.go b/internal/gcimporter/iimport.go
index a1c4696..cb75952 100644
--- a/internal/gcimporter/iimport.go
+++ b/internal/gcimporter/iimport.go
@@ -137,12 +137,18 @@
 	}
 
 	sLen := int64(r.uint64())
+	var fLen int64
+	if insert != nil {
+		// shallow mode uses a different position encoding
+		fLen = int64(r.uint64())
+	}
 	dLen := int64(r.uint64())
 
 	whence, _ := r.Seek(0, io.SeekCurrent)
 	stringData := data[whence : whence+sLen]
-	declData := data[whence+sLen : whence+sLen+dLen]
-	r.Seek(sLen+dLen, io.SeekCurrent)
+	fileData := data[whence+sLen : whence+sLen+fLen]
+	declData := data[whence+sLen+fLen : whence+sLen+fLen+dLen]
+	r.Seek(sLen+fLen+dLen, io.SeekCurrent)
 
 	p := iimporter{
 		version: int(version),
@@ -151,6 +157,8 @@
 
 		stringData:  stringData,
 		stringCache: make(map[uint64]string),
+		fileData:    fileData,
+		fileCache:   make(map[uint64]*token.File),
 		pkgCache:    make(map[uint64]*types.Package),
 
 		declData: declData,
@@ -280,6 +288,8 @@
 
 	stringData  []byte
 	stringCache map[uint64]string
+	fileData    []byte
+	fileCache   map[uint64]*token.File
 	pkgCache    map[uint64]*types.Package
 
 	declData    []byte
@@ -352,6 +362,29 @@
 	return s
 }
 
+func (p *iimporter) fileAt(off uint64) *token.File {
+	file, ok := p.fileCache[off]
+	if !ok {
+		rd := intReader{bytes.NewReader(p.fileData[off:]), p.ipath}
+		filename := p.stringAt(rd.uint64())
+		size := int(rd.uint64())
+		file = p.fake.fset.AddFile(filename, -1, size)
+
+		if n := int(rd.uint64()); n > 0 {
+			lines := make([]int, n) // initial element always implicitly zero
+			for i := 1; i < n; i++ {
+				lines[i] = lines[i-1] + int(rd.uint64())
+			}
+			if !file.SetLines(lines) {
+				errorf("SetLines failed") // can't happen
+			}
+		}
+
+		p.fileCache[off] = file
+	}
+	return file
+}
+
 func (p *iimporter) pkgAt(off uint64) *types.Package {
 	if pkg, ok := p.pkgCache[off]; ok {
 		return pkg
@@ -645,6 +678,9 @@
 }
 
 func (r *importReader) pos() token.Pos {
+	if r.p.insert != nil { // shallow mode
+		return r.posv2()
+	}
 	if r.p.version >= iexportVersionPosCol {
 		r.posv1()
 	} else {
@@ -681,6 +717,15 @@
 	}
 }
 
+func (r *importReader) posv2() token.Pos {
+	file := r.uint64()
+	if file == 0 {
+		return token.NoPos
+	}
+	tf := r.p.fileAt(file - 1)
+	return tf.Pos(int(r.uint64()))
+}
+
 func (r *importReader) typ() types.Type {
 	return r.p.typAt(r.uint64(), nil)
 }
diff --git a/internal/gcimporter/shallow_test.go b/internal/gcimporter/shallow_test.go
index 3d8c86a..443bb30 100644
--- a/internal/gcimporter/shallow_test.go
+++ b/internal/gcimporter/shallow_test.go
@@ -10,6 +10,8 @@
 	"go/parser"
 	"go/token"
 	"go/types"
+	"os"
+	"strings"
 	"testing"
 
 	"golang.org/x/sync/errgroup"
@@ -153,6 +155,7 @@
 
 	// Type-check the syntax trees.
 	tpkg, _ := cfg.Check(ppkg.PkgPath, fset, syntax, nil)
+	postTypeCheck(t, fset, tpkg)
 
 	// Save the export data.
 	data, err := gcimporter.IExportShallow(fset, tpkg)
@@ -161,3 +164,30 @@
 	}
 	ppkg.ExportFile = string(data)
 }
+
+// postTypeCheck is called after a package is type checked.
+// We use it to assert additional correctness properties,
+// for example, that the apparent location of "fmt.Println"
+// corresponds to its source location: in other words,
+// export+import preserves high-fidelity positions.
+func postTypeCheck(t *testing.T, fset *token.FileSet, pkg *types.Package) {
+	if pkg.Path() == "fmt" {
+		obj := pkg.Scope().Lookup("Println")
+		posn := fset.Position(obj.Pos())
+		data, err := os.ReadFile(posn.Filename)
+		if err != nil {
+			t.Errorf("can't read source file declaring fmt.Println: %v", err)
+			return
+		}
+		// Check line and column.
+		line := strings.Split(string(data), "\n")[posn.Line-1]
+
+		if id := line[posn.Column-1 : posn.Column-1+len(obj.Name())]; id != "Println" {
+			t.Errorf("%+v: expected declaration of fmt.Println at this line, column; got %q", posn, line)
+		}
+		// Check offset.
+		if id := string(data[posn.Offset : posn.Offset+len(obj.Name())]); id != "Println" {
+			t.Errorf("%+v: expected declaration of fmt.Println at this offset; got %q", posn, id)
+		}
+	}
+}