internal/gcimporter: discard position info for unneeded lines

This change implements an optimization in the size of the
shallow export data: instead of saving the entire line
start offset table of each file, it saves a sparse representation
of each file capable of restoring accurate position information
for the entirety of each line that contains an encoded token.Pos,
but no others.

This reduces the total size of export data for the standard library
to 665KB, down 31% from 957KB, and only 18% more than the original
baseline of 564KB.

The shallow_test asserts (for a couple of specific objects) that
the end positions may be derived from the start positions either
before or after converting token.Pos to token.Position.

Change-Id: I5c2a1648398d0bee979dc36ba6224072ea183bec
Reviewed-on: https://go-review.googlesource.com/c/tools/+/462096
gopls-CI: kokoro <noreply+kokoro@google.com>
Run-TryBot: Alan Donovan <adonovan@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Robert Findley <rfindley@google.com>
diff --git a/internal/gcimporter/iexport.go b/internal/gcimporter/iexport.go
index fa6fa3d..5e01184 100644
--- a/internal/gcimporter/iexport.go
+++ b/internal/gcimporter/iexport.go
@@ -147,24 +147,7 @@
 		fileOffset = make([]uint64, len(p.fileInfos))
 		for i, info := range p.fileInfos {
 			fileOffset[i] = uint64(files.Len())
-
-			files.uint64(p.stringOff(info.file.Name()))
-			files.uint64(uint64(info.file.Size()))
-
-			// Delta-encode the line offsets, omitting the initial zero.
-			// (An empty file has an empty lines array.)
-			//
-			// TODO(adonovan): opt: use a two-pass approach that
-			// first gathers the set of Pos values and then
-			// encodes only the information necessary for them.
-			// This would allow us to discard the lines after the
-			// last object of interest and to run-length encode the
-			// trivial lines between lines with needed positions.
-			lines := getLines(info.file)
-			files.uint64(uint64(len(lines)))
-			for i := 1; i < len(lines); i++ {
-				files.uint64(uint64(lines[i] - lines[i-1]))
-			}
+			p.encodeFile(&files, info.file, info.needed)
 		}
 	}
 
@@ -213,6 +196,55 @@
 	return nil
 }
 
+// encodeFile writes to w a representation of the file sufficient to
+// faithfully restore position information about all needed offsets.
+// Mutates the needed array.
+func (p *iexporter) encodeFile(w *intWriter, file *token.File, needed []uint64) {
+	_ = needed[0] // precondition: needed is non-empty
+
+	w.uint64(p.stringOff(file.Name()))
+
+	size := uint64(file.Size())
+	w.uint64(size)
+
+	// Sort the set of needed offsets. Duplicates are harmless.
+	sort.Slice(needed, func(i, j int) bool { return needed[i] < needed[j] })
+
+	lines := getLines(file) // byte offset of each line start
+	w.uint64(uint64(len(lines)))
+
+	// Rather than record the entire array of line start offsets,
+	// we save only a sparse list of (index, offset) pairs for
+	// the start of each line that contains a needed position.
+	var sparse [][2]int // (index, offset) pairs
+outer:
+	for i, lineStart := range lines {
+		lineEnd := size
+		if i < len(lines)-1 {
+			lineEnd = uint64(lines[i+1])
+		}
+		// Does this line contains a needed offset?
+		if needed[0] < lineEnd {
+			sparse = append(sparse, [2]int{i, lineStart})
+			for needed[0] < lineEnd {
+				needed = needed[1:]
+				if len(needed) == 0 {
+					break outer
+				}
+			}
+		}
+	}
+
+	// Delta-encode the columns.
+	w.uint64(uint64(len(sparse)))
+	var prev [2]int
+	for _, pair := range sparse {
+		w.uint64(uint64(pair[0] - prev[0]))
+		w.uint64(uint64(pair[1] - prev[1]))
+		prev = pair
+	}
+}
+
 // writeIndex writes out an object index. mainIndex indicates whether
 // we're writing out the main index, which is also read by
 // non-compiler tools and includes a complete package description
@@ -311,7 +343,7 @@
 
 type filePositions struct {
 	file   *token.File
-	needed []token.Pos // unordered list of needed positions
+	needed []uint64 // unordered list of needed file offsets
 }
 
 func (p *iexporter) trace(format string, args ...interface{}) {
@@ -337,8 +369,8 @@
 	return off
 }
 
-// fileIndex returns the index of the token.File.
-func (p *iexporter) fileIndex(file *token.File, pos token.Pos) uint64 {
+// fileIndex returns the index of the token.File and the byte offset of pos within it.
+func (p *iexporter) fileIndexAndOffset(file *token.File, pos token.Pos) (uint64, uint64) {
 	index, ok := p.fileInfo[file]
 	if !ok {
 		index = uint64(len(p.fileInfo))
@@ -348,10 +380,12 @@
 		}
 		p.fileInfo[file] = index
 	}
-	// Record each needed position.
+	// Record each needed offset.
 	info := p.fileInfos[index]
-	info.needed = append(info.needed, pos)
-	return index
+	offset := uint64(file.Offset(pos))
+	info.needed = append(info.needed, offset)
+
+	return index, offset
 }
 
 // pushDecl adds n to the declaration work queue, if not already present.
@@ -551,8 +585,9 @@
 		return
 	}
 	file := w.p.fset.File(pos) // fset must be non-nil
-	w.uint64(1 + w.p.fileIndex(file, pos))
-	w.uint64(uint64(file.Offset(pos)))
+	index, offset := w.p.fileIndexAndOffset(file, pos)
+	w.uint64(1 + index)
+	w.uint64(offset)
 }
 
 func (w *exportWriter) posV1(pos token.Pos) {
diff --git a/internal/gcimporter/iimport.go b/internal/gcimporter/iimport.go
index df381d3..448f903 100644
--- a/internal/gcimporter/iimport.go
+++ b/internal/gcimporter/iimport.go
@@ -373,26 +373,51 @@
 	file := p.fileCache[index]
 	if file == nil {
 		off := p.fileOffset[index]
-		rd := intReader{bytes.NewReader(p.fileData[off:]), p.ipath}
-		filename := p.stringAt(rd.uint64())
-		size := int(rd.uint64())
-		file = p.fake.fset.AddFile(filename, -1, size)
-
-		if n := int(rd.uint64()); n > 0 {
-			lines := make([]int, n) // initial element always implicitly zero
-			for i := 1; i < n; i++ {
-				lines[i] = lines[i-1] + int(rd.uint64())
-			}
-			if !file.SetLines(lines) {
-				errorf("SetLines failed") // can't happen
-			}
-		}
-
+		file = p.decodeFile(intReader{bytes.NewReader(p.fileData[off:]), p.ipath})
 		p.fileCache[index] = file
 	}
 	return file
 }
 
+func (p *iimporter) decodeFile(rd intReader) *token.File {
+	filename := p.stringAt(rd.uint64())
+	size := int(rd.uint64())
+	file := p.fake.fset.AddFile(filename, -1, size)
+
+	// SetLines requires a nondecreasing sequence.
+	// Because it is common for clients to derive the interval
+	// [start, start+len(name)] from a start position, and we
+	// want to ensure that the end offset is on the same line,
+	// we fill in the gaps of the sparse encoding with values
+	// that strictly increase by the largest possible amount.
+	// This allows us to avoid having to record the actual end
+	// offset of each needed line.
+
+	lines := make([]int, int(rd.uint64()))
+	var index, offset int
+	for i, n := 0, int(rd.uint64()); i < n; i++ {
+		index += int(rd.uint64())
+		offset += int(rd.uint64())
+		lines[index] = offset
+
+		// Ensure monotonicity between points.
+		for j := index - 1; j > 0 && lines[j] == 0; j-- {
+			lines[j] = lines[j+1] - 1
+		}
+	}
+
+	// Ensure monotonicity after last point.
+	for j := len(lines) - 1; j > 0 && lines[j] == 0; j-- {
+		size--
+		lines[j] = size
+	}
+
+	if !file.SetLines(lines) {
+		errorf("SetLines failed: %d", lines) // can't happen
+	}
+	return file
+}
+
 func (p *iimporter) pkgAt(off uint64) *types.Package {
 	if pkg, ok := p.pkgCache[off]; ok {
 		return pkg
diff --git a/internal/gcimporter/shallow_test.go b/internal/gcimporter/shallow_test.go
index 443bb30..429c34b 100644
--- a/internal/gcimporter/shallow_test.go
+++ b/internal/gcimporter/shallow_test.go
@@ -171,23 +171,56 @@
 // corresponds to its source location: in other words,
 // export+import preserves high-fidelity positions.
 func postTypeCheck(t *testing.T, fset *token.FileSet, pkg *types.Package) {
-	if pkg.Path() == "fmt" {
-		obj := pkg.Scope().Lookup("Println")
-		posn := fset.Position(obj.Pos())
-		data, err := os.ReadFile(posn.Filename)
-		if err != nil {
-			t.Errorf("can't read source file declaring fmt.Println: %v", err)
-			return
-		}
-		// Check line and column.
-		line := strings.Split(string(data), "\n")[posn.Line-1]
+	// We hard-code a few interesting test-case objects.
+	var obj types.Object
+	switch pkg.Path() {
+	case "fmt":
+		// func fmt.Println
+		obj = pkg.Scope().Lookup("Println")
+	case "net/http":
+		// method (*http.Request).ParseForm
+		req := pkg.Scope().Lookup("Request")
+		obj, _, _ = types.LookupFieldOrMethod(req.Type(), true, pkg, "ParseForm")
+	default:
+		return
+	}
+	if obj == nil {
+		t.Errorf("object not found in package %s", pkg.Path())
+		return
+	}
 
-		if id := line[posn.Column-1 : posn.Column-1+len(obj.Name())]; id != "Println" {
-			t.Errorf("%+v: expected declaration of fmt.Println at this line, column; got %q", posn, line)
-		}
-		// Check offset.
-		if id := string(data[posn.Offset : posn.Offset+len(obj.Name())]); id != "Println" {
-			t.Errorf("%+v: expected declaration of fmt.Println at this offset; got %q", posn, id)
-		}
+	// Now check the source fidelity of the object's position.
+	posn := fset.Position(obj.Pos())
+	data, err := os.ReadFile(posn.Filename)
+	if err != nil {
+		t.Errorf("can't read source file declaring %v: %v", obj, err)
+		return
+	}
+
+	// Check line and column denote a source interval containing the object's identifier.
+	line := strings.Split(string(data), "\n")[posn.Line-1]
+
+	if id := line[posn.Column-1 : posn.Column-1+len(obj.Name())]; id != obj.Name() {
+		t.Errorf("%+v: expected declaration of %v at this line, column; got %q", posn, obj, line)
+	}
+
+	// Check offset.
+	if id := string(data[posn.Offset : posn.Offset+len(obj.Name())]); id != obj.Name() {
+		t.Errorf("%+v: expected declaration of %v at this offset; got %q", posn, obj, id)
+	}
+
+	// Check commutativity of Position() and start+len(name) operations:
+	// Position(startPos+len(name)) == Position(startPos) + len(name).
+	// This important property is a consequence of the way in which the
+	// decoder fills the gaps in the sparse line-start offset table.
+	endPosn := fset.Position(obj.Pos() + token.Pos(len(obj.Name())))
+	wantEndPosn := token.Position{
+		Filename: posn.Filename,
+		Offset:   posn.Offset + len(obj.Name()),
+		Line:     posn.Line,
+		Column:   posn.Column + len(obj.Name()),
+	}
+	if endPosn != wantEndPosn {
+		t.Errorf("%+v: expected end Position of %v here; was at %+v", wantEndPosn, obj, endPosn)
 	}
 }