internal/gcimporter: discard position info for unneeded lines This change implements an optimization in the size of the shallow export data: instead of saving the entire line start offset table of each file, it saves a sparse representation of each file capable of restoring accurate position information for the entirety of each line that contains an encoded token.Pos, but no others. This reduces the total size of export data for the standard library to 665KB, down 31% from 957KB, and only 18% more than the original baseline of 564KB. The shallow_test asserts (for a couple of specific objects) that the end positions may be derived from the start positions either before or after converting token.Pos to token.Position. Change-Id: I5c2a1648398d0bee979dc36ba6224072ea183bec Reviewed-on: https://go-review.googlesource.com/c/tools/+/462096 gopls-CI: kokoro <noreply+kokoro@google.com> Run-TryBot: Alan Donovan <adonovan@google.com> TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Robert Findley <rfindley@google.com>

commit: f0e2d5c52e777589d1375465c4b2816266af60b4 [log] [tgz]
author: Alan Donovan <adonovan@google.com> Fri Jan 13 15:12:19 2023 -0500
committer: Alan Donovan <adonovan@google.com> Wed Jan 18 14:54:44 2023 +0000
tree: 7d8ddec98581f0715e3b68474f71d46ca97c3d5d
parent: 5bedd861b12ec5276c0877b760e82f0469c7bf79 [diff]
diff --git a/internal/gcimporter/iexport.go b/internal/gcimporter/iexport.go
index fa6fa3d..5e01184 100644
--- a/internal/gcimporter/iexport.go
+++ b/internal/gcimporter/iexport.go

@@ -147,24 +147,7 @@
 		fileOffset = make([]uint64, len(p.fileInfos))
 		for i, info := range p.fileInfos {
 			fileOffset[i] = uint64(files.Len())
-
-			files.uint64(p.stringOff(info.file.Name()))
-			files.uint64(uint64(info.file.Size()))
-
-			// Delta-encode the line offsets, omitting the initial zero.
-			// (An empty file has an empty lines array.)
-			//
-			// TODO(adonovan): opt: use a two-pass approach that
-			// first gathers the set of Pos values and then
-			// encodes only the information necessary for them.
-			// This would allow us to discard the lines after the
-			// last object of interest and to run-length encode the
-			// trivial lines between lines with needed positions.
-			lines := getLines(info.file)
-			files.uint64(uint64(len(lines)))
-			for i := 1; i < len(lines); i++ {
-				files.uint64(uint64(lines[i] - lines[i-1]))
-			}
+			p.encodeFile(&files, info.file, info.needed)
 		}
 	}
 
@@ -213,6 +196,55 @@
 	return nil
 }
 
+// encodeFile writes to w a representation of the file sufficient to
+// faithfully restore position information about all needed offsets.
+// Mutates the needed array.
+func (p *iexporter) encodeFile(w *intWriter, file *token.File, needed []uint64) {
+	_ = needed[0] // precondition: needed is non-empty
+
+	w.uint64(p.stringOff(file.Name()))
+
+	size := uint64(file.Size())
+	w.uint64(size)
+
+	// Sort the set of needed offsets. Duplicates are harmless.
+	sort.Slice(needed, func(i, j int) bool { return needed[i] < needed[j] })
+
+	lines := getLines(file) // byte offset of each line start
+	w.uint64(uint64(len(lines)))
+
+	// Rather than record the entire array of line start offsets,
+	// we save only a sparse list of (index, offset) pairs for
+	// the start of each line that contains a needed position.
+	var sparse [][2]int // (index, offset) pairs
+outer:
+	for i, lineStart := range lines {
+		lineEnd := size
+		if i < len(lines)-1 {
+			lineEnd = uint64(lines[i+1])
+		}
+		// Does this line contains a needed offset?
+		if needed[0] < lineEnd {
+			sparse = append(sparse, [2]int{i, lineStart})
+			for needed[0] < lineEnd {
+				needed = needed[1:]
+				if len(needed) == 0 {
+					break outer
+				}
+			}
+		}
+	}
+
+	// Delta-encode the columns.
+	w.uint64(uint64(len(sparse)))
+	var prev [2]int
+	for _, pair := range sparse {
+		w.uint64(uint64(pair[0] - prev[0]))
+		w.uint64(uint64(pair[1] - prev[1]))
+		prev = pair
+	}
+}
+
 // writeIndex writes out an object index. mainIndex indicates whether
 // we're writing out the main index, which is also read by
 // non-compiler tools and includes a complete package description
@@ -311,7 +343,7 @@
 
 type filePositions struct {
 	file   *token.File
-	needed []token.Pos // unordered list of needed positions
+	needed []uint64 // unordered list of needed file offsets
 }
 
 func (p *iexporter) trace(format string, args ...interface{}) {
@@ -337,8 +369,8 @@
 	return off
 }
 
-// fileIndex returns the index of the token.File.
-func (p *iexporter) fileIndex(file *token.File, pos token.Pos) uint64 {
+// fileIndex returns the index of the token.File and the byte offset of pos within it.
+func (p *iexporter) fileIndexAndOffset(file *token.File, pos token.Pos) (uint64, uint64) {
 	index, ok := p.fileInfo[file]
 	if !ok {
 		index = uint64(len(p.fileInfo))
@@ -348,10 +380,12 @@
 		}
 		p.fileInfo[file] = index
 	}
-	// Record each needed position.
+	// Record each needed offset.
 	info := p.fileInfos[index]
-	info.needed = append(info.needed, pos)
-	return index
+	offset := uint64(file.Offset(pos))
+	info.needed = append(info.needed, offset)
+
+	return index, offset
 }
 
 // pushDecl adds n to the declaration work queue, if not already present.
@@ -551,8 +585,9 @@
 		return
 	}
 	file := w.p.fset.File(pos) // fset must be non-nil
-	w.uint64(1 + w.p.fileIndex(file, pos))
-	w.uint64(uint64(file.Offset(pos)))
+	index, offset := w.p.fileIndexAndOffset(file, pos)
+	w.uint64(1 + index)
+	w.uint64(offset)
 }
 
 func (w *exportWriter) posV1(pos token.Pos) {

diff --git a/internal/gcimporter/iimport.go b/internal/gcimporter/iimport.go
index df381d3..448f903 100644
--- a/internal/gcimporter/iimport.go
+++ b/internal/gcimporter/iimport.go

@@ -373,26 +373,51 @@
 	file := p.fileCache[index]
 	if file == nil {
 		off := p.fileOffset[index]
-		rd := intReader{bytes.NewReader(p.fileData[off:]), p.ipath}
-		filename := p.stringAt(rd.uint64())
-		size := int(rd.uint64())
-		file = p.fake.fset.AddFile(filename, -1, size)
-
-		if n := int(rd.uint64()); n > 0 {
-			lines := make([]int, n) // initial element always implicitly zero
-			for i := 1; i < n; i++ {
-				lines[i] = lines[i-1] + int(rd.uint64())
-			}
-			if !file.SetLines(lines) {
-				errorf("SetLines failed") // can't happen
-			}
-		}
-
+		file = p.decodeFile(intReader{bytes.NewReader(p.fileData[off:]), p.ipath})
 		p.fileCache[index] = file
 	}
 	return file
 }
 
+func (p *iimporter) decodeFile(rd intReader) *token.File {
+	filename := p.stringAt(rd.uint64())
+	size := int(rd.uint64())
+	file := p.fake.fset.AddFile(filename, -1, size)
+
+	// SetLines requires a nondecreasing sequence.
+	// Because it is common for clients to derive the interval
+	// [start, start+len(name)] from a start position, and we
+	// want to ensure that the end offset is on the same line,
+	// we fill in the gaps of the sparse encoding with values
+	// that strictly increase by the largest possible amount.
+	// This allows us to avoid having to record the actual end
+	// offset of each needed line.
+
+	lines := make([]int, int(rd.uint64()))
+	var index, offset int
+	for i, n := 0, int(rd.uint64()); i < n; i++ {
+		index += int(rd.uint64())
+		offset += int(rd.uint64())
+		lines[index] = offset
+
+		// Ensure monotonicity between points.
+		for j := index - 1; j > 0 && lines[j] == 0; j-- {
+			lines[j] = lines[j+1] - 1
+		}
+	}
+
+	// Ensure monotonicity after last point.
+	for j := len(lines) - 1; j > 0 && lines[j] == 0; j-- {
+		size--
+		lines[j] = size
+	}
+
+	if !file.SetLines(lines) {
+		errorf("SetLines failed: %d", lines) // can't happen
+	}
+	return file
+}
+
 func (p *iimporter) pkgAt(off uint64) *types.Package {
 	if pkg, ok := p.pkgCache[off]; ok {
 		return pkg

diff --git a/internal/gcimporter/shallow_test.go b/internal/gcimporter/shallow_test.go
index 443bb30..429c34b 100644
--- a/internal/gcimporter/shallow_test.go
+++ b/internal/gcimporter/shallow_test.go

@@ -171,23 +171,56 @@
 // corresponds to its source location: in other words,
 // export+import preserves high-fidelity positions.
 func postTypeCheck(t *testing.T, fset *token.FileSet, pkg *types.Package) {
-	if pkg.Path() == "fmt" {
-		obj := pkg.Scope().Lookup("Println")
-		posn := fset.Position(obj.Pos())
-		data, err := os.ReadFile(posn.Filename)
-		if err != nil {
-			t.Errorf("can't read source file declaring fmt.Println: %v", err)
-			return
-		}
-		// Check line and column.
-		line := strings.Split(string(data), "\n")[posn.Line-1]
+	// We hard-code a few interesting test-case objects.
+	var obj types.Object
+	switch pkg.Path() {
+	case "fmt":
+		// func fmt.Println
+		obj = pkg.Scope().Lookup("Println")
+	case "net/http":
+		// method (*http.Request).ParseForm
+		req := pkg.Scope().Lookup("Request")
+		obj, _, _ = types.LookupFieldOrMethod(req.Type(), true, pkg, "ParseForm")
+	default:
+		return
+	}
+	if obj == nil {
+		t.Errorf("object not found in package %s", pkg.Path())
+		return
+	}
 
-		if id := line[posn.Column-1 : posn.Column-1+len(obj.Name())]; id != "Println" {
-			t.Errorf("%+v: expected declaration of fmt.Println at this line, column; got %q", posn, line)
-		}
-		// Check offset.
-		if id := string(data[posn.Offset : posn.Offset+len(obj.Name())]); id != "Println" {
-			t.Errorf("%+v: expected declaration of fmt.Println at this offset; got %q", posn, id)
-		}
+	// Now check the source fidelity of the object's position.
+	posn := fset.Position(obj.Pos())
+	data, err := os.ReadFile(posn.Filename)
+	if err != nil {
+		t.Errorf("can't read source file declaring %v: %v", obj, err)
+		return
+	}
+
+	// Check line and column denote a source interval containing the object's identifier.
+	line := strings.Split(string(data), "\n")[posn.Line-1]
+
+	if id := line[posn.Column-1 : posn.Column-1+len(obj.Name())]; id != obj.Name() {
+		t.Errorf("%+v: expected declaration of %v at this line, column; got %q", posn, obj, line)
+	}
+
+	// Check offset.
+	if id := string(data[posn.Offset : posn.Offset+len(obj.Name())]); id != obj.Name() {
+		t.Errorf("%+v: expected declaration of %v at this offset; got %q", posn, obj, id)
+	}
+
+	// Check commutativity of Position() and start+len(name) operations:
+	// Position(startPos+len(name)) == Position(startPos) + len(name).
+	// This important property is a consequence of the way in which the
+	// decoder fills the gaps in the sparse line-start offset table.
+	endPosn := fset.Position(obj.Pos() + token.Pos(len(obj.Name())))
+	wantEndPosn := token.Position{
+		Filename: posn.Filename,
+		Offset:   posn.Offset + len(obj.Name()),
+		Line:     posn.Line,
+		Column:   posn.Column + len(obj.Name()),
+	}
+	if endPosn != wantEndPosn {
+		t.Errorf("%+v: expected end Position of %v here; was at %+v", wantEndPosn, obj, endPosn)
 	}
 }
commit	f0e2d5c52e777589d1375465c4b2816266af60b4	[log] [tgz]
author	Alan Donovan <adonovan@google.com>	Fri Jan 13 15:12:19 2023 -0500
committer	Alan Donovan <adonovan@google.com>	Wed Jan 18 14:54:44 2023 +0000
tree	7d8ddec98581f0715e3b68474f71d46ca97c3d5d
parent	5bedd861b12ec5276c0877b760e82f0469c7bf79 [diff]