internal/diff: avoid unnecessary allocations Previously, a diff of non-ASCII strings would incur three complete copies of the buffers; this changes reduces it to one. Also: - add diff.Bytes function, to avoid unnecessary conversions. - remove unused diff.Lines and LineEdits functions from API. - remove TODO to use []bytes everywhere. We tried it in CL 439277 and didn't like it. - Document that the diff is textual, even when given []byte. Change-Id: I2da3257cc3d12c569218a2d7ce182452e8647a96 Reviewed-on: https://go-review.googlesource.com/c/tools/+/439835 Reviewed-by: Robert Findley <rfindley@google.com> Run-TryBot: Alan Donovan <adonovan@google.com> Auto-Submit: Alan Donovan <adonovan@google.com> TryBot-Result: Gopher Robot <gobot@golang.org> gopls-CI: kokoro <noreply+kokoro@google.com>

commit: f2c4579afe5fead8846586e49d2cd2ad63518191 [log] [tgz]
author: Alan Donovan <adonovan@google.com> Thu Oct 06 16:11:42 2022 -0400
committer: Gopher Robot <gobot@golang.org> Fri Oct 07 14:48:35 2022 +0000
tree: d4daf8087df071957bfce562936874bcff7ac768
parent: 60ddccae8574e69e18ab4c19979ee15bfdf95af6 [diff] [blame]
diff --git a/internal/diff/ndiff.go b/internal/diff/ndiff.go
index e76d2db..722b50c 100644
--- a/internal/diff/ndiff.go
+++ b/internal/diff/ndiff.go

@@ -5,111 +5,104 @@
 package diff
 
 import (
-	"strings"
+	"bytes"
 	"unicode/utf8"
 
 	"golang.org/x/tools/internal/diff/lcs"
 )
 
+// Strings computes the differences between two strings.
+// The resulting edits respect rune boundaries.
+func Strings(before, after string) []Edit {
+	if before == after {
+		return nil // common case
+	}
+
+	if stringIsASCII(before) && stringIsASCII(after) {
+		return diffASCII([]byte(before), []byte(after))
+	}
+	return diffRunes([]rune(before), []rune(after))
+}
+
+// Bytes computes the differences between two byte slices.
+// The resulting edits respect rune boundaries.
+func Bytes(before, after []byte) []Edit {
+	if bytes.Equal(before, after) {
+		return nil // common case
+	}
+
+	if bytesIsASCII(before) && bytesIsASCII(after) {
+		return diffASCII(before, after)
+	}
+	return diffRunes(runes(before), runes(after))
+}
+
+func diffASCII(before, after []byte) []Edit {
+	diffs, _ := lcs.Compute(before, after, maxDiffs/2)
+
+	// Convert from LCS diffs.
+	res := make([]Edit, len(diffs))
+	for i, d := range diffs {
+		res[i] = Edit{d.Start, d.End, d.Text}
+	}
+	return res
+}
+
+func diffRunes(before, after []rune) []Edit {
+	diffs, _ := lcs.Compute(before, after, maxDiffs/2)
+
+	// The diffs returned by the lcs package use indexes
+	// into whatever slice was passed in.
+	// Convert rune offsets to byte offsets.
+	res := make([]Edit, len(diffs))
+	lastEnd := 0
+	utf8Len := 0
+	for i, d := range diffs {
+		utf8Len += runesLen(before[lastEnd:d.Start]) // text between edits
+		start := utf8Len
+		utf8Len += runesLen(before[d.Start:d.End]) // text deleted by this edit
+		res[i] = Edit{start, utf8Len, d.Text}
+		lastEnd = d.End
+	}
+	return res
+}
+
 // maxDiffs is a limit on how deeply the lcs algorithm should search
 // the value is just a guess
 const maxDiffs = 30
 
-// Strings computes the differences between two strings.
-// (Both it and the diff in the myers package have type ComputeEdits, which
-// is why the arguments are strings, not []bytes.)
-// TODO(adonovan): opt: consider switching everything to []bytes, if
-// that's the more common type in practice. Or provide both flavors?
-func Strings(before, after string) []Edit {
-	if before == after {
-		// very frequently true
-		return nil
+// runes is like []rune(string(bytes)) without the duplicate allocation.
+func runes(bytes []byte) []rune {
+	n := utf8.RuneCount(bytes)
+	runes := make([]rune, n)
+	for i := 0; i < n; i++ {
+		r, sz := utf8.DecodeRune(bytes)
+		bytes = bytes[sz:]
+		runes[i] = r
 	}
-	// The diffs returned by the lcs package use indexes into
-	// whatever slice was passed in. Edits use byte offsets, so
-	// rune or line offsets need to be converted.
-	// TODO(adonovan): opt: eliminate all the unnecessary allocations.
-	var diffs []lcs.Diff
-	if !isASCII(before) || !isASCII(after) {
-		diffs, _ = lcs.Compute([]rune(before), []rune(after), maxDiffs/2)
-		diffs = runeOffsets(diffs, []rune(before))
-	} else {
-		// Common case: pure ASCII. Avoid expansion to []rune slice.
-		diffs, _ = lcs.Compute([]byte(before), []byte(after), maxDiffs/2)
-	}
-	return convertDiffs(diffs)
+	return runes
 }
 
-// Lines computes the differences between two list of lines.
-// TODO(adonovan): unused except by its test. Do we actually need it?
-func Lines(before, after []string) []Edit {
-	diffs, _ := lcs.Compute(before, after, maxDiffs/2)
-	diffs = lineOffsets(diffs, before)
-	return convertDiffs(diffs)
-	// the code is not coping with possible missing \ns at the ends
-}
-
-func convertDiffs(diffs []lcs.Diff) []Edit {
-	ans := make([]Edit, len(diffs))
-	for i, d := range diffs {
-		ans[i] = Edit{d.Start, d.End, d.Text}
+// runesLen returns the length in bytes of the UTF-8 encoding of runes.
+func runesLen(runes []rune) (len int) {
+	for _, r := range runes {
+		len += utf8.RuneLen(r)
 	}
-	return ans
-}
-
-// convert diffs with rune offsets into diffs with byte offsets
-func runeOffsets(diffs []lcs.Diff, src []rune) []lcs.Diff {
-	var idx int
-	var tmp strings.Builder // string because []byte([]rune) is illegal
-	for i, d := range diffs {
-		tmp.WriteString(string(src[idx:d.Start]))
-		v := tmp.Len()
-		tmp.WriteString(string(src[d.Start:d.End]))
-		d.Start = v
-		idx = d.End
-		d.End = tmp.Len()
-		diffs[i] = d
-	}
-	return diffs
-}
-
-// convert diffs with line offsets into diffs with byte offsets
-func lineOffsets(diffs []lcs.Diff, src []string) []lcs.Diff {
-	var idx int
-	var tmp strings.Builder // bytes/
-	for i, d := range diffs {
-		tmp.WriteString(strJoin(src[idx:d.Start]))
-		v := tmp.Len()
-		tmp.WriteString(strJoin(src[d.Start:d.End]))
-		d.Start = v
-		idx = d.End
-		d.End = tmp.Len()
-		diffs[i] = d
-	}
-	return diffs
-}
-
-// join lines. (strings.Join doesn't add a trailing separator)
-func strJoin(elems []string) string {
-	if len(elems) == 0 {
-		return ""
-	}
-	n := 0
-	for i := 0; i < len(elems); i++ {
-		n += len(elems[i])
-	}
-
-	var b strings.Builder
-	b.Grow(n)
-	for _, s := range elems {
-		b.WriteString(s)
-		//b.WriteByte('\n')
-	}
-	return b.String()
+	return len
 }
 
 // isASCII reports whether s contains only ASCII.
-func isASCII(s string) bool {
+// TODO(adonovan): combine when x/tools allows generics.
+func stringIsASCII(s string) bool {
+	for i := 0; i < len(s); i++ {
+		if s[i] >= utf8.RuneSelf {
+			return false
+		}
+	}
+	return true
+}
+
+func bytesIsASCII(s []byte) bool {
 	for i := 0; i < len(s); i++ {
 		if s[i] >= utf8.RuneSelf {
 			return false
commit	f2c4579afe5fead8846586e49d2cd2ad63518191	[log] [tgz]
author	Alan Donovan <adonovan@google.com>	Thu Oct 06 16:11:42 2022 -0400
committer	Gopher Robot <gobot@golang.org>	Fri Oct 07 14:48:35 2022 +0000
tree	d4daf8087df071957bfce562936874bcff7ac768
parent	60ddccae8574e69e18ab4c19979ee15bfdf95af6 [diff] [blame]