Blame - internal/diffp/diff.go - tools

blob: aa5ef81ac2e1b8a6b70a1c8d2c439651e8e9949c [file] [log] [blame]

Russ Cox	58fedf6	2023-05-03 01:09:37 -0400	[diff] [blame]	1	// Copyright 2022 The Go Authors. All rights reserved.
				2	// Use of this source code is governed by a BSD-style
				3	// license that can be found in the LICENSE file.
				4
				5	// Package diffp implements a basic diff algorithm equivalent to patience diff.
				6	// It is a copy of internal/diff from the main Go repo, renamed to diffp to avoid
				7	// conflict with the existing golang.org/x/tools/internal/diff.
				8	package diffp
				9
				10	import (
				11	"bytes"
				12	"fmt"
				13	"sort"
				14	"strings"
				15	)
				16
				17	// A pair is a pair of values tracked for both the x and y side of a diff.
				18	// It is typically a pair of line indexes.
				19	type pair struct{ x, y int }
				20
				21	// Diff returns an anchored diff of the two texts old and new
				22	// in the “unified diff” format. If old and new are identical,
				23	// Diff returns a nil slice (no output).
				24	//
				25	// Unix diff implementations typically look for a diff with
				26	// the smallest number of lines inserted and removed,
				27	// which can in the worst case take time quadratic in the
				28	// number of lines in the texts. As a result, many implementations
				29	// either can be made to run for a long time or cut off the search
				30	// after a predetermined amount of work.
				31	//
				32	// In contrast, this implementation looks for a diff with the
				33	// smallest number of “unique” lines inserted and removed,
				34	// where unique means a line that appears just once in both old and new.
				35	// We call this an “anchored diff” because the unique lines anchor
				36	// the chosen matching regions. An anchored diff is usually clearer
				37	// than a standard diff, because the algorithm does not try to
				38	// reuse unrelated blank lines or closing braces.
				39	// The algorithm also guarantees to run in O(n log n) time
				40	// instead of the standard O(n²) time.
				41	//
				42	// Some systems call this approach a “patience diff,” named for
				43	// the “patience sorting” algorithm, itself named for a solitaire card game.
				44	// We avoid that name for two reasons. First, the name has been used
				45	// for a few different variants of the algorithm, so it is imprecise.
				46	// Second, the name is frequently interpreted as meaning that you have
				47	// to wait longer (to be patient) for the diff, meaning that it is a slower algorithm,
				48	// when in fact the algorithm is faster than the standard one.
				49	func Diff(oldName string, old []byte, newName string, new []byte) []byte {
				50	if bytes.Equal(old, new) {
				51	return nil
				52	}
				53	x := lines(old)
				54	y := lines(new)
				55
				56	// Print diff header.
				57	var out bytes.Buffer
				58	fmt.Fprintf(&out, "diff %s %s\n", oldName, newName)
				59	fmt.Fprintf(&out, "--- %s\n", oldName)
				60	fmt.Fprintf(&out, "+++ %s\n", newName)
				61
				62	// Loop over matches to consider,
				63	// expanding each match to include surrounding lines,
				64	// and then printing diff chunks.
				65	// To avoid setup/teardown cases outside the loop,
				66	// tgs returns a leading {0,0} and trailing {len(x), len(y)} pair
				67	// in the sequence of matches.
				68	var (
				69	done pair // printed up to x[:done.x] and y[:done.y]
				70	chunk pair // start lines of current chunk
				71	count pair // number of lines from each side in current chunk
				72	ctext []string // lines for current chunk
				73	)
				74	for _, m := range tgs(x, y) {
				75	if m.x < done.x {
				76	// Already handled scanning forward from earlier match.
				77	continue
				78	}
				79
				80	// Expand matching lines as far possible,
				81	// establishing that x[start.x:end.x] == y[start.y:end.y].
				82	// Note that on the first (or last) iteration we may (or definitey do)
				83	// have an empty match: start.x==end.x and start.y==end.y.
				84	start := m
				85	for start.x > done.x && start.y > done.y && x[start.x-1] == y[start.y-1] {
				86	start.x--
				87	start.y--
				88	}
				89	end := m
				90	for end.x < len(x) && end.y < len(y) && x[end.x] == y[end.y] {
				91	end.x++
				92	end.y++
				93	}
				94
				95	// Emit the mismatched lines before start into this chunk.
				96	// (No effect on first sentinel iteration, when start = {0,0}.)
				97	for _, s := range x[done.x:start.x] {
				98	ctext = append(ctext, "-"+s)
				99	count.x++
				100	}
				101	for _, s := range y[done.y:start.y] {
				102	ctext = append(ctext, "+"+s)
				103	count.y++
				104	}
				105
				106	// If we're not at EOF and have too few common lines,
				107	// the chunk includes all the common lines and continues.
				108	const C = 3 // number of context lines
				109	if (end.x < len(x) \|\| end.y < len(y)) &&
				110	(end.x-start.x < C \|\| (len(ctext) > 0 && end.x-start.x < 2*C)) {
				111	for _, s := range x[start.x:end.x] {
				112	ctext = append(ctext, " "+s)
				113	count.x++
				114	count.y++
				115	}
				116	done = end
				117	continue
				118	}
				119
				120	// End chunk with common lines for context.
				121	if len(ctext) > 0 {
				122	n := end.x - start.x
				123	if n > C {
				124	n = C
				125	}
				126	for _, s := range x[start.x : start.x+n] {
				127	ctext = append(ctext, " "+s)
				128	count.x++
				129	count.y++
				130	}
				131	done = pair{start.x + n, start.y + n}
				132
				133	// Format and emit chunk.
				134	// Convert line numbers to 1-indexed.
				135	// Special case: empty file shows up as 0,0 not 1,0.
				136	if count.x > 0 {
				137	chunk.x++
				138	}
				139	if count.y > 0 {
				140	chunk.y++
				141	}
				142	fmt.Fprintf(&out, "@@ -%d,%d +%d,%d @@\n", chunk.x, count.x, chunk.y, count.y)
				143	for _, s := range ctext {
				144	out.WriteString(s)
				145	}
				146	count.x = 0
				147	count.y = 0
				148	ctext = ctext[:0]
				149	}
				150
				151	// If we reached EOF, we're done.
				152	if end.x >= len(x) && end.y >= len(y) {
				153	break
				154	}
				155
				156	// Otherwise start a new chunk.
				157	chunk = pair{end.x - C, end.y - C}
				158	for _, s := range x[chunk.x:end.x] {
				159	ctext = append(ctext, " "+s)
				160	count.x++
				161	count.y++
				162	}
				163	done = end
				164	}
				165
				166	return out.Bytes()
				167	}
				168
				169	// lines returns the lines in the file x, including newlines.
				170	// If the file does not end in a newline, one is supplied
				171	// along with a warning about the missing newline.
				172	func lines(x []byte) []string {
				173	l := strings.SplitAfter(string(x), "\n")
				174	if l[len(l)-1] == "" {
				175	l = l[:len(l)-1]
				176	} else {
				177	// Treat last line as having a message about the missing newline attached,
				178	// using the same text as BSD/GNU diff (including the leading backslash).
				179	l[len(l)-1] += "\n\\ No newline at end of file\n"
				180	}
				181	return l
				182	}
				183
				184	// tgs returns the pairs of indexes of the longest common subsequence
				185	// of unique lines in x and y, where a unique line is one that appears
				186	// once in x and once in y.
				187	//
				188	// The longest common subsequence algorithm is as described in
				189	// Thomas G. Szymanski, “A Special Case of the Maximal Common
				190	// Subsequence Problem,” Princeton TR #170 (January 1975),
				191	// available at https://research.swtch.com/tgs170.pdf.
				192	func tgs(x, y []string) []pair {
				193	// Count the number of times each string appears in a and b.
				194	// We only care about 0, 1, many, counted as 0, -1, -2
				195	// for the x side and 0, -4, -8 for the y side.
				196	// Using negative numbers now lets us distinguish positive line numbers later.
				197	m := make(map[string]int)
				198	for _, s := range x {
				199	if c := m[s]; c > -2 {
				200	m[s] = c - 1
				201	}
				202	}
				203	for _, s := range y {
				204	if c := m[s]; c > -8 {
				205	m[s] = c - 4
				206	}
				207	}
				208
				209	// Now unique strings can be identified by m[s] = -1+-4.
				210	//
				211	// Gather the indexes of those strings in x and y, building:
				212	// xi[i] = increasing indexes of unique strings in x.
				213	// yi[i] = increasing indexes of unique strings in y.
				214	// inv[i] = index j such that x[xi[i]] = y[yi[j]].
				215	var xi, yi, inv []int
				216	for i, s := range y {
				217	if m[s] == -1+-4 {
				218	m[s] = len(yi)
				219	yi = append(yi, i)
				220	}
				221	}
				222	for i, s := range x {
				223	if j, ok := m[s]; ok && j >= 0 {
				224	xi = append(xi, i)
				225	inv = append(inv, j)
				226	}
				227	}
				228
				229	// Apply Algorithm A from Szymanski's paper.
				230	// In those terms, A = J = inv and B = [0, n).
				231	// We add sentinel pairs {0,0}, and {len(x),len(y)}
				232	// to the returned sequence, to help the processing loop.
				233	J := inv
				234	n := len(xi)
				235	T := make([]int, n)
				236	L := make([]int, n)
				237	for i := range T {
				238	T[i] = n + 1
				239	}
				240	for i := 0; i < n; i++ {
				241	k := sort.Search(n, func(k int) bool {
				242	return T[k] >= J[i]
				243	})
				244	T[k] = J[i]
				245	L[i] = k + 1
				246	}
				247	k := 0
				248	for _, v := range L {
				249	if k < v {
				250	k = v
				251	}
				252	}
				253	seq := make([]pair, 2+k)
				254	seq[1+k] = pair{len(x), len(y)} // sentinel at end
				255	lastj := n
				256	for i := n - 1; i >= 0; i-- {
				257	if L[i] == k && J[i] < lastj {
				258	seq[k] = pair{xi[i], yi[J[i]]}
				259	k--
				260	}
				261	}
				262	seq[0] = pair{0, 0} // sentinel at start
				263	return seq
				264	}