blob: aa5ef81ac2e1b8a6b70a1c8d2c439651e8e9949c [file] [log] [blame]
Russ Cox58fedf62023-05-03 01:09:37 -04001// Copyright 2022 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Package diffp implements a basic diff algorithm equivalent to patience diff.
6// It is a copy of internal/diff from the main Go repo, renamed to diffp to avoid
7// conflict with the existing golang.org/x/tools/internal/diff.
8package diffp
9
10import (
11 "bytes"
12 "fmt"
13 "sort"
14 "strings"
15)
16
17// A pair is a pair of values tracked for both the x and y side of a diff.
18// It is typically a pair of line indexes.
19type pair struct{ x, y int }
20
21// Diff returns an anchored diff of the two texts old and new
22// in the “unified diff” format. If old and new are identical,
23// Diff returns a nil slice (no output).
24//
25// Unix diff implementations typically look for a diff with
26// the smallest number of lines inserted and removed,
27// which can in the worst case take time quadratic in the
28// number of lines in the texts. As a result, many implementations
29// either can be made to run for a long time or cut off the search
30// after a predetermined amount of work.
31//
32// In contrast, this implementation looks for a diff with the
33// smallest number of “unique” lines inserted and removed,
34// where unique means a line that appears just once in both old and new.
35// We call this an “anchored diff” because the unique lines anchor
36// the chosen matching regions. An anchored diff is usually clearer
37// than a standard diff, because the algorithm does not try to
38// reuse unrelated blank lines or closing braces.
39// The algorithm also guarantees to run in O(n log n) time
40// instead of the standard O(n²) time.
41//
42// Some systems call this approach a “patience diff,” named for
43// the “patience sorting” algorithm, itself named for a solitaire card game.
44// We avoid that name for two reasons. First, the name has been used
45// for a few different variants of the algorithm, so it is imprecise.
46// Second, the name is frequently interpreted as meaning that you have
47// to wait longer (to be patient) for the diff, meaning that it is a slower algorithm,
48// when in fact the algorithm is faster than the standard one.
49func Diff(oldName string, old []byte, newName string, new []byte) []byte {
50 if bytes.Equal(old, new) {
51 return nil
52 }
53 x := lines(old)
54 y := lines(new)
55
56 // Print diff header.
57 var out bytes.Buffer
58 fmt.Fprintf(&out, "diff %s %s\n", oldName, newName)
59 fmt.Fprintf(&out, "--- %s\n", oldName)
60 fmt.Fprintf(&out, "+++ %s\n", newName)
61
62 // Loop over matches to consider,
63 // expanding each match to include surrounding lines,
64 // and then printing diff chunks.
65 // To avoid setup/teardown cases outside the loop,
66 // tgs returns a leading {0,0} and trailing {len(x), len(y)} pair
67 // in the sequence of matches.
68 var (
69 done pair // printed up to x[:done.x] and y[:done.y]
70 chunk pair // start lines of current chunk
71 count pair // number of lines from each side in current chunk
72 ctext []string // lines for current chunk
73 )
74 for _, m := range tgs(x, y) {
75 if m.x < done.x {
76 // Already handled scanning forward from earlier match.
77 continue
78 }
79
80 // Expand matching lines as far possible,
81 // establishing that x[start.x:end.x] == y[start.y:end.y].
82 // Note that on the first (or last) iteration we may (or definitey do)
83 // have an empty match: start.x==end.x and start.y==end.y.
84 start := m
85 for start.x > done.x && start.y > done.y && x[start.x-1] == y[start.y-1] {
86 start.x--
87 start.y--
88 }
89 end := m
90 for end.x < len(x) && end.y < len(y) && x[end.x] == y[end.y] {
91 end.x++
92 end.y++
93 }
94
95 // Emit the mismatched lines before start into this chunk.
96 // (No effect on first sentinel iteration, when start = {0,0}.)
97 for _, s := range x[done.x:start.x] {
98 ctext = append(ctext, "-"+s)
99 count.x++
100 }
101 for _, s := range y[done.y:start.y] {
102 ctext = append(ctext, "+"+s)
103 count.y++
104 }
105
106 // If we're not at EOF and have too few common lines,
107 // the chunk includes all the common lines and continues.
108 const C = 3 // number of context lines
109 if (end.x < len(x) || end.y < len(y)) &&
110 (end.x-start.x < C || (len(ctext) > 0 && end.x-start.x < 2*C)) {
111 for _, s := range x[start.x:end.x] {
112 ctext = append(ctext, " "+s)
113 count.x++
114 count.y++
115 }
116 done = end
117 continue
118 }
119
120 // End chunk with common lines for context.
121 if len(ctext) > 0 {
122 n := end.x - start.x
123 if n > C {
124 n = C
125 }
126 for _, s := range x[start.x : start.x+n] {
127 ctext = append(ctext, " "+s)
128 count.x++
129 count.y++
130 }
131 done = pair{start.x + n, start.y + n}
132
133 // Format and emit chunk.
134 // Convert line numbers to 1-indexed.
135 // Special case: empty file shows up as 0,0 not 1,0.
136 if count.x > 0 {
137 chunk.x++
138 }
139 if count.y > 0 {
140 chunk.y++
141 }
142 fmt.Fprintf(&out, "@@ -%d,%d +%d,%d @@\n", chunk.x, count.x, chunk.y, count.y)
143 for _, s := range ctext {
144 out.WriteString(s)
145 }
146 count.x = 0
147 count.y = 0
148 ctext = ctext[:0]
149 }
150
151 // If we reached EOF, we're done.
152 if end.x >= len(x) && end.y >= len(y) {
153 break
154 }
155
156 // Otherwise start a new chunk.
157 chunk = pair{end.x - C, end.y - C}
158 for _, s := range x[chunk.x:end.x] {
159 ctext = append(ctext, " "+s)
160 count.x++
161 count.y++
162 }
163 done = end
164 }
165
166 return out.Bytes()
167}
168
169// lines returns the lines in the file x, including newlines.
170// If the file does not end in a newline, one is supplied
171// along with a warning about the missing newline.
172func lines(x []byte) []string {
173 l := strings.SplitAfter(string(x), "\n")
174 if l[len(l)-1] == "" {
175 l = l[:len(l)-1]
176 } else {
177 // Treat last line as having a message about the missing newline attached,
178 // using the same text as BSD/GNU diff (including the leading backslash).
179 l[len(l)-1] += "\n\\ No newline at end of file\n"
180 }
181 return l
182}
183
184// tgs returns the pairs of indexes of the longest common subsequence
185// of unique lines in x and y, where a unique line is one that appears
186// once in x and once in y.
187//
188// The longest common subsequence algorithm is as described in
189// Thomas G. Szymanski, “A Special Case of the Maximal Common
190// Subsequence Problem,” Princeton TR #170 (January 1975),
191// available at https://research.swtch.com/tgs170.pdf.
192func tgs(x, y []string) []pair {
193 // Count the number of times each string appears in a and b.
194 // We only care about 0, 1, many, counted as 0, -1, -2
195 // for the x side and 0, -4, -8 for the y side.
196 // Using negative numbers now lets us distinguish positive line numbers later.
197 m := make(map[string]int)
198 for _, s := range x {
199 if c := m[s]; c > -2 {
200 m[s] = c - 1
201 }
202 }
203 for _, s := range y {
204 if c := m[s]; c > -8 {
205 m[s] = c - 4
206 }
207 }
208
209 // Now unique strings can be identified by m[s] = -1+-4.
210 //
211 // Gather the indexes of those strings in x and y, building:
212 // xi[i] = increasing indexes of unique strings in x.
213 // yi[i] = increasing indexes of unique strings in y.
214 // inv[i] = index j such that x[xi[i]] = y[yi[j]].
215 var xi, yi, inv []int
216 for i, s := range y {
217 if m[s] == -1+-4 {
218 m[s] = len(yi)
219 yi = append(yi, i)
220 }
221 }
222 for i, s := range x {
223 if j, ok := m[s]; ok && j >= 0 {
224 xi = append(xi, i)
225 inv = append(inv, j)
226 }
227 }
228
229 // Apply Algorithm A from Szymanski's paper.
230 // In those terms, A = J = inv and B = [0, n).
231 // We add sentinel pairs {0,0}, and {len(x),len(y)}
232 // to the returned sequence, to help the processing loop.
233 J := inv
234 n := len(xi)
235 T := make([]int, n)
236 L := make([]int, n)
237 for i := range T {
238 T[i] = n + 1
239 }
240 for i := 0; i < n; i++ {
241 k := sort.Search(n, func(k int) bool {
242 return T[k] >= J[i]
243 })
244 T[k] = J[i]
245 L[i] = k + 1
246 }
247 k := 0
248 for _, v := range L {
249 if k < v {
250 k = v
251 }
252 }
253 seq := make([]pair, 2+k)
254 seq[1+k] = pair{len(x), len(y)} // sentinel at end
255 lastj := n
256 for i := n - 1; i >= 0; i-- {
257 if L[i] == k && J[i] < lastj {
258 seq[k] = pair{xi[i], yi[J[i]]}
259 k--
260 }
261 }
262 seq[0] = pair{0, 0} // sentinel at start
263 return seq
264}