internal/lsp/fuzzy: add a new fuzzy matcher optimized for Go symbols

For workspace symbol requests we use various workarounds to optimize
matches for Go symbol paths (specifically, to increase relevance toward
the right hand side of a symbol path). These workarounds have a
significant impact on performance.

The existing fuzzy matcher could also be optimized, but is hard to
modify safely. As an experiment, add a new simple-but-fast fuzzy matcher
to use as a point of reference.

Change-Id: Iacaf149dfaf75f25e13909145f9508c7eaedf1a2
Reviewed-on: https://go-review.googlesource.com/c/tools/+/338689
Trust: Robert Findley <rfindley@google.com>
Run-TryBot: Robert Findley <rfindley@google.com>
gopls-CI: kokoro <noreply+kokoro@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Peter Weinberger <pjw@google.com>
diff --git a/internal/lsp/fuzzy/symbol.go b/internal/lsp/fuzzy/symbol.go
new file mode 100644
index 0000000..062f491
--- /dev/null
+++ b/internal/lsp/fuzzy/symbol.go
@@ -0,0 +1,224 @@
+// Copyright 2021 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package fuzzy
+
+import (
+	"unicode"
+)
+
+// SymbolMatcher implements a fuzzy matching algorithm optimized for Go symbols
+// of the form:
+//  example.com/path/to/package.object.field
+//
+// Knowing that we are matching symbols like this allows us to make the
+// following optimizations:
+//  - We can incorporate right-to-left relevance directly into the score
+//    calculation.
+//  - We can match from right to left, discarding leading bytes if the input is
+//    too long.
+//  - We just take the right-most match without losing too much precision. This
+//    allows us to use an O(n) algorithm.
+//  - We can operate directly on chunked strings; in many cases we will
+//    be storing the package path and/or package name separately from the
+//    symbol or identifiers, so doing this avoids allocating strings.
+//  - We can return the index of the right-most match, allowing us to trim
+//    irrelevant qualification.
+//
+// This implementation is experimental, serving as a reference fast algorithm
+// to compare to the fuzzy algorithm implemented by Matcher.
+type SymbolMatcher struct {
+	// Using buffers of length 256 is both a reasonable size for most qualified
+	// symbols, and makes it easy to avoid bounds checks by using uint8 indexes.
+	pattern     [256]rune
+	patternLen  uint8
+	inputBuffer [256]rune   // avoid allocating when considering chunks
+	roles       [256]uint32 // which roles does a rune play (word start, etc.)
+	segments    [256]uint8  // how many segments from the right is each rune
+}
+
+const (
+	segmentStart uint32 = 1 << iota
+	wordStart
+	separator
+)
+
+// NewSymbolMatcher creates a SymbolMatcher that may be used to match the given
+// search pattern.
+//
+// Currently this matcher only accepts case-insensitive fuzzy patterns.
+//
+// TODO(rfindley):
+//  - implement smart-casing
+//  - implement space-separated groups
+//  - implement ', ^, and $ modifiers
+//
+// An empty pattern matches no input.
+func NewSymbolMatcher(pattern string) *SymbolMatcher {
+	m := &SymbolMatcher{}
+	for _, p := range pattern {
+		m.pattern[m.patternLen] = unicode.ToLower(p)
+		m.patternLen++
+		if m.patternLen == 255 || int(m.patternLen) == len(pattern) {
+			// break at 255 so that we can represent patternLen with a uint8.
+			break
+		}
+	}
+	return m
+}
+
+// Match looks for the right-most match of the search pattern within the symbol
+// represented by concatenating the given chunks, returning its offset and
+// score.
+//
+// If a match is found, the first return value will hold the absolute byte
+// offset within all chunks for the start of the symbol. In other words, the
+// index of the match within strings.Join(chunks, ""). If no match is found,
+// the first return value will be -1.
+//
+// The second return value will be the score of the match, which is always
+// between 0 and 1, inclusive. A score of 0 indicates no match.
+func (m *SymbolMatcher) Match(chunks []string) (int, float64) {
+	// Explicit behavior for an empty pattern.
+	//
+	// As a minor optimization, this also avoids nilness checks later on, since
+	// the compiler can prove that m != nil.
+	if m.patternLen == 0 {
+		return -1, 0
+	}
+
+	// First phase: populate the input buffer with lower-cased runes.
+	//
+	// We could also check for a forward match here, but since we'd have to write
+	// the entire input anyway this has negligible impact on performance.
+
+	var (
+		inputLen  = uint8(0)
+		modifiers = wordStart | segmentStart
+	)
+
+input:
+	for _, chunk := range chunks {
+		for _, r := range chunk {
+			if r == '.' || r == '/' {
+				modifiers |= separator
+			}
+			// optimization: avoid calls to unicode.ToLower, which can't be inlined.
+			l := r
+			if r <= unicode.MaxASCII {
+				if 'A' <= r && r <= 'Z' {
+					l = r + 'a' - 'A'
+				}
+			} else {
+				l = unicode.ToLower(r)
+			}
+			if l != r {
+				modifiers |= wordStart
+			}
+			m.inputBuffer[inputLen] = l
+			m.roles[inputLen] = modifiers
+			inputLen++
+			if m.roles[inputLen-1]&separator != 0 {
+				modifiers = wordStart | segmentStart
+			} else {
+				modifiers = 0
+			}
+			// TODO: we should prefer the right-most input if it overflows, rather
+			//       than the left-most as we're doing here.
+			if inputLen == 255 {
+				break input
+			}
+		}
+	}
+
+	// Second phase: find the right-most match, and count segments from the
+	// right.
+
+	var (
+		pi    = uint8(m.patternLen - 1) // pattern index
+		p     = m.pattern[pi]           // pattern rune
+		start = -1                      // start offset of match
+		rseg  = uint8(0)
+	)
+	const maxSeg = 3 // maximum number of segments from the right to count, for scoring purposes.
+
+	for ii := inputLen - 1; ; ii-- {
+		r := m.inputBuffer[ii]
+		if rseg < maxSeg && m.roles[ii]&separator != 0 {
+			rseg++
+		}
+		m.segments[ii] = rseg
+		if p == r {
+			if pi == 0 {
+				start = int(ii)
+				break
+			}
+			pi--
+			p = m.pattern[pi]
+		}
+		// Don't check ii >= 0 in the loop condition: ii is a uint8.
+		if ii == 0 {
+			break
+		}
+	}
+
+	if start < 0 {
+		// no match: skip scoring
+		return -1, 0
+	}
+
+	// Third phase: find the shortest match, and compute the score.
+
+	// Score is the average score for each character.
+	//
+	// A character score is the multiple of:
+	//   1. 1.0 if the character starts a segment, .8 if the character start a
+	//      mid-segment word, otherwise 0.6. This carries over to immediately
+	//      following characters.
+	//   2. 1.0 if the character is part of the last segment, otherwise
+	//      1.0-.2*<segments from the right>, with a max segment count of 3.
+	//
+	// This is a very naive algorithm, but it is fast. There's lots of prior art
+	// here, and we should leverage it. For example, we could explicitly consider
+	// character distance, and exact matches of words or segments.
+	//
+	// Also note that this might not actually find the highest scoring match, as
+	// doing so could require a non-linear algorithm, depending on how the score
+	// is calculated.
+
+	pi = 0
+	p = m.pattern[pi]
+
+	const (
+		segStreak  = 1.0
+		wordStreak = 0.8
+		noStreak   = 0.6
+		perSegment = 0.2 // we count at most 3 segments above
+	)
+
+	streakBonus := noStreak
+	totScore := 0.0
+	for ii := uint8(start); ii < inputLen; ii++ {
+		r := m.inputBuffer[ii]
+		if r == p {
+			pi++
+			p = m.pattern[pi]
+			// Note: this could be optimized with some bit operations.
+			switch {
+			case m.roles[ii]&segmentStart != 0 && segStreak > streakBonus:
+				streakBonus = segStreak
+			case m.roles[ii]&wordStart != 0 && wordStreak > streakBonus:
+				streakBonus = wordStreak
+			}
+			totScore += streakBonus * (1.0 - float64(m.segments[ii])*perSegment)
+			if pi >= m.patternLen {
+				break
+			}
+		} else {
+			streakBonus = noStreak
+		}
+	}
+
+	return start, totScore / float64(m.patternLen)
+}
diff --git a/internal/lsp/fuzzy/symbol_test.go b/internal/lsp/fuzzy/symbol_test.go
new file mode 100644
index 0000000..9dc710e
--- /dev/null
+++ b/internal/lsp/fuzzy/symbol_test.go
@@ -0,0 +1,78 @@
+// Copyright 2021 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package fuzzy_test
+
+import (
+	"testing"
+
+	. "golang.org/x/tools/internal/lsp/fuzzy"
+)
+
+func TestSymbolMatchIndex(t *testing.T) {
+	tests := []struct {
+		pattern, input string
+		want           int
+	}{
+		{"test", "foo.TestFoo", 4},
+		{"test", "test", 0},
+		{"test", "Test", 0},
+		{"test", "est", -1},
+		{"t", "shortest", 7},
+		{"", "foo", -1},
+		{"", string([]rune{0}), -1}, // verify that we don't default to an empty pattern.
+		{"anything", "", -1},
+	}
+
+	for _, test := range tests {
+		matcher := NewSymbolMatcher(test.pattern)
+		if got, _ := matcher.Match([]string{test.input}); got != test.want {
+			t.Errorf("NewSymbolMatcher(%q).Match(%q) = %v, _, want %v, _", test.pattern, test.input, got, test.want)
+		}
+	}
+}
+
+func TestSymbolRanking(t *testing.T) {
+	matcher := NewSymbolMatcher("test")
+
+	// symbols to match, in ascending order of ranking.
+	symbols := []string{
+		"this.is.better.than.most",
+		"test.foo.bar",
+		"atest",
+		"thebest",
+		"test.foo",
+		"tTest",
+		"test.foo",
+		"foo.test",
+		"test",
+	}
+	prev := 0.0
+	for _, sym := range symbols {
+		_, score := matcher.Match([]string{sym})
+		t.Logf("Match(%q) = %v", sym, score)
+		if score < prev {
+			t.Errorf("Match(%q) = _, %v, want > %v", sym, score, prev)
+		}
+		prev = score
+	}
+}
+
+func TestChunkedMatch(t *testing.T) {
+	matcher := NewSymbolMatcher("test")
+
+	chunked := [][]string{
+		{"test"},
+		{"", "test"},
+		{"test", ""},
+		{"te", "st"},
+	}
+
+	for _, chunks := range chunked {
+		offset, score := matcher.Match(chunks)
+		if offset != 0 || score != 1.0 {
+			t.Errorf("Match(%v) = %v, %v, want 0, 1.0", chunks, offset, score)
+		}
+	}
+}