internal/fuzzy: improvements to the symbol scoring algorithm
Based on feedback in golang/go#60027, tweak the fuzzy symbol scoring
algorithm to much more strongly prefer sequential and exact matches.
Fixes golang/go#60027
Change-Id: I1c6d019065c4dff4adf2db9e94397a635e13d50f
Reviewed-on: https://go-review.googlesource.com/c/tools/+/493623
gopls-CI: kokoro <noreply+kokoro@google.com>
Run-TryBot: Robert Findley <rfindley@google.com>
Reviewed-by: Paul Jolly <paul@myitcv.org.uk>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Alan Donovan <adonovan@google.com>
diff --git a/gopls/internal/lsp/cmd/usage/workspace_symbol.hlp b/gopls/internal/lsp/cmd/usage/workspace_symbol.hlp
index a61b47b..ed22e98 100644
--- a/gopls/internal/lsp/cmd/usage/workspace_symbol.hlp
+++ b/gopls/internal/lsp/cmd/usage/workspace_symbol.hlp
@@ -9,5 +9,5 @@
workspace_symbol-flags:
-matcher=string
- specifies the type of matcher: fuzzy, caseSensitive, or caseInsensitive.
- The default is caseInsensitive.
+ specifies the type of matcher: fuzzy, fastfuzzy, casesensitive, or caseinsensitive.
+ The default is caseinsensitive.
diff --git a/gopls/internal/lsp/cmd/workspace_symbol.go b/gopls/internal/lsp/cmd/workspace_symbol.go
index ed28df0..520d6bc 100644
--- a/gopls/internal/lsp/cmd/workspace_symbol.go
+++ b/gopls/internal/lsp/cmd/workspace_symbol.go
@@ -8,6 +8,7 @@
"context"
"flag"
"fmt"
+ "strings"
"golang.org/x/tools/gopls/internal/lsp/protocol"
"golang.org/x/tools/gopls/internal/lsp/source"
@@ -16,7 +17,7 @@
// workspaceSymbol implements the workspace_symbol verb for gopls.
type workspaceSymbol struct {
- Matcher string `flag:"matcher" help:"specifies the type of matcher: fuzzy, caseSensitive, or caseInsensitive.\nThe default is caseInsensitive."`
+ Matcher string `flag:"matcher" help:"specifies the type of matcher: fuzzy, fastfuzzy, casesensitive, or caseinsensitive.\nThe default is caseinsensitive."`
app *Application
}
@@ -46,10 +47,10 @@
if opts != nil {
opts(o)
}
- switch r.Matcher {
+ switch strings.ToLower(r.Matcher) {
case "fuzzy":
o.SymbolMatcher = source.SymbolFuzzy
- case "caseSensitive":
+ case "casesensitive":
o.SymbolMatcher = source.SymbolCaseSensitive
case "fastfuzzy":
o.SymbolMatcher = source.SymbolFastFuzzy
diff --git a/gopls/internal/lsp/source/workspace_symbol.go b/gopls/internal/lsp/source/workspace_symbol.go
index 4a65d52..bf92c77 100644
--- a/gopls/internal/lsp/source/workspace_symbol.go
+++ b/gopls/internal/lsp/source/workspace_symbol.go
@@ -484,7 +484,10 @@
// every field or method nesting level to access the field decreases
// the score by a factor of 1.0 - depth*depthFactor, up to a depth of
// 3.
- depthFactor = 0.2
+ //
+ // Use a small constant here, as this exists mostly to break ties
+ // (e.g. given a type Foo and a field x.Foo, prefer Foo).
+ depthFactor = 0.01
)
startWord := true
diff --git a/internal/fuzzy/symbol.go b/internal/fuzzy/symbol.go
index 073a4cd..bf93041 100644
--- a/internal/fuzzy/symbol.go
+++ b/internal/fuzzy/symbol.go
@@ -26,9 +26,6 @@
// symbol or identifiers, so doing this avoids allocating strings.
// - We can return the index of the right-most match, allowing us to trim
// irrelevant qualification.
-//
-// This implementation is experimental, serving as a reference fast algorithm
-// to compare to the fuzzy algorithm implemented by Matcher.
type SymbolMatcher struct {
// Using buffers of length 256 is both a reasonable size for most qualified
// symbols, and makes it easy to avoid bounds checks by using uint8 indexes.
@@ -169,19 +166,29 @@
// Score is the average score for each character.
//
// A character score is the multiple of:
- // 1. 1.0 if the character starts a segment, .8 if the character start a
- // mid-segment word, otherwise 0.6. This carries over to immediately
- // following characters.
- // 2. For the final character match, the multiplier from (1) is reduced to
- // .8 if the next character in the input is a mid-segment word, or 0.6 if
- // the next character in the input is not a word or segment start. This
- // ensures that we favor whole-word or whole-segment matches over prefix
- // matches.
- // 3. 1.0 if the character is part of the last segment, otherwise
- // 1.0-.2*<segments from the right>, with a max segment count of 3.
+ // 1. 1.0 if the character starts a segment or is preceded by a matching
+ // character, 0.9 if the character starts a mid-segment word, else 0.6.
//
- // This is a very naive algorithm, but it is fast. There's lots of prior art
- // here, and we should leverage it. For example, we could explicitly consider
+ // Note that characters preceded by a matching character get the max
+ // score of 1.0 so that sequential or exact matches are preferred, even
+ // if they don't start/end at a segment or word boundary. For example, a
+ // match for "func" in intfuncs should have a higher score than in
+ // ifunmatched.
+ //
+ // For the final character match, the multiplier from (1) is reduced to
+ // 0.9 if the next character in the input is a mid-segment word, or 0.6
+ // if the next character in the input is not a word or segment start.
+ // This ensures that we favor whole-word or whole-segment matches over
+ // prefix matches.
+ //
+ // 2. 1.0 if the character is part of the last segment, otherwise
+ // 1.0-0.1*<segments from the right>, with a max segment count of 3.
+ // Notably 1.0-0.1*3 = 0.7 > 0.6, so that foo/_/_/_/_ (a match very
+ // early in a qualified symbol name) still scores higher than _f_o_o_
+ // (a completely split match).
+ //
+ // This is a naive algorithm, but it is fast. There's lots of prior art here
+ // that could be leveraged. For example, we could explicitly consider
// character distance, and exact matches of words or segments.
//
// Also note that this might not actually find the highest scoring match, as
@@ -192,10 +199,10 @@
p = m.pattern[pi]
const (
- segStreak = 1.0
- wordStreak = 0.8
+ segStreak = 1.0 // start of segment or sequential match
+ wordStreak = 0.9 // start of word match
noStreak = 0.6
- perSegment = 0.2 // we count at most 3 segments above
+ perSegment = 0.1 // we count at most 3 segments above
)
streakBonus := noStreak
@@ -228,6 +235,7 @@
if finalChar {
break
}
+ streakBonus = segStreak // see above: sequential characters get the max score
} else {
streakBonus = noStreak
}
diff --git a/internal/fuzzy/symbol_test.go b/internal/fuzzy/symbol_test.go
index df74bbe..2a9d9b6 100644
--- a/internal/fuzzy/symbol_test.go
+++ b/internal/fuzzy/symbol_test.go
@@ -40,12 +40,12 @@
symbols := []string{
"this.is.better.than.most",
"test.foo.bar",
- "atest",
"thebest",
"test.foo",
"test.foo",
- "tTest",
+ "atest",
"testage",
+ "tTest",
"foo.test",
"test",
}
@@ -60,6 +60,33 @@
}
}
+// Test that we strongly prefer exact matches.
+//
+// In golang/go#60027, we preferred "Runner" for the query "rune" over several
+// results containing the word "rune" exactly. Following this observation,
+// scoring was tweaked to more strongly emphasize sequential characters and
+// exact matches.
+func TestSymbolRanking_Issue60027(t *testing.T) {
+ matcher := NewSymbolMatcher("rune")
+
+ // symbols to match, in ascending order of ranking.
+ symbols := []string{
+ "Runner",
+ "singleRuneParam",
+ "Config.ifsRune",
+ "Parser.rune",
+ }
+ prev := 0.0
+ for _, sym := range symbols {
+ _, score := matcher.Match([]string{sym})
+ t.Logf("Match(%q) = %v", sym, score)
+ if score < prev {
+ t.Errorf("Match(%q) = _, %v, want > %v", sym, score, prev)
+ }
+ prev = score
+ }
+}
+
func TestChunkedMatch(t *testing.T) {
matcher := NewSymbolMatcher("test")