blob: c0efd30dd9a32fd30fb2dfaaa4d028a724a5b50e [file] [log] [blame]
// Copyright 2019 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package fuzzy implements a fuzzy matching algorithm.
package fuzzy
import (
"bytes"
"fmt"
)
const (
// MaxInputSize is the maximum size of the input scored against the fuzzy matcher. Longer inputs
// will be truncated to this size.
MaxInputSize = 127
// MaxPatternSize is the maximum size of the pattern used to construct the fuzzy matcher. Longer
// inputs are truncated to this size.
MaxPatternSize = 63
)
type scoreVal int
func (s scoreVal) val() int {
return int(s) >> 1
}
func (s scoreVal) prevK() int {
return int(s) & 1
}
func score(val int, prevK int /*0 or 1*/) scoreVal {
return scoreVal(val<<1 + prevK)
}
// Matcher implements a fuzzy matching algorithm for scoring candidates against a pattern.
// The matcher does not support parallel usage.
type Matcher struct {
pattern string
patternLower []byte // lower-case version of the pattern
patternShort []byte // first characters of the pattern
caseSensitive bool // set if the pattern is mix-cased
patternRoles []RuneRole // the role of each character in the pattern
roles []RuneRole // the role of each character in the tested string
scores [MaxInputSize + 1][MaxPatternSize + 1][2]scoreVal
scoreScale float32
lastCandidateLen int // in bytes
lastCandidateMatched bool
// Reusable buffers to avoid allocating for every candidate.
// - inputBuf stores the concatenated input chunks
// - lowerBuf stores the last candidate in lower-case
// - rolesBuf stores the calculated roles for each rune in the last
// candidate.
inputBuf [MaxInputSize]byte
lowerBuf [MaxInputSize]byte
rolesBuf [MaxInputSize]RuneRole
}
func (m *Matcher) bestK(i, j int) int {
if m.scores[i][j][0].val() < m.scores[i][j][1].val() {
return 1
}
return 0
}
// NewMatcher returns a new fuzzy matcher for scoring candidates against the provided pattern.
func NewMatcher(pattern string) *Matcher {
if len(pattern) > MaxPatternSize {
pattern = pattern[:MaxPatternSize]
}
m := &Matcher{
pattern: pattern,
patternLower: toLower([]byte(pattern), nil),
}
for i, c := range m.patternLower {
if pattern[i] != c {
m.caseSensitive = true
break
}
}
if len(pattern) > 3 {
m.patternShort = m.patternLower[:3]
} else {
m.patternShort = m.patternLower
}
m.patternRoles = RuneRoles([]byte(pattern), nil)
if len(pattern) > 0 {
maxCharScore := 4
m.scoreScale = 1 / float32(maxCharScore*len(pattern))
}
return m
}
// Score returns the score returned by matching the candidate to the pattern.
// This is not designed for parallel use. Multiple candidates must be scored sequentially.
// Returns a score between 0 and 1 (0 - no match, 1 - perfect match).
func (m *Matcher) Score(candidate string) float32 {
return m.ScoreChunks([]string{candidate})
}
func (m *Matcher) ScoreChunks(chunks []string) float32 {
candidate := fromChunks(chunks, m.inputBuf[:])
if len(candidate) > MaxInputSize {
candidate = candidate[:MaxInputSize]
}
lower := toLower(candidate, m.lowerBuf[:])
m.lastCandidateLen = len(candidate)
if len(m.pattern) == 0 {
// Empty patterns perfectly match candidates.
return 1
}
if m.match(candidate, lower) {
sc := m.computeScore(candidate, lower)
if sc > minScore/2 && !m.poorMatch() {
m.lastCandidateMatched = true
if len(m.pattern) == len(candidate) {
// Perfect match.
return 1
}
if sc < 0 {
sc = 0
}
normalizedScore := float32(sc) * m.scoreScale
if normalizedScore > 1 {
normalizedScore = 1
}
return normalizedScore
}
}
m.lastCandidateMatched = false
return 0
}
const minScore = -10000
// MatchedRanges returns matches ranges for the last scored string as a flattened array of
// [begin, end) byte offset pairs.
func (m *Matcher) MatchedRanges() []int {
if len(m.pattern) == 0 || !m.lastCandidateMatched {
return nil
}
i, j := m.lastCandidateLen, len(m.pattern)
if m.scores[i][j][0].val() < minScore/2 && m.scores[i][j][1].val() < minScore/2 {
return nil
}
var ret []int
k := m.bestK(i, j)
for i > 0 {
take := (k == 1)
k = m.scores[i][j][k].prevK()
if take {
if len(ret) == 0 || ret[len(ret)-1] != i {
ret = append(ret, i)
ret = append(ret, i-1)
} else {
ret[len(ret)-1] = i - 1
}
j--
}
i--
}
// Reverse slice.
for i := 0; i < len(ret)/2; i++ {
ret[i], ret[len(ret)-1-i] = ret[len(ret)-1-i], ret[i]
}
return ret
}
func (m *Matcher) match(candidate []byte, candidateLower []byte) bool {
i, j := 0, 0
for ; i < len(candidateLower) && j < len(m.patternLower); i++ {
if candidateLower[i] == m.patternLower[j] {
j++
}
}
if j != len(m.patternLower) {
return false
}
// The input passes the simple test against pattern, so it is time to classify its characters.
// Character roles are used below to find the last segment.
m.roles = RuneRoles(candidate, m.rolesBuf[:])
return true
}
func (m *Matcher) computeScore(candidate []byte, candidateLower []byte) int {
pattLen, candLen := len(m.pattern), len(candidate)
for j := 0; j <= len(m.pattern); j++ {
m.scores[0][j][0] = minScore << 1
m.scores[0][j][1] = minScore << 1
}
m.scores[0][0][0] = score(0, 0) // Start with 0.
segmentsLeft, lastSegStart := 1, 0
for i := 0; i < candLen; i++ {
if m.roles[i] == RSep {
segmentsLeft++
lastSegStart = i + 1
}
}
// A per-character bonus for a consecutive match.
consecutiveBonus := 2
wordIdx := 0 // Word count within segment.
for i := 1; i <= candLen; i++ {
role := m.roles[i-1]
isHead := role == RHead
if isHead {
wordIdx++
} else if role == RSep && segmentsLeft > 1 {
wordIdx = 0
segmentsLeft--
}
var skipPenalty int
if i == 1 || (i-1) == lastSegStart {
// Skipping the start of first or last segment.
skipPenalty++
}
for j := 0; j <= pattLen; j++ {
// By default, we don't have a match. Fill in the skip data.
m.scores[i][j][1] = minScore << 1
// Compute the skip score.
k := 0
if m.scores[i-1][j][0].val() < m.scores[i-1][j][1].val() {
k = 1
}
skipScore := m.scores[i-1][j][k].val()
// Do not penalize missing characters after the last matched segment.
if j != pattLen {
skipScore -= skipPenalty
}
m.scores[i][j][0] = score(skipScore, k)
if j == 0 || candidateLower[i-1] != m.patternLower[j-1] {
// Not a match.
continue
}
pRole := m.patternRoles[j-1]
if role == RTail && pRole == RHead {
if j > 1 {
// Not a match: a head in the pattern matches a tail character in the candidate.
continue
}
// Special treatment for the first character of the pattern. We allow
// matches in the middle of a word if they are long enough, at least
// min(3, pattern.length) characters.
if !bytes.HasPrefix(candidateLower[i-1:], m.patternShort) {
continue
}
}
// Compute the char score.
var charScore int
// Bonus 1: the char is in the candidate's last segment.
if segmentsLeft <= 1 {
charScore++
}
// Bonus 2: Case match or a Head in the pattern aligns with one in the word.
// Single-case patterns lack segmentation signals and we assume any character
// can be a head of a segment.
if candidate[i-1] == m.pattern[j-1] || role == RHead && (!m.caseSensitive || pRole == RHead) {
charScore++
}
// Penalty 1: pattern char is Head, candidate char is Tail.
if role == RTail && pRole == RHead {
charScore--
}
// Penalty 2: first pattern character matched in the middle of a word.
if j == 1 && role == RTail {
charScore -= 4
}
// Third dimension encodes whether there is a gap between the previous match and the current
// one.
for k := 0; k < 2; k++ {
sc := m.scores[i-1][j-1][k].val() + charScore
isConsecutive := k == 1 || i-1 == 0 || i-1 == lastSegStart
if isConsecutive {
// Bonus 3: a consecutive match. First character match also gets a bonus to
// ensure prefix final match score normalizes to 1.0.
// Logically, this is a part of charScore, but we have to compute it here because it
// only applies for consecutive matches (k == 1).
sc += consecutiveBonus
}
if k == 0 {
// Penalty 3: Matching inside a segment (and previous char wasn't matched). Penalize for the lack
// of alignment.
if role == RTail || role == RUCTail {
sc -= 3
}
}
if sc > m.scores[i][j][1].val() {
m.scores[i][j][1] = score(sc, k)
}
}
}
}
result := m.scores[len(candidate)][len(m.pattern)][m.bestK(len(candidate), len(m.pattern))].val()
return result
}
// ScoreTable returns the score table computed for the provided candidate. Used only for debugging.
func (m *Matcher) ScoreTable(candidate string) string {
var buf bytes.Buffer
var line1, line2, separator bytes.Buffer
line1.WriteString("\t")
line2.WriteString("\t")
for j := 0; j < len(m.pattern); j++ {
line1.WriteString(fmt.Sprintf("%c\t\t", m.pattern[j]))
separator.WriteString("----------------")
}
buf.WriteString(line1.String())
buf.WriteString("\n")
buf.WriteString(separator.String())
buf.WriteString("\n")
for i := 1; i <= len(candidate); i++ {
line1.Reset()
line2.Reset()
line1.WriteString(fmt.Sprintf("%c\t", candidate[i-1]))
line2.WriteString("\t")
for j := 1; j <= len(m.pattern); j++ {
line1.WriteString(fmt.Sprintf("M%6d(%c)\t", m.scores[i][j][0].val(), dir(m.scores[i][j][0].prevK())))
line2.WriteString(fmt.Sprintf("H%6d(%c)\t", m.scores[i][j][1].val(), dir(m.scores[i][j][1].prevK())))
}
buf.WriteString(line1.String())
buf.WriteString("\n")
buf.WriteString(line2.String())
buf.WriteString("\n")
buf.WriteString(separator.String())
buf.WriteString("\n")
}
return buf.String()
}
func dir(prevK int) rune {
if prevK == 0 {
return 'M'
}
return 'H'
}
func (m *Matcher) poorMatch() bool {
if len(m.pattern) < 2 {
return false
}
i, j := m.lastCandidateLen, len(m.pattern)
k := m.bestK(i, j)
var counter, len int
for i > 0 {
take := (k == 1)
k = m.scores[i][j][k].prevK()
if take {
len++
if k == 0 && len < 3 && m.roles[i-1] == RTail {
// Short match in the middle of a word
counter++
if counter > 1 {
return true
}
}
j--
} else {
len = 0
}
i--
}
return false
}
// BestMatch returns the name most similar to the
// pattern, using fuzzy matching, or the empty string.
func BestMatch(pattern string, names []string) string {
fuzz := NewMatcher(pattern)
best := ""
highScore := float32(0) // minimum score is 0 (no match)
for _, name := range names {
// TODO: Improve scoring algorithm.
score := fuzz.Score(name)
if score > highScore {
highScore = score
best = name
} else if score == 0 {
// Order matters in the fuzzy matching algorithm. If we find no match
// when matching the target to the identifier, try matching the identifier
// to the target.
revFuzz := NewMatcher(name)
revScore := revFuzz.Score(pattern)
if revScore > highScore {
highScore = revScore
best = name
}
}
}
return best
}