|  | // Copyright 2019 The Go Authors. All rights reserved. | 
|  | // Use of this source code is governed by a BSD-style | 
|  | // license that can be found in the LICENSE file. | 
|  |  | 
|  | // Package fuzzy implements a fuzzy matching algorithm. | 
|  | package fuzzy | 
|  |  | 
|  | import ( | 
|  | "bytes" | 
|  | "fmt" | 
|  | ) | 
|  |  | 
|  | const ( | 
|  | // MaxInputSize is the maximum size of the input scored against the fuzzy matcher. Longer inputs | 
|  | // will be truncated to this size. | 
|  | MaxInputSize = 127 | 
|  | // MaxPatternSize is the maximum size of the pattern used to construct the fuzzy matcher. Longer | 
|  | // inputs are truncated to this size. | 
|  | MaxPatternSize = 63 | 
|  | ) | 
|  |  | 
|  | type scoreVal int | 
|  |  | 
|  | func (s scoreVal) val() int { | 
|  | return int(s) >> 1 | 
|  | } | 
|  |  | 
|  | func (s scoreVal) prevK() int { | 
|  | return int(s) & 1 | 
|  | } | 
|  |  | 
|  | func score(val int, prevK int /*0 or 1*/) scoreVal { | 
|  | return scoreVal(val<<1 + prevK) | 
|  | } | 
|  |  | 
|  | // Matcher implements a fuzzy matching algorithm for scoring candidates against a pattern. | 
|  | // The matcher does not support parallel usage. | 
|  | type Matcher struct { | 
|  | pattern       string | 
|  | patternLower  []byte // lower-case version of the pattern | 
|  | patternShort  []byte // first characters of the pattern | 
|  | caseSensitive bool   // set if the pattern is mix-cased | 
|  |  | 
|  | patternRoles []RuneRole // the role of each character in the pattern | 
|  | roles        []RuneRole // the role of each character in the tested string | 
|  |  | 
|  | scores [MaxInputSize + 1][MaxPatternSize + 1][2]scoreVal | 
|  |  | 
|  | scoreScale float32 | 
|  |  | 
|  | lastCandidateLen     int // in bytes | 
|  | lastCandidateMatched bool | 
|  |  | 
|  | // Reusable buffers to avoid allocating for every candidate. | 
|  | //  - inputBuf stores the concatenated input chunks | 
|  | //  - lowerBuf stores the last candidate in lower-case | 
|  | //  - rolesBuf stores the calculated roles for each rune in the last | 
|  | //    candidate. | 
|  | inputBuf [MaxInputSize]byte | 
|  | lowerBuf [MaxInputSize]byte | 
|  | rolesBuf [MaxInputSize]RuneRole | 
|  | } | 
|  |  | 
|  | func (m *Matcher) String() string { return m.pattern } | 
|  |  | 
|  | func (m *Matcher) bestK(i, j int) int { | 
|  | if m.scores[i][j][0].val() < m.scores[i][j][1].val() { | 
|  | return 1 | 
|  | } | 
|  | return 0 | 
|  | } | 
|  |  | 
|  | // NewMatcher returns a new fuzzy matcher for scoring candidates against the provided pattern. | 
|  | func NewMatcher(pattern string) *Matcher { | 
|  | if len(pattern) > MaxPatternSize { | 
|  | pattern = pattern[:MaxPatternSize] | 
|  | } | 
|  |  | 
|  | m := &Matcher{ | 
|  | pattern:      pattern, | 
|  | patternLower: toLower([]byte(pattern), nil), | 
|  | } | 
|  |  | 
|  | for i, c := range m.patternLower { | 
|  | if pattern[i] != c { | 
|  | m.caseSensitive = true | 
|  | break | 
|  | } | 
|  | } | 
|  |  | 
|  | if len(pattern) > 3 { | 
|  | m.patternShort = m.patternLower[:3] | 
|  | } else { | 
|  | m.patternShort = m.patternLower | 
|  | } | 
|  |  | 
|  | m.patternRoles = RuneRoles([]byte(pattern), nil) | 
|  |  | 
|  | if len(pattern) > 0 { | 
|  | maxCharScore := 4 | 
|  | m.scoreScale = 1 / float32(maxCharScore*len(pattern)) | 
|  | } | 
|  |  | 
|  | return m | 
|  | } | 
|  |  | 
|  | // Score returns the score returned by matching the candidate to the pattern. | 
|  | // This is not designed for parallel use. Multiple candidates must be scored sequentially. | 
|  | // Returns a score between 0 and 1 (0 - no match, 1 - perfect match). | 
|  | func (m *Matcher) Score(candidate string) float32 { | 
|  | return m.ScoreChunks([]string{candidate}) | 
|  | } | 
|  |  | 
|  | func (m *Matcher) ScoreChunks(chunks []string) float32 { | 
|  | candidate := fromChunks(chunks, m.inputBuf[:]) | 
|  | if len(candidate) > MaxInputSize { | 
|  | candidate = candidate[:MaxInputSize] | 
|  | } | 
|  | lower := toLower(candidate, m.lowerBuf[:]) | 
|  | m.lastCandidateLen = len(candidate) | 
|  |  | 
|  | if len(m.pattern) == 0 { | 
|  | // Empty patterns perfectly match candidates. | 
|  | return 1 | 
|  | } | 
|  |  | 
|  | if m.match(candidate, lower) { | 
|  | sc := m.computeScore(candidate, lower) | 
|  | if sc > minScore/2 && !m.poorMatch() { | 
|  | m.lastCandidateMatched = true | 
|  | if len(m.pattern) == len(candidate) { | 
|  | // Perfect match. | 
|  | return 1 | 
|  | } | 
|  |  | 
|  | if sc < 0 { | 
|  | sc = 0 | 
|  | } | 
|  | normalizedScore := min(float32(sc)*m.scoreScale, 1) | 
|  |  | 
|  | return normalizedScore | 
|  | } | 
|  | } | 
|  |  | 
|  | m.lastCandidateMatched = false | 
|  | return 0 | 
|  | } | 
|  |  | 
|  | const minScore = -10000 | 
|  |  | 
|  | // MatchedRanges returns matches ranges for the last scored string as a flattened array of | 
|  | // [begin, end) byte offset pairs. | 
|  | func (m *Matcher) MatchedRanges() []int { | 
|  | if len(m.pattern) == 0 || !m.lastCandidateMatched { | 
|  | return nil | 
|  | } | 
|  | i, j := m.lastCandidateLen, len(m.pattern) | 
|  | if m.scores[i][j][0].val() < minScore/2 && m.scores[i][j][1].val() < minScore/2 { | 
|  | return nil | 
|  | } | 
|  |  | 
|  | var ret []int | 
|  | k := m.bestK(i, j) | 
|  | for i > 0 { | 
|  | take := (k == 1) | 
|  | k = m.scores[i][j][k].prevK() | 
|  | if take { | 
|  | if len(ret) == 0 || ret[len(ret)-1] != i { | 
|  | ret = append(ret, i) | 
|  | ret = append(ret, i-1) | 
|  | } else { | 
|  | ret[len(ret)-1] = i - 1 | 
|  | } | 
|  | j-- | 
|  | } | 
|  | i-- | 
|  | } | 
|  | // Reverse slice. | 
|  | for i := range len(ret) / 2 { | 
|  | ret[i], ret[len(ret)-1-i] = ret[len(ret)-1-i], ret[i] | 
|  | } | 
|  | return ret | 
|  | } | 
|  |  | 
|  | func (m *Matcher) match(candidate []byte, candidateLower []byte) bool { | 
|  | i, j := 0, 0 | 
|  | for ; i < len(candidateLower) && j < len(m.patternLower); i++ { | 
|  | if candidateLower[i] == m.patternLower[j] { | 
|  | j++ | 
|  | } | 
|  | } | 
|  | if j != len(m.patternLower) { | 
|  | return false | 
|  | } | 
|  |  | 
|  | // The input passes the simple test against pattern, so it is time to classify its characters. | 
|  | // Character roles are used below to find the last segment. | 
|  | m.roles = RuneRoles(candidate, m.rolesBuf[:]) | 
|  |  | 
|  | return true | 
|  | } | 
|  |  | 
|  | func (m *Matcher) computeScore(candidate []byte, candidateLower []byte) int { | 
|  | pattLen, candLen := len(m.pattern), len(candidate) | 
|  |  | 
|  | for j := 0; j <= len(m.pattern); j++ { | 
|  | m.scores[0][j][0] = minScore << 1 | 
|  | m.scores[0][j][1] = minScore << 1 | 
|  | } | 
|  | m.scores[0][0][0] = score(0, 0) // Start with 0. | 
|  |  | 
|  | segmentsLeft, lastSegStart := 1, 0 | 
|  | for i := range candLen { | 
|  | if m.roles[i] == RSep { | 
|  | segmentsLeft++ | 
|  | lastSegStart = i + 1 | 
|  | } | 
|  | } | 
|  |  | 
|  | // A per-character bonus for a consecutive match. | 
|  | consecutiveBonus := 2 | 
|  | wordIdx := 0 // Word count within segment. | 
|  | for i := 1; i <= candLen; i++ { | 
|  |  | 
|  | role := m.roles[i-1] | 
|  | isHead := role == RHead | 
|  |  | 
|  | if isHead { | 
|  | wordIdx++ | 
|  | } else if role == RSep && segmentsLeft > 1 { | 
|  | wordIdx = 0 | 
|  | segmentsLeft-- | 
|  | } | 
|  |  | 
|  | var skipPenalty int | 
|  | if i == 1 || (i-1) == lastSegStart { | 
|  | // Skipping the start of first or last segment. | 
|  | skipPenalty++ | 
|  | } | 
|  |  | 
|  | for j := 0; j <= pattLen; j++ { | 
|  | // By default, we don't have a match. Fill in the skip data. | 
|  | m.scores[i][j][1] = minScore << 1 | 
|  |  | 
|  | // Compute the skip score. | 
|  | k := 0 | 
|  | if m.scores[i-1][j][0].val() < m.scores[i-1][j][1].val() { | 
|  | k = 1 | 
|  | } | 
|  |  | 
|  | skipScore := m.scores[i-1][j][k].val() | 
|  | // Do not penalize missing characters after the last matched segment. | 
|  | if j != pattLen { | 
|  | skipScore -= skipPenalty | 
|  | } | 
|  | m.scores[i][j][0] = score(skipScore, k) | 
|  |  | 
|  | if j == 0 || candidateLower[i-1] != m.patternLower[j-1] { | 
|  | // Not a match. | 
|  | continue | 
|  | } | 
|  | pRole := m.patternRoles[j-1] | 
|  |  | 
|  | if role == RTail && pRole == RHead { | 
|  | if j > 1 { | 
|  | // Not a match: a head in the pattern matches a tail character in the candidate. | 
|  | continue | 
|  | } | 
|  | // Special treatment for the first character of the pattern. We allow | 
|  | // matches in the middle of a word if they are long enough, at least | 
|  | // min(3, pattern.length) characters. | 
|  | if !bytes.HasPrefix(candidateLower[i-1:], m.patternShort) { | 
|  | continue | 
|  | } | 
|  | } | 
|  |  | 
|  | // Compute the char score. | 
|  | var charScore int | 
|  | // Bonus: the char is in the candidate's last segment. | 
|  | if segmentsLeft <= 1 { | 
|  | charScore++ | 
|  | } | 
|  |  | 
|  | // Bonus: exact case match between pattern and candidate. | 
|  | if candidate[i-1] == m.pattern[j-1] || | 
|  | // Bonus: candidate char is a head and pattern is all | 
|  | // lowercase. There is no segmentation in an all lowercase | 
|  | // pattern, so assume any char in pattern can be a head. Note | 
|  | // that we are intentionally _not_ giving a bonus to a case | 
|  | // insensitive match when the pattern is case sensitive. | 
|  | role == RHead && !m.caseSensitive { | 
|  | charScore++ | 
|  | } | 
|  |  | 
|  | // Penalty: pattern char is Head, candidate char is Tail. | 
|  | if role == RTail && pRole == RHead { | 
|  | charScore-- | 
|  | } | 
|  | // Penalty: first pattern character matched in the middle of a word. | 
|  | if j == 1 && role == RTail { | 
|  | charScore -= 4 | 
|  | } | 
|  |  | 
|  | // Third dimension encodes whether there is a gap between the previous match and the current | 
|  | // one. | 
|  | for k := range 2 { | 
|  | sc := m.scores[i-1][j-1][k].val() + charScore | 
|  |  | 
|  | isConsecutive := k == 1 || i-1 == 0 || i-1 == lastSegStart | 
|  | if isConsecutive { | 
|  | // Bonus: a consecutive match. First character match also gets a bonus to | 
|  | // ensure prefix final match score normalizes to 1.0. | 
|  | // Logically, this is a part of charScore, but we have to compute it here because it | 
|  | // only applies for consecutive matches (k == 1). | 
|  | sc += consecutiveBonus | 
|  | } | 
|  | if k == 0 { | 
|  | // Penalty: Matching inside a segment (and previous char wasn't matched). Penalize for the lack | 
|  | // of alignment. | 
|  | if role == RTail || role == RUCTail { | 
|  | sc -= 3 | 
|  | } | 
|  | } | 
|  |  | 
|  | if sc > m.scores[i][j][1].val() { | 
|  | m.scores[i][j][1] = score(sc, k) | 
|  | } | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | result := m.scores[len(candidate)][len(m.pattern)][m.bestK(len(candidate), len(m.pattern))].val() | 
|  |  | 
|  | return result | 
|  | } | 
|  |  | 
|  | // ScoreTable returns the score table computed for the provided candidate. Used only for debugging. | 
|  | func (m *Matcher) ScoreTable(candidate string) string { | 
|  | var buf bytes.Buffer | 
|  |  | 
|  | var line1, line2, separator bytes.Buffer | 
|  | line1.WriteString("\t") | 
|  | line2.WriteString("\t") | 
|  | for j := range len(m.pattern) { | 
|  | line1.WriteString(fmt.Sprintf("%c\t\t", m.pattern[j])) | 
|  | separator.WriteString("----------------") | 
|  | } | 
|  |  | 
|  | buf.WriteString(line1.String()) | 
|  | buf.WriteString("\n") | 
|  | buf.WriteString(separator.String()) | 
|  | buf.WriteString("\n") | 
|  |  | 
|  | for i := 1; i <= len(candidate); i++ { | 
|  | line1.Reset() | 
|  | line2.Reset() | 
|  |  | 
|  | line1.WriteString(fmt.Sprintf("%c\t", candidate[i-1])) | 
|  | line2.WriteString("\t") | 
|  |  | 
|  | for j := 1; j <= len(m.pattern); j++ { | 
|  | line1.WriteString(fmt.Sprintf("M%6d(%c)\t", m.scores[i][j][0].val(), dir(m.scores[i][j][0].prevK()))) | 
|  | line2.WriteString(fmt.Sprintf("H%6d(%c)\t", m.scores[i][j][1].val(), dir(m.scores[i][j][1].prevK()))) | 
|  | } | 
|  | buf.WriteString(line1.String()) | 
|  | buf.WriteString("\n") | 
|  | buf.WriteString(line2.String()) | 
|  | buf.WriteString("\n") | 
|  | buf.WriteString(separator.String()) | 
|  | buf.WriteString("\n") | 
|  | } | 
|  |  | 
|  | return buf.String() | 
|  | } | 
|  |  | 
|  | func dir(prevK int) rune { | 
|  | if prevK == 0 { | 
|  | return 'M' | 
|  | } | 
|  | return 'H' | 
|  | } | 
|  |  | 
|  | func (m *Matcher) poorMatch() bool { | 
|  | if len(m.pattern) < 2 { | 
|  | return false | 
|  | } | 
|  |  | 
|  | i, j := m.lastCandidateLen, len(m.pattern) | 
|  | k := m.bestK(i, j) | 
|  |  | 
|  | var counter, len int | 
|  | for i > 0 { | 
|  | take := (k == 1) | 
|  | k = m.scores[i][j][k].prevK() | 
|  | if take { | 
|  | len++ | 
|  | if k == 0 && len < 3 && m.roles[i-1] == RTail { | 
|  | // Short match in the middle of a word | 
|  | counter++ | 
|  | if counter > 1 { | 
|  | return true | 
|  | } | 
|  | } | 
|  | j-- | 
|  | } else { | 
|  | len = 0 | 
|  | } | 
|  | i-- | 
|  | } | 
|  | return false | 
|  | } | 
|  |  | 
|  | // BestMatch returns the name most similar to the | 
|  | // pattern, using fuzzy matching, or the empty string. | 
|  | func BestMatch(pattern string, names []string) string { | 
|  | fuzz := NewMatcher(pattern) | 
|  | best := "" | 
|  | highScore := float32(0) // minimum score is 0 (no match) | 
|  | for _, name := range names { | 
|  | // TODO: Improve scoring algorithm. | 
|  | score := fuzz.Score(name) | 
|  | if score > highScore { | 
|  | highScore = score | 
|  | best = name | 
|  | } else if score == 0 { | 
|  | // Order matters in the fuzzy matching algorithm. If we find no match | 
|  | // when matching the target to the identifier, try matching the identifier | 
|  | // to the target. | 
|  | revFuzz := NewMatcher(name) | 
|  | revScore := revFuzz.Score(pattern) | 
|  | if revScore > highScore { | 
|  | highScore = revScore | 
|  | best = name | 
|  | } | 
|  | } | 
|  | } | 
|  | return best | 
|  | } |