blob: c1038163f1a929cd19efd25915e522bad03d0964 [file] [log] [blame]
Rebecca Stambler22149862019-07-01 17:08:29 -04001// Copyright 2019 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package fuzzy
6
7import (
8 "unicode"
9)
10
Rebecca Stambler22149862019-07-01 17:08:29 -040011// RuneRole specifies the role of a rune in the context of an input.
12type RuneRole byte
13
14const (
15 // RNone specifies a rune without any role in the input (i.e., whitespace/non-ASCII).
16 RNone RuneRole = iota
17 // RSep specifies a rune with the role of segment separator.
18 RSep
19 // RTail specifies a rune which is a lower-case tail in a word in the input.
20 RTail
21 // RUCTail specifies a rune which is an upper-case tail in a word in the input.
22 RUCTail
23 // RHead specifies a rune which is the first character in a word in the input.
24 RHead
25)
26
27// RuneRoles detects the roles of each byte rune in an input string and stores it in the output
28// slice. The rune role depends on the input type. Stops when it parsed all the runes in the string
29// or when it filled the output. If output is nil, then it gets created.
Rob Findley15eebf72021-07-30 15:11:53 -040030func RuneRoles(candidate []byte, reuse []RuneRole) []RuneRole {
Rebecca Stambler22149862019-07-01 17:08:29 -040031 var output []RuneRole
Rob Findley15eebf72021-07-30 15:11:53 -040032 if cap(reuse) < len(candidate) {
33 output = make([]RuneRole, 0, len(candidate))
Rebecca Stambler22149862019-07-01 17:08:29 -040034 } else {
35 output = reuse[:0]
36 }
37
38 prev, prev2 := rtNone, rtNone
Rob Findley15eebf72021-07-30 15:11:53 -040039 for i := 0; i < len(candidate); i++ {
40 r := rune(candidate[i])
Rebecca Stambler22149862019-07-01 17:08:29 -040041
42 role := RNone
43
44 curr := rtLower
Rob Findley15eebf72021-07-30 15:11:53 -040045 if candidate[i] <= unicode.MaxASCII {
46 curr = runeType(rt[candidate[i]] - '0')
Rebecca Stambler22149862019-07-01 17:08:29 -040047 }
48
49 if curr == rtLower {
50 if prev == rtNone || prev == rtPunct {
51 role = RHead
52 } else {
53 role = RTail
54 }
55 } else if curr == rtUpper {
56 role = RHead
57
58 if prev == rtUpper {
59 // This and previous characters are both upper case.
60
Rob Findley15eebf72021-07-30 15:11:53 -040061 if i+1 == len(candidate) {
Rebecca Stambler22149862019-07-01 17:08:29 -040062 // This is last character, previous was also uppercase -> this is UCTail
63 // i.e., (current char is C): aBC / BC / ABC
64 role = RUCTail
65 }
66 }
67 } else if curr == rtPunct {
Muir Mandersa12cc762019-10-21 21:07:21 -070068 switch r {
69 case '.', ':':
Rebecca Stambler22149862019-07-01 17:08:29 -040070 role = RSep
71 }
72 }
73 if curr != rtLower {
74 if i > 1 && output[i-1] == RHead && prev2 == rtUpper && (output[i-2] == RHead || output[i-2] == RUCTail) {
75 // The previous two characters were uppercase. The current one is not a lower case, so the
76 // previous one can't be a HEAD. Make it a UCTail.
77 // i.e., (last char is current char - B must be a UCTail): ABC / ZABC / AB.
78 output[i-1] = RUCTail
79 }
80 }
81
82 output = append(output, role)
83 prev2 = prev
84 prev = curr
85 }
86 return output
87}
88
89type runeType byte
90
91const (
92 rtNone runeType = iota
93 rtPunct
94 rtLower
95 rtUpper
96)
97
98const rt = "00000000000000000000000000000000000000000000001122222222221000000333333333333333333333333330000002222222222222222222222222200000"
99
100// LastSegment returns the substring representing the last segment from the input, where each
101// byte has an associated RuneRole in the roles slice. This makes sense only for inputs of Symbol
102// or Filename type.
103func LastSegment(input string, roles []RuneRole) string {
104 // Exclude ending separators.
105 end := len(input) - 1
106 for end >= 0 && roles[end] == RSep {
107 end--
108 }
109 if end < 0 {
110 return ""
111 }
112
113 start := end - 1
114 for start >= 0 && roles[start] != RSep {
115 start--
116 }
117
118 return input[start+1 : end+1]
119}
120
Rob Findley15eebf72021-07-30 15:11:53 -0400121// fromChunks copies string chunks into the given buffer.
122func fromChunks(chunks []string, buffer []byte) []byte {
123 ii := 0
124 for _, chunk := range chunks {
125 for i := 0; i < len(chunk); i++ {
126 if ii >= cap(buffer) {
127 break
128 }
129 buffer[ii] = chunk[i]
130 ii++
131 }
132 }
133 return buffer[:ii]
134}
135
136// toLower transforms the input string to lower case, which is stored in the output byte slice.
Rebecca Stambler22149862019-07-01 17:08:29 -0400137// The lower casing considers only ASCII values - non ASCII values are left unmodified.
138// Stops when parsed all input or when it filled the output slice. If output is nil, then it gets
139// created.
Rob Findley15eebf72021-07-30 15:11:53 -0400140func toLower(input []byte, reuse []byte) []byte {
Rebecca Stambler22149862019-07-01 17:08:29 -0400141 output := reuse
142 if cap(reuse) < len(input) {
143 output = make([]byte, len(input))
144 }
145
146 for i := 0; i < len(input); i++ {
147 r := rune(input[i])
Rob Findley15eebf72021-07-30 15:11:53 -0400148 if input[i] <= unicode.MaxASCII {
Rebecca Stambler22149862019-07-01 17:08:29 -0400149 if 'A' <= r && r <= 'Z' {
150 r += 'a' - 'A'
151 }
152 }
153 output[i] = byte(r)
154 }
155 return output[:len(input)]
156}
157
158// WordConsumer defines a consumer for a word delimited by the [start,end) byte offsets in an input
159// (start is inclusive, end is exclusive).
160type WordConsumer func(start, end int)
161
162// Words find word delimiters in an input based on its bytes' mappings to rune roles. The offset
163// delimiters for each word are fed to the provided consumer function.
164func Words(roles []RuneRole, consume WordConsumer) {
165 var wordStart int
166 for i, r := range roles {
167 switch r {
168 case RUCTail, RTail:
169 case RHead, RNone, RSep:
170 if i != wordStart {
171 consume(wordStart, i)
172 }
173 wordStart = i
174 if r != RHead {
175 // Skip this character.
176 wordStart = i + 1
177 }
178 }
179 }
180 if wordStart != len(roles) {
181 consume(wordStart, len(roles))
182 }
183}