| // Copyright 2012 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| package strings |
| |
| // stringFinder efficiently finds strings in a source text. It's implemented |
| // using the Boyer-Moore string search algorithm: |
| // http://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm |
| // http://www.cs.utexas.edu/~moore/publications/fstrpos.pdf (note: this aged |
| // document uses 1-based indexing) |
| type stringFinder struct { |
| // pattern is the string that we are searching for in the text. |
| pattern string |
| |
| // badCharSkip[b] contains the distance between the last byte of pattern |
| // and the rightmost occurrence of b in pattern. If b is not in pattern, |
| // badCharSkip[b] is len(pattern). |
| // |
| // Whenever a mismatch is found with byte b in the text, we can safely |
| // shift the matching frame at least badCharSkip[b] until the next time |
| // the matching char could be in alignment. |
| badCharSkip [256]int |
| |
| // goodSuffixSkip[i] defines how far we can shift the matching frame given |
| // that the suffix pattern[i+1:] matches, but the byte pattern[i] does |
| // not. There are two cases to consider: |
| // |
| // 1. The matched suffix occurs elsewhere in pattern (with a different |
| // byte preceding it that we might possibly match). In this case, we can |
| // shift the matching frame to align with the next suffix chunk. For |
| // example, the pattern "mississi" has the suffix "issi" next occurring |
| // (in right-to-left order) at index 1, so goodSuffixSkip[3] == |
| // shift+len(suffix) == 3+4 == 7. |
| // |
| // 2. If the matched suffix does not occur elsewhere in pattern, then the |
| // matching frame may share part of its prefix with the end of the |
| // matching suffix. In this case, goodSuffixSkip[i] will contain how far |
| // to shift the frame to align this portion of the prefix to the |
| // suffix. For example, in the pattern "abcxxxabc", when the first |
| // mismatch from the back is found to be in position 3, the matching |
| // suffix "xxabc" is not found elsewhere in the pattern. However, its |
| // rightmost "abc" (at position 6) is a prefix of the whole pattern, so |
| // goodSuffixSkip[3] == shift+len(suffix) == 6+5 == 11. |
| goodSuffixSkip []int |
| } |
| |
| func makeStringFinder(pattern string) *stringFinder { |
| f := &stringFinder{ |
| pattern: pattern, |
| goodSuffixSkip: make([]int, len(pattern)), |
| } |
| // last is the index of the last character in the pattern. |
| last := len(pattern) - 1 |
| |
| // Build bad character table. |
| // Bytes not in the pattern can skip one pattern's length. |
| for i := range f.badCharSkip { |
| f.badCharSkip[i] = len(pattern) |
| } |
| // The loop condition is < instead of <= so that the last byte does not |
| // have a zero distance to itself. Finding this byte out of place implies |
| // that it is not in the last position. |
| for i := 0; i < last; i++ { |
| f.badCharSkip[pattern[i]] = last - i |
| } |
| |
| // Build good suffix table. |
| // First pass: set each value to the next index which starts a prefix of |
| // pattern. |
| lastPrefix := last |
| for i := last; i >= 0; i-- { |
| if HasPrefix(pattern, pattern[i+1:]) { |
| lastPrefix = i + 1 |
| } |
| // lastPrefix is the shift, and (last-i) is len(suffix). |
| f.goodSuffixSkip[i] = lastPrefix + last - i |
| } |
| // Second pass: find repeats of pattern's suffix starting from the front. |
| for i := 0; i < last; i++ { |
| lenSuffix := longestCommonSuffix(pattern, pattern[1:i+1]) |
| if pattern[i-lenSuffix] != pattern[last-lenSuffix] { |
| // (last-i) is the shift, and lenSuffix is len(suffix). |
| f.goodSuffixSkip[last-lenSuffix] = lenSuffix + last - i |
| } |
| } |
| |
| return f |
| } |
| |
| func longestCommonSuffix(a, b string) (i int) { |
| for ; i < len(a) && i < len(b); i++ { |
| if a[len(a)-1-i] != b[len(b)-1-i] { |
| break |
| } |
| } |
| return |
| } |
| |
| // next returns the index in text of the first occurrence of the pattern. If |
| // the pattern is not found, it returns -1. |
| func (f *stringFinder) next(text string) int { |
| i := len(f.pattern) - 1 |
| for i < len(text) { |
| // Compare backwards from the end until the first unmatching character. |
| j := len(f.pattern) - 1 |
| for j >= 0 && text[i] == f.pattern[j] { |
| i-- |
| j-- |
| } |
| if j < 0 { |
| return i + 1 // match |
| } |
| i += max(f.badCharSkip[text[i]], f.goodSuffixSkip[j]) |
| } |
| return -1 |
| } |
| |
| func max(a, b int) int { |
| if a > b { |
| return a |
| } |
| return b |
| } |