| // Copyright 2012 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| package colltab |
| |
| import "unicode/utf8" |
| |
| // For a description of ContractTrieSet, see text/collate/build/contract.go. |
| |
| type ContractTrieSet []struct{ L, H, N, I uint8 } |
| |
| // ctScanner is used to match a trie to an input sequence. |
| // A contraction may match a non-contiguous sequence of bytes in an input string. |
| // For example, if there is a contraction for <a, combining_ring>, it should match |
| // the sequence <a, combining_cedilla, combining_ring>, as combining_cedilla does |
| // not block combining_ring. |
| // ctScanner does not automatically skip over non-blocking non-starters, but rather |
| // retains the state of the last match and leaves it up to the user to continue |
| // the match at the appropriate points. |
| type ctScanner struct { |
| states ContractTrieSet |
| s []byte |
| n int |
| index int |
| pindex int |
| done bool |
| } |
| |
| type ctScannerString struct { |
| states ContractTrieSet |
| s string |
| n int |
| index int |
| pindex int |
| done bool |
| } |
| |
| func (t ContractTrieSet) scanner(index, n int, b []byte) ctScanner { |
| return ctScanner{s: b, states: t[index:], n: n} |
| } |
| |
| func (t ContractTrieSet) scannerString(index, n int, str string) ctScannerString { |
| return ctScannerString{s: str, states: t[index:], n: n} |
| } |
| |
| // result returns the offset i and bytes consumed p so far. If no suffix |
| // matched, i and p will be 0. |
| func (s *ctScanner) result() (i, p int) { |
| return s.index, s.pindex |
| } |
| |
| func (s *ctScannerString) result() (i, p int) { |
| return s.index, s.pindex |
| } |
| |
| const ( |
| final = 0 |
| noIndex = 0xFF |
| ) |
| |
| // scan matches the longest suffix at the current location in the input |
| // and returns the number of bytes consumed. |
| func (s *ctScanner) scan(p int) int { |
| pr := p // the p at the rune start |
| str := s.s |
| states, n := s.states, s.n |
| for i := 0; i < n && p < len(str); { |
| e := states[i] |
| c := str[p] |
| // TODO: a significant number of contractions are of a form that |
| // cannot match discontiguous UTF-8 in a normalized string. We could let |
| // a negative value of e.n mean that we can set s.done = true and avoid |
| // the need for additional matches. |
| if c >= e.L { |
| if e.L == c { |
| p++ |
| if e.I != noIndex { |
| s.index = int(e.I) |
| s.pindex = p |
| } |
| if e.N != final { |
| i, states, n = 0, states[int(e.H)+n:], int(e.N) |
| if p >= len(str) || utf8.RuneStart(str[p]) { |
| s.states, s.n, pr = states, n, p |
| } |
| } else { |
| s.done = true |
| return p |
| } |
| continue |
| } else if e.N == final && c <= e.H { |
| p++ |
| s.done = true |
| s.index = int(c-e.L) + int(e.I) |
| s.pindex = p |
| return p |
| } |
| } |
| i++ |
| } |
| return pr |
| } |
| |
| // scan is a verbatim copy of ctScanner.scan. |
| func (s *ctScannerString) scan(p int) int { |
| pr := p // the p at the rune start |
| str := s.s |
| states, n := s.states, s.n |
| for i := 0; i < n && p < len(str); { |
| e := states[i] |
| c := str[p] |
| // TODO: a significant number of contractions are of a form that |
| // cannot match discontiguous UTF-8 in a normalized string. We could let |
| // a negative value of e.n mean that we can set s.done = true and avoid |
| // the need for additional matches. |
| if c >= e.L { |
| if e.L == c { |
| p++ |
| if e.I != noIndex { |
| s.index = int(e.I) |
| s.pindex = p |
| } |
| if e.N != final { |
| i, states, n = 0, states[int(e.H)+n:], int(e.N) |
| if p >= len(str) || utf8.RuneStart(str[p]) { |
| s.states, s.n, pr = states, n, p |
| } |
| } else { |
| s.done = true |
| return p |
| } |
| continue |
| } else if e.N == final && c <= e.H { |
| p++ |
| s.done = true |
| s.index = int(c-e.L) + int(e.I) |
| s.pindex = p |
| return p |
| } |
| } |
| i++ |
| } |
| return pr |
| } |