| // Copyright 2011 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| package norm |
| |
| import ( |
| "fmt" |
| "unicode/utf8" |
| ) |
| |
| // MaxSegmentSize is the maximum size of a byte buffer needed to consider any |
| // sequence of starter and non-starter runes for the purpose of normalization. |
| const MaxSegmentSize = maxByteBufferSize |
| |
| // An Iter iterates over a string or byte slice, while normalizing it |
| // to a given Form. |
| type Iter struct { |
| rb reorderBuffer |
| buf [maxByteBufferSize]byte |
| info Properties // first character saved from previous iteration |
| next iterFunc // implementation of next depends on form |
| asciiF iterFunc |
| |
| p int // current position in input source |
| multiSeg []byte // remainder of multi-segment decomposition |
| } |
| |
| type iterFunc func(*Iter) []byte |
| |
| // Init initializes i to iterate over src after normalizing it to Form f. |
| func (i *Iter) Init(f Form, src []byte) { |
| i.p = 0 |
| if len(src) == 0 { |
| i.setDone() |
| i.rb.nsrc = 0 |
| return |
| } |
| i.multiSeg = nil |
| i.rb.init(f, src) |
| i.next = i.rb.f.nextMain |
| i.asciiF = nextASCIIBytes |
| i.info = i.rb.f.info(i.rb.src, i.p) |
| i.rb.ss.first(i.info) |
| } |
| |
| // InitString initializes i to iterate over src after normalizing it to Form f. |
| func (i *Iter) InitString(f Form, src string) { |
| i.p = 0 |
| if len(src) == 0 { |
| i.setDone() |
| i.rb.nsrc = 0 |
| return |
| } |
| i.multiSeg = nil |
| i.rb.initString(f, src) |
| i.next = i.rb.f.nextMain |
| i.asciiF = nextASCIIString |
| i.info = i.rb.f.info(i.rb.src, i.p) |
| i.rb.ss.first(i.info) |
| } |
| |
| // Seek sets the segment to be returned by the next call to Next to start |
| // at position p. It is the responsibility of the caller to set p to the |
| // start of a segment. |
| func (i *Iter) Seek(offset int64, whence int) (int64, error) { |
| var abs int64 |
| switch whence { |
| case 0: |
| abs = offset |
| case 1: |
| abs = int64(i.p) + offset |
| case 2: |
| abs = int64(i.rb.nsrc) + offset |
| default: |
| return 0, fmt.Errorf("norm: invalid whence") |
| } |
| if abs < 0 { |
| return 0, fmt.Errorf("norm: negative position") |
| } |
| if int(abs) >= i.rb.nsrc { |
| i.setDone() |
| return int64(i.p), nil |
| } |
| i.p = int(abs) |
| i.multiSeg = nil |
| i.next = i.rb.f.nextMain |
| i.info = i.rb.f.info(i.rb.src, i.p) |
| i.rb.ss.first(i.info) |
| return abs, nil |
| } |
| |
| // returnSlice returns a slice of the underlying input type as a byte slice. |
| // If the underlying is of type []byte, it will simply return a slice. |
| // If the underlying is of type string, it will copy the slice to the buffer |
| // and return that. |
| func (i *Iter) returnSlice(a, b int) []byte { |
| if i.rb.src.bytes == nil { |
| return i.buf[:copy(i.buf[:], i.rb.src.str[a:b])] |
| } |
| return i.rb.src.bytes[a:b] |
| } |
| |
| // Pos returns the byte position at which the next call to Next will commence processing. |
| func (i *Iter) Pos() int { |
| return i.p |
| } |
| |
| func (i *Iter) setDone() { |
| i.next = nextDone |
| i.p = i.rb.nsrc |
| } |
| |
| // Done returns true if there is no more input to process. |
| func (i *Iter) Done() bool { |
| return i.p >= i.rb.nsrc |
| } |
| |
| // Next returns f(i.input[i.Pos():n]), where n is a boundary of i.input. |
| // For any input a and b for which f(a) == f(b), subsequent calls |
| // to Next will return the same segments. |
| // Modifying runes are grouped together with the preceding starter, if such a starter exists. |
| // Although not guaranteed, n will typically be the smallest possible n. |
| func (i *Iter) Next() []byte { |
| return i.next(i) |
| } |
| |
| func nextASCIIBytes(i *Iter) []byte { |
| p := i.p + 1 |
| if p >= i.rb.nsrc { |
| p0 := i.p |
| i.setDone() |
| return i.rb.src.bytes[p0:p] |
| } |
| if i.rb.src.bytes[p] < utf8.RuneSelf { |
| p0 := i.p |
| i.p = p |
| return i.rb.src.bytes[p0:p] |
| } |
| i.info = i.rb.f.info(i.rb.src, i.p) |
| i.next = i.rb.f.nextMain |
| return i.next(i) |
| } |
| |
| func nextASCIIString(i *Iter) []byte { |
| p := i.p + 1 |
| if p >= i.rb.nsrc { |
| i.buf[0] = i.rb.src.str[i.p] |
| i.setDone() |
| return i.buf[:1] |
| } |
| if i.rb.src.str[p] < utf8.RuneSelf { |
| i.buf[0] = i.rb.src.str[i.p] |
| i.p = p |
| return i.buf[:1] |
| } |
| i.info = i.rb.f.info(i.rb.src, i.p) |
| i.next = i.rb.f.nextMain |
| return i.next(i) |
| } |
| |
| func nextHangul(i *Iter) []byte { |
| p := i.p |
| next := p + hangulUTF8Size |
| if next >= i.rb.nsrc { |
| i.setDone() |
| } else if i.rb.src.hangul(next) == 0 { |
| i.rb.ss.next(i.info) |
| i.info = i.rb.f.info(i.rb.src, i.p) |
| i.next = i.rb.f.nextMain |
| return i.next(i) |
| } |
| i.p = next |
| return i.buf[:decomposeHangul(i.buf[:], i.rb.src.hangul(p))] |
| } |
| |
| func nextDone(i *Iter) []byte { |
| return nil |
| } |
| |
| // nextMulti is used for iterating over multi-segment decompositions |
| // for decomposing normal forms. |
| func nextMulti(i *Iter) []byte { |
| j := 0 |
| d := i.multiSeg |
| // skip first rune |
| for j = 1; j < len(d) && !utf8.RuneStart(d[j]); j++ { |
| } |
| for j < len(d) { |
| info := i.rb.f.info(input{bytes: d}, j) |
| if info.BoundaryBefore() { |
| i.multiSeg = d[j:] |
| return d[:j] |
| } |
| j += int(info.size) |
| } |
| // treat last segment as normal decomposition |
| i.next = i.rb.f.nextMain |
| return i.next(i) |
| } |
| |
| // nextMultiNorm is used for iterating over multi-segment decompositions |
| // for composing normal forms. |
| func nextMultiNorm(i *Iter) []byte { |
| j := 0 |
| d := i.multiSeg |
| for j < len(d) { |
| info := i.rb.f.info(input{bytes: d}, j) |
| if info.BoundaryBefore() { |
| i.rb.compose() |
| seg := i.buf[:i.rb.flushCopy(i.buf[:])] |
| i.rb.insertUnsafe(input{bytes: d}, j, info) |
| i.multiSeg = d[j+int(info.size):] |
| return seg |
| } |
| i.rb.insertUnsafe(input{bytes: d}, j, info) |
| j += int(info.size) |
| } |
| i.multiSeg = nil |
| i.next = nextComposed |
| return doNormComposed(i) |
| } |
| |
| // nextDecomposed is the implementation of Next for forms NFD and NFKD. |
| func nextDecomposed(i *Iter) (next []byte) { |
| outp := 0 |
| inCopyStart, outCopyStart := i.p, 0 |
| for { |
| if sz := int(i.info.size); sz <= 1 { |
| i.rb.ss = 0 |
| p := i.p |
| i.p++ // ASCII or illegal byte. Either way, advance by 1. |
| if i.p >= i.rb.nsrc { |
| i.setDone() |
| return i.returnSlice(p, i.p) |
| } else if i.rb.src._byte(i.p) < utf8.RuneSelf { |
| i.next = i.asciiF |
| return i.returnSlice(p, i.p) |
| } |
| outp++ |
| } else if d := i.info.Decomposition(); d != nil { |
| // Note: If leading CCC != 0, then len(d) == 2 and last is also non-zero. |
| // Case 1: there is a leftover to copy. In this case the decomposition |
| // must begin with a modifier and should always be appended. |
| // Case 2: no leftover. Simply return d if followed by a ccc == 0 value. |
| p := outp + len(d) |
| if outp > 0 { |
| i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p) |
| // TODO: this condition should not be possible, but we leave it |
| // in for defensive purposes. |
| if p > len(i.buf) { |
| return i.buf[:outp] |
| } |
| } else if i.info.multiSegment() { |
| // outp must be 0 as multi-segment decompositions always |
| // start a new segment. |
| if i.multiSeg == nil { |
| i.multiSeg = d |
| i.next = nextMulti |
| return nextMulti(i) |
| } |
| // We are in the last segment. Treat as normal decomposition. |
| d = i.multiSeg |
| i.multiSeg = nil |
| p = len(d) |
| } |
| prevCC := i.info.tccc |
| if i.p += sz; i.p >= i.rb.nsrc { |
| i.setDone() |
| i.info = Properties{} // Force BoundaryBefore to succeed. |
| } else { |
| i.info = i.rb.f.info(i.rb.src, i.p) |
| } |
| switch i.rb.ss.next(i.info) { |
| case ssOverflow: |
| i.next = nextCGJDecompose |
| fallthrough |
| case ssStarter: |
| if outp > 0 { |
| copy(i.buf[outp:], d) |
| return i.buf[:p] |
| } |
| return d |
| } |
| copy(i.buf[outp:], d) |
| outp = p |
| inCopyStart, outCopyStart = i.p, outp |
| if i.info.ccc < prevCC { |
| goto doNorm |
| } |
| continue |
| } else if r := i.rb.src.hangul(i.p); r != 0 { |
| outp = decomposeHangul(i.buf[:], r) |
| i.p += hangulUTF8Size |
| inCopyStart, outCopyStart = i.p, outp |
| if i.p >= i.rb.nsrc { |
| i.setDone() |
| break |
| } else if i.rb.src.hangul(i.p) != 0 { |
| i.next = nextHangul |
| return i.buf[:outp] |
| } |
| } else { |
| p := outp + sz |
| if p > len(i.buf) { |
| break |
| } |
| outp = p |
| i.p += sz |
| } |
| if i.p >= i.rb.nsrc { |
| i.setDone() |
| break |
| } |
| prevCC := i.info.tccc |
| i.info = i.rb.f.info(i.rb.src, i.p) |
| if v := i.rb.ss.next(i.info); v == ssStarter { |
| break |
| } else if v == ssOverflow { |
| i.next = nextCGJDecompose |
| break |
| } |
| if i.info.ccc < prevCC { |
| goto doNorm |
| } |
| } |
| if outCopyStart == 0 { |
| return i.returnSlice(inCopyStart, i.p) |
| } else if inCopyStart < i.p { |
| i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p) |
| } |
| return i.buf[:outp] |
| doNorm: |
| // Insert what we have decomposed so far in the reorderBuffer. |
| // As we will only reorder, there will always be enough room. |
| i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p) |
| i.rb.insertDecomposed(i.buf[0:outp]) |
| return doNormDecomposed(i) |
| } |
| |
| func doNormDecomposed(i *Iter) []byte { |
| for { |
| i.rb.insertUnsafe(i.rb.src, i.p, i.info) |
| if i.p += int(i.info.size); i.p >= i.rb.nsrc { |
| i.setDone() |
| break |
| } |
| i.info = i.rb.f.info(i.rb.src, i.p) |
| if i.info.ccc == 0 { |
| break |
| } |
| if s := i.rb.ss.next(i.info); s == ssOverflow { |
| i.next = nextCGJDecompose |
| break |
| } |
| } |
| // new segment or too many combining characters: exit normalization |
| return i.buf[:i.rb.flushCopy(i.buf[:])] |
| } |
| |
| func nextCGJDecompose(i *Iter) []byte { |
| i.rb.ss = 0 |
| i.rb.insertCGJ() |
| i.next = nextDecomposed |
| i.rb.ss.first(i.info) |
| buf := doNormDecomposed(i) |
| return buf |
| } |
| |
| // nextComposed is the implementation of Next for forms NFC and NFKC. |
| func nextComposed(i *Iter) []byte { |
| outp, startp := 0, i.p |
| var prevCC uint8 |
| for { |
| if !i.info.isYesC() { |
| goto doNorm |
| } |
| prevCC = i.info.tccc |
| sz := int(i.info.size) |
| if sz == 0 { |
| sz = 1 // illegal rune: copy byte-by-byte |
| } |
| p := outp + sz |
| if p > len(i.buf) { |
| break |
| } |
| outp = p |
| i.p += sz |
| if i.p >= i.rb.nsrc { |
| i.setDone() |
| break |
| } else if i.rb.src._byte(i.p) < utf8.RuneSelf { |
| i.rb.ss = 0 |
| i.next = i.asciiF |
| break |
| } |
| i.info = i.rb.f.info(i.rb.src, i.p) |
| if v := i.rb.ss.next(i.info); v == ssStarter { |
| break |
| } else if v == ssOverflow { |
| i.next = nextCGJCompose |
| break |
| } |
| if i.info.ccc < prevCC { |
| goto doNorm |
| } |
| } |
| return i.returnSlice(startp, i.p) |
| doNorm: |
| // reset to start position |
| i.p = startp |
| i.info = i.rb.f.info(i.rb.src, i.p) |
| i.rb.ss.first(i.info) |
| if i.info.multiSegment() { |
| d := i.info.Decomposition() |
| info := i.rb.f.info(input{bytes: d}, 0) |
| i.rb.insertUnsafe(input{bytes: d}, 0, info) |
| i.multiSeg = d[int(info.size):] |
| i.next = nextMultiNorm |
| return nextMultiNorm(i) |
| } |
| i.rb.ss.first(i.info) |
| i.rb.insertUnsafe(i.rb.src, i.p, i.info) |
| return doNormComposed(i) |
| } |
| |
| func doNormComposed(i *Iter) []byte { |
| // First rune should already be inserted. |
| for { |
| if i.p += int(i.info.size); i.p >= i.rb.nsrc { |
| i.setDone() |
| break |
| } |
| i.info = i.rb.f.info(i.rb.src, i.p) |
| if s := i.rb.ss.next(i.info); s == ssStarter { |
| break |
| } else if s == ssOverflow { |
| i.next = nextCGJCompose |
| break |
| } |
| i.rb.insertUnsafe(i.rb.src, i.p, i.info) |
| } |
| i.rb.compose() |
| seg := i.buf[:i.rb.flushCopy(i.buf[:])] |
| return seg |
| } |
| |
| func nextCGJCompose(i *Iter) []byte { |
| i.rb.ss = 0 // instead of first |
| i.rb.insertCGJ() |
| i.next = nextComposed |
| // Note that we treat any rune with nLeadingNonStarters > 0 as a non-starter, |
| // even if they are not. This is particularly dubious for U+FF9E and UFF9A. |
| // If we ever change that, insert a check here. |
| i.rb.ss.first(i.info) |
| i.rb.insertUnsafe(i.rb.src, i.p, i.info) |
| return doNormComposed(i) |
| } |