| // Copyright 2011 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| package norm |
| |
| const MaxSegmentSize = maxByteBufferSize |
| |
| // An Iter iterates over a string or byte slice, while normalizing it |
| // to a given Form. |
| type Iter struct { |
| rb reorderBuffer |
| info Properties // first character saved from previous iteration |
| next iterFunc // implementation of next depends on form |
| |
| p int // current position in input source |
| outStart int // start of current segment in output buffer |
| inStart int // start of current segment in input source |
| maxp int // position in output buffer after which not to start a new segment |
| maxseg int // for tracking an excess of combining characters |
| |
| tccc uint8 |
| done bool |
| } |
| |
| type iterFunc func(*Iter, []byte) int |
| |
| // SetInput initializes i to iterate over src after normalizing it to Form f. |
| func (i *Iter) SetInput(f Form, src []byte) { |
| i.rb.init(f, src) |
| if i.rb.f.composing { |
| i.next = nextComposed |
| } else { |
| i.next = nextDecomposed |
| } |
| i.p = 0 |
| if i.done = len(src) == 0; !i.done { |
| i.info = i.rb.f.info(i.rb.src, i.p) |
| } |
| } |
| |
| // SetInputString initializes i to iterate over src after normalizing it to Form f. |
| func (i *Iter) SetInputString(f Form, src string) { |
| i.rb.initString(f, src) |
| if i.rb.f.composing { |
| i.next = nextComposed |
| } else { |
| i.next = nextDecomposed |
| } |
| i.p = 0 |
| if i.done = len(src) == 0; !i.done { |
| i.info = i.rb.f.info(i.rb.src, i.p) |
| } |
| } |
| |
| // Pos returns the byte position at which the next call to Next will commence processing. |
| func (i *Iter) Pos() int { |
| return i.p |
| } |
| |
| // Done returns true if there is no more input to process. |
| func (i *Iter) Done() bool { |
| return i.done |
| } |
| |
| // Next writes f(i.input[i.Pos():n]...) to buffer buf, where n is the |
| // largest boundary of i.input such that the result fits in buf. |
| // It returns the number of bytes written to buf. |
| // len(buf) should be at least MaxSegmentSize. |
| // Done must be false before calling Next. |
| func (i *Iter) Next(buf []byte) int { |
| return i.next(i, buf) |
| } |
| |
| func (i *Iter) initNext(outn, inStart int) { |
| i.outStart = 0 |
| i.inStart = inStart |
| i.maxp = outn - MaxSegmentSize |
| i.maxseg = MaxSegmentSize |
| } |
| |
| // setStart resets the start of the new segment to the given position. |
| // It returns true if there is not enough room for the new segment. |
| func (i *Iter) setStart(outp, inp int) bool { |
| if outp > i.maxp { |
| return true |
| } |
| i.outStart = outp |
| i.inStart = inp |
| i.maxseg = outp + MaxSegmentSize |
| return false |
| } |
| |
| func min(a, b int) int { |
| if a < b { |
| return a |
| } |
| return b |
| } |
| |
| // nextDecomposed is the implementation of Next for forms NFD and NFKD. |
| func nextDecomposed(i *Iter, out []byte) int { |
| var outp int |
| i.initNext(len(out), i.p) |
| doFast: |
| inCopyStart, outCopyStart := i.p, outp // invariant xCopyStart <= i.xStart |
| for { |
| if sz := int(i.info.size); sz <= 1 { |
| // ASCII or illegal byte. Either way, advance by 1. |
| i.p++ |
| outp++ |
| max := min(i.rb.nsrc, len(out)-outp+i.p) |
| if np := i.rb.src.skipASCII(i.p, max); np > i.p { |
| outp += np - i.p |
| i.p = np |
| if i.p >= i.rb.nsrc { |
| break |
| } |
| // ASCII may combine with consecutive runes. |
| if i.setStart(outp-1, i.p-1) { |
| i.p-- |
| outp-- |
| i.info.size = 1 |
| break |
| } |
| } |
| } else if d := i.info.Decomposition(); d != nil { |
| i.rb.src.copySlice(out[outCopyStart:], inCopyStart, i.p) |
| p := outp + len(d) |
| if p > i.maxseg && i.setStart(outp, i.p) { |
| return outp |
| } |
| copy(out[outp:], d) |
| outp = p |
| i.p += sz |
| inCopyStart, outCopyStart = i.p, outp |
| } else if r := i.rb.src.hangul(i.p); r != 0 { |
| i.rb.src.copySlice(out[outCopyStart:], inCopyStart, i.p) |
| for { |
| outp += decomposeHangul(out[outp:], r) |
| i.p += hangulUTF8Size |
| if r = i.rb.src.hangul(i.p); r == 0 { |
| break |
| } |
| if i.setStart(outp, i.p) { |
| return outp |
| } |
| } |
| inCopyStart, outCopyStart = i.p, outp |
| } else { |
| p := outp + sz |
| if p > i.maxseg && i.setStart(outp, i.p) { |
| break |
| } |
| outp = p |
| i.p += sz |
| } |
| if i.p >= i.rb.nsrc { |
| break |
| } |
| prevCC := i.info.tccc |
| i.info = i.rb.f.info(i.rb.src, i.p) |
| if cc := i.info.ccc; cc == 0 { |
| if i.setStart(outp, i.p) { |
| break |
| } |
| } else if cc < prevCC { |
| goto doNorm |
| } |
| } |
| if inCopyStart != i.p { |
| i.rb.src.copySlice(out[outCopyStart:], inCopyStart, i.p) |
| } |
| i.done = i.p >= i.rb.nsrc |
| return outp |
| doNorm: |
| // Insert what we have decomposed so far in the reorderBuffer. |
| // As we will only reorder, there will always be enough room. |
| i.rb.src.copySlice(out[outCopyStart:], inCopyStart, i.p) |
| if !i.rb.insertDecomposed(out[i.outStart:outp]) { |
| // Start over to prevent decompositions from crossing segment boundaries. |
| // This is a rare occurance. |
| i.p = i.inStart |
| i.info = i.rb.f.info(i.rb.src, i.p) |
| } |
| outp = i.outStart |
| for { |
| if !i.rb.insert(i.rb.src, i.p, i.info) { |
| break |
| } |
| if i.p += int(i.info.size); i.p >= i.rb.nsrc { |
| outp += i.rb.flushCopy(out[outp:]) |
| i.done = true |
| return outp |
| } |
| i.info = i.rb.f.info(i.rb.src, i.p) |
| if i.info.ccc == 0 { |
| break |
| } |
| } |
| // new segment or too many combining characters: exit normalization |
| if outp += i.rb.flushCopy(out[outp:]); i.setStart(outp, i.p) { |
| return outp |
| } |
| goto doFast |
| } |
| |
| // nextComposed is the implementation of Next for forms NFC and NFKC. |
| func nextComposed(i *Iter, out []byte) int { |
| var outp int |
| i.initNext(len(out), i.p) |
| doFast: |
| inCopyStart, outCopyStart := i.p, outp // invariant xCopyStart <= i.xStart |
| var prevCC uint8 |
| for { |
| if !i.info.isYesC() { |
| goto doNorm |
| } |
| if cc := i.info.ccc; cc == 0 { |
| if i.setStart(outp, i.p) { |
| break |
| } |
| } else if cc < prevCC { |
| goto doNorm |
| } |
| prevCC = i.info.tccc |
| sz := int(i.info.size) |
| if sz == 0 { |
| sz = 1 // illegal rune: copy byte-by-byte |
| } |
| p := outp + sz |
| if p > i.maxseg && i.setStart(outp, i.p) { |
| break |
| } |
| outp = p |
| i.p += sz |
| max := min(i.rb.nsrc, len(out)-outp+i.p) |
| if np := i.rb.src.skipASCII(i.p, max); np > i.p { |
| outp += np - i.p |
| i.p = np |
| if i.p >= i.rb.nsrc { |
| break |
| } |
| // ASCII may combine with consecutive runes. |
| if i.setStart(outp-1, i.p-1) { |
| i.p-- |
| outp-- |
| i.info = Properties{size: 1} |
| break |
| } |
| } |
| if i.p >= i.rb.nsrc { |
| break |
| } |
| i.info = i.rb.f.info(i.rb.src, i.p) |
| } |
| if inCopyStart != i.p { |
| i.rb.src.copySlice(out[outCopyStart:], inCopyStart, i.p) |
| } |
| i.done = i.p >= i.rb.nsrc |
| return outp |
| doNorm: |
| i.rb.src.copySlice(out[outCopyStart:], inCopyStart, i.inStart) |
| outp, i.p = i.outStart, i.inStart |
| i.info = i.rb.f.info(i.rb.src, i.p) |
| for { |
| if !i.rb.insert(i.rb.src, i.p, i.info) { |
| break |
| } |
| if i.p += int(i.info.size); i.p >= i.rb.nsrc { |
| i.rb.compose() |
| outp += i.rb.flushCopy(out[outp:]) |
| i.done = true |
| return outp |
| } |
| i.info = i.rb.f.info(i.rb.src, i.p) |
| if i.info.BoundaryBefore() { |
| break |
| } |
| } |
| i.rb.compose() |
| if outp += i.rb.flushCopy(out[outp:]); i.setStart(outp, i.p) { |
| return outp |
| } |
| goto doFast |
| } |