| // Copyright 2012 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| // Package collate contains types for comparing and sorting Unicode strings |
| // according to a given collation order. Package locale provides a high-level |
| // interface to collation. Users should typically use that package instead. |
| package collate // import "golang.org/x/text/collate" |
| |
| //go:generate go run maketables.go -output tables.go |
| |
| import ( |
| "bytes" |
| "strings" |
| |
| "golang.org/x/text/collate/colltab" |
| "golang.org/x/text/language" |
| ) |
| |
| // Collator provides functionality for comparing strings for a given |
| // collation order. |
| type Collator struct { |
| options |
| |
| sorter sorter |
| |
| _iter [2]iter |
| } |
| |
| func (c *Collator) iter(i int) *iter { |
| // TODO: evaluate performance for making the second iterator optional. |
| return &c._iter[i] |
| } |
| |
| // Supported returns the list of languages for which collating differs from its parent. |
| func Supported() []language.Tag { |
| ids := strings.Split(availableLocales, ",") |
| tags := make([]language.Tag, len(ids)) |
| for i, s := range ids { |
| tags[i] = language.Make(s) |
| } |
| return tags |
| } |
| |
| var matcher = language.NewMatcher(Supported()) |
| |
| // New returns a new Collator initialized for the given locale. |
| func New(t language.Tag, o ...Option) *Collator { |
| tt, index, _ := matcher.Match(t) |
| c := newCollator(colltab.Init(locales[index])) |
| |
| // Set the default options for the retrieved locale. |
| c.setFromTag(tt) |
| |
| // Set options from the user-supplied tag. |
| c.setFromTag(t) |
| |
| // Set the user-supplied options. |
| c.setOptions(o) |
| |
| return c |
| } |
| |
| // NewFromTable returns a new Collator for the given Weigher. |
| func NewFromTable(w colltab.Weigher, o ...Option) *Collator { |
| c := newCollator(w) |
| c.setOptions(o) |
| return c |
| } |
| |
| // Buffer holds keys generated by Key and KeyString. |
| type Buffer struct { |
| buf [4096]byte |
| key []byte |
| } |
| |
| func (b *Buffer) init() { |
| if b.key == nil { |
| b.key = b.buf[:0] |
| } |
| } |
| |
| // Reset clears the buffer from previous results generated by Key and KeyString. |
| func (b *Buffer) Reset() { |
| b.key = b.key[:0] |
| } |
| |
| // Compare returns an integer comparing the two byte slices. |
| // The result will be 0 if a==b, -1 if a < b, and +1 if a > b. |
| func (c *Collator) Compare(a, b []byte) int { |
| // TODO: skip identical prefixes once we have a fast way to detect if a rune is |
| // part of a contraction. This would lead to roughly a 10% speedup for the colcmp regtest. |
| c.iter(0).setInput(a) |
| c.iter(1).setInput(b) |
| if res := c.compare(); res != 0 { |
| return res |
| } |
| if !c.ignore[colltab.Identity] { |
| return bytes.Compare(a, b) |
| } |
| return 0 |
| } |
| |
| // CompareString returns an integer comparing the two strings. |
| // The result will be 0 if a==b, -1 if a < b, and +1 if a > b. |
| func (c *Collator) CompareString(a, b string) int { |
| // TODO: skip identical prefixes once we have a fast way to detect if a rune is |
| // part of a contraction. This would lead to roughly a 10% speedup for the colcmp regtest. |
| c.iter(0).setInputString(a) |
| c.iter(1).setInputString(b) |
| if res := c.compare(); res != 0 { |
| return res |
| } |
| if !c.ignore[colltab.Identity] { |
| if a < b { |
| return -1 |
| } else if a > b { |
| return 1 |
| } |
| } |
| return 0 |
| } |
| |
| func compareLevel(f func(i *iter) int, a, b *iter) int { |
| a.pce = 0 |
| b.pce = 0 |
| for { |
| va := f(a) |
| vb := f(b) |
| if va != vb { |
| if va < vb { |
| return -1 |
| } |
| return 1 |
| } else if va == 0 { |
| break |
| } |
| } |
| return 0 |
| } |
| |
| func (c *Collator) compare() int { |
| ia, ib := c.iter(0), c.iter(1) |
| // Process primary level |
| if c.alternate != altShifted { |
| // TODO: implement script reordering |
| if res := compareLevel((*iter).nextPrimary, ia, ib); res != 0 { |
| return res |
| } |
| } else { |
| // TODO: handle shifted |
| } |
| if !c.ignore[colltab.Secondary] { |
| f := (*iter).nextSecondary |
| if c.backwards { |
| f = (*iter).prevSecondary |
| } |
| if res := compareLevel(f, ia, ib); res != 0 { |
| return res |
| } |
| } |
| // TODO: special case handling (Danish?) |
| if !c.ignore[colltab.Tertiary] || c.caseLevel { |
| if res := compareLevel((*iter).nextTertiary, ia, ib); res != 0 { |
| return res |
| } |
| if !c.ignore[colltab.Quaternary] { |
| if res := compareLevel((*iter).nextQuaternary, ia, ib); res != 0 { |
| return res |
| } |
| } |
| } |
| return 0 |
| } |
| |
| // Key returns the collation key for str. |
| // Passing the buffer buf may avoid memory allocations. |
| // The returned slice will point to an allocation in Buffer and will remain |
| // valid until the next call to buf.Reset(). |
| func (c *Collator) Key(buf *Buffer, str []byte) []byte { |
| // See http://www.unicode.org/reports/tr10/#Main_Algorithm for more details. |
| buf.init() |
| return c.key(buf, c.getColElems(str)) |
| } |
| |
| // KeyFromString returns the collation key for str. |
| // Passing the buffer buf may avoid memory allocations. |
| // The returned slice will point to an allocation in Buffer and will retain |
| // valid until the next call to buf.ResetKeys(). |
| func (c *Collator) KeyFromString(buf *Buffer, str string) []byte { |
| // See http://www.unicode.org/reports/tr10/#Main_Algorithm for more details. |
| buf.init() |
| return c.key(buf, c.getColElemsString(str)) |
| } |
| |
| func (c *Collator) key(buf *Buffer, w []colltab.Elem) []byte { |
| processWeights(c.alternate, c.t.Top(), w) |
| kn := len(buf.key) |
| c.keyFromElems(buf, w) |
| return buf.key[kn:] |
| } |
| |
| func (c *Collator) getColElems(str []byte) []colltab.Elem { |
| i := c.iter(0) |
| i.setInput(str) |
| for i.next() { |
| } |
| return i.ce |
| } |
| |
| func (c *Collator) getColElemsString(str string) []colltab.Elem { |
| i := c.iter(0) |
| i.setInputString(str) |
| for i.next() { |
| } |
| return i.ce |
| } |
| |
| type iter struct { |
| bytes []byte |
| str string |
| |
| wa [512]colltab.Elem |
| ce []colltab.Elem |
| pce int |
| nce int // nce <= len(nce) |
| |
| prevCCC uint8 |
| pStarter int |
| |
| t colltab.Weigher |
| } |
| |
| func (i *iter) init(c *Collator) { |
| i.t = c.t |
| i.ce = i.wa[:0] |
| } |
| |
| func (i *iter) reset() { |
| i.ce = i.ce[:0] |
| i.nce = 0 |
| i.prevCCC = 0 |
| i.pStarter = 0 |
| } |
| |
| func (i *iter) setInput(s []byte) *iter { |
| i.bytes = s |
| i.str = "" |
| i.reset() |
| return i |
| } |
| |
| func (i *iter) setInputString(s string) *iter { |
| i.str = s |
| i.bytes = nil |
| i.reset() |
| return i |
| } |
| |
| func (i *iter) done() bool { |
| return len(i.str) == 0 && len(i.bytes) == 0 |
| } |
| |
| func (i *iter) tail(n int) { |
| if i.bytes == nil { |
| i.str = i.str[n:] |
| } else { |
| i.bytes = i.bytes[n:] |
| } |
| } |
| |
| func (i *iter) appendNext() int { |
| var sz int |
| if i.bytes == nil { |
| i.ce, sz = i.t.AppendNextString(i.ce, i.str) |
| } else { |
| i.ce, sz = i.t.AppendNext(i.ce, i.bytes) |
| } |
| return sz |
| } |
| |
| // next appends Elems to the internal array until it adds an element with CCC=0. |
| // In the majority of cases, a Elem with a primary value > 0 will have |
| // a CCC of 0. The CCC values of colation elements are also used to detect if the |
| // input string was not normalized and to adjust the result accordingly. |
| func (i *iter) next() bool { |
| for !i.done() { |
| p0 := len(i.ce) |
| sz := i.appendNext() |
| i.tail(sz) |
| last := len(i.ce) - 1 |
| if ccc := i.ce[last].CCC(); ccc == 0 { |
| i.nce = len(i.ce) |
| i.pStarter = last |
| i.prevCCC = 0 |
| return true |
| } else if p0 < last && i.ce[p0].CCC() == 0 { |
| // set i.nce to only cover part of i.ce for which ccc == 0 and |
| // use rest the next call to next. |
| for p0++; p0 < last && i.ce[p0].CCC() == 0; p0++ { |
| } |
| i.nce = p0 |
| i.pStarter = p0 - 1 |
| i.prevCCC = ccc |
| return true |
| } else if ccc < i.prevCCC { |
| i.doNorm(p0, ccc) // should be rare for most common cases |
| } else { |
| i.prevCCC = ccc |
| } |
| } |
| if len(i.ce) != i.nce { |
| i.nce = len(i.ce) |
| return true |
| } |
| return false |
| } |
| |
| // nextPlain is the same as next, but does not "normalize" the collation |
| // elements. |
| // TODO: remove this function. Using this instead of next does not seem |
| // to improve performance in any significant way. We retain this until |
| // later for evaluation purposes. |
| func (i *iter) nextPlain() bool { |
| if i.done() { |
| return false |
| } |
| sz := i.appendNext() |
| i.tail(sz) |
| i.nce = len(i.ce) |
| return true |
| } |
| |
| const maxCombiningCharacters = 30 |
| |
| // doNorm reorders the collation elements in i.ce. |
| // It assumes that blocks of collation elements added with appendNext |
| // either start and end with the same CCC or start with CCC == 0. |
| // This allows for a single insertion point for the entire block. |
| // The correctness of this assumption is verified in builder.go. |
| func (i *iter) doNorm(p int, ccc uint8) { |
| if p-i.pStarter > maxCombiningCharacters { |
| i.prevCCC = i.ce[len(i.ce)-1].CCC() |
| i.pStarter = len(i.ce) - 1 |
| return |
| } |
| n := len(i.ce) |
| k := p |
| for p--; p > i.pStarter && ccc < i.ce[p-1].CCC(); p-- { |
| } |
| i.ce = append(i.ce, i.ce[p:k]...) |
| copy(i.ce[p:], i.ce[k:]) |
| i.ce = i.ce[:n] |
| } |
| |
| func (i *iter) nextPrimary() int { |
| for { |
| for ; i.pce < i.nce; i.pce++ { |
| if v := i.ce[i.pce].Primary(); v != 0 { |
| i.pce++ |
| return v |
| } |
| } |
| if !i.next() { |
| return 0 |
| } |
| } |
| panic("should not reach here") |
| } |
| |
| func (i *iter) nextSecondary() int { |
| for ; i.pce < len(i.ce); i.pce++ { |
| if v := i.ce[i.pce].Secondary(); v != 0 { |
| i.pce++ |
| return v |
| } |
| } |
| return 0 |
| } |
| |
| func (i *iter) prevSecondary() int { |
| for ; i.pce < len(i.ce); i.pce++ { |
| if v := i.ce[len(i.ce)-i.pce-1].Secondary(); v != 0 { |
| i.pce++ |
| return v |
| } |
| } |
| return 0 |
| } |
| |
| func (i *iter) nextTertiary() int { |
| for ; i.pce < len(i.ce); i.pce++ { |
| if v := i.ce[i.pce].Tertiary(); v != 0 { |
| i.pce++ |
| return int(v) |
| } |
| } |
| return 0 |
| } |
| |
| func (i *iter) nextQuaternary() int { |
| for ; i.pce < len(i.ce); i.pce++ { |
| if v := i.ce[i.pce].Quaternary(); v != 0 { |
| i.pce++ |
| return v |
| } |
| } |
| return 0 |
| } |
| |
| func appendPrimary(key []byte, p int) []byte { |
| // Convert to variable length encoding; supports up to 23 bits. |
| if p <= 0x7FFF { |
| key = append(key, uint8(p>>8), uint8(p)) |
| } else { |
| key = append(key, uint8(p>>16)|0x80, uint8(p>>8), uint8(p)) |
| } |
| return key |
| } |
| |
| // keyFromElems converts the weights ws to a compact sequence of bytes. |
| // The result will be appended to the byte buffer in buf. |
| func (c *Collator) keyFromElems(buf *Buffer, ws []colltab.Elem) { |
| for _, v := range ws { |
| if w := v.Primary(); w > 0 { |
| buf.key = appendPrimary(buf.key, w) |
| } |
| } |
| if !c.ignore[colltab.Secondary] { |
| buf.key = append(buf.key, 0, 0) |
| // TODO: we can use one 0 if we can guarantee that all non-zero weights are > 0xFF. |
| if !c.backwards { |
| for _, v := range ws { |
| if w := v.Secondary(); w > 0 { |
| buf.key = append(buf.key, uint8(w>>8), uint8(w)) |
| } |
| } |
| } else { |
| for i := len(ws) - 1; i >= 0; i-- { |
| if w := ws[i].Secondary(); w > 0 { |
| buf.key = append(buf.key, uint8(w>>8), uint8(w)) |
| } |
| } |
| } |
| } else if c.caseLevel { |
| buf.key = append(buf.key, 0, 0) |
| } |
| if !c.ignore[colltab.Tertiary] || c.caseLevel { |
| buf.key = append(buf.key, 0, 0) |
| for _, v := range ws { |
| if w := v.Tertiary(); w > 0 { |
| buf.key = append(buf.key, uint8(w)) |
| } |
| } |
| // Derive the quaternary weights from the options and other levels. |
| // Note that we represent MaxQuaternary as 0xFF. The first byte of the |
| // representation of a primary weight is always smaller than 0xFF, |
| // so using this single byte value will compare correctly. |
| if !c.ignore[colltab.Quaternary] && c.alternate >= altShifted { |
| if c.alternate == altShiftTrimmed { |
| lastNonFFFF := len(buf.key) |
| buf.key = append(buf.key, 0) |
| for _, v := range ws { |
| if w := v.Quaternary(); w == colltab.MaxQuaternary { |
| buf.key = append(buf.key, 0xFF) |
| } else if w > 0 { |
| buf.key = appendPrimary(buf.key, w) |
| lastNonFFFF = len(buf.key) |
| } |
| } |
| buf.key = buf.key[:lastNonFFFF] |
| } else { |
| buf.key = append(buf.key, 0) |
| for _, v := range ws { |
| if w := v.Quaternary(); w == colltab.MaxQuaternary { |
| buf.key = append(buf.key, 0xFF) |
| } else if w > 0 { |
| buf.key = appendPrimary(buf.key, w) |
| } |
| } |
| } |
| } |
| } |
| } |
| |
| func processWeights(vw alternateHandling, top uint32, wa []colltab.Elem) { |
| ignore := false |
| vtop := int(top) |
| switch vw { |
| case altShifted, altShiftTrimmed: |
| for i := range wa { |
| if p := wa[i].Primary(); p <= vtop && p != 0 { |
| wa[i] = colltab.MakeQuaternary(p) |
| ignore = true |
| } else if p == 0 { |
| if ignore { |
| wa[i] = colltab.Ignore |
| } |
| } else { |
| ignore = false |
| } |
| } |
| case altBlanked: |
| for i := range wa { |
| if p := wa[i].Primary(); p <= vtop && (ignore || p != 0) { |
| wa[i] = colltab.Ignore |
| ignore = true |
| } else { |
| ignore = false |
| } |
| } |
| } |
| } |