| // Copyright 2014 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| package colltab |
| |
| import ( |
| "unicode" |
| "unicode/utf8" |
| ) |
| |
| // NewNumericWeighter wraps w to replace individual digits to sort based on their |
| // numeric value. |
| // |
| // Weighter w must have a free primary weight after the primary weight for 9. |
| // If this is not the case, numeric value will sort at the same primary level |
| // as the first primary sorting after 9. |
| func NewNumericWeighter(w Weighter) Weighter { |
| getElem := func(s string) Elem { |
| elems, _ := w.AppendNextString(nil, s) |
| return elems[0] |
| } |
| nine := getElem("9") |
| |
| // Numbers should order before zero, but the DUCET has no room for this. |
| // TODO: move before zero once we use fractional collation elements. |
| ns, _ := MakeElem(nine.Primary()+1, nine.Secondary(), int(nine.Tertiary()), 0) |
| |
| return &numericWeighter{ |
| Weighter: w, |
| |
| // We assume that w sorts digits of different kinds in order of numeric |
| // value and that the tertiary weight order is preserved. |
| // |
| // TODO: evaluate whether it is worth basing the ranges on the Elem |
| // encoding itself once the move to fractional weights is complete. |
| zero: getElem("0"), |
| zeroSpecialLo: getElem("0"), // U+FF10 FULLWIDTH DIGIT ZERO |
| zeroSpecialHi: getElem("₀"), // U+2080 SUBSCRIPT ZERO |
| nine: nine, |
| nineSpecialHi: getElem("₉"), // U+2089 SUBSCRIPT NINE |
| numberStart: ns, |
| } |
| } |
| |
| // A numericWeighter translates a stream of digits into a stream of weights |
| // representing the numeric value. |
| type numericWeighter struct { |
| Weighter |
| |
| // The Elems below all demarcate boundaries of specific ranges. With the |
| // current element encoding digits are in two ranges: normal (default |
| // tertiary value) and special. For most languages, digits have collation |
| // elements in the normal range. |
| // |
| // Note: the range tests are very specific for the element encoding used by |
| // this implementation. The tests in collate_test.go are designed to fail |
| // if this code is not updated when an encoding has changed. |
| |
| zero Elem // normal digit zero |
| zeroSpecialLo Elem // special digit zero, low tertiary value |
| zeroSpecialHi Elem // special digit zero, high tertiary value |
| nine Elem // normal digit nine |
| nineSpecialHi Elem // special digit nine |
| numberStart Elem |
| } |
| |
| // AppendNext calls the namesake of the underlying weigher, but replaces single |
| // digits with weights representing their value. |
| func (nw *numericWeighter) AppendNext(buf []Elem, s []byte) (ce []Elem, n int) { |
| ce, n = nw.Weighter.AppendNext(buf, s) |
| nc := numberConverter{ |
| elems: buf, |
| w: nw, |
| b: s, |
| } |
| isZero, ok := nc.checkNextDigit(ce) |
| if !ok { |
| return ce, n |
| } |
| // ce might have been grown already, so take it instead of buf. |
| nc.init(ce, len(buf), isZero) |
| for n < len(s) { |
| ce, sz := nw.Weighter.AppendNext(nc.elems, s[n:]) |
| nc.b = s |
| n += sz |
| if !nc.update(ce) { |
| break |
| } |
| } |
| return nc.result(), n |
| } |
| |
| // AppendNextString calls the namesake of the underlying weigher, but replaces |
| // single digits with weights representing their value. |
| func (nw *numericWeighter) AppendNextString(buf []Elem, s string) (ce []Elem, n int) { |
| ce, n = nw.Weighter.AppendNextString(buf, s) |
| nc := numberConverter{ |
| elems: buf, |
| w: nw, |
| s: s, |
| } |
| isZero, ok := nc.checkNextDigit(ce) |
| if !ok { |
| return ce, n |
| } |
| nc.init(ce, len(buf), isZero) |
| for n < len(s) { |
| ce, sz := nw.Weighter.AppendNextString(nc.elems, s[n:]) |
| nc.s = s |
| n += sz |
| if !nc.update(ce) { |
| break |
| } |
| } |
| return nc.result(), n |
| } |
| |
| type numberConverter struct { |
| w *numericWeighter |
| |
| elems []Elem |
| nDigits int |
| lenIndex int |
| |
| s string // set if the input was of type string |
| b []byte // set if the input was of type []byte |
| } |
| |
| // init completes initialization of a numberConverter and prepares it for adding |
| // more digits. elems is assumed to have a digit starting at oldLen. |
| func (nc *numberConverter) init(elems []Elem, oldLen int, isZero bool) { |
| // Insert a marker indicating the start of a number and and a placeholder |
| // for the number of digits. |
| if isZero { |
| elems = append(elems[:oldLen], nc.w.numberStart, 0) |
| } else { |
| elems = append(elems, 0, 0) |
| copy(elems[oldLen+2:], elems[oldLen:]) |
| elems[oldLen] = nc.w.numberStart |
| elems[oldLen+1] = 0 |
| |
| nc.nDigits = 1 |
| } |
| nc.elems = elems |
| nc.lenIndex = oldLen + 1 |
| } |
| |
| // checkNextDigit reports whether bufNew adds a single digit relative to the old |
| // buffer. If it does, it also reports whether this digit is zero. |
| func (nc *numberConverter) checkNextDigit(bufNew []Elem) (isZero, ok bool) { |
| if len(nc.elems) >= len(bufNew) { |
| return false, false |
| } |
| e := bufNew[len(nc.elems)] |
| if e < nc.w.zeroSpecialLo || nc.w.nine < e { |
| // Not a number. |
| return false, false |
| } |
| if e < nc.w.zero { |
| if e > nc.w.nineSpecialHi { |
| // Not a number. |
| return false, false |
| } |
| if !nc.isDigit() { |
| return false, false |
| } |
| isZero = e <= nc.w.zeroSpecialHi |
| } else { |
| // This is the common case if we encounter a digit. |
| isZero = e == nc.w.zero |
| } |
| // Test the remaining added collation elements have a zero primary value. |
| if n := len(bufNew) - len(nc.elems); n > 1 { |
| for i := len(nc.elems) + 1; i < len(bufNew); i++ { |
| if bufNew[i].Primary() != 0 { |
| return false, false |
| } |
| } |
| // In some rare cases, collation elements will encode runes in |
| // unicode.No as a digit. For example Ethiopic digits (U+1369 - U+1371) |
| // are not in Nd. Also some digits that clearly belong in unicode.No, |
| // like U+0C78 TELUGU FRACTION DIGIT ZERO FOR ODD POWERS OF FOUR, have |
| // collation elements indistinguishable from normal digits. |
| // Unfortunately, this means we need to make this check for nearly all |
| // non-Latin digits. |
| // |
| // TODO: check the performance impact and find something better if it is |
| // an issue. |
| if !nc.isDigit() { |
| return false, false |
| } |
| } |
| return isZero, true |
| } |
| |
| func (nc *numberConverter) isDigit() bool { |
| if nc.b != nil { |
| r, _ := utf8.DecodeRune(nc.b) |
| return unicode.In(r, unicode.Nd) |
| } |
| r, _ := utf8.DecodeRuneInString(nc.s) |
| return unicode.In(r, unicode.Nd) |
| } |
| |
| // We currently support a maximum of about 2M digits (the number of primary |
| // values). Such numbers will compare correctly against small numbers, but their |
| // comparison against other large numbers is undefined. |
| // |
| // TODO: define a proper fallback, such as comparing large numbers textually or |
| // actually allowing numbers of unlimited length. |
| // |
| // TODO: cap this to a lower number (like 100) and maybe allow a larger number |
| // in an option? |
| const maxDigits = 1<<maxPrimaryBits - 1 |
| |
| func (nc *numberConverter) update(elems []Elem) bool { |
| isZero, ok := nc.checkNextDigit(elems) |
| if nc.nDigits == 0 && isZero { |
| return true |
| } |
| nc.elems = elems |
| if !ok { |
| return false |
| } |
| nc.nDigits++ |
| return nc.nDigits < maxDigits |
| } |
| |
| // result fills in the length element for the digit sequence and returns the |
| // completed collation elements. |
| func (nc *numberConverter) result() []Elem { |
| e, _ := MakeElem(nc.nDigits, defaultSecondary, defaultTertiary, 0) |
| nc.elems[nc.lenIndex] = e |
| return nc.elems |
| } |