| // Copyright 2012 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| package colltab |
| |
| import ( |
| "fmt" |
| "unicode" |
| ) |
| |
| // Level identifies the collation comparison level. |
| // The primary level corresponds to the basic sorting of text. |
| // The secondary level corresponds to accents and related linguistic elements. |
| // The tertiary level corresponds to casing and related concepts. |
| // The quaternary level is derived from the other levels by the |
| // various algorithms for handling variable elements. |
| type Level int |
| |
| const ( |
| Primary Level = iota |
| Secondary |
| Tertiary |
| Quaternary |
| Identity |
| |
| NumLevels |
| ) |
| |
| const ( |
| defaultSecondary = 0x20 |
| defaultTertiary = 0x2 |
| maxTertiary = 0x1F |
| MaxQuaternary = 0x1FFFFF // 21 bits. |
| ) |
| |
| // Elem is a representation of a collation element. This API provides ways to encode |
| // and decode Elems. Implementations of collation tables may use values greater |
| // or equal to PrivateUse for their own purposes. However, these should never be |
| // returned by AppendNext. |
| type Elem uint32 |
| |
| const ( |
| maxCE Elem = 0xAFFFFFFF |
| PrivateUse = minContract |
| minContract = 0xC0000000 |
| maxContract = 0xDFFFFFFF |
| minExpand = 0xE0000000 |
| maxExpand = 0xEFFFFFFF |
| minDecomp = 0xF0000000 |
| ) |
| |
| type ceType int |
| |
| const ( |
| ceNormal ceType = iota // ceNormal includes implicits (ce == 0) |
| ceContractionIndex // rune can be a start of a contraction |
| ceExpansionIndex // rune expands into a sequence of collation elements |
| ceDecompose // rune expands using NFKC decomposition |
| ) |
| |
| func (ce Elem) ctype() ceType { |
| if ce <= maxCE { |
| return ceNormal |
| } |
| if ce <= maxContract { |
| return ceContractionIndex |
| } else { |
| if ce <= maxExpand { |
| return ceExpansionIndex |
| } |
| return ceDecompose |
| } |
| panic("should not reach here") |
| return ceType(-1) |
| } |
| |
| // For normal collation elements, we assume that a collation element either has |
| // a primary or non-default secondary value, not both. |
| // Collation elements with a primary value are of the form |
| // 01pppppp pppppppp ppppppp0 ssssssss |
| // - p* is primary collation value |
| // - s* is the secondary collation value |
| // 00pppppp pppppppp ppppppps sssttttt, where |
| // - p* is primary collation value |
| // - s* offset of secondary from default value. |
| // - t* is the tertiary collation value |
| // 100ttttt cccccccc pppppppp pppppppp |
| // - t* is the tertiar collation value |
| // - c* is the canonical combining class |
| // - p* is the primary collation value |
| // Collation elements with a secondary value are of the form |
| // 1010cccc ccccssss ssssssss tttttttt, where |
| // - c* is the canonical combining class |
| // - s* is the secondary collation value |
| // - t* is the tertiary collation value |
| // 11qqqqqq qqqqqqqq qqqqqqq0 00000000 |
| // - q* quaternary value |
| const ( |
| ceTypeMask = 0xC0000000 |
| ceTypeMaskExt = 0xE0000000 |
| ceIgnoreMask = 0xF00FFFFF |
| ceType1 = 0x40000000 |
| ceType2 = 0x00000000 |
| ceType3or4 = 0x80000000 |
| ceType4 = 0xA0000000 |
| ceTypeQ = 0xC0000000 |
| Ignore = ceType4 |
| firstNonPrimary = 0x80000000 |
| lastSpecialPrimary = 0xA0000000 |
| secondaryMask = 0x80000000 |
| hasTertiaryMask = 0x40000000 |
| primaryValueMask = 0x3FFFFE00 |
| maxPrimaryBits = 21 |
| compactPrimaryBits = 16 |
| maxSecondaryBits = 12 |
| maxTertiaryBits = 8 |
| maxCCCBits = 8 |
| maxSecondaryCompactBits = 8 |
| maxSecondaryDiffBits = 4 |
| maxTertiaryCompactBits = 5 |
| primaryShift = 9 |
| compactSecondaryShift = 5 |
| minCompactSecondary = defaultSecondary - 4 |
| ) |
| |
| func makeImplicitCE(primary int) Elem { |
| return ceType1 | Elem(primary<<primaryShift) | defaultSecondary |
| } |
| |
| // MakeElem returns an Elem for the given values. It will return an error |
| // if the given combination of values is invalid. |
| func MakeElem(primary, secondary, tertiary int, ccc uint8) (Elem, error) { |
| if w := primary; w >= 1<<maxPrimaryBits || w < 0 { |
| return 0, fmt.Errorf("makeCE: primary weight out of bounds: %x >= %x", w, 1<<maxPrimaryBits) |
| } |
| if w := secondary; w >= 1<<maxSecondaryBits || w < 0 { |
| return 0, fmt.Errorf("makeCE: secondary weight out of bounds: %x >= %x", w, 1<<maxSecondaryBits) |
| } |
| if w := tertiary; w >= 1<<maxTertiaryBits || w < 0 { |
| return 0, fmt.Errorf("makeCE: tertiary weight out of bounds: %x >= %x", w, 1<<maxTertiaryBits) |
| } |
| ce := Elem(0) |
| if primary != 0 { |
| if ccc != 0 { |
| if primary >= 1<<compactPrimaryBits { |
| return 0, fmt.Errorf("makeCE: primary weight with non-zero CCC out of bounds: %x >= %x", primary, 1<<compactPrimaryBits) |
| } |
| if secondary != defaultSecondary { |
| return 0, fmt.Errorf("makeCE: cannot combine non-default secondary value (%x) with non-zero CCC (%x)", secondary, ccc) |
| } |
| ce = Elem(tertiary << (compactPrimaryBits + maxCCCBits)) |
| ce |= Elem(ccc) << compactPrimaryBits |
| ce |= Elem(primary) |
| ce |= ceType3or4 |
| } else if tertiary == defaultTertiary { |
| if secondary >= 1<<maxSecondaryCompactBits { |
| return 0, fmt.Errorf("makeCE: secondary weight with non-zero primary out of bounds: %x >= %x", secondary, 1<<maxSecondaryCompactBits) |
| } |
| ce = Elem(primary<<(maxSecondaryCompactBits+1) + secondary) |
| ce |= ceType1 |
| } else { |
| d := secondary - defaultSecondary + maxSecondaryDiffBits |
| if d >= 1<<maxSecondaryDiffBits || d < 0 { |
| return 0, fmt.Errorf("makeCE: secondary weight diff out of bounds: %x < 0 || %x > %x", d, d, 1<<maxSecondaryDiffBits) |
| } |
| if tertiary >= 1<<maxTertiaryCompactBits { |
| return 0, fmt.Errorf("makeCE: tertiary weight with non-zero primary out of bounds: %x > %x", tertiary, 1<<maxTertiaryCompactBits) |
| } |
| ce = Elem(primary<<maxSecondaryDiffBits + d) |
| ce = ce<<maxTertiaryCompactBits + Elem(tertiary) |
| } |
| } else { |
| ce = Elem(secondary<<maxTertiaryBits + tertiary) |
| ce += Elem(ccc) << (maxSecondaryBits + maxTertiaryBits) |
| ce |= ceType4 |
| } |
| return ce, nil |
| } |
| |
| // MakeQuaternary returns an Elem with the given quaternary value. |
| func MakeQuaternary(v int) Elem { |
| return ceTypeQ | Elem(v<<primaryShift) |
| } |
| |
| // Mask sets weights for any level smaller than l to 0. |
| // The resulting Elem can be used to test for equality with |
| // other Elems to which the same mask has been applied. |
| func (ce Elem) Mask(l Level) uint32 { |
| return 0 |
| } |
| |
| // CCC returns the canonical combining class associated with the underlying character, |
| // if applicable, or 0 otherwise. |
| func (ce Elem) CCC() uint8 { |
| if ce&ceType3or4 != 0 { |
| if ce&ceType4 == ceType3or4 { |
| return uint8(ce >> 16) |
| } |
| return uint8(ce >> 20) |
| } |
| return 0 |
| } |
| |
| // Primary returns the primary collation weight for ce. |
| func (ce Elem) Primary() int { |
| if ce >= firstNonPrimary { |
| if ce > lastSpecialPrimary { |
| return 0 |
| } |
| return int(uint16(ce)) |
| } |
| return int(ce&primaryValueMask) >> primaryShift |
| } |
| |
| // Secondary returns the secondary collation weight for ce. |
| func (ce Elem) Secondary() int { |
| switch ce & ceTypeMask { |
| case ceType1: |
| return int(uint8(ce)) |
| case ceType2: |
| return minCompactSecondary + int((ce>>compactSecondaryShift)&0xF) |
| case ceType3or4: |
| if ce < ceType4 { |
| return defaultSecondary |
| } |
| return int(ce>>8) & 0xFFF |
| case ceTypeQ: |
| return 0 |
| } |
| panic("should not reach here") |
| } |
| |
| // Tertiary returns the tertiary collation weight for ce. |
| func (ce Elem) Tertiary() uint8 { |
| if ce&hasTertiaryMask == 0 { |
| if ce&ceType3or4 == 0 { |
| return uint8(ce & 0x1F) |
| } |
| if ce&ceType4 == ceType4 { |
| return uint8(ce) |
| } |
| return uint8(ce>>24) & 0x1F // type 2 |
| } else if ce&ceTypeMask == ceType1 { |
| return defaultTertiary |
| } |
| // ce is a quaternary value. |
| return 0 |
| } |
| |
| func (ce Elem) updateTertiary(t uint8) Elem { |
| if ce&ceTypeMask == ceType1 { |
| // convert to type 4 |
| nce := ce & primaryValueMask |
| nce |= Elem(uint8(ce)-minCompactSecondary) << compactSecondaryShift |
| ce = nce |
| } else if ce&ceTypeMaskExt == ceType3or4 { |
| ce &= ^Elem(maxTertiary << 24) |
| return ce | (Elem(t) << 24) |
| } else { |
| // type 2 or 4 |
| ce &= ^Elem(maxTertiary) |
| } |
| return ce | Elem(t) |
| } |
| |
| // Quaternary returns the quaternary value if explicitly specified, |
| // 0 if ce == Ignore, or MaxQuaternary otherwise. |
| // Quaternary values are used only for shifted variants. |
| func (ce Elem) Quaternary() int { |
| if ce&ceTypeMask == ceTypeQ { |
| return int(ce&primaryValueMask) >> primaryShift |
| } else if ce&ceIgnoreMask == Ignore { |
| return 0 |
| } |
| return MaxQuaternary |
| } |
| |
| // Weight returns the collation weight for the given level. |
| func (ce Elem) Weight(l Level) int { |
| switch l { |
| case Primary: |
| return ce.Primary() |
| case Secondary: |
| return ce.Secondary() |
| case Tertiary: |
| return int(ce.Tertiary()) |
| case Quaternary: |
| return ce.Quaternary() |
| } |
| return 0 // return 0 (ignore) for undefined levels. |
| } |
| |
| // For contractions, collation elements are of the form |
| // 110bbbbb bbbbbbbb iiiiiiii iiiinnnn, where |
| // - n* is the size of the first node in the contraction trie. |
| // - i* is the index of the first node in the contraction trie. |
| // - b* is the offset into the contraction collation element table. |
| // See contract.go for details on the contraction trie. |
| const ( |
| maxNBits = 4 |
| maxTrieIndexBits = 12 |
| maxContractOffsetBits = 13 |
| ) |
| |
| func splitContractIndex(ce Elem) (index, n, offset int) { |
| n = int(ce & (1<<maxNBits - 1)) |
| ce >>= maxNBits |
| index = int(ce & (1<<maxTrieIndexBits - 1)) |
| ce >>= maxTrieIndexBits |
| offset = int(ce & (1<<maxContractOffsetBits - 1)) |
| return |
| } |
| |
| // For expansions, Elems are of the form 11100000 00000000 bbbbbbbb bbbbbbbb, |
| // where b* is the index into the expansion sequence table. |
| const maxExpandIndexBits = 16 |
| |
| func splitExpandIndex(ce Elem) (index int) { |
| return int(uint16(ce)) |
| } |
| |
| // Some runes can be expanded using NFKD decomposition. Instead of storing the full |
| // sequence of collation elements, we decompose the rune and lookup the collation |
| // elements for each rune in the decomposition and modify the tertiary weights. |
| // The Elem, in this case, is of the form 11110000 00000000 wwwwwwww vvvvvvvv, where |
| // - v* is the replacement tertiary weight for the first rune, |
| // - w* is the replacement tertiary weight for the second rune, |
| // Tertiary weights of subsequent runes should be replaced with maxTertiary. |
| // See https://www.unicode.org/reports/tr10/#Compatibility_Decompositions for more details. |
| func splitDecompose(ce Elem) (t1, t2 uint8) { |
| return uint8(ce), uint8(ce >> 8) |
| } |
| |
| const ( |
| // These constants were taken from https://www.unicode.org/versions/Unicode6.0.0/ch12.pdf. |
| minUnified rune = 0x4E00 |
| maxUnified = 0x9FFF |
| minCompatibility = 0xF900 |
| maxCompatibility = 0xFAFF |
| minRare = 0x3400 |
| maxRare = 0x4DBF |
| ) |
| const ( |
| commonUnifiedOffset = 0x10000 |
| rareUnifiedOffset = 0x20000 // largest rune in common is U+FAFF |
| otherOffset = 0x50000 // largest rune in rare is U+2FA1D |
| illegalOffset = otherOffset + int(unicode.MaxRune) |
| maxPrimary = illegalOffset + 1 |
| ) |
| |
| // implicitPrimary returns the primary weight for the a rune |
| // for which there is no entry for the rune in the collation table. |
| // We take a different approach from the one specified in |
| // https://unicode.org/reports/tr10/#Implicit_Weights, |
| // but preserve the resulting relative ordering of the runes. |
| func implicitPrimary(r rune) int { |
| if unicode.Is(unicode.Ideographic, r) { |
| if r >= minUnified && r <= maxUnified { |
| // The most common case for CJK. |
| return int(r) + commonUnifiedOffset |
| } |
| if r >= minCompatibility && r <= maxCompatibility { |
| // This will typically not hit. The DUCET explicitly specifies mappings |
| // for all characters that do not decompose. |
| return int(r) + commonUnifiedOffset |
| } |
| return int(r) + rareUnifiedOffset |
| } |
| return int(r) + otherOffset |
| } |