collate/build/colelem.go - text - Git at Google

 // Copyright 2012 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

 package build

 import (
 	"fmt"
 	"unicode"

 	"golang.org/x/text/internal/colltab"
 )

 const (
 	defaultSecondary = 0x20
 	defaultTertiary  = 0x2
 	maxTertiary      = 0x1F
 )

 type rawCE struct {
 	w   []int
 	ccc uint8
 }

 func makeRawCE(w []int, ccc uint8) rawCE {
 	ce := rawCE{w: make([]int, 4), ccc: ccc}
 	copy(ce.w, w)
 	return ce
 }

 // A collation element is represented as an uint32.
 // In the typical case, a rune maps to a single collation element. If a rune
 // can be the start of a contraction or expands into multiple collation elements,
 // then the collation element that is associated with a rune will have a special
 // form to represent such m to n mappings.  Such special collation elements
 // have a value >= 0x80000000.

 const (
 	maxPrimaryBits   = 21
 	maxSecondaryBits = 12
 	maxTertiaryBits  = 8
 )

 func makeCE(ce rawCE) (uint32, error) {
 	v, e := colltab.MakeElem(ce.w[0], ce.w[1], ce.w[2], ce.ccc)
 	return uint32(v), e
 }

 // For contractions, collation elements are of the form
 // 110bbbbb bbbbbbbb iiiiiiii iiiinnnn, where
 //   - n* is the size of the first node in the contraction trie.
 //   - i* is the index of the first node in the contraction trie.
 //   - b* is the offset into the contraction collation element table.
 // See contract.go for details on the contraction trie.
 const (
 	contractID            = 0xC0000000
 	maxNBits              = 4
 	maxTrieIndexBits      = 12
 	maxContractOffsetBits = 13
 )

 func makeContractIndex(h ctHandle, offset int) (uint32, error) {
 	if h.n >= 1<<maxNBits {
 		return 0, fmt.Errorf("size of contraction trie node too large: %d >= %d", h.n, 1<<maxNBits)
 	}
 	if h.index >= 1<<maxTrieIndexBits {
 		return 0, fmt.Errorf("size of contraction trie offset too large: %d >= %d", h.index, 1<<maxTrieIndexBits)
 	}
 	if offset >= 1<<maxContractOffsetBits {
 		return 0, fmt.Errorf("contraction offset out of bounds: %x >= %x", offset, 1<<maxContractOffsetBits)
 	}
 	ce := uint32(contractID)
 	ce += uint32(offset << (maxNBits + maxTrieIndexBits))
 	ce += uint32(h.index << maxNBits)
 	ce += uint32(h.n)
 	return ce, nil
 }

 // For expansions, collation elements are of the form
 // 11100000 00000000 bbbbbbbb bbbbbbbb,
 // where b* is the index into the expansion sequence table.
 const (
 	expandID           = 0xE0000000
 	maxExpandIndexBits = 16
 )

 func makeExpandIndex(index int) (uint32, error) {
 	if index >= 1<<maxExpandIndexBits {
 		return 0, fmt.Errorf("expansion index out of bounds: %x >= %x", index, 1<<maxExpandIndexBits)
 	}
 	return expandID + uint32(index), nil
 }

 // Each list of collation elements corresponding to an expansion starts with
 // a header indicating the length of the sequence.
 func makeExpansionHeader(n int) (uint32, error) {
 	return uint32(n), nil
 }

 // Some runes can be expanded using NFKD decomposition. Instead of storing the full
 // sequence of collation elements, we decompose the rune and lookup the collation
 // elements for each rune in the decomposition and modify the tertiary weights.
 // The collation element, in this case, is of the form
 // 11110000 00000000 wwwwwwww vvvvvvvv, where
 //   - v* is the replacement tertiary weight for the first rune,
 //   - w* is the replacement tertiary weight for the second rune,
 // Tertiary weights of subsequent runes should be replaced with maxTertiary.
 // See https://www.unicode.org/reports/tr10/#Compatibility_Decompositions for more details.
 const (
 	decompID = 0xF0000000
 )

 func makeDecompose(t1, t2 int) (uint32, error) {
 	if t1 >= 256 || t1 < 0 {
 		return 0, fmt.Errorf("first tertiary weight out of bounds: %d >= 256", t1)
 	}
 	if t2 >= 256 || t2 < 0 {
 		return 0, fmt.Errorf("second tertiary weight out of bounds: %d >= 256", t2)
 	}
 	return uint32(t2<<8+t1) + decompID, nil
 }

 const (
 	// These constants were taken from https://www.unicode.org/versions/Unicode6.0.0/ch12.pdf.
 	minUnified       rune = 0x4E00
 	maxUnified            = 0x9FFF
 	minCompatibility      = 0xF900
 	maxCompatibility      = 0xFAFF
 	minRare               = 0x3400
 	maxRare               = 0x4DBF
 )
 const (
 	commonUnifiedOffset = 0x10000
 	rareUnifiedOffset   = 0x20000 // largest rune in common is U+FAFF
 	otherOffset         = 0x50000 // largest rune in rare is U+2FA1D
 	illegalOffset       = otherOffset + int(unicode.MaxRune)
 	maxPrimary          = illegalOffset + 1
 )

 // implicitPrimary returns the primary weight for the a rune
 // for which there is no entry for the rune in the collation table.
 // We take a different approach from the one specified in
 // https://unicode.org/reports/tr10/#Implicit_Weights,
 // but preserve the resulting relative ordering of the runes.
 func implicitPrimary(r rune) int {
 	if unicode.Is(unicode.Ideographic, r) {
 		if r >= minUnified && r <= maxUnified {
 			// The most common case for CJK.
 			return int(r) + commonUnifiedOffset
 		}
 		if r >= minCompatibility && r <= maxCompatibility {
 			// This will typically not hit. The DUCET explicitly specifies mappings
 			// for all characters that do not decompose.
 			return int(r) + commonUnifiedOffset
 		}
 		return int(r) + rareUnifiedOffset
 	}
 	return int(r) + otherOffset
 }

 // convertLargeWeights converts collation elements with large
 // primaries (either double primaries or for illegal runes)
 // to our own representation.
 // A CJK character C is represented in the DUCET as
 //   [.FBxx.0020.0002.C][.BBBB.0000.0000.C]
 // We will rewrite these characters to a single CE.
 // We assume the CJK values start at 0x8000.
 // See https://unicode.org/reports/tr10/#Implicit_Weights
 func convertLargeWeights(elems []rawCE) (res []rawCE, err error) {
 	const (
 		cjkPrimaryStart   = 0xFB40
 		rarePrimaryStart  = 0xFB80
 		otherPrimaryStart = 0xFBC0
 		illegalPrimary    = 0xFFFE
 		highBitsMask      = 0x3F
 		lowBitsMask       = 0x7FFF
 		lowBitsFlag       = 0x8000
 		shiftBits         = 15
 	)
 	for i := 0; i < len(elems); i++ {
 		ce := elems[i].w
 		p := ce[0]
 		if p < cjkPrimaryStart {
 			continue
 		}
 		if p > 0xFFFF {
 			return elems, fmt.Errorf("found primary weight %X; should be <= 0xFFFF", p)
 		}
 		if p >= illegalPrimary {
 			ce[0] = illegalOffset + p - illegalPrimary
 		} else {
 			if i+1 >= len(elems) {
 				return elems, fmt.Errorf("second part of double primary weight missing: %v", elems)
 			}
 			if elems[i+1].w[0]&lowBitsFlag == 0 {
 				return elems, fmt.Errorf("malformed second part of double primary weight: %v", elems)
 			}
 			np := ((p & highBitsMask) << shiftBits) + elems[i+1].w[0]&lowBitsMask
 			switch {
 			case p < rarePrimaryStart:
 				np += commonUnifiedOffset
 			case p < otherPrimaryStart:
 				np += rareUnifiedOffset
 			default:
 				p += otherOffset
 			}
 			ce[0] = np
 			for j := i + 1; j+1 < len(elems); j++ {
 				elems[j] = elems[j+1]
 			}
 			elems = elems[:len(elems)-1]
 		}
 	}
 	return elems, nil
 }

 // nextWeight computes the first possible collation weights following elems
 // for the given level.
 func nextWeight(level colltab.Level, elems []rawCE) []rawCE {
 	if level == colltab.Identity {
 		next := make([]rawCE, len(elems))
 		copy(next, elems)
 		return next
 	}
 	next := []rawCE{makeRawCE(elems[0].w, elems[0].ccc)}
 	next[0].w[level]++
 	if level < colltab.Secondary {
 		next[0].w[colltab.Secondary] = defaultSecondary
 	}
 	if level < colltab.Tertiary {
 		next[0].w[colltab.Tertiary] = defaultTertiary
 	}
 	// Filter entries that cannot influence ordering.
 	for _, ce := range elems[1:] {
 		skip := true
 		for i := colltab.Primary; i < level; i++ {
 			skip = skip && ce.w[i] == 0
 		}
 		if !skip {
 			next = append(next, ce)
 		}
 	}
 	return next
 }

 func nextVal(elems []rawCE, i int, level colltab.Level) (index, value int) {
 	for ; i < len(elems) && elems[i].w[level] == 0; i++ {
 	}
 	if i < len(elems) {
 		return i, elems[i].w[level]
 	}
 	return i, 0
 }

 // compareWeights returns -1 if a < b, 1 if a > b, or 0 otherwise.
 // It also returns the collation level at which the difference is found.
 func compareWeights(a, b []rawCE) (result int, level colltab.Level) {
 	for level := colltab.Primary; level < colltab.Identity; level++ {
 		var va, vb int
 		for ia, ib := 0, 0; ia < len(a) || ib < len(b); ia, ib = ia+1, ib+1 {
 			ia, va = nextVal(a, ia, level)
 			ib, vb = nextVal(b, ib, level)
 			if va != vb {
 				if va < vb {
 					return -1, level
 				} else {
 					return 1, level
 				}
 			}
 		}
 	}
 	return 0, colltab.Identity
 }

 func equalCE(a, b rawCE) bool {
 	for i := 0; i < 3; i++ {
 		if b.w[i] != a.w[i] {
 			return false
 		}
 	}
 	return true
 }

 func equalCEArrays(a, b []rawCE) bool {
 	if len(a) != len(b) {
 		return false
 	}
 	for i := range a {
 		if !equalCE(a[i], b[i]) {
 			return false
 		}
 	}
 	return true
 }
	// Copyright 2012 The Go Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style
	// license that can be found in the LICENSE file.

	package build

	import (
	"fmt"
	"unicode"

	"golang.org/x/text/internal/colltab"
	)

	const (
	defaultSecondary = 0x20
	defaultTertiary = 0x2
	maxTertiary = 0x1F
	)

	type rawCE struct {
	w []int
	ccc uint8
	}

	func makeRawCE(w []int, ccc uint8) rawCE {
	ce := rawCE{w: make([]int, 4), ccc: ccc}
	copy(ce.w, w)
	return ce
	}

	// A collation element is represented as an uint32.
	// In the typical case, a rune maps to a single collation element. If a rune
	// can be the start of a contraction or expands into multiple collation elements,
	// then the collation element that is associated with a rune will have a special
	// form to represent such m to n mappings. Such special collation elements
	// have a value >= 0x80000000.

	const (
	maxPrimaryBits = 21
	maxSecondaryBits = 12
	maxTertiaryBits = 8
	)

	func makeCE(ce rawCE) (uint32, error) {
	v, e := colltab.MakeElem(ce.w[0], ce.w[1], ce.w[2], ce.ccc)
	return uint32(v), e
	}

	// For contractions, collation elements are of the form
	// 110bbbbb bbbbbbbb iiiiiiii iiiinnnn, where
	// - n* is the size of the first node in the contraction trie.
	// - i* is the index of the first node in the contraction trie.
	// - b* is the offset into the contraction collation element table.
	// See contract.go for details on the contraction trie.
	const (
	contractID = 0xC0000000
	maxNBits = 4
	maxTrieIndexBits = 12
	maxContractOffsetBits = 13
	)

	func makeContractIndex(h ctHandle, offset int) (uint32, error) {
	if h.n >= 1<<maxNBits {
	return 0, fmt.Errorf("size of contraction trie node too large: %d >= %d", h.n, 1<<maxNBits)
	}
	if h.index >= 1<<maxTrieIndexBits {
	return 0, fmt.Errorf("size of contraction trie offset too large: %d >= %d", h.index, 1<<maxTrieIndexBits)
	}
	if offset >= 1<<maxContractOffsetBits {
	return 0, fmt.Errorf("contraction offset out of bounds: %x >= %x", offset, 1<<maxContractOffsetBits)
	}
	ce := uint32(contractID)
	ce += uint32(offset << (maxNBits + maxTrieIndexBits))
	ce += uint32(h.index << maxNBits)
	ce += uint32(h.n)
	return ce, nil
	}

	// For expansions, collation elements are of the form
	// 11100000 00000000 bbbbbbbb bbbbbbbb,
	// where b* is the index into the expansion sequence table.
	const (
	expandID = 0xE0000000
	maxExpandIndexBits = 16
	)

	func makeExpandIndex(index int) (uint32, error) {
	if index >= 1<<maxExpandIndexBits {
	return 0, fmt.Errorf("expansion index out of bounds: %x >= %x", index, 1<<maxExpandIndexBits)
	}
	return expandID + uint32(index), nil
	}

	// Each list of collation elements corresponding to an expansion starts with
	// a header indicating the length of the sequence.
	func makeExpansionHeader(n int) (uint32, error) {
	return uint32(n), nil
	}

	// Some runes can be expanded using NFKD decomposition. Instead of storing the full
	// sequence of collation elements, we decompose the rune and lookup the collation
	// elements for each rune in the decomposition and modify the tertiary weights.
	// The collation element, in this case, is of the form
	// 11110000 00000000 wwwwwwww vvvvvvvv, where
	// - v* is the replacement tertiary weight for the first rune,
	// - w* is the replacement tertiary weight for the second rune,
	// Tertiary weights of subsequent runes should be replaced with maxTertiary.
	// See https://www.unicode.org/reports/tr10/#Compatibility_Decompositions for more details.
	const (
	decompID = 0xF0000000
	)

	func makeDecompose(t1, t2 int) (uint32, error) {
	if t1 >= 256 \|\| t1 < 0 {
	return 0, fmt.Errorf("first tertiary weight out of bounds: %d >= 256", t1)
	}
	if t2 >= 256 \|\| t2 < 0 {
	return 0, fmt.Errorf("second tertiary weight out of bounds: %d >= 256", t2)
	}
	return uint32(t2<<8+t1) + decompID, nil
	}

	const (
	// These constants were taken from https://www.unicode.org/versions/Unicode6.0.0/ch12.pdf.
	minUnified rune = 0x4E00
	maxUnified = 0x9FFF
	minCompatibility = 0xF900
	maxCompatibility = 0xFAFF
	minRare = 0x3400
	maxRare = 0x4DBF
	)
	const (
	commonUnifiedOffset = 0x10000
	rareUnifiedOffset = 0x20000 // largest rune in common is U+FAFF
	otherOffset = 0x50000 // largest rune in rare is U+2FA1D
	illegalOffset = otherOffset + int(unicode.MaxRune)
	maxPrimary = illegalOffset + 1
	)

	// implicitPrimary returns the primary weight for the a rune
	// for which there is no entry for the rune in the collation table.
	// We take a different approach from the one specified in
	// https://unicode.org/reports/tr10/#Implicit_Weights,
	// but preserve the resulting relative ordering of the runes.
	func implicitPrimary(r rune) int {
	if unicode.Is(unicode.Ideographic, r) {
	if r >= minUnified && r <= maxUnified {
	// The most common case for CJK.
	return int(r) + commonUnifiedOffset
	}
	if r >= minCompatibility && r <= maxCompatibility {
	// This will typically not hit. The DUCET explicitly specifies mappings
	// for all characters that do not decompose.
	return int(r) + commonUnifiedOffset
	}
	return int(r) + rareUnifiedOffset
	}
	return int(r) + otherOffset
	}

	// convertLargeWeights converts collation elements with large
	// primaries (either double primaries or for illegal runes)
	// to our own representation.
	// A CJK character C is represented in the DUCET as
	// [.FBxx.0020.0002.C][.BBBB.0000.0000.C]
	// We will rewrite these characters to a single CE.
	// We assume the CJK values start at 0x8000.
	// See https://unicode.org/reports/tr10/#Implicit_Weights
	func convertLargeWeights(elems []rawCE) (res []rawCE, err error) {
	const (
	cjkPrimaryStart = 0xFB40
	rarePrimaryStart = 0xFB80
	otherPrimaryStart = 0xFBC0
	illegalPrimary = 0xFFFE
	highBitsMask = 0x3F
	lowBitsMask = 0x7FFF
	lowBitsFlag = 0x8000
	shiftBits = 15
	)
	for i := 0; i < len(elems); i++ {
	ce := elems[i].w
	p := ce[0]
	if p < cjkPrimaryStart {
	continue
	}
	if p > 0xFFFF {
	return elems, fmt.Errorf("found primary weight %X; should be <= 0xFFFF", p)
	}
	if p >= illegalPrimary {
	ce[0] = illegalOffset + p - illegalPrimary
	} else {
	if i+1 >= len(elems) {
	return elems, fmt.Errorf("second part of double primary weight missing: %v", elems)
	}
	if elems[i+1].w[0]&lowBitsFlag == 0 {
	return elems, fmt.Errorf("malformed second part of double primary weight: %v", elems)
	}
	np := ((p & highBitsMask) << shiftBits) + elems[i+1].w[0]&lowBitsMask
	switch {
	case p < rarePrimaryStart:
	np += commonUnifiedOffset
	case p < otherPrimaryStart:
	np += rareUnifiedOffset
	default:
	p += otherOffset
	}
	ce[0] = np
	for j := i + 1; j+1 < len(elems); j++ {
	elems[j] = elems[j+1]
	}
	elems = elems[:len(elems)-1]
	}
	}
	return elems, nil
	}

	// nextWeight computes the first possible collation weights following elems
	// for the given level.
	func nextWeight(level colltab.Level, elems []rawCE) []rawCE {
	if level == colltab.Identity {
	next := make([]rawCE, len(elems))
	copy(next, elems)
	return next
	}
	next := []rawCE{makeRawCE(elems[0].w, elems[0].ccc)}
	next[0].w[level]++
	if level < colltab.Secondary {
	next[0].w[colltab.Secondary] = defaultSecondary
	}
	if level < colltab.Tertiary {
	next[0].w[colltab.Tertiary] = defaultTertiary
	}
	// Filter entries that cannot influence ordering.
	for _, ce := range elems[1:] {
	skip := true
	for i := colltab.Primary; i < level; i++ {
	skip = skip && ce.w[i] == 0
	}
	if !skip {
	next = append(next, ce)
	}
	}
	return next
	}

	func nextVal(elems []rawCE, i int, level colltab.Level) (index, value int) {
	for ; i < len(elems) && elems[i].w[level] == 0; i++ {
	}
	if i < len(elems) {
	return i, elems[i].w[level]
	}
	return i, 0
	}

	// compareWeights returns -1 if a < b, 1 if a > b, or 0 otherwise.
	// It also returns the collation level at which the difference is found.
	func compareWeights(a, b []rawCE) (result int, level colltab.Level) {
	for level := colltab.Primary; level < colltab.Identity; level++ {
	var va, vb int
	for ia, ib := 0, 0; ia < len(a) \|\| ib < len(b); ia, ib = ia+1, ib+1 {
	ia, va = nextVal(a, ia, level)
	ib, vb = nextVal(b, ib, level)
	if va != vb {
	if va < vb {
	return -1, level
	} else {
	return 1, level
	}
	}
	}
	}
	return 0, colltab.Identity
	}

	func equalCE(a, b rawCE) bool {
	for i := 0; i < 3; i++ {
	if b.w[i] != a.w[i] {
	return false
	}
	}
	return true
	}

	func equalCEArrays(a, b []rawCE) bool {
	if len(a) != len(b) {
	return false
	}
	for i := range a {
	if !equalCE(a[i], b[i]) {
	return false
	}
	}
	return true
	}