collate/build/order.go - text - Git at Google

 // Copyright 2012 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

 package build

 import (
 	"fmt"
 	"log"
 	"sort"
 	"strings"
 	"unicode"

 	"golang.org/x/text/internal/colltab"
 	"golang.org/x/text/unicode/norm"
 )

 type logicalAnchor int

 const (
 	firstAnchor logicalAnchor = -1
 	noAnchor                  = 0
 	lastAnchor                = 1
 )

 // entry is used to keep track of a single entry in the collation element table
 // during building. Examples of entries can be found in the Default Unicode
 // Collation Element Table.
 // See https://www.unicode.org/Public/UCA/6.0.0/allkeys.txt.
 type entry struct {
 	str    string // same as string(runes)
 	runes  []rune
 	elems  []rawCE // the collation elements
 	extend string  // weights of extend to be appended to elems
 	before bool    // weights relative to next instead of previous.
 	lock   bool    // entry is used in extension and can no longer be moved.

 	// prev, next, and level are used to keep track of tailorings.
 	prev, next *entry
 	level      colltab.Level // next differs at this level
 	skipRemove bool          // do not unlink when removed

 	decompose bool // can use NFKD decomposition to generate elems
 	exclude   bool // do not include in table
 	implicit  bool // derived, is not included in the list
 	modified  bool // entry was modified in tailoring
 	logical   logicalAnchor

 	expansionIndex    int // used to store index into expansion table
 	contractionHandle ctHandle
 	contractionIndex  int // index into contraction elements
 }

 func (e *entry) String() string {
 	return fmt.Sprintf("%X (%q) -> %X (ch:%x; ci:%d, ei:%d)",
 		e.runes, e.str, e.elems, e.contractionHandle, e.contractionIndex, e.expansionIndex)
 }

 func (e *entry) skip() bool {
 	return e.contraction()
 }

 func (e *entry) expansion() bool {
 	return !e.decompose && len(e.elems) > 1
 }

 func (e *entry) contraction() bool {
 	return len(e.runes) > 1
 }

 func (e *entry) contractionStarter() bool {
 	return e.contractionHandle.n != 0
 }

 // nextIndexed gets the next entry that needs to be stored in the table.
 // It returns the entry and the collation level at which the next entry differs
 // from the current entry.
 // Entries that can be explicitly derived and logical reset positions are
 // examples of entries that will not be indexed.
 func (e *entry) nextIndexed() (*entry, colltab.Level) {
 	level := e.level
 	for e = e.next; e != nil && (e.exclude || len(e.elems) == 0); e = e.next {
 		if e.level < level {
 			level = e.level
 		}
 	}
 	return e, level
 }

 // remove unlinks entry e from the sorted chain and clears the collation
 // elements. e may not be at the front or end of the list. This should always
 // be the case, as the front and end of the list are always logical anchors,
 // which may not be removed.
 func (e *entry) remove() {
 	if e.logical != noAnchor {
 		log.Fatalf("may not remove anchor %q", e.str)
 	}
 	// TODO: need to set e.prev.level to e.level if e.level is smaller?
 	e.elems = nil
 	if !e.skipRemove {
 		if e.prev != nil {
 			e.prev.next = e.next
 		}
 		if e.next != nil {
 			e.next.prev = e.prev
 		}
 	}
 	e.skipRemove = false
 }

 // insertAfter inserts n after e.
 func (e *entry) insertAfter(n *entry) {
 	if e == n {
 		panic("e == anchor")
 	}
 	if e == nil {
 		panic("unexpected nil anchor")
 	}
 	n.remove()
 	n.decompose = false // redo decomposition test

 	n.next = e.next
 	n.prev = e
 	if e.next != nil {
 		e.next.prev = n
 	}
 	e.next = n
 }

 // insertBefore inserts n before e.
 func (e *entry) insertBefore(n *entry) {
 	if e == n {
 		panic("e == anchor")
 	}
 	if e == nil {
 		panic("unexpected nil anchor")
 	}
 	n.remove()
 	n.decompose = false // redo decomposition test

 	n.prev = e.prev
 	n.next = e
 	if e.prev != nil {
 		e.prev.next = n
 	}
 	e.prev = n
 }

 func (e *entry) encodeBase() (ce uint32, err error) {
 	switch {
 	case e.expansion():
 		ce, err = makeExpandIndex(e.expansionIndex)
 	default:
 		if e.decompose {
 			log.Fatal("decompose should be handled elsewhere")
 		}
 		ce, err = makeCE(e.elems[0])
 	}
 	return
 }

 func (e *entry) encode() (ce uint32, err error) {
 	if e.skip() {
 		log.Fatal("cannot build colElem for entry that should be skipped")
 	}
 	switch {
 	case e.decompose:
 		t1 := e.elems[0].w[2]
 		t2 := 0
 		if len(e.elems) > 1 {
 			t2 = e.elems[1].w[2]
 		}
 		ce, err = makeDecompose(t1, t2)
 	case e.contractionStarter():
 		ce, err = makeContractIndex(e.contractionHandle, e.contractionIndex)
 	default:
 		if len(e.runes) > 1 {
 			log.Fatal("colElem: contractions are handled in contraction trie")
 		}
 		ce, err = e.encodeBase()
 	}
 	return
 }

 // entryLess returns true if a sorts before b and false otherwise.
 func entryLess(a, b *entry) bool {
 	if res, _ := compareWeights(a.elems, b.elems); res != 0 {
 		return res == -1
 	}
 	if a.logical != noAnchor {
 		return a.logical == firstAnchor
 	}
 	if b.logical != noAnchor {
 		return b.logical == lastAnchor
 	}
 	return a.str < b.str
 }

 type sortedEntries []*entry

 func (s sortedEntries) Len() int {
 	return len(s)
 }

 func (s sortedEntries) Swap(i, j int) {
 	s[i], s[j] = s[j], s[i]
 }

 func (s sortedEntries) Less(i, j int) bool {
 	return entryLess(s[i], s[j])
 }

 type ordering struct {
 	id       string
 	entryMap map[string]*entry
 	ordered  []*entry
 	handle   *trieHandle
 }

 // insert inserts e into both entryMap and ordered.
 // Note that insert simply appends e to ordered.  To reattain a sorted
 // order, o.sort() should be called.
 func (o *ordering) insert(e *entry) {
 	if e.logical == noAnchor {
 		o.entryMap[e.str] = e
 	} else {
 		// Use key format as used in UCA rules.
 		o.entryMap[fmt.Sprintf("[%s]", e.str)] = e
 		// Also add index entry for XML format.
 		o.entryMap[fmt.Sprintf("<%s/>", strings.Replace(e.str, " ", "_", -1))] = e
 	}
 	o.ordered = append(o.ordered, e)
 }

 // newEntry creates a new entry for the given info and inserts it into
 // the index.
 func (o *ordering) newEntry(s string, ces []rawCE) *entry {
 	e := &entry{
 		runes: []rune(s),
 		elems: ces,
 		str:   s,
 	}
 	o.insert(e)
 	return e
 }

 // find looks up and returns the entry for the given string.
 // It returns nil if str is not in the index and if an implicit value
 // cannot be derived, that is, if str represents more than one rune.
 func (o *ordering) find(str string) *entry {
 	e := o.entryMap[str]
 	if e == nil {
 		r := []rune(str)
 		if len(r) == 1 {
 			const (
 				firstHangul = 0xAC00
 				lastHangul  = 0xD7A3
 			)
 			if r[0] >= firstHangul && r[0] <= lastHangul {
 				ce := []rawCE{}
 				nfd := norm.NFD.String(str)
 				for _, r := range nfd {
 					ce = append(ce, o.find(string(r)).elems...)
 				}
 				e = o.newEntry(nfd, ce)
 			} else {
 				e = o.newEntry(string(r[0]), []rawCE{
 					{w: []int{
 						implicitPrimary(r[0]),
 						defaultSecondary,
 						defaultTertiary,
 						int(r[0]),
 					},
 					},
 				})
 				e.modified = true
 			}
 			e.exclude = true // do not index implicits
 		}
 	}
 	return e
 }

 // makeRootOrdering returns a newly initialized ordering value and populates
 // it with a set of logical reset points that can be used as anchors.
 // The anchors first_tertiary_ignorable and __END__ will always sort at
 // the beginning and end, respectively. This means that prev and next are non-nil
 // for any indexed entry.
 func makeRootOrdering() ordering {
 	const max = unicode.MaxRune
 	o := ordering{
 		entryMap: make(map[string]*entry),
 	}
 	insert := func(typ logicalAnchor, s string, ce []int) {
 		e := &entry{
 			elems:   []rawCE{{w: ce}},
 			str:     s,
 			exclude: true,
 			logical: typ,
 		}
 		o.insert(e)
 	}
 	insert(firstAnchor, "first tertiary ignorable", []int{0, 0, 0, 0})
 	insert(lastAnchor, "last tertiary ignorable", []int{0, 0, 0, max})
 	insert(lastAnchor, "last primary ignorable", []int{0, defaultSecondary, defaultTertiary, max})
 	insert(lastAnchor, "last non ignorable", []int{maxPrimary, defaultSecondary, defaultTertiary, max})
 	insert(lastAnchor, "__END__", []int{1 << maxPrimaryBits, defaultSecondary, defaultTertiary, max})
 	return o
 }

 // patchForInsert eliminates entries from the list with more than one collation element.
 // The next and prev fields of the eliminated entries still point to appropriate
 // values in the newly created list.
 // It requires that sort has been called.
 func (o *ordering) patchForInsert() {
 	for i := 0; i < len(o.ordered)-1; {
 		e := o.ordered[i]
 		lev := e.level
 		n := e.next
 		for ; n != nil && len(n.elems) > 1; n = n.next {
 			if n.level < lev {
 				lev = n.level
 			}
 			n.skipRemove = true
 		}
 		for ; o.ordered[i] != n; i++ {
 			o.ordered[i].level = lev
 			o.ordered[i].next = n
 			o.ordered[i+1].prev = e
 		}
 	}
 }

 // clone copies all ordering of es into a new ordering value.
 func (o *ordering) clone() *ordering {
 	o.sort()
 	oo := ordering{
 		entryMap: make(map[string]*entry),
 	}
 	for _, e := range o.ordered {
 		ne := &entry{
 			runes:     e.runes,
 			elems:     e.elems,
 			str:       e.str,
 			decompose: e.decompose,
 			exclude:   e.exclude,
 			logical:   e.logical,
 		}
 		oo.insert(ne)
 	}
 	oo.sort() // link all ordering.
 	oo.patchForInsert()
 	return &oo
 }

 // front returns the first entry to be indexed.
 // It assumes that sort() has been called.
 func (o *ordering) front() *entry {
 	e := o.ordered[0]
 	if e.prev != nil {
 		log.Panicf("unexpected first entry: %v", e)
 	}
 	// The first entry is always a logical position, which should not be indexed.
 	e, _ = e.nextIndexed()
 	return e
 }

 // sort sorts all ordering based on their collation elements and initializes
 // the prev, next, and level fields accordingly.
 func (o *ordering) sort() {
 	sort.Sort(sortedEntries(o.ordered))
 	l := o.ordered
 	for i := 1; i < len(l); i++ {
 		k := i - 1
 		l[k].next = l[i]
 		_, l[k].level = compareWeights(l[k].elems, l[i].elems)
 		l[i].prev = l[k]
 	}
 }

 // genColElems generates a collation element array from the runes in str. This
 // assumes that all collation elements have already been added to the Builder.
 func (o *ordering) genColElems(str string) []rawCE {
 	elems := []rawCE{}
 	for _, r := range []rune(str) {
 		for _, ce := range o.find(string(r)).elems {
 			if ce.w[0] != 0 || ce.w[1] != 0 || ce.w[2] != 0 {
 				elems = append(elems, ce)
 			}
 		}
 	}
 	return elems
 }
	// Copyright 2012 The Go Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style
	// license that can be found in the LICENSE file.

	package build

	import (
	"fmt"
	"log"
	"sort"
	"strings"
	"unicode"

	"golang.org/x/text/internal/colltab"
	"golang.org/x/text/unicode/norm"
	)

	type logicalAnchor int

	const (
	firstAnchor logicalAnchor = -1
	noAnchor = 0
	lastAnchor = 1
	)

	// entry is used to keep track of a single entry in the collation element table
	// during building. Examples of entries can be found in the Default Unicode
	// Collation Element Table.
	// See https://www.unicode.org/Public/UCA/6.0.0/allkeys.txt.
	type entry struct {
	str string // same as string(runes)
	runes []rune
	elems []rawCE // the collation elements
	extend string // weights of extend to be appended to elems
	before bool // weights relative to next instead of previous.
	lock bool // entry is used in extension and can no longer be moved.

	// prev, next, and level are used to keep track of tailorings.
	prev, next *entry
	level colltab.Level // next differs at this level
	skipRemove bool // do not unlink when removed

	decompose bool // can use NFKD decomposition to generate elems
	exclude bool // do not include in table
	implicit bool // derived, is not included in the list
	modified bool // entry was modified in tailoring
	logical logicalAnchor

	expansionIndex int // used to store index into expansion table
	contractionHandle ctHandle
	contractionIndex int // index into contraction elements
	}

	func (e *entry) String() string {
	return fmt.Sprintf("%X (%q) -> %X (ch:%x; ci:%d, ei:%d)",
	e.runes, e.str, e.elems, e.contractionHandle, e.contractionIndex, e.expansionIndex)
	}

	func (e *entry) skip() bool {
	return e.contraction()
	}

	func (e *entry) expansion() bool {
	return !e.decompose && len(e.elems) > 1
	}

	func (e *entry) contraction() bool {
	return len(e.runes) > 1
	}

	func (e *entry) contractionStarter() bool {
	return e.contractionHandle.n != 0
	}

	// nextIndexed gets the next entry that needs to be stored in the table.
	// It returns the entry and the collation level at which the next entry differs
	// from the current entry.
	// Entries that can be explicitly derived and logical reset positions are
	// examples of entries that will not be indexed.
	func (e entry) nextIndexed() (entry, colltab.Level) {
	level := e.level
	for e = e.next; e != nil && (e.exclude \|\| len(e.elems) == 0); e = e.next {
	if e.level < level {
	level = e.level
	}
	}
	return e, level
	}

	// remove unlinks entry e from the sorted chain and clears the collation
	// elements. e may not be at the front or end of the list. This should always
	// be the case, as the front and end of the list are always logical anchors,
	// which may not be removed.
	func (e *entry) remove() {
	if e.logical != noAnchor {
	log.Fatalf("may not remove anchor %q", e.str)
	}
	// TODO: need to set e.prev.level to e.level if e.level is smaller?
	e.elems = nil
	if !e.skipRemove {
	if e.prev != nil {
	e.prev.next = e.next
	}
	if e.next != nil {
	e.next.prev = e.prev
	}
	}
	e.skipRemove = false
	}

	// insertAfter inserts n after e.
	func (e entry) insertAfter(n entry) {
	if e == n {
	panic("e == anchor")
	}
	if e == nil {
	panic("unexpected nil anchor")
	}
	n.remove()
	n.decompose = false // redo decomposition test

	n.next = e.next
	n.prev = e
	if e.next != nil {
	e.next.prev = n
	}
	e.next = n
	}

	// insertBefore inserts n before e.
	func (e entry) insertBefore(n entry) {
	if e == n {
	panic("e == anchor")
	}
	if e == nil {
	panic("unexpected nil anchor")
	}
	n.remove()
	n.decompose = false // redo decomposition test

	n.prev = e.prev
	n.next = e
	if e.prev != nil {
	e.prev.next = n
	}
	e.prev = n
	}

	func (e *entry) encodeBase() (ce uint32, err error) {
	switch {
	case e.expansion():
	ce, err = makeExpandIndex(e.expansionIndex)
	default:
	if e.decompose {
	log.Fatal("decompose should be handled elsewhere")
	}
	ce, err = makeCE(e.elems[0])
	}
	return
	}

	func (e *entry) encode() (ce uint32, err error) {
	if e.skip() {
	log.Fatal("cannot build colElem for entry that should be skipped")
	}
	switch {
	case e.decompose:
	t1 := e.elems[0].w[2]
	t2 := 0
	if len(e.elems) > 1 {
	t2 = e.elems[1].w[2]
	}
	ce, err = makeDecompose(t1, t2)
	case e.contractionStarter():
	ce, err = makeContractIndex(e.contractionHandle, e.contractionIndex)
	default:
	if len(e.runes) > 1 {
	log.Fatal("colElem: contractions are handled in contraction trie")
	}
	ce, err = e.encodeBase()
	}
	return
	}

	// entryLess returns true if a sorts before b and false otherwise.
	func entryLess(a, b *entry) bool {
	if res, _ := compareWeights(a.elems, b.elems); res != 0 {
	return res == -1
	}
	if a.logical != noAnchor {
	return a.logical == firstAnchor
	}
	if b.logical != noAnchor {
	return b.logical == lastAnchor
	}
	return a.str < b.str
	}

	type sortedEntries []*entry

	func (s sortedEntries) Len() int {
	return len(s)
	}

	func (s sortedEntries) Swap(i, j int) {
	s[i], s[j] = s[j], s[i]
	}

	func (s sortedEntries) Less(i, j int) bool {
	return entryLess(s[i], s[j])
	}

	type ordering struct {
	id string
	entryMap map[string]*entry
	ordered []*entry
	handle *trieHandle
	}

	// insert inserts e into both entryMap and ordered.
	// Note that insert simply appends e to ordered. To reattain a sorted
	// order, o.sort() should be called.
	func (o ordering) insert(e entry) {
	if e.logical == noAnchor {
	o.entryMap[e.str] = e
	} else {
	// Use key format as used in UCA rules.
	o.entryMap[fmt.Sprintf("[%s]", e.str)] = e
	// Also add index entry for XML format.
	o.entryMap[fmt.Sprintf("<%s/>", strings.Replace(e.str, " ", "_", -1))] = e
	}
	o.ordered = append(o.ordered, e)
	}

	// newEntry creates a new entry for the given info and inserts it into
	// the index.
	func (o ordering) newEntry(s string, ces []rawCE) entry {
	e := &entry{
	runes: []rune(s),
	elems: ces,
	str: s,
	}
	o.insert(e)
	return e
	}

	// find looks up and returns the entry for the given string.
	// It returns nil if str is not in the index and if an implicit value
	// cannot be derived, that is, if str represents more than one rune.
	func (o ordering) find(str string) entry {
	e := o.entryMap[str]
	if e == nil {
	r := []rune(str)
	if len(r) == 1 {
	const (
	firstHangul = 0xAC00
	lastHangul = 0xD7A3
	)
	if r[0] >= firstHangul && r[0] <= lastHangul {
	ce := []rawCE{}
	nfd := norm.NFD.String(str)
	for _, r := range nfd {
	ce = append(ce, o.find(string(r)).elems...)
	}
	e = o.newEntry(nfd, ce)
	} else {
	e = o.newEntry(string(r[0]), []rawCE{
	{w: []int{
	implicitPrimary(r[0]),
	defaultSecondary,
	defaultTertiary,
	int(r[0]),
	},
	},
	})
	e.modified = true
	}
	e.exclude = true // do not index implicits
	}
	}
	return e
	}

	// makeRootOrdering returns a newly initialized ordering value and populates
	// it with a set of logical reset points that can be used as anchors.
	// The anchors first_tertiary_ignorable and __END__ will always sort at
	// the beginning and end, respectively. This means that prev and next are non-nil
	// for any indexed entry.
	func makeRootOrdering() ordering {
	const max = unicode.MaxRune
	o := ordering{
	entryMap: make(map[string]*entry),
	}
	insert := func(typ logicalAnchor, s string, ce []int) {
	e := &entry{
	elems: []rawCE{{w: ce}},
	str: s,
	exclude: true,
	logical: typ,
	}
	o.insert(e)
	}
	insert(firstAnchor, "first tertiary ignorable", []int{0, 0, 0, 0})
	insert(lastAnchor, "last tertiary ignorable", []int{0, 0, 0, max})
	insert(lastAnchor, "last primary ignorable", []int{0, defaultSecondary, defaultTertiary, max})
	insert(lastAnchor, "last non ignorable", []int{maxPrimary, defaultSecondary, defaultTertiary, max})
	insert(lastAnchor, "__END__", []int{1 << maxPrimaryBits, defaultSecondary, defaultTertiary, max})
	return o
	}

	// patchForInsert eliminates entries from the list with more than one collation element.
	// The next and prev fields of the eliminated entries still point to appropriate
	// values in the newly created list.
	// It requires that sort has been called.
	func (o *ordering) patchForInsert() {
	for i := 0; i < len(o.ordered)-1; {
	e := o.ordered[i]
	lev := e.level
	n := e.next
	for ; n != nil && len(n.elems) > 1; n = n.next {
	if n.level < lev {
	lev = n.level
	}
	n.skipRemove = true
	}
	for ; o.ordered[i] != n; i++ {
	o.ordered[i].level = lev
	o.ordered[i].next = n
	o.ordered[i+1].prev = e
	}
	}
	}

	// clone copies all ordering of es into a new ordering value.
	func (o ordering) clone() ordering {
	o.sort()
	oo := ordering{
	entryMap: make(map[string]*entry),
	}
	for _, e := range o.ordered {
	ne := &entry{
	runes: e.runes,
	elems: e.elems,
	str: e.str,
	decompose: e.decompose,
	exclude: e.exclude,
	logical: e.logical,
	}
	oo.insert(ne)
	}
	oo.sort() // link all ordering.
	oo.patchForInsert()
	return &oo
	}

	// front returns the first entry to be indexed.
	// It assumes that sort() has been called.
	func (o ordering) front() entry {
	e := o.ordered[0]
	if e.prev != nil {
	log.Panicf("unexpected first entry: %v", e)
	}
	// The first entry is always a logical position, which should not be indexed.
	e, _ = e.nextIndexed()
	return e
	}

	// sort sorts all ordering based on their collation elements and initializes
	// the prev, next, and level fields accordingly.
	func (o *ordering) sort() {
	sort.Sort(sortedEntries(o.ordered))
	l := o.ordered
	for i := 1; i < len(l); i++ {
	k := i - 1
	l[k].next = l[i]
	_, l[k].level = compareWeights(l[k].elems, l[i].elems)
	l[i].prev = l[k]
	}
	}

	// genColElems generates a collation element array from the runes in str. This
	// assumes that all collation elements have already been added to the Builder.
	func (o *ordering) genColElems(str string) []rawCE {
	elems := []rawCE{}
	for _, r := range []rune(str) {
	for _, ce := range o.find(string(r)).elems {
	if ce.w[0] != 0 \|\| ce.w[1] != 0 \|\| ce.w[2] != 0 {
	elems = append(elems, ce)
	}
	}
	}
	return elems
	}