collate/colltab/table.go - text - Git at Google

 // Copyright 2012 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

 package colltab

 import (
 	"unicode/utf8"

 	"golang.org/x/text/unicode/norm"
 )

 // table holds all collation data for a given collation ordering.
 type table struct {
 	index trie // main trie

 	// expansion info
 	expandElem []uint32

 	// contraction info
 	contractTries  contractTrieSet
 	contractElem   []uint32
 	maxContractLen int
 	variableTop    uint32
 }

 func (t *table) AppendNext(w []Elem, b []byte) (res []Elem, n int) {
 	return t.appendNext(w, source{bytes: b})
 }

 func (t *table) AppendNextString(w []Elem, s string) (res []Elem, n int) {
 	return t.appendNext(w, source{str: s})
 }

 func (t *table) Start(p int, b []byte) int {
 	// TODO: implement
 	panic("not implemented")
 }

 func (t *table) StartString(p int, s string) int {
 	// TODO: implement
 	panic("not implemented")
 }

 func (t *table) Domain() []string {
 	// TODO: implement
 	panic("not implemented")
 }

 func (t *table) Top() uint32 {
 	return t.variableTop
 }

 type source struct {
 	str   string
 	bytes []byte
 }

 func (src *source) lookup(t *table) (ce Elem, sz int) {
 	if src.bytes == nil {
 		return t.index.lookupString(src.str)
 	}
 	return t.index.lookup(src.bytes)
 }

 func (src *source) tail(sz int) {
 	if src.bytes == nil {
 		src.str = src.str[sz:]
 	} else {
 		src.bytes = src.bytes[sz:]
 	}
 }

 func (src *source) nfd(buf []byte, end int) []byte {
 	if src.bytes == nil {
 		return norm.NFD.AppendString(buf[:0], src.str[:end])
 	}
 	return norm.NFD.Append(buf[:0], src.bytes[:end]...)
 }

 func (src *source) rune() (r rune, sz int) {
 	if src.bytes == nil {
 		return utf8.DecodeRuneInString(src.str)
 	}
 	return utf8.DecodeRune(src.bytes)
 }

 func (src *source) properties(f norm.Form) norm.Properties {
 	if src.bytes == nil {
 		return f.PropertiesString(src.str)
 	}
 	return f.Properties(src.bytes)
 }

 // appendNext appends the weights corresponding to the next rune or
 // contraction in s.  If a contraction is matched to a discontinuous
 // sequence of runes, the weights for the interstitial runes are
 // appended as well.  It returns a new slice that includes the appended
 // weights and the number of bytes consumed from s.
 func (t *table) appendNext(w []Elem, src source) (res []Elem, n int) {
 	ce, sz := src.lookup(t)
 	tp := ce.ctype()
 	if tp == ceNormal {
 		if ce == 0 {
 			r, _ := src.rune()
 			const (
 				hangulSize  = 3
 				firstHangul = 0xAC00
 				lastHangul  = 0xD7A3
 			)
 			if r >= firstHangul && r <= lastHangul {
 				// TODO: performance can be considerably improved here.
 				n = sz
 				var buf [16]byte // Used for decomposing Hangul.
 				for b := src.nfd(buf[:0], hangulSize); len(b) > 0; b = b[sz:] {
 					ce, sz = t.index.lookup(b)
 					w = append(w, ce)
 				}
 				return w, n
 			}
 			ce = makeImplicitCE(implicitPrimary(r))
 		}
 		w = append(w, ce)
 	} else if tp == ceExpansionIndex {
 		w = t.appendExpansion(w, ce)
 	} else if tp == ceContractionIndex {
 		n := 0
 		src.tail(sz)
 		if src.bytes == nil {
 			w, n = t.matchContractionString(w, ce, src.str)
 		} else {
 			w, n = t.matchContraction(w, ce, src.bytes)
 		}
 		sz += n
 	} else if tp == ceDecompose {
 		// Decompose using NFKD and replace tertiary weights.
 		t1, t2 := splitDecompose(ce)
 		i := len(w)
 		nfkd := src.properties(norm.NFKD).Decomposition()
 		for p := 0; len(nfkd) > 0; nfkd = nfkd[p:] {
 			w, p = t.appendNext(w, source{bytes: nfkd})
 		}
 		w[i] = w[i].updateTertiary(t1)
 		if i++; i < len(w) {
 			w[i] = w[i].updateTertiary(t2)
 			for i++; i < len(w); i++ {
 				w[i] = w[i].updateTertiary(maxTertiary)
 			}
 		}
 	}
 	return w, sz
 }

 func (t *table) appendExpansion(w []Elem, ce Elem) []Elem {
 	i := splitExpandIndex(ce)
 	n := int(t.expandElem[i])
 	i++
 	for _, ce := range t.expandElem[i : i+n] {
 		w = append(w, Elem(ce))
 	}
 	return w
 }

 func (t *table) matchContraction(w []Elem, ce Elem, suffix []byte) ([]Elem, int) {
 	index, n, offset := splitContractIndex(ce)

 	scan := t.contractTries.scanner(index, n, suffix)
 	buf := [norm.MaxSegmentSize]byte{}
 	bufp := 0
 	p := scan.scan(0)

 	if !scan.done && p < len(suffix) && suffix[p] >= utf8.RuneSelf {
 		// By now we should have filtered most cases.
 		p0 := p
 		bufn := 0
 		rune := norm.NFD.Properties(suffix[p:])
 		p += rune.Size()
 		if rune.LeadCCC() != 0 {
 			prevCC := rune.TrailCCC()
 			// A gap may only occur in the last normalization segment.
 			// This also ensures that len(scan.s) < norm.MaxSegmentSize.
 			if end := norm.NFD.FirstBoundary(suffix[p:]); end != -1 {
 				scan.s = suffix[:p+end]
 			}
 			for p < len(suffix) && !scan.done && suffix[p] >= utf8.RuneSelf {
 				rune = norm.NFD.Properties(suffix[p:])
 				if ccc := rune.LeadCCC(); ccc == 0 || prevCC >= ccc {
 					break
 				}
 				prevCC = rune.TrailCCC()
 				if pp := scan.scan(p); pp != p {
 					// Copy the interstitial runes for later processing.
 					bufn += copy(buf[bufn:], suffix[p0:p])
 					if scan.pindex == pp {
 						bufp = bufn
 					}
 					p, p0 = pp, pp
 				} else {
 					p += rune.Size()
 				}
 			}
 		}
 	}
 	// Append weights for the matched contraction, which may be an expansion.
 	i, n := scan.result()
 	ce = Elem(t.contractElem[i+offset])
 	if ce.ctype() == ceNormal {
 		w = append(w, ce)
 	} else {
 		w = t.appendExpansion(w, ce)
 	}
 	// Append weights for the runes in the segment not part of the contraction.
 	for b, p := buf[:bufp], 0; len(b) > 0; b = b[p:] {
 		w, p = t.appendNext(w, source{bytes: b})
 	}
 	return w, n
 }

 // TODO: unify the two implementations. This is best done after first simplifying
 // the algorithm taking into account the inclusion of both NFC and NFD forms
 // in the table.
 func (t *table) matchContractionString(w []Elem, ce Elem, suffix string) ([]Elem, int) {
 	index, n, offset := splitContractIndex(ce)

 	scan := t.contractTries.scannerString(index, n, suffix)
 	buf := [norm.MaxSegmentSize]byte{}
 	bufp := 0
 	p := scan.scan(0)

 	if !scan.done && p < len(suffix) && suffix[p] >= utf8.RuneSelf {
 		// By now we should have filtered most cases.
 		p0 := p
 		bufn := 0
 		rune := norm.NFD.PropertiesString(suffix[p:])
 		p += rune.Size()
 		if rune.LeadCCC() != 0 {
 			prevCC := rune.TrailCCC()
 			// A gap may only occur in the last normalization segment.
 			// This also ensures that len(scan.s) < norm.MaxSegmentSize.
 			if end := norm.NFD.FirstBoundaryInString(suffix[p:]); end != -1 {
 				scan.s = suffix[:p+end]
 			}
 			for p < len(suffix) && !scan.done && suffix[p] >= utf8.RuneSelf {
 				rune = norm.NFD.PropertiesString(suffix[p:])
 				if ccc := rune.LeadCCC(); ccc == 0 || prevCC >= ccc {
 					break
 				}
 				prevCC = rune.TrailCCC()
 				if pp := scan.scan(p); pp != p {
 					// Copy the interstitial runes for later processing.
 					bufn += copy(buf[bufn:], suffix[p0:p])
 					if scan.pindex == pp {
 						bufp = bufn
 					}
 					p, p0 = pp, pp
 				} else {
 					p += rune.Size()
 				}
 			}
 		}
 	}
 	// Append weights for the matched contraction, which may be an expansion.
 	i, n := scan.result()
 	ce = Elem(t.contractElem[i+offset])
 	if ce.ctype() == ceNormal {
 		w = append(w, ce)
 	} else {
 		w = t.appendExpansion(w, ce)
 	}
 	// Append weights for the runes in the segment not part of the contraction.
 	for b, p := buf[:bufp], 0; len(b) > 0; b = b[p:] {
 		w, p = t.appendNext(w, source{bytes: b})
 	}
 	return w, n
 }
	// Copyright 2012 The Go Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style
	// license that can be found in the LICENSE file.

	package colltab

	import (
	"unicode/utf8"

	"golang.org/x/text/unicode/norm"
	)

	// table holds all collation data for a given collation ordering.
	type table struct {
	index trie // main trie

	// expansion info
	expandElem []uint32

	// contraction info
	contractTries contractTrieSet
	contractElem []uint32
	maxContractLen int
	variableTop uint32
	}

	func (t *table) AppendNext(w []Elem, b []byte) (res []Elem, n int) {
	return t.appendNext(w, source{bytes: b})
	}

	func (t *table) AppendNextString(w []Elem, s string) (res []Elem, n int) {
	return t.appendNext(w, source{str: s})
	}

	func (t *table) Start(p int, b []byte) int {
	// TODO: implement
	panic("not implemented")
	}

	func (t *table) StartString(p int, s string) int {
	// TODO: implement
	panic("not implemented")
	}

	func (t *table) Domain() []string {
	// TODO: implement
	panic("not implemented")
	}

	func (t *table) Top() uint32 {
	return t.variableTop
	}

	type source struct {
	str string
	bytes []byte
	}

	func (src source) lookup(t table) (ce Elem, sz int) {
	if src.bytes == nil {
	return t.index.lookupString(src.str)
	}
	return t.index.lookup(src.bytes)
	}

	func (src *source) tail(sz int) {
	if src.bytes == nil {
	src.str = src.str[sz:]
	} else {
	src.bytes = src.bytes[sz:]
	}
	}

	func (src *source) nfd(buf []byte, end int) []byte {
	if src.bytes == nil {
	return norm.NFD.AppendString(buf[:0], src.str[:end])
	}
	return norm.NFD.Append(buf[:0], src.bytes[:end]...)
	}

	func (src *source) rune() (r rune, sz int) {
	if src.bytes == nil {
	return utf8.DecodeRuneInString(src.str)
	}
	return utf8.DecodeRune(src.bytes)
	}

	func (src *source) properties(f norm.Form) norm.Properties {
	if src.bytes == nil {
	return f.PropertiesString(src.str)
	}
	return f.Properties(src.bytes)
	}

	// appendNext appends the weights corresponding to the next rune or
	// contraction in s. If a contraction is matched to a discontinuous
	// sequence of runes, the weights for the interstitial runes are
	// appended as well. It returns a new slice that includes the appended
	// weights and the number of bytes consumed from s.
	func (t *table) appendNext(w []Elem, src source) (res []Elem, n int) {
	ce, sz := src.lookup(t)
	tp := ce.ctype()
	if tp == ceNormal {
	if ce == 0 {
	r, _ := src.rune()
	const (
	hangulSize = 3
	firstHangul = 0xAC00
	lastHangul = 0xD7A3
	)
	if r >= firstHangul && r <= lastHangul {
	// TODO: performance can be considerably improved here.
	n = sz
	var buf [16]byte // Used for decomposing Hangul.
	for b := src.nfd(buf[:0], hangulSize); len(b) > 0; b = b[sz:] {
	ce, sz = t.index.lookup(b)
	w = append(w, ce)
	}
	return w, n
	}
	ce = makeImplicitCE(implicitPrimary(r))
	}
	w = append(w, ce)
	} else if tp == ceExpansionIndex {
	w = t.appendExpansion(w, ce)
	} else if tp == ceContractionIndex {
	n := 0
	src.tail(sz)
	if src.bytes == nil {
	w, n = t.matchContractionString(w, ce, src.str)
	} else {
	w, n = t.matchContraction(w, ce, src.bytes)
	}
	sz += n
	} else if tp == ceDecompose {
	// Decompose using NFKD and replace tertiary weights.
	t1, t2 := splitDecompose(ce)
	i := len(w)
	nfkd := src.properties(norm.NFKD).Decomposition()
	for p := 0; len(nfkd) > 0; nfkd = nfkd[p:] {
	w, p = t.appendNext(w, source{bytes: nfkd})
	}
	w[i] = w[i].updateTertiary(t1)
	if i++; i < len(w) {
	w[i] = w[i].updateTertiary(t2)
	for i++; i < len(w); i++ {
	w[i] = w[i].updateTertiary(maxTertiary)
	}
	}
	}
	return w, sz
	}

	func (t *table) appendExpansion(w []Elem, ce Elem) []Elem {
	i := splitExpandIndex(ce)
	n := int(t.expandElem[i])
	i++
	for _, ce := range t.expandElem[i : i+n] {
	w = append(w, Elem(ce))
	}
	return w
	}

	func (t *table) matchContraction(w []Elem, ce Elem, suffix []byte) ([]Elem, int) {
	index, n, offset := splitContractIndex(ce)

	scan := t.contractTries.scanner(index, n, suffix)
	buf := [norm.MaxSegmentSize]byte{}
	bufp := 0
	p := scan.scan(0)

	if !scan.done && p < len(suffix) && suffix[p] >= utf8.RuneSelf {
	// By now we should have filtered most cases.
	p0 := p
	bufn := 0
	rune := norm.NFD.Properties(suffix[p:])
	p += rune.Size()
	if rune.LeadCCC() != 0 {
	prevCC := rune.TrailCCC()
	// A gap may only occur in the last normalization segment.
	// This also ensures that len(scan.s) < norm.MaxSegmentSize.
	if end := norm.NFD.FirstBoundary(suffix[p:]); end != -1 {
	scan.s = suffix[:p+end]
	}
	for p < len(suffix) && !scan.done && suffix[p] >= utf8.RuneSelf {
	rune = norm.NFD.Properties(suffix[p:])
	if ccc := rune.LeadCCC(); ccc == 0 \|\| prevCC >= ccc {
	break
	}
	prevCC = rune.TrailCCC()
	if pp := scan.scan(p); pp != p {
	// Copy the interstitial runes for later processing.
	bufn += copy(buf[bufn:], suffix[p0:p])
	if scan.pindex == pp {
	bufp = bufn
	}
	p, p0 = pp, pp
	} else {
	p += rune.Size()
	}
	}
	}
	}
	// Append weights for the matched contraction, which may be an expansion.
	i, n := scan.result()
	ce = Elem(t.contractElem[i+offset])
	if ce.ctype() == ceNormal {
	w = append(w, ce)
	} else {
	w = t.appendExpansion(w, ce)
	}
	// Append weights for the runes in the segment not part of the contraction.
	for b, p := buf[:bufp], 0; len(b) > 0; b = b[p:] {
	w, p = t.appendNext(w, source{bytes: b})
	}
	return w, n
	}

	// TODO: unify the two implementations. This is best done after first simplifying
	// the algorithm taking into account the inclusion of both NFC and NFD forms
	// in the table.
	func (t *table) matchContractionString(w []Elem, ce Elem, suffix string) ([]Elem, int) {
	index, n, offset := splitContractIndex(ce)

	scan := t.contractTries.scannerString(index, n, suffix)
	buf := [norm.MaxSegmentSize]byte{}
	bufp := 0
	p := scan.scan(0)

	if !scan.done && p < len(suffix) && suffix[p] >= utf8.RuneSelf {
	// By now we should have filtered most cases.
	p0 := p
	bufn := 0
	rune := norm.NFD.PropertiesString(suffix[p:])
	p += rune.Size()
	if rune.LeadCCC() != 0 {
	prevCC := rune.TrailCCC()
	// A gap may only occur in the last normalization segment.
	// This also ensures that len(scan.s) < norm.MaxSegmentSize.
	if end := norm.NFD.FirstBoundaryInString(suffix[p:]); end != -1 {
	scan.s = suffix[:p+end]
	}
	for p < len(suffix) && !scan.done && suffix[p] >= utf8.RuneSelf {
	rune = norm.NFD.PropertiesString(suffix[p:])
	if ccc := rune.LeadCCC(); ccc == 0 \|\| prevCC >= ccc {
	break
	}
	prevCC = rune.TrailCCC()
	if pp := scan.scan(p); pp != p {
	// Copy the interstitial runes for later processing.
	bufn += copy(buf[bufn:], suffix[p0:p])
	if scan.pindex == pp {
	bufp = bufn
	}
	p, p0 = pp, pp
	} else {
	p += rune.Size()
	}
	}
	}
	}
	// Append weights for the matched contraction, which may be an expansion.
	i, n := scan.result()
	ce = Elem(t.contractElem[i+offset])
	if ce.ctype() == ceNormal {
	w = append(w, ce)
	} else {
	w = t.appendExpansion(w, ce)
	}
	// Append weights for the runes in the segment not part of the contraction.
	for b, p := buf[:bufp], 0; len(b) > 0; b = b[p:] {
	w, p = t.appendNext(w, source{bytes: b})
	}
	return w, n
	}