collate/build/contract.go - text - Git at Google

 // Copyright 2012 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

 package build

 import (
 	"fmt"
 	"io"
 	"reflect"
 	"sort"
 	"strings"

 	"golang.org/x/text/internal/colltab"
 )

 // This file contains code for detecting contractions and generating
 // the necessary tables.
 // Any Unicode Collation Algorithm (UCA) table entry that has more than
 // one rune one the left-hand side is called a contraction.
 // See https://www.unicode.org/reports/tr10/#Contractions for more details.
 //
 // We define the following terms:
 //   initial:     a rune that appears as the first rune in a contraction.
 //   suffix:      a sequence of runes succeeding the initial rune
 //                in a given contraction.
 //   non-initial: a rune that appears in a suffix.
 //
 // A rune may be both an initial and a non-initial and may be so in
 // many contractions.  An initial may typically also appear by itself.
 // In case of ambiguities, the UCA requires we match the longest
 // contraction.
 //
 // Many contraction rules share the same set of possible suffixes.
 // We store sets of suffixes in a trie that associates an index with
 // each suffix in the set.  This index can be used to look up a
 // collation element associated with the (starter rune, suffix) pair.
 //
 // The trie is defined on a UTF-8 byte sequence.
 // The overall trie is represented as an array of ctEntries.  Each node of the trie
 // is represented as a subsequence of ctEntries, where each entry corresponds to
 // a possible match of a next character in the search string.  An entry
 // also includes the length and offset to the next sequence of entries
 // to check in case of a match.

 const (
 	final   = 0
 	noIndex = 0xFF
 )

 // ctEntry associates to a matching byte an offset and/or next sequence of
 // bytes to check. A ctEntry c is called final if a match means that the
 // longest suffix has been found.  An entry c is final if c.N == 0.
 // A single final entry can match a range of characters to an offset.
 // A non-final entry always matches a single byte. Note that a non-final
 // entry might still resemble a completed suffix.
 // Examples:
 // The suffix strings "ab" and "ac" can be represented as:
 // []ctEntry{
 //     {'a', 1, 1, noIndex},  // 'a' by itself does not match, so i is 0xFF.
 //     {'b', 'c', 0, 1},   // "ab" -> 1, "ac" -> 2
 // }
 //
 // The suffix strings "ab", "abc", "abd", and "abcd" can be represented as:
 // []ctEntry{
 //     {'a', 1, 1, noIndex}, // 'a' must be followed by 'b'.
 //     {'b', 1, 2, 1},    // "ab" -> 1, may be followed by 'c' or 'd'.
 //     {'d', 'd', final, 3},  // "abd" -> 3
 //     {'c', 4, 1, 2},    // "abc" -> 2, may be followed by 'd'.
 //     {'d', 'd', final, 4},  // "abcd" -> 4
 // }
 // See genStateTests in contract_test.go for more examples.
 type ctEntry struct {
 	L uint8 // non-final: byte value to match; final: lowest match in range.
 	H uint8 // non-final: relative index to next block; final: highest match in range.
 	N uint8 // non-final: length of next block; final: final
 	I uint8 // result offset. Will be noIndex if more bytes are needed to complete.
 }

 // contractTrieSet holds a set of contraction tries. The tries are stored
 // consecutively in the entry field.
 type contractTrieSet []struct{ l, h, n, i uint8 }

 // ctHandle is used to identify a trie in the trie set, consisting in an offset
 // in the array and the size of the first node.
 type ctHandle struct {
 	index, n int
 }

 // appendTrie adds a new trie for the given suffixes to the trie set and returns
 // a handle to it.  The handle will be invalid on error.
 func appendTrie(ct *colltab.ContractTrieSet, suffixes []string) (ctHandle, error) {
 	es := make([]stridx, len(suffixes))
 	for i, s := range suffixes {
 		es[i].str = s
 	}
 	sort.Sort(offsetSort(es))
 	for i := range es {
 		es[i].index = i + 1
 	}
 	sort.Sort(genidxSort(es))
 	i := len(*ct)
 	n, err := genStates(ct, es)
 	if err != nil {
 		*ct = (*ct)[:i]
 		return ctHandle{}, err
 	}
 	return ctHandle{i, n}, nil
 }

 // genStates generates ctEntries for a given suffix set and returns
 // the number of entries for the first node.
 func genStates(ct *colltab.ContractTrieSet, sis []stridx) (int, error) {
 	if len(sis) == 0 {
 		return 0, fmt.Errorf("genStates: list of suffices must be non-empty")
 	}
 	start := len(*ct)
 	// create entries for differing first bytes.
 	for _, si := range sis {
 		s := si.str
 		if len(s) == 0 {
 			continue
 		}
 		added := false
 		c := s[0]
 		if len(s) > 1 {
 			for j := len(*ct) - 1; j >= start; j-- {
 				if (*ct)[j].L == c {
 					added = true
 					break
 				}
 			}
 			if !added {
 				*ct = append(*ct, ctEntry{L: c, I: noIndex})
 			}
 		} else {
 			for j := len(*ct) - 1; j >= start; j-- {
 				// Update the offset for longer suffixes with the same byte.
 				if (*ct)[j].L == c {
 					(*ct)[j].I = uint8(si.index)
 					added = true
 				}
 				// Extend range of final ctEntry, if possible.
 				if (*ct)[j].H+1 == c {
 					(*ct)[j].H = c
 					added = true
 				}
 			}
 			if !added {
 				*ct = append(*ct, ctEntry{L: c, H: c, N: final, I: uint8(si.index)})
 			}
 		}
 	}
 	n := len(*ct) - start
 	// Append nodes for the remainder of the suffixes for each ctEntry.
 	sp := 0
 	for i, end := start, len(*ct); i < end; i++ {
 		fe := (*ct)[i]
 		if fe.H == 0 { // uninitialized non-final
 			ln := len(*ct) - start - n
 			if ln > 0xFF {
 				return 0, fmt.Errorf("genStates: relative block offset too large: %d > 255", ln)
 			}
 			fe.H = uint8(ln)
 			// Find first non-final strings with same byte as current entry.
 			for ; sis[sp].str[0] != fe.L; sp++ {
 			}
 			se := sp + 1
 			for ; se < len(sis) && len(sis[se].str) > 1 && sis[se].str[0] == fe.L; se++ {
 			}
 			sl := sis[sp:se]
 			sp = se
 			for i, si := range sl {
 				sl[i].str = si.str[1:]
 			}
 			nn, err := genStates(ct, sl)
 			if err != nil {
 				return 0, err
 			}
 			fe.N = uint8(nn)
 			(*ct)[i] = fe
 		}
 	}
 	sort.Sort(entrySort((*ct)[start : start+n]))
 	return n, nil
 }

 // There may be both a final and non-final entry for a byte if the byte
 // is implied in a range of matches in the final entry.
 // We need to ensure that the non-final entry comes first in that case.
 type entrySort colltab.ContractTrieSet

 func (fe entrySort) Len() int      { return len(fe) }
 func (fe entrySort) Swap(i, j int) { fe[i], fe[j] = fe[j], fe[i] }
 func (fe entrySort) Less(i, j int) bool {
 	return fe[i].L > fe[j].L
 }

 // stridx is used for sorting suffixes and their associated offsets.
 type stridx struct {
 	str   string
 	index int
 }

 // For computing the offsets, we first sort by size, and then by string.
 // This ensures that strings that only differ in the last byte by 1
 // are sorted consecutively in increasing order such that they can
 // be packed as a range in a final ctEntry.
 type offsetSort []stridx

 func (si offsetSort) Len() int      { return len(si) }
 func (si offsetSort) Swap(i, j int) { si[i], si[j] = si[j], si[i] }
 func (si offsetSort) Less(i, j int) bool {
 	if len(si[i].str) != len(si[j].str) {
 		return len(si[i].str) > len(si[j].str)
 	}
 	return si[i].str < si[j].str
 }

 // For indexing, we want to ensure that strings are sorted in string order, where
 // for strings with the same prefix, we put longer strings before shorter ones.
 type genidxSort []stridx

 func (si genidxSort) Len() int      { return len(si) }
 func (si genidxSort) Swap(i, j int) { si[i], si[j] = si[j], si[i] }
 func (si genidxSort) Less(i, j int) bool {
 	if strings.HasPrefix(si[j].str, si[i].str) {
 		return false
 	}
 	if strings.HasPrefix(si[i].str, si[j].str) {
 		return true
 	}
 	return si[i].str < si[j].str
 }

 // lookup matches the longest suffix in str and returns the associated offset
 // and the number of bytes consumed.
 func lookup(ct *colltab.ContractTrieSet, h ctHandle, str []byte) (index, ns int) {
 	states := (*ct)[h.index:]
 	p := 0
 	n := h.n
 	for i := 0; i < n && p < len(str); {
 		e := states[i]
 		c := str[p]
 		if c >= e.L {
 			if e.L == c {
 				p++
 				if e.I != noIndex {
 					index, ns = int(e.I), p
 				}
 				if e.N != final {
 					// set to new state
 					i, states, n = 0, states[int(e.H)+n:], int(e.N)
 				} else {
 					return
 				}
 				continue
 			} else if e.N == final && c <= e.H {
 				p++
 				return int(c-e.L) + int(e.I), p
 			}
 		}
 		i++
 	}
 	return
 }

 // print writes the contractTrieSet t as compilable Go code to w. It returns
 // the total number of bytes written and the size of the resulting data structure in bytes.
 func print(t *colltab.ContractTrieSet, w io.Writer, name string) (n, size int, err error) {
 	update3 := func(nn, sz int, e error) {
 		n += nn
 		if err == nil {
 			err = e
 		}
 		size += sz
 	}
 	update2 := func(nn int, e error) { update3(nn, 0, e) }

 	update3(printArray(*t, w, name))
 	update2(fmt.Fprintf(w, "var %sContractTrieSet = ", name))
 	update3(printStruct(*t, w, name))
 	update2(fmt.Fprintln(w))
 	return
 }

 func printArray(ct colltab.ContractTrieSet, w io.Writer, name string) (n, size int, err error) {
 	p := func(f string, a ...interface{}) {
 		nn, e := fmt.Fprintf(w, f, a...)
 		n += nn
 		if err == nil {
 			err = e
 		}
 	}
 	size = len(ct) * 4
 	p("// %sCTEntries: %d entries, %d bytes\n", name, len(ct), size)
 	p("var %sCTEntries = [%d]struct{L,H,N,I uint8}{\n", name, len(ct))
 	for _, fe := range ct {
 		p("\t{0x%X, 0x%X, %d, %d},\n", fe.L, fe.H, fe.N, fe.I)
 	}
 	p("}\n")
 	return
 }

 func printStruct(ct colltab.ContractTrieSet, w io.Writer, name string) (n, size int, err error) {
 	n, err = fmt.Fprintf(w, "colltab.ContractTrieSet( %sCTEntries[:] )", name)
 	size = int(reflect.TypeOf(ct).Size())
 	return
 }
	// Copyright 2012 The Go Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style
	// license that can be found in the LICENSE file.

	package build

	import (
	"fmt"
	"io"
	"reflect"
	"sort"
	"strings"

	"golang.org/x/text/internal/colltab"
	)

	// This file contains code for detecting contractions and generating
	// the necessary tables.
	// Any Unicode Collation Algorithm (UCA) table entry that has more than
	// one rune one the left-hand side is called a contraction.
	// See https://www.unicode.org/reports/tr10/#Contractions for more details.
	//
	// We define the following terms:
	// initial: a rune that appears as the first rune in a contraction.
	// suffix: a sequence of runes succeeding the initial rune
	// in a given contraction.
	// non-initial: a rune that appears in a suffix.
	//
	// A rune may be both an initial and a non-initial and may be so in
	// many contractions. An initial may typically also appear by itself.
	// In case of ambiguities, the UCA requires we match the longest
	// contraction.
	//
	// Many contraction rules share the same set of possible suffixes.
	// We store sets of suffixes in a trie that associates an index with
	// each suffix in the set. This index can be used to look up a
	// collation element associated with the (starter rune, suffix) pair.
	//
	// The trie is defined on a UTF-8 byte sequence.
	// The overall trie is represented as an array of ctEntries. Each node of the trie
	// is represented as a subsequence of ctEntries, where each entry corresponds to
	// a possible match of a next character in the search string. An entry
	// also includes the length and offset to the next sequence of entries
	// to check in case of a match.

	const (
	final = 0
	noIndex = 0xFF
	)

	// ctEntry associates to a matching byte an offset and/or next sequence of
	// bytes to check. A ctEntry c is called final if a match means that the
	// longest suffix has been found. An entry c is final if c.N == 0.
	// A single final entry can match a range of characters to an offset.
	// A non-final entry always matches a single byte. Note that a non-final
	// entry might still resemble a completed suffix.
	// Examples:
	// The suffix strings "ab" and "ac" can be represented as:
	// []ctEntry{
	// {'a', 1, 1, noIndex}, // 'a' by itself does not match, so i is 0xFF.
	// {'b', 'c', 0, 1}, // "ab" -> 1, "ac" -> 2
	// }
	//
	// The suffix strings "ab", "abc", "abd", and "abcd" can be represented as:
	// []ctEntry{
	// {'a', 1, 1, noIndex}, // 'a' must be followed by 'b'.
	// {'b', 1, 2, 1}, // "ab" -> 1, may be followed by 'c' or 'd'.
	// {'d', 'd', final, 3}, // "abd" -> 3
	// {'c', 4, 1, 2}, // "abc" -> 2, may be followed by 'd'.
	// {'d', 'd', final, 4}, // "abcd" -> 4
	// }
	// See genStateTests in contract_test.go for more examples.
	type ctEntry struct {
	L uint8 // non-final: byte value to match; final: lowest match in range.
	H uint8 // non-final: relative index to next block; final: highest match in range.
	N uint8 // non-final: length of next block; final: final
	I uint8 // result offset. Will be noIndex if more bytes are needed to complete.
	}

	// contractTrieSet holds a set of contraction tries. The tries are stored
	// consecutively in the entry field.
	type contractTrieSet []struct{ l, h, n, i uint8 }

	// ctHandle is used to identify a trie in the trie set, consisting in an offset
	// in the array and the size of the first node.
	type ctHandle struct {
	index, n int
	}

	// appendTrie adds a new trie for the given suffixes to the trie set and returns
	// a handle to it. The handle will be invalid on error.
	func appendTrie(ct *colltab.ContractTrieSet, suffixes []string) (ctHandle, error) {
	es := make([]stridx, len(suffixes))
	for i, s := range suffixes {
	es[i].str = s
	}
	sort.Sort(offsetSort(es))
	for i := range es {
	es[i].index = i + 1
	}
	sort.Sort(genidxSort(es))
	i := len(*ct)
	n, err := genStates(ct, es)
	if err != nil {
	ct = (ct)[:i]
	return ctHandle{}, err
	}
	return ctHandle{i, n}, nil
	}

	// genStates generates ctEntries for a given suffix set and returns
	// the number of entries for the first node.
	func genStates(ct *colltab.ContractTrieSet, sis []stridx) (int, error) {
	if len(sis) == 0 {
	return 0, fmt.Errorf("genStates: list of suffices must be non-empty")
	}
	start := len(*ct)
	// create entries for differing first bytes.
	for _, si := range sis {
	s := si.str
	if len(s) == 0 {
	continue
	}
	added := false
	c := s[0]
	if len(s) > 1 {
	for j := len(*ct) - 1; j >= start; j-- {
	if (*ct)[j].L == c {
	added = true
	break
	}
	}
	if !added {
	ct = append(ct, ctEntry{L: c, I: noIndex})
	}
	} else {
	for j := len(*ct) - 1; j >= start; j-- {
	// Update the offset for longer suffixes with the same byte.
	if (*ct)[j].L == c {
	(*ct)[j].I = uint8(si.index)
	added = true
	}
	// Extend range of final ctEntry, if possible.
	if (*ct)[j].H+1 == c {
	(*ct)[j].H = c
	added = true
	}
	}
	if !added {
	ct = append(ct, ctEntry{L: c, H: c, N: final, I: uint8(si.index)})
	}
	}
	}
	n := len(*ct) - start
	// Append nodes for the remainder of the suffixes for each ctEntry.
	sp := 0
	for i, end := start, len(*ct); i < end; i++ {
	fe := (*ct)[i]
	if fe.H == 0 { // uninitialized non-final
	ln := len(*ct) - start - n
	if ln > 0xFF {
	return 0, fmt.Errorf("genStates: relative block offset too large: %d > 255", ln)
	}
	fe.H = uint8(ln)
	// Find first non-final strings with same byte as current entry.
	for ; sis[sp].str[0] != fe.L; sp++ {
	}
	se := sp + 1
	for ; se < len(sis) && len(sis[se].str) > 1 && sis[se].str[0] == fe.L; se++ {
	}
	sl := sis[sp:se]
	sp = se
	for i, si := range sl {
	sl[i].str = si.str[1:]
	}
	nn, err := genStates(ct, sl)
	if err != nil {
	return 0, err
	}
	fe.N = uint8(nn)
	(*ct)[i] = fe
	}
	}
	sort.Sort(entrySort((*ct)[start : start+n]))
	return n, nil
	}

	// There may be both a final and non-final entry for a byte if the byte
	// is implied in a range of matches in the final entry.
	// We need to ensure that the non-final entry comes first in that case.
	type entrySort colltab.ContractTrieSet

	func (fe entrySort) Len() int { return len(fe) }
	func (fe entrySort) Swap(i, j int) { fe[i], fe[j] = fe[j], fe[i] }
	func (fe entrySort) Less(i, j int) bool {
	return fe[i].L > fe[j].L
	}

	// stridx is used for sorting suffixes and their associated offsets.
	type stridx struct {
	str string
	index int
	}

	// For computing the offsets, we first sort by size, and then by string.
	// This ensures that strings that only differ in the last byte by 1
	// are sorted consecutively in increasing order such that they can
	// be packed as a range in a final ctEntry.
	type offsetSort []stridx

	func (si offsetSort) Len() int { return len(si) }
	func (si offsetSort) Swap(i, j int) { si[i], si[j] = si[j], si[i] }
	func (si offsetSort) Less(i, j int) bool {
	if len(si[i].str) != len(si[j].str) {
	return len(si[i].str) > len(si[j].str)
	}
	return si[i].str < si[j].str
	}

	// For indexing, we want to ensure that strings are sorted in string order, where
	// for strings with the same prefix, we put longer strings before shorter ones.
	type genidxSort []stridx

	func (si genidxSort) Len() int { return len(si) }
	func (si genidxSort) Swap(i, j int) { si[i], si[j] = si[j], si[i] }
	func (si genidxSort) Less(i, j int) bool {
	if strings.HasPrefix(si[j].str, si[i].str) {
	return false
	}
	if strings.HasPrefix(si[i].str, si[j].str) {
	return true
	}
	return si[i].str < si[j].str
	}

	// lookup matches the longest suffix in str and returns the associated offset
	// and the number of bytes consumed.
	func lookup(ct *colltab.ContractTrieSet, h ctHandle, str []byte) (index, ns int) {
	states := (*ct)[h.index:]
	p := 0
	n := h.n
	for i := 0; i < n && p < len(str); {
	e := states[i]
	c := str[p]
	if c >= e.L {
	if e.L == c {
	p++
	if e.I != noIndex {
	index, ns = int(e.I), p
	}
	if e.N != final {
	// set to new state
	i, states, n = 0, states[int(e.H)+n:], int(e.N)
	} else {
	return
	}
	continue
	} else if e.N == final && c <= e.H {
	p++
	return int(c-e.L) + int(e.I), p
	}
	}
	i++
	}
	return
	}

	// print writes the contractTrieSet t as compilable Go code to w. It returns
	// the total number of bytes written and the size of the resulting data structure in bytes.
	func print(t *colltab.ContractTrieSet, w io.Writer, name string) (n, size int, err error) {
	update3 := func(nn, sz int, e error) {
	n += nn
	if err == nil {
	err = e
	}
	size += sz
	}
	update2 := func(nn int, e error) { update3(nn, 0, e) }

	update3(printArray(*t, w, name))
	update2(fmt.Fprintf(w, "var %sContractTrieSet = ", name))
	update3(printStruct(*t, w, name))
	update2(fmt.Fprintln(w))
	return
	}

	func printArray(ct colltab.ContractTrieSet, w io.Writer, name string) (n, size int, err error) {
	p := func(f string, a ...interface{}) {
	nn, e := fmt.Fprintf(w, f, a...)
	n += nn
	if err == nil {
	err = e
	}
	}
	size = len(ct) * 4
	p("// %sCTEntries: %d entries, %d bytes\n", name, len(ct), size)
	p("var %sCTEntries = [%d]struct{L,H,N,I uint8}{\n", name, len(ct))
	for _, fe := range ct {
	p("\t{0x%X, 0x%X, %d, %d},\n", fe.L, fe.H, fe.N, fe.I)
	}
	p("}\n")
	return
	}

	func printStruct(ct colltab.ContractTrieSet, w io.Writer, name string) (n, size int, err error) {
	n, err = fmt.Fprintf(w, "colltab.ContractTrieSet( %sCTEntries[:] )", name)
	size = int(reflect.TypeOf(ct).Size())
	return
	}