collate/tools/colcmp/gen.go - text - Git at Google

 // Copyright 2012 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

 package main

 import (
 	"math"
 	"math/rand"
 	"strings"
 	"unicode"
 	"unicode/utf16"
 	"unicode/utf8"

 	"golang.org/x/text/language"
 	"golang.org/x/text/unicode/norm"
 )

 // TODO: replace with functionality in language package.
 // parent computes the parent language for the given language.
 // It returns false if the parent is already root.
 func parent(locale string) (parent string, ok bool) {
 	if locale == "und" {
 		return "", false
 	}
 	if i := strings.LastIndex(locale, "-"); i != -1 {
 		return locale[:i], true
 	}
 	return "und", true
 }

 // rewriter is used to both unique strings and create variants of strings
 // to add to the test set.
 type rewriter struct {
 	seen     map[string]bool
 	addCases bool
 }

 func newRewriter() *rewriter {
 	return &rewriter{
 		seen: make(map[string]bool),
 	}
 }

 func (r *rewriter) insert(a []string, s string) []string {
 	if !r.seen[s] {
 		r.seen[s] = true
 		a = append(a, s)
 	}
 	return a
 }

 // rewrite takes a sequence of strings in, adds variants of the these strings
 // based on options and removes duplicates.
 func (r *rewriter) rewrite(ss []string) []string {
 	ns := []string{}
 	for _, s := range ss {
 		ns = r.insert(ns, s)
 		if r.addCases {
 			rs := []rune(s)
 			rn := rs[0]
 			for c := unicode.SimpleFold(rn); c != rn; c = unicode.SimpleFold(c) {
 				rs[0] = c
 				ns = r.insert(ns, string(rs))
 			}
 		}
 	}
 	return ns
 }

 // exemplarySet holds a parsed set of characters from the exemplarCharacters table.
 type exemplarySet struct {
 	typ       exemplarType
 	set       []string
 	charIndex int // cumulative total of phrases, including this set
 }

 type phraseGenerator struct {
 	sets [exN]exemplarySet
 	n    int
 }

 func (g *phraseGenerator) init(id string) {
 	ec := exemplarCharacters
 	loc := language.Make(id).String()
 	// get sets for locale or parent locale if the set is not defined.
 	for i := range g.sets {
 		for p, ok := loc, true; ok; p, ok = parent(p) {
 			if set, ok := ec[p]; ok && set[i] != "" {
 				g.sets[i].set = strings.Split(set[i], " ")
 				break
 			}
 		}
 	}
 	r := newRewriter()
 	r.addCases = *cases
 	for i := range g.sets {
 		g.sets[i].set = r.rewrite(g.sets[i].set)
 	}
 	// compute indexes
 	for i, set := range g.sets {
 		g.n += len(set.set)
 		g.sets[i].charIndex = g.n
 	}
 }

 // phrase returns the ith phrase, where i < g.n.
 func (g *phraseGenerator) phrase(i int) string {
 	for _, set := range g.sets {
 		if i < set.charIndex {
 			return set.set[i-(set.charIndex-len(set.set))]
 		}
 	}
 	panic("index out of range")
 }

 // generate generates inputs by combining all pairs of examplar strings.
 // If doNorm is true, all input strings are normalized to NFC.
 // TODO: allow other variations, statistical models, and random
 // trailing sequences.
 func (g *phraseGenerator) generate(doNorm bool) []Input {
 	const (
 		M         = 1024 * 1024
 		buf8Size  = 30 * M
 		buf16Size = 10 * M
 	)
 	// TODO: use a better way to limit the input size.
 	if sq := int(math.Sqrt(float64(*limit))); g.n > sq {
 		g.n = sq
 	}
 	size := g.n * g.n
 	a := make([]Input, 0, size)
 	buf8 := make([]byte, 0, buf8Size)
 	buf16 := make([]uint16, 0, buf16Size)

 	addInput := func(str string) {
 		buf8 = buf8[len(buf8):]
 		buf16 = buf16[len(buf16):]
 		if len(str) > cap(buf8) {
 			buf8 = make([]byte, 0, buf8Size)
 		}
 		if len(str) > cap(buf16) {
 			buf16 = make([]uint16, 0, buf16Size)
 		}
 		if doNorm {
 			buf8 = norm.NFD.AppendString(buf8, str)
 		} else {
 			buf8 = append(buf8, str...)
 		}
 		buf16 = appendUTF16(buf16, buf8)
 		a = append(a, makeInput(buf8, buf16))
 	}
 	for i := 0; i < g.n; i++ {
 		p1 := g.phrase(i)
 		addInput(p1)
 		for j := 0; j < g.n; j++ {
 			p2 := g.phrase(j)
 			addInput(p1 + p2)
 		}
 	}
 	// permutate
 	rnd := rand.New(rand.NewSource(int64(rand.Int())))
 	for i := range a {
 		j := i + rnd.Intn(len(a)-i)
 		a[i], a[j] = a[j], a[i]
 		a[i].index = i // allow restoring this order if input is used multiple times.
 	}
 	return a
 }

 func appendUTF16(buf []uint16, s []byte) []uint16 {
 	for len(s) > 0 {
 		r, sz := utf8.DecodeRune(s)
 		s = s[sz:]
 		r1, r2 := utf16.EncodeRune(r)
 		if r1 != 0xFFFD {
 			buf = append(buf, uint16(r1), uint16(r2))
 		} else {
 			buf = append(buf, uint16(r))
 		}
 	}
 	return buf
 }
	// Copyright 2012 The Go Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style
	// license that can be found in the LICENSE file.

	package main

	import (
	"math"
	"math/rand"
	"strings"
	"unicode"
	"unicode/utf16"
	"unicode/utf8"

	"golang.org/x/text/language"
	"golang.org/x/text/unicode/norm"
	)

	// TODO: replace with functionality in language package.
	// parent computes the parent language for the given language.
	// It returns false if the parent is already root.
	func parent(locale string) (parent string, ok bool) {
	if locale == "und" {
	return "", false
	}
	if i := strings.LastIndex(locale, "-"); i != -1 {
	return locale[:i], true
	}
	return "und", true
	}

	// rewriter is used to both unique strings and create variants of strings
	// to add to the test set.
	type rewriter struct {
	seen map[string]bool
	addCases bool
	}

	func newRewriter() *rewriter {
	return &rewriter{
	seen: make(map[string]bool),
	}
	}

	func (r *rewriter) insert(a []string, s string) []string {
	if !r.seen[s] {
	r.seen[s] = true
	a = append(a, s)
	}
	return a
	}

	// rewrite takes a sequence of strings in, adds variants of the these strings
	// based on options and removes duplicates.
	func (r *rewriter) rewrite(ss []string) []string {
	ns := []string{}
	for _, s := range ss {
	ns = r.insert(ns, s)
	if r.addCases {
	rs := []rune(s)
	rn := rs[0]
	for c := unicode.SimpleFold(rn); c != rn; c = unicode.SimpleFold(c) {
	rs[0] = c
	ns = r.insert(ns, string(rs))
	}
	}
	}
	return ns
	}

	// exemplarySet holds a parsed set of characters from the exemplarCharacters table.
	type exemplarySet struct {
	typ exemplarType
	set []string
	charIndex int // cumulative total of phrases, including this set
	}

	type phraseGenerator struct {
	sets [exN]exemplarySet
	n int
	}

	func (g *phraseGenerator) init(id string) {
	ec := exemplarCharacters
	loc := language.Make(id).String()
	// get sets for locale or parent locale if the set is not defined.
	for i := range g.sets {
	for p, ok := loc, true; ok; p, ok = parent(p) {
	if set, ok := ec[p]; ok && set[i] != "" {
	g.sets[i].set = strings.Split(set[i], " ")
	break
	}
	}
	}
	r := newRewriter()
	r.addCases = *cases
	for i := range g.sets {
	g.sets[i].set = r.rewrite(g.sets[i].set)
	}
	// compute indexes
	for i, set := range g.sets {
	g.n += len(set.set)
	g.sets[i].charIndex = g.n
	}
	}

	// phrase returns the ith phrase, where i < g.n.
	func (g *phraseGenerator) phrase(i int) string {
	for _, set := range g.sets {
	if i < set.charIndex {
	return set.set[i-(set.charIndex-len(set.set))]
	}
	}
	panic("index out of range")
	}

	// generate generates inputs by combining all pairs of examplar strings.
	// If doNorm is true, all input strings are normalized to NFC.
	// TODO: allow other variations, statistical models, and random
	// trailing sequences.
	func (g *phraseGenerator) generate(doNorm bool) []Input {
	const (
	M = 1024 * 1024
	buf8Size = 30 * M
	buf16Size = 10 * M
	)
	// TODO: use a better way to limit the input size.
	if sq := int(math.Sqrt(float64(*limit))); g.n > sq {
	g.n = sq
	}
	size := g.n * g.n
	a := make([]Input, 0, size)
	buf8 := make([]byte, 0, buf8Size)
	buf16 := make([]uint16, 0, buf16Size)

	addInput := func(str string) {
	buf8 = buf8[len(buf8):]
	buf16 = buf16[len(buf16):]
	if len(str) > cap(buf8) {
	buf8 = make([]byte, 0, buf8Size)
	}
	if len(str) > cap(buf16) {
	buf16 = make([]uint16, 0, buf16Size)
	}
	if doNorm {
	buf8 = norm.NFD.AppendString(buf8, str)
	} else {
	buf8 = append(buf8, str...)
	}
	buf16 = appendUTF16(buf16, buf8)
	a = append(a, makeInput(buf8, buf16))
	}
	for i := 0; i < g.n; i++ {
	p1 := g.phrase(i)
	addInput(p1)
	for j := 0; j < g.n; j++ {
	p2 := g.phrase(j)
	addInput(p1 + p2)
	}
	}
	// permutate
	rnd := rand.New(rand.NewSource(int64(rand.Int())))
	for i := range a {
	j := i + rnd.Intn(len(a)-i)
	a[i], a[j] = a[j], a[i]
	a[i].index = i // allow restoring this order if input is used multiple times.
	}
	return a
	}

	func appendUTF16(buf []uint16, s []byte) []uint16 {
	for len(s) > 0 {
	r, sz := utf8.DecodeRune(s)
	s = s[sz:]
	r1, r2 := utf16.EncodeRune(r)
	if r1 != 0xFFFD {
	buf = append(buf, uint16(r1), uint16(r2))
	} else {
	buf = append(buf, uint16(r))
	}
	}
	return buf
	}