| // Copyright 2012 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| package main |
| |
| import ( |
| "math" |
| "math/rand" |
| "strings" |
| "unicode" |
| "unicode/utf16" |
| "unicode/utf8" |
| |
| "golang.org/x/text/language" |
| "golang.org/x/text/unicode/norm" |
| ) |
| |
| // TODO: replace with functionality in language package. |
| // parent computes the parent language for the given language. |
| // It returns false if the parent is already root. |
| func parent(locale string) (parent string, ok bool) { |
| if locale == "und" { |
| return "", false |
| } |
| if i := strings.LastIndex(locale, "-"); i != -1 { |
| return locale[:i], true |
| } |
| return "und", true |
| } |
| |
| // rewriter is used to both unique strings and create variants of strings |
| // to add to the test set. |
| type rewriter struct { |
| seen map[string]bool |
| addCases bool |
| } |
| |
| func newRewriter() *rewriter { |
| return &rewriter{ |
| seen: make(map[string]bool), |
| } |
| } |
| |
| func (r *rewriter) insert(a []string, s string) []string { |
| if !r.seen[s] { |
| r.seen[s] = true |
| a = append(a, s) |
| } |
| return a |
| } |
| |
| // rewrite takes a sequence of strings in, adds variants of the these strings |
| // based on options and removes duplicates. |
| func (r *rewriter) rewrite(ss []string) []string { |
| ns := []string{} |
| for _, s := range ss { |
| ns = r.insert(ns, s) |
| if r.addCases { |
| rs := []rune(s) |
| rn := rs[0] |
| for c := unicode.SimpleFold(rn); c != rn; c = unicode.SimpleFold(c) { |
| rs[0] = c |
| ns = r.insert(ns, string(rs)) |
| } |
| } |
| } |
| return ns |
| } |
| |
| // exemplarySet holds a parsed set of characters from the exemplarCharacters table. |
| type exemplarySet struct { |
| typ exemplarType |
| set []string |
| charIndex int // cumulative total of phrases, including this set |
| } |
| |
| type phraseGenerator struct { |
| sets [exN]exemplarySet |
| n int |
| } |
| |
| func (g *phraseGenerator) init(id string) { |
| ec := exemplarCharacters |
| loc := language.Make(id).String() |
| // get sets for locale or parent locale if the set is not defined. |
| for i := range g.sets { |
| for p, ok := loc, true; ok; p, ok = parent(p) { |
| if set, ok := ec[p]; ok && set[i] != "" { |
| g.sets[i].set = strings.Split(set[i], " ") |
| break |
| } |
| } |
| } |
| r := newRewriter() |
| r.addCases = *cases |
| for i := range g.sets { |
| g.sets[i].set = r.rewrite(g.sets[i].set) |
| } |
| // compute indexes |
| for i, set := range g.sets { |
| g.n += len(set.set) |
| g.sets[i].charIndex = g.n |
| } |
| } |
| |
| // phrase returns the ith phrase, where i < g.n. |
| func (g *phraseGenerator) phrase(i int) string { |
| for _, set := range g.sets { |
| if i < set.charIndex { |
| return set.set[i-(set.charIndex-len(set.set))] |
| } |
| } |
| panic("index out of range") |
| } |
| |
| // generate generates inputs by combining all pairs of examplar strings. |
| // If doNorm is true, all input strings are normalized to NFC. |
| // TODO: allow other variations, statistical models, and random |
| // trailing sequences. |
| func (g *phraseGenerator) generate(doNorm bool) []Input { |
| const ( |
| M = 1024 * 1024 |
| buf8Size = 30 * M |
| buf16Size = 10 * M |
| ) |
| // TODO: use a better way to limit the input size. |
| if sq := int(math.Sqrt(float64(*limit))); g.n > sq { |
| g.n = sq |
| } |
| size := g.n * g.n |
| a := make([]Input, 0, size) |
| buf8 := make([]byte, 0, buf8Size) |
| buf16 := make([]uint16, 0, buf16Size) |
| |
| addInput := func(str string) { |
| buf8 = buf8[len(buf8):] |
| buf16 = buf16[len(buf16):] |
| if len(str) > cap(buf8) { |
| buf8 = make([]byte, 0, buf8Size) |
| } |
| if len(str) > cap(buf16) { |
| buf16 = make([]uint16, 0, buf16Size) |
| } |
| if doNorm { |
| buf8 = norm.NFD.AppendString(buf8, str) |
| } else { |
| buf8 = append(buf8, str...) |
| } |
| buf16 = appendUTF16(buf16, buf8) |
| a = append(a, makeInput(buf8, buf16)) |
| } |
| for i := 0; i < g.n; i++ { |
| p1 := g.phrase(i) |
| addInput(p1) |
| for j := 0; j < g.n; j++ { |
| p2 := g.phrase(j) |
| addInput(p1 + p2) |
| } |
| } |
| // permutate |
| rnd := rand.New(rand.NewSource(int64(rand.Int()))) |
| for i := range a { |
| j := i + rnd.Intn(len(a)-i) |
| a[i], a[j] = a[j], a[i] |
| a[i].index = i // allow restoring this order if input is used multiple times. |
| } |
| return a |
| } |
| |
| func appendUTF16(buf []uint16, s []byte) []uint16 { |
| for len(s) > 0 { |
| r, sz := utf8.DecodeRune(s) |
| s = s[sz:] |
| r1, r2 := utf16.EncodeRune(r) |
| if r1 != 0xFFFD { |
| buf = append(buf, uint16(r1), uint16(r2)) |
| } else { |
| buf = append(buf, uint16(r)) |
| } |
| } |
| return buf |
| } |