| // Copyright 2014 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| //go:build ignore |
| |
| // This program generates the trie for casing operations. The Unicode casing |
| // algorithm requires the lookup of various properties and mappings for each |
| // rune. The table generated by this generator combines several of the most |
| // frequently used of these into a single trie so that they can be accessed |
| // with a single lookup. |
| package main |
| |
| import ( |
| "bytes" |
| "fmt" |
| "io" |
| "log" |
| "reflect" |
| "strconv" |
| "strings" |
| "unicode" |
| |
| "golang.org/x/text/internal/gen" |
| "golang.org/x/text/internal/triegen" |
| "golang.org/x/text/internal/ucd" |
| "golang.org/x/text/unicode/norm" |
| ) |
| |
| func main() { |
| gen.Init() |
| genTables() |
| genTablesTest() |
| gen.Repackage("gen_trieval.go", "trieval.go", "cases") |
| } |
| |
| // runeInfo contains all information for a rune that we care about for casing |
| // operations. |
| type runeInfo struct { |
| Rune rune |
| |
| entry info // trie value for this rune. |
| |
| CaseMode info |
| |
| // Simple case mappings. |
| Simple [1 + maxCaseMode][]rune |
| |
| // Special casing |
| HasSpecial bool |
| Conditional bool |
| Special [1 + maxCaseMode][]rune |
| |
| // Folding |
| FoldSimple rune |
| FoldSpecial rune |
| FoldFull []rune |
| |
| // TODO: FC_NFKC, or equivalent data. |
| |
| // Properties |
| SoftDotted bool |
| CaseIgnorable bool |
| Cased bool |
| DecomposeGreek bool |
| BreakType string |
| BreakCat breakCategory |
| |
| // We care mostly about 0, Above, and IotaSubscript. |
| CCC byte |
| } |
| |
| type breakCategory int |
| |
| const ( |
| breakBreak breakCategory = iota |
| breakLetter |
| breakMid |
| ) |
| |
| // mapping returns the case mapping for the given case type. |
| func (r *runeInfo) mapping(c info) string { |
| if r.HasSpecial { |
| return string(r.Special[c]) |
| } |
| if len(r.Simple[c]) != 0 { |
| return string(r.Simple[c]) |
| } |
| return string(r.Rune) |
| } |
| |
| func parse(file string, f func(p *ucd.Parser)) { |
| ucd.Parse(gen.OpenUCDFile(file), f) |
| } |
| |
| func parseUCD() []runeInfo { |
| chars := make([]runeInfo, unicode.MaxRune) |
| |
| get := func(r rune) *runeInfo { |
| c := &chars[r] |
| c.Rune = r |
| return c |
| } |
| |
| parse("UnicodeData.txt", func(p *ucd.Parser) { |
| ri := get(p.Rune(0)) |
| ri.CCC = byte(p.Int(ucd.CanonicalCombiningClass)) |
| ri.Simple[cLower] = p.Runes(ucd.SimpleLowercaseMapping) |
| ri.Simple[cUpper] = p.Runes(ucd.SimpleUppercaseMapping) |
| ri.Simple[cTitle] = p.Runes(ucd.SimpleTitlecaseMapping) |
| if p.String(ucd.GeneralCategory) == "Lt" { |
| ri.CaseMode = cTitle |
| } |
| }) |
| |
| // <code>; <property> |
| parse("PropList.txt", func(p *ucd.Parser) { |
| if p.String(1) == "Soft_Dotted" { |
| chars[p.Rune(0)].SoftDotted = true |
| } |
| }) |
| |
| // <code>; <word break type> |
| parse("DerivedCoreProperties.txt", func(p *ucd.Parser) { |
| ri := get(p.Rune(0)) |
| switch p.String(1) { |
| case "Case_Ignorable": |
| ri.CaseIgnorable = true |
| case "Cased": |
| ri.Cased = true |
| case "Lowercase": |
| ri.CaseMode = cLower |
| case "Uppercase": |
| ri.CaseMode = cUpper |
| } |
| }) |
| |
| // <code>; <lower> ; <title> ; <upper> ; (<condition_list> ;)? |
| parse("SpecialCasing.txt", func(p *ucd.Parser) { |
| // We drop all conditional special casing and deal with them manually in |
| // the language-specific case mappers. Rune 0x03A3 is the only one with |
| // a conditional formatting that is not language-specific. However, |
| // dealing with this letter is tricky, especially in a streaming |
| // context, so we deal with it in the Caser for Greek specifically. |
| ri := get(p.Rune(0)) |
| if p.String(4) == "" { |
| ri.HasSpecial = true |
| ri.Special[cLower] = p.Runes(1) |
| ri.Special[cTitle] = p.Runes(2) |
| ri.Special[cUpper] = p.Runes(3) |
| } else { |
| ri.Conditional = true |
| } |
| }) |
| |
| // TODO: Use text breaking according to UAX #29. |
| // <code>; <word break type> |
| parse("auxiliary/WordBreakProperty.txt", func(p *ucd.Parser) { |
| ri := get(p.Rune(0)) |
| ri.BreakType = p.String(1) |
| |
| // We collapse the word breaking properties onto the categories we need. |
| switch p.String(1) { // TODO: officially we need to canonicalize. |
| case "MidLetter", "MidNumLet", "Single_Quote": |
| ri.BreakCat = breakMid |
| if !ri.CaseIgnorable { |
| // finalSigma relies on the fact that all breakMid runes are |
| // also a Case_Ignorable. Revisit this code when this changes. |
| log.Fatalf("Rune %U, which has a break category mid, is not a case ignorable", ri) |
| } |
| case "ALetter", "Hebrew_Letter", "Numeric", "Extend", "ExtendNumLet", "Format", "ZWJ": |
| ri.BreakCat = breakLetter |
| } |
| }) |
| |
| // <code>; <type>; <mapping> |
| parse("CaseFolding.txt", func(p *ucd.Parser) { |
| ri := get(p.Rune(0)) |
| switch p.String(1) { |
| case "C": |
| ri.FoldSimple = p.Rune(2) |
| ri.FoldFull = p.Runes(2) |
| case "S": |
| ri.FoldSimple = p.Rune(2) |
| case "T": |
| ri.FoldSpecial = p.Rune(2) |
| case "F": |
| ri.FoldFull = p.Runes(2) |
| default: |
| log.Fatalf("%U: unknown type: %s", p.Rune(0), p.String(1)) |
| } |
| }) |
| |
| return chars |
| } |
| |
| func genTables() { |
| chars := parseUCD() |
| verifyProperties(chars) |
| |
| t := triegen.NewTrie("case") |
| for i := range chars { |
| c := &chars[i] |
| makeEntry(c) |
| t.Insert(rune(i), uint64(c.entry)) |
| } |
| |
| w := gen.NewCodeWriter() |
| defer w.WriteVersionedGoFile("tables.go", "cases") |
| |
| gen.WriteUnicodeVersion(w) |
| |
| // TODO: write CLDR version after adding a mechanism to detect that the |
| // tables on which the manually created locale-sensitive casing code is |
| // based hasn't changed. |
| |
| w.WriteVar("xorData", string(xorData)) |
| w.WriteVar("exceptions", string(exceptionData)) |
| |
| sz, err := t.Gen(w, triegen.Compact(&sparseCompacter{})) |
| if err != nil { |
| log.Fatal(err) |
| } |
| w.Size += sz |
| } |
| |
| func makeEntry(ri *runeInfo) { |
| if ri.CaseIgnorable { |
| if ri.Cased { |
| ri.entry = cIgnorableCased |
| } else { |
| ri.entry = cIgnorableUncased |
| } |
| } else { |
| ri.entry = ri.CaseMode |
| } |
| |
| // TODO: handle soft-dotted. |
| |
| ccc := cccOther |
| switch ri.CCC { |
| case 0: // Not_Reordered |
| ccc = cccZero |
| case above: // Above |
| ccc = cccAbove |
| } |
| switch ri.BreakCat { |
| case breakBreak: |
| ccc = cccBreak |
| case breakMid: |
| ri.entry |= isMidBit |
| } |
| |
| ri.entry |= ccc |
| |
| if ri.CaseMode == cUncased { |
| return |
| } |
| |
| // Need to do something special. |
| if ri.CaseMode == cTitle || ri.HasSpecial || ri.mapping(cTitle) != ri.mapping(cUpper) { |
| makeException(ri) |
| return |
| } |
| if f := string(ri.FoldFull); len(f) > 0 && f != ri.mapping(cUpper) && f != ri.mapping(cLower) { |
| makeException(ri) |
| return |
| } |
| |
| // Rune is either lowercase or uppercase. |
| |
| orig := string(ri.Rune) |
| mapped := "" |
| if ri.CaseMode == cUpper { |
| mapped = ri.mapping(cLower) |
| } else { |
| mapped = ri.mapping(cUpper) |
| } |
| |
| if len(orig) != len(mapped) { |
| makeException(ri) |
| return |
| } |
| |
| if string(ri.FoldFull) == ri.mapping(cUpper) { |
| ri.entry |= inverseFoldBit |
| } |
| |
| n := len(orig) |
| |
| // Create per-byte XOR mask. |
| var b []byte |
| for i := 0; i < n; i++ { |
| b = append(b, orig[i]^mapped[i]) |
| } |
| |
| // Remove leading 0 bytes, but keep at least one byte. |
| for ; len(b) > 1 && b[0] == 0; b = b[1:] { |
| } |
| |
| if len(b) == 1 && b[0]&0xc0 == 0 { |
| ri.entry |= info(b[0]) << xorShift |
| return |
| } |
| |
| key := string(b) |
| x, ok := xorCache[key] |
| if !ok { |
| xorData = append(xorData, 0) // for detecting start of sequence |
| xorData = append(xorData, b...) |
| |
| x = len(xorData) - 1 |
| xorCache[key] = x |
| } |
| ri.entry |= info(x<<xorShift) | xorIndexBit |
| } |
| |
| var xorCache = map[string]int{} |
| |
| // xorData contains byte-wise XOR data for the least significant bytes of a |
| // UTF-8 encoded rune. An index points to the last byte. The sequence starts |
| // with a zero terminator. |
| var xorData = []byte{} |
| |
| // See the comments in gen_trieval.go re "the exceptions slice". |
| var exceptionData = []byte{0} |
| |
| // makeException encodes case mappings that cannot be expressed in a simple |
| // XOR diff. |
| func makeException(ri *runeInfo) { |
| ccc := ri.entry & cccMask |
| // Set exception bit and retain case type. |
| ri.entry &= 0x0007 |
| ri.entry |= exceptionBit |
| |
| if len(exceptionData) >= 1<<numExceptionBits { |
| log.Fatalf("%U:exceptionData too large %#x > %d bits", ri.Rune, len(exceptionData), numExceptionBits) |
| } |
| |
| // Set the offset in the exceptionData array. |
| ri.entry |= info(len(exceptionData) << exceptionShift) |
| |
| orig := string(ri.Rune) |
| tc := ri.mapping(cTitle) |
| uc := ri.mapping(cUpper) |
| lc := ri.mapping(cLower) |
| ff := string(ri.FoldFull) |
| |
| // addString sets the length of a string and adds it to the expansions array. |
| addString := func(s string, b *byte) { |
| if len(s) == 0 { |
| // Zero-length mappings exist, but only for conditional casing, |
| // which we are representing outside of this table. |
| log.Fatalf("%U: has zero-length mapping.", ri.Rune) |
| } |
| *b <<= 3 |
| if s != orig || ri.CaseMode == cLower { |
| n := len(s) |
| if n > 7 { |
| log.Fatalf("%U: mapping larger than 7 (%d)", ri.Rune, n) |
| } |
| *b |= byte(n) |
| exceptionData = append(exceptionData, s...) |
| } |
| } |
| |
| // byte 0: |
| exceptionData = append(exceptionData, byte(ccc)|byte(len(ff))) |
| |
| // byte 1: |
| p := len(exceptionData) |
| exceptionData = append(exceptionData, 0) |
| |
| if len(ff) > 7 { // May be zero-length. |
| log.Fatalf("%U: fold string larger than 7 (%d)", ri.Rune, len(ff)) |
| } |
| exceptionData = append(exceptionData, ff...) |
| ct := ri.CaseMode |
| if ct != cLower { |
| addString(lc, &exceptionData[p]) |
| } |
| if ct != cUpper { |
| addString(uc, &exceptionData[p]) |
| } |
| if ct != cTitle { |
| addString(tc, &exceptionData[p]) |
| } |
| } |
| |
| // sparseCompacter is a trie value block Compacter. There are many cases where |
| // successive runes alternate between lower- and upper-case. This Compacter |
| // exploits this by adding a special case type where the case value is obtained |
| // from or-ing it with the least-significant bit of the rune, creating large |
| // ranges of equal case values that compress well. |
| type sparseCompacter struct { |
| sparseBlocks [][]uint16 |
| sparseOffsets []uint16 |
| sparseCount int |
| } |
| |
| // makeSparse returns the number of elements that compact block would contain |
| // as well as the modified values. |
| func makeSparse(vals []uint64) ([]uint16, int) { |
| // Copy the values. |
| values := make([]uint16, len(vals)) |
| for i, v := range vals { |
| values[i] = uint16(v) |
| } |
| |
| alt := func(i int, v uint16) uint16 { |
| if cm := info(v & fullCasedMask); cm == cUpper || cm == cLower { |
| // Convert cLower or cUpper to cXORCase value, which has the form 11x. |
| xor := v |
| xor &^= 1 |
| xor |= uint16(i&1) ^ (v & 1) |
| xor |= 0x4 |
| return xor |
| } |
| return v |
| } |
| |
| var count int |
| var previous uint16 |
| for i, v := range values { |
| if v != 0 { |
| // Try if the unmodified value is equal to the previous. |
| if v == previous { |
| continue |
| } |
| |
| // Try if the xor-ed value is equal to the previous value. |
| a := alt(i, v) |
| if a == previous { |
| values[i] = a |
| continue |
| } |
| |
| // This is a new value. |
| count++ |
| |
| // Use the xor-ed value if it will be identical to the next value. |
| if p := i + 1; p < len(values) && alt(p, values[p]) == a { |
| values[i] = a |
| v = a |
| } |
| } |
| previous = v |
| } |
| return values, count |
| } |
| |
| func (s *sparseCompacter) Size(v []uint64) (int, bool) { |
| _, n := makeSparse(v) |
| |
| // We limit using this method to having 16 entries. |
| if n > 16 { |
| return 0, false |
| } |
| |
| return 2 + int(reflect.TypeOf(valueRange{}).Size())*n, true |
| } |
| |
| func (s *sparseCompacter) Store(v []uint64) uint32 { |
| h := uint32(len(s.sparseOffsets)) |
| values, sz := makeSparse(v) |
| s.sparseBlocks = append(s.sparseBlocks, values) |
| s.sparseOffsets = append(s.sparseOffsets, uint16(s.sparseCount)) |
| s.sparseCount += sz |
| return h |
| } |
| |
| func (s *sparseCompacter) Handler() string { |
| // The sparse global variable and its lookup method is defined in gen_trieval.go. |
| return "sparse.lookup" |
| } |
| |
| func (s *sparseCompacter) Print(w io.Writer) (retErr error) { |
| p := func(format string, args ...interface{}) { |
| _, err := fmt.Fprintf(w, format, args...) |
| if retErr == nil && err != nil { |
| retErr = err |
| } |
| } |
| |
| ls := len(s.sparseBlocks) |
| if ls == len(s.sparseOffsets) { |
| s.sparseOffsets = append(s.sparseOffsets, uint16(s.sparseCount)) |
| } |
| p("// sparseOffsets: %d entries, %d bytes\n", ls+1, (ls+1)*2) |
| p("var sparseOffsets = %#v\n\n", s.sparseOffsets) |
| |
| ns := s.sparseCount |
| p("// sparseValues: %d entries, %d bytes\n", ns, ns*4) |
| p("var sparseValues = [%d]valueRange {", ns) |
| for i, values := range s.sparseBlocks { |
| p("\n// Block %#x, offset %#x", i, s.sparseOffsets[i]) |
| var v uint16 |
| for i, nv := range values { |
| if nv != v { |
| if v != 0 { |
| p(",hi:%#02x},", 0x80+i-1) |
| } |
| if nv != 0 { |
| p("\n{value:%#04x,lo:%#02x", nv, 0x80+i) |
| } |
| } |
| v = nv |
| } |
| if v != 0 { |
| p(",hi:%#02x},", 0x80+len(values)-1) |
| } |
| } |
| p("\n}\n\n") |
| return |
| } |
| |
| // verifyProperties that properties of the runes that are relied upon in the |
| // implementation. Each property is marked with an identifier that is referred |
| // to in the places where it is used. |
| func verifyProperties(chars []runeInfo) { |
| for i, c := range chars { |
| r := rune(i) |
| |
| // Rune properties. |
| |
| // A.1: modifier never changes on lowercase. [ltLower] |
| if c.CCC > 0 && unicode.ToLower(r) != r { |
| log.Fatalf("%U: non-starter changes when lowercased", r) |
| } |
| |
| // A.2: properties of decompositions starting with I or J. [ltLower] |
| d := norm.NFD.PropertiesString(string(r)).Decomposition() |
| if len(d) > 0 { |
| if d[0] == 'I' || d[0] == 'J' { |
| // A.2.1: we expect at least an ASCII character and a modifier. |
| if len(d) < 3 { |
| log.Fatalf("%U: length of decomposition was %d; want >= 3", r, len(d)) |
| } |
| |
| // All subsequent runes are modifiers and all have the same CCC. |
| runes := []rune(string(d[1:])) |
| ccc := chars[runes[0]].CCC |
| |
| for _, mr := range runes[1:] { |
| mc := chars[mr] |
| |
| // A.2.2: all modifiers have a CCC of Above or less. |
| if ccc == 0 || ccc > above { |
| log.Fatalf("%U: CCC of successive rune (%U) was %d; want (0,230]", r, mr, ccc) |
| } |
| |
| // A.2.3: a sequence of modifiers all have the same CCC. |
| if mc.CCC != ccc { |
| log.Fatalf("%U: CCC of follow-up modifier (%U) was %d; want %d", r, mr, mc.CCC, ccc) |
| } |
| |
| // A.2.4: for each trailing r, r in [0x300, 0x311] <=> CCC == Above. |
| if (ccc == above) != (0x300 <= mr && mr <= 0x311) { |
| log.Fatalf("%U: modifier %U in [U+0300, U+0311] != ccc(%U) == 230", r, mr, mr) |
| } |
| |
| if i += len(string(mr)); i >= len(d) { |
| break |
| } |
| } |
| } |
| } |
| |
| // A.3: no U+0307 in decomposition of Soft-Dotted rune. [ltUpper] |
| if unicode.Is(unicode.Soft_Dotted, r) && strings.Contains(string(d), "\u0307") { |
| log.Fatalf("%U: decomposition of soft-dotted rune may not contain U+0307", r) |
| } |
| |
| // A.4: only rune U+0345 may be of CCC Iota_Subscript. [elUpper] |
| if c.CCC == iotaSubscript && r != 0x0345 { |
| log.Fatalf("%U: only rune U+0345 may have CCC Iota_Subscript", r) |
| } |
| |
| // A.5: soft-dotted runes do not have exceptions. |
| if c.SoftDotted && c.entry&exceptionBit != 0 { |
| log.Fatalf("%U: soft-dotted has exception", r) |
| } |
| |
| // A.6: Greek decomposition. [elUpper] |
| if unicode.Is(unicode.Greek, r) { |
| if b := norm.NFD.PropertiesString(string(r)).Decomposition(); b != nil { |
| runes := []rune(string(b)) |
| // A.6.1: If a Greek rune decomposes and the first rune of the |
| // decomposition is greater than U+00FF, the rune is always |
| // great and not a modifier. |
| if f := runes[0]; unicode.IsMark(f) || f > 0xFF && !unicode.Is(unicode.Greek, f) { |
| log.Fatalf("%U: expected first rune of Greek decomposition to be letter, found %U", r, f) |
| } |
| // A.6.2: Any follow-up rune in a Greek decomposition is a |
| // modifier of which the first should be gobbled in |
| // decomposition. |
| for _, m := range runes[1:] { |
| switch m { |
| case 0x0313, 0x0314, 0x0301, 0x0300, 0x0306, 0x0342, 0x0308, 0x0304, 0x345: |
| default: |
| log.Fatalf("%U: modifier %U is outside of expected Greek modifier set", r, m) |
| } |
| } |
| } |
| } |
| |
| // Breaking properties. |
| |
| // B.1: all runes with CCC > 0 are of break type Extend. |
| if c.CCC > 0 && c.BreakType != "Extend" { |
| log.Fatalf("%U: CCC == %d, but got break type %s; want Extend", r, c.CCC, c.BreakType) |
| } |
| |
| // B.2: all cased runes with c.CCC == 0 are of break type ALetter. |
| if c.CCC == 0 && c.Cased && c.BreakType != "ALetter" { |
| log.Fatalf("%U: cased, but got break type %s; want ALetter", r, c.BreakType) |
| } |
| |
| // B.3: letter category. |
| if c.CCC == 0 && c.BreakCat != breakBreak && !c.CaseIgnorable { |
| if c.BreakCat != breakLetter { |
| log.Fatalf("%U: check for letter break type gave %d; want %d", r, c.BreakCat, breakLetter) |
| } |
| } |
| } |
| } |
| |
| func genTablesTest() { |
| w := &bytes.Buffer{} |
| |
| fmt.Fprintln(w, "var (") |
| printProperties(w, "DerivedCoreProperties.txt", "Case_Ignorable", verifyIgnore) |
| |
| // We discard the output as we know we have perfect functions. We run them |
| // just to verify the properties are correct. |
| n := printProperties(io.Discard, "DerivedCoreProperties.txt", "Cased", verifyCased) |
| n += printProperties(io.Discard, "DerivedCoreProperties.txt", "Lowercase", verifyLower) |
| n += printProperties(io.Discard, "DerivedCoreProperties.txt", "Uppercase", verifyUpper) |
| if n > 0 { |
| log.Fatalf("One of the discarded properties does not have a perfect filter.") |
| } |
| |
| // <code>; <lower> ; <title> ; <upper> ; (<condition_list> ;)? |
| fmt.Fprintln(w, "\tspecial = map[rune]struct{ toLower, toTitle, toUpper string }{") |
| parse("SpecialCasing.txt", func(p *ucd.Parser) { |
| // Skip conditional entries. |
| if p.String(4) != "" { |
| return |
| } |
| r := p.Rune(0) |
| fmt.Fprintf(w, "\t\t0x%04x: {%q, %q, %q},\n", |
| r, string(p.Runes(1)), string(p.Runes(2)), string(p.Runes(3))) |
| }) |
| fmt.Fprint(w, "\t}\n\n") |
| |
| // <code>; <type>; <runes> |
| table := map[rune]struct{ simple, full, special string }{} |
| parse("CaseFolding.txt", func(p *ucd.Parser) { |
| r := p.Rune(0) |
| t := p.String(1) |
| v := string(p.Runes(2)) |
| if t != "T" && v == string(unicode.ToLower(r)) { |
| return |
| } |
| x := table[r] |
| switch t { |
| case "C": |
| x.full = v |
| x.simple = v |
| case "S": |
| x.simple = v |
| case "F": |
| x.full = v |
| case "T": |
| x.special = v |
| } |
| table[r] = x |
| }) |
| fmt.Fprintln(w, "\tfoldMap = map[rune]struct{ simple, full, special string }{") |
| for r := rune(0); r < 0x10FFFF; r++ { |
| x, ok := table[r] |
| if !ok { |
| continue |
| } |
| fmt.Fprintf(w, "\t\t0x%04x: {%q, %q, %q},\n", r, x.simple, x.full, x.special) |
| } |
| fmt.Fprint(w, "\t}\n\n") |
| |
| // Break property |
| notBreak := map[rune]bool{} |
| parse("auxiliary/WordBreakProperty.txt", func(p *ucd.Parser) { |
| switch p.String(1) { |
| case "Extend", "Format", "MidLetter", "MidNumLet", "Single_Quote", |
| "ALetter", "Hebrew_Letter", "Numeric", "ExtendNumLet", "ZWJ": |
| notBreak[p.Rune(0)] = true |
| } |
| }) |
| |
| fmt.Fprintln(w, "\tbreakProp = []struct{ lo, hi rune }{") |
| inBreak := false |
| for r := rune(0); r <= lastRuneForTesting; r++ { |
| if isBreak := !notBreak[r]; isBreak != inBreak { |
| if isBreak { |
| fmt.Fprintf(w, "\t\t{0x%x, ", r) |
| } else { |
| fmt.Fprintf(w, "0x%x},\n", r-1) |
| } |
| inBreak = isBreak |
| } |
| } |
| if inBreak { |
| fmt.Fprintf(w, "0x%x},\n", lastRuneForTesting) |
| } |
| fmt.Fprint(w, "\t}\n\n") |
| |
| // Word break test |
| // Filter out all samples that do not contain cased characters. |
| cased := map[rune]bool{} |
| parse("DerivedCoreProperties.txt", func(p *ucd.Parser) { |
| if p.String(1) == "Cased" { |
| cased[p.Rune(0)] = true |
| } |
| }) |
| |
| fmt.Fprintln(w, "\tbreakTest = []string{") |
| parse("auxiliary/WordBreakTest.txt", func(p *ucd.Parser) { |
| c := strings.Split(p.String(0), " ") |
| |
| const sep = '|' |
| numCased := 0 |
| test := "" |
| for ; len(c) >= 2; c = c[2:] { |
| if c[0] == "รท" && test != "" { |
| test += string(sep) |
| } |
| i, err := strconv.ParseUint(c[1], 16, 32) |
| r := rune(i) |
| if err != nil { |
| log.Fatalf("Invalid rune %q.", c[1]) |
| } |
| if r == sep { |
| log.Fatalf("Separator %q not allowed in test data. Pick another one.", sep) |
| } |
| if cased[r] { |
| numCased++ |
| } |
| test += string(r) |
| } |
| if numCased > 1 { |
| fmt.Fprintf(w, "\t\t%q,\n", test) |
| } |
| }) |
| fmt.Fprintln(w, "\t}") |
| |
| fmt.Fprintln(w, ")") |
| |
| gen.WriteVersionedGoFile("tables_test.go", "cases", w.Bytes()) |
| } |
| |
| // These functions are just used for verification that their definition have not |
| // changed in the Unicode Standard. |
| |
| func verifyCased(r rune) bool { |
| return verifyLower(r) || verifyUpper(r) || unicode.IsTitle(r) |
| } |
| |
| func verifyLower(r rune) bool { |
| return unicode.IsLower(r) || unicode.Is(unicode.Other_Lowercase, r) |
| } |
| |
| func verifyUpper(r rune) bool { |
| return unicode.IsUpper(r) || unicode.Is(unicode.Other_Uppercase, r) |
| } |
| |
| // verifyIgnore is an approximation of the Case_Ignorable property using the |
| // core unicode package. It is used to reduce the size of the test data. |
| func verifyIgnore(r rune) bool { |
| props := []*unicode.RangeTable{ |
| unicode.Mn, |
| unicode.Me, |
| unicode.Cf, |
| unicode.Lm, |
| unicode.Sk, |
| } |
| for _, p := range props { |
| if unicode.Is(p, r) { |
| return true |
| } |
| } |
| return false |
| } |
| |
| // printProperties prints tables of rune properties from the given UCD file. |
| // A filter func f can be given to exclude certain values. A rune r will have |
| // the indicated property if it is in the generated table or if f(r). |
| func printProperties(w io.Writer, file, property string, f func(r rune) bool) int { |
| verify := map[rune]bool{} |
| n := 0 |
| varNameParts := strings.Split(property, "_") |
| varNameParts[0] = strings.ToLower(varNameParts[0]) |
| fmt.Fprintf(w, "\t%s = map[rune]bool{\n", strings.Join(varNameParts, "")) |
| parse(file, func(p *ucd.Parser) { |
| if p.String(1) == property { |
| r := p.Rune(0) |
| verify[r] = true |
| if !f(r) { |
| n++ |
| fmt.Fprintf(w, "\t\t0x%.4x: true,\n", r) |
| } |
| } |
| }) |
| fmt.Fprint(w, "\t}\n\n") |
| |
| // Verify that f is correct, that is, it represents a subset of the property. |
| for r := rune(0); r <= lastRuneForTesting; r++ { |
| if !verify[r] && f(r) { |
| log.Fatalf("Incorrect filter func for property %q.", property) |
| } |
| } |
| return n |
| } |
| |
| // The newCaseTrie, sparseValues and sparseOffsets definitions below are |
| // placeholders referred to by gen_trieval.go. The real definitions are |
| // generated by this program and written to tables.go. |
| |
| func newCaseTrie(int) int { return 0 } |
| |
| var ( |
| sparseValues [0]valueRange |
| sparseOffsets [0]uint16 |
| ) |