internal/export/unicode: add table generator This is used to update tables in core unicode. Change-Id: I6fb34eba45842e38426b1ca54e79b74c361195ec Reviewed-on: https://go-review.googlesource.com/c/154439 Run-TryBot: Marcel van Lohuizen <mpvl@golang.org> Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
diff --git a/gen.go b/gen.go index cf2640e..224eef9 100644 --- a/gen.go +++ b/gen.go
@@ -97,7 +97,8 @@ var unicode = &dependency{} if updateCore { fmt.Printf("Updating core to version %s...\n", gen.UnicodeVersion()) - unicode = generate("unicode") + unicodeInternal := generate("./internal/export/unicode") + unicode = generate("unicode", unicodeInternal) // Test some users of the unicode packages, especially the ones that // keep a mirrored table. These may need to be corrected by hand.
diff --git a/internal/export/unicode/doc.go b/internal/export/unicode/doc.go new file mode 100644 index 0000000..c49ab6e --- /dev/null +++ b/internal/export/unicode/doc.go
@@ -0,0 +1,13 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package unicode generates the Unicode tables in core. +package unicode + +// This package is defined here, instead of core, as Go does not allow any +// standard packages to have non-standard imports, even if imported in files +// with a build ignore tag. + +//go:generate go run gen.go -tables=all -output tables.go +//go:generate mv tables.go $GOROOT/src/unicode
diff --git a/internal/export/unicode/maketables.go b/internal/export/unicode/gen.go similarity index 90% rename from internal/export/unicode/maketables.go rename to internal/export/unicode/gen.go index a1f1586..c93e695 100644 --- a/internal/export/unicode/maketables.go +++ b/internal/export/unicode/gen.go
@@ -24,6 +24,8 @@ "strconv" "strings" "unicode" + + "golang.org/x/text/unicode/rangetable" ) func main() { @@ -450,9 +452,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -// Code generated by maketables; DO NOT EDIT. -// To regenerate, run: -// maketables --tables=%s --data=%s --casefolding=%s +// Code generated by go generate; DO NOT EDIT. package unicode @@ -504,7 +504,7 @@ fullCategoryTest(list) return } - printf(progHeader, *tablelist, *dataURL, *casefoldingURL) + printf(progHeader) println("// Version is the Unicode edition from which the tables are derived.") printf("const Version = %q\n\n", version()) @@ -596,91 +596,38 @@ type Op func(code rune) bool -const format = "\t\t{0x%04x, 0x%04x, %d},\n" - func dumpRange(header string, inCategory Op) { - print(header) - next := rune(0) - latinOffset := 0 - print("\tR16: []Range16{\n") - // one Range for each iteration - count := &range16Count - size := 16 - for { - // look for start of range - for next < rune(len(chars)) && !inCategory(next) { - next++ + runes := []rune{} + for i := range chars { + r := rune(i) + if inCategory(r) { + runes = append(runes, r) } - if next >= rune(len(chars)) { - // no characters remain - break - } - - // start of range - lo := next - hi := next - stride := rune(1) - // accept lo - next++ - // look for another character to set the stride - for next < rune(len(chars)) && !inCategory(next) { - next++ - } - if next >= rune(len(chars)) { - // no more characters - printf(format, lo, hi, stride) - break - } - // set stride - stride = next - lo - // check for length of run. next points to first jump in stride - for i := next; i < rune(len(chars)); i++ { - if inCategory(i) == (((i - lo) % stride) == 0) { - // accept - if inCategory(i) { - hi = i - } - } else { - // no more characters in this run - break - } - } - if uint32(hi) <= unicode.MaxLatin1 { - latinOffset++ - } - size, count = printRange(uint32(lo), uint32(hi), uint32(stride), size, count) - // next range: start looking where this range ends - next = hi + 1 } - print("\t},\n") - if latinOffset > 0 { - printf("\tLatinOffset: %d,\n", latinOffset) - } - print("}\n\n") + printRangeTable(header, runes) } -func printRange(lo, hi, stride uint32, size int, count *int) (int, *int) { - if size == 16 && hi >= 1<<16 { - if lo < 1<<16 { - if lo+stride != hi { - logger.Fatalf("unexpected straddle: %U %U %d", lo, hi, stride) - } - // No range contains U+FFFF as an instance, so split - // the range into two entries. That way we can maintain - // the invariant that R32 contains only >= 1<<16. - printf(format, lo, lo, 1) - lo = hi - stride = 1 - *count++ - } - print("\t},\n") - print("\tR32: []Range32{\n") - size = 32 - count = &range32Count +func printRangeTable(header string, runes []rune) { + rt := rangetable.New(runes...) + print(header) + println("\tR16: []Range16{") + for _, r := range rt.R16 { + printf("\t\t{%#04x, %#04x, %d},\n", r.Lo, r.Hi, r.Stride) + range16Count++ } - printf(format, lo, hi, stride) - *count++ - return size, count + println("\t},") + if len(rt.R32) > 0 { + println("\tR32: []Range32{") + for _, r := range rt.R32 { + printf("\t\t{%#x, %#x, %d},\n", r.Lo, r.Hi, r.Stride) + range32Count++ + } + println("\t},") + } + if rt.LatinOffset > 0 { + printf("\tLatinOffset: %d,\n", rt.LatinOffset) + } + printf("}\n\n") } func fullCategoryTest(list []string) { @@ -751,26 +698,6 @@ scripts[name] = append(scripts[name], Script{uint32(lo), uint32(hi), name}) } -// The script tables have a lot of adjacent elements. Fold them together. -func foldAdjacent(r []Script) []unicode.Range32 { - s := make([]unicode.Range32, 0, len(r)) - j := 0 - for i := 0; i < len(r); i++ { - if j > 0 && r[i].lo == s[j-1].Hi+1 { - s[j-1].Hi = r[i].hi - } else { - s = s[0 : j+1] - s[j] = unicode.Range32{ - Lo: uint32(r[i].lo), - Hi: uint32(r[i].hi), - Stride: 1, - } - j++ - } - } - return s -} - func fullScriptTest(list []string, installed map[string]*unicode.RangeTable, scripts map[string][]Script) { for _, name := range list { if _, ok := scripts[name]; !ok { @@ -796,13 +723,11 @@ // PropList.txt has the same format as Scripts.txt so we can share its parser. func printScriptOrProperty(doProps bool) { - flag := "scripts" flaglist := *scriptlist file := "Scripts.txt" table := scripts installed := unicode.Scripts if doProps { - flag = "props" flaglist = *proplist file = "PropList.txt" table = props @@ -831,13 +756,6 @@ return } - printf( - "// Generated by running\n"+ - "// maketables --%s=%s --url=%s\n"+ - "// DO NOT EDIT\n\n", - flag, - flaglist, - *url) if flaglist == "all" { if doProps { println("// Properties is the set of Unicode property tables.") @@ -874,19 +792,14 @@ alias, name) ndecl++ } - printf("var _%s = &RangeTable {\n", name) - ranges := foldAdjacent(table[name]) - print("\tR16: []Range16{\n") - size := 16 - count := &range16Count - for _, s := range ranges { - size, count = printRange(s.Lo, s.Hi, s.Stride, size, count) + decl := fmt.Sprintf("var _%s = &RangeTable {\n", name) + runes := []rune{} + for _, scr := range table[name] { + for r := scr.lo; r <= scr.hi; r++ { + runes = append(runes, rune(r)) + } } - print("\t},\n") - if off := findLatinOffset(ranges); off > 0 { - printf("\tLatinOffset: %d,\n", off) - } - print("}\n\n") + printRangeTable(decl, runes) } decl.Sort() println("// These variables have type *RangeTable.") @@ -897,14 +810,6 @@ print(")\n\n") } -func findLatinOffset(ranges []unicode.Range32) int { - i := 0 - for i < len(ranges) && ranges[i].Hi <= unicode.MaxLatin1 { - i++ - } - return i -} - const ( CaseUpper = 1 << iota CaseLower @@ -1054,14 +959,10 @@ return } printf( - "// Generated by running\n"+ - "// maketables --data=%s --casefolding=%s\n"+ - "// DO NOT EDIT\n\n"+ - "// CaseRanges is the table describing case mappings for all letters with\n"+ - "// non-self mappings.\n"+ - "var CaseRanges = _CaseRanges\n"+ - "var _CaseRanges = []CaseRange {\n", - *dataURL, *casefoldingURL) + "// CaseRanges is the table describing case mappings for all letters with\n" + + "// non-self mappings.\n" + + "var CaseRanges = _CaseRanges\n" + + "var _CaseRanges = []CaseRange {\n") var startState *caseState // the start of a run; nil for not active var prevState = &caseState{} // the state of the previous character