internal/export/unicode: simplified generation
by using x/text’s infrastructure
Updates golang/go#27945
Change-Id: I070afd9cad4e367ac9085a66064f2bb47e5e225a
Reviewed-on: https://go-review.googlesource.com/c/text/+/154443
Run-TryBot: Marcel van Lohuizen <mpvl@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Russ Cox <rsc@golang.org>
diff --git a/internal/export/unicode/doc.go b/internal/export/unicode/doc.go
index c49ab6e..1a31d18 100644
--- a/internal/export/unicode/doc.go
+++ b/internal/export/unicode/doc.go
@@ -9,5 +9,5 @@
// standard packages to have non-standard imports, even if imported in files
// with a build ignore tag.
-//go:generate go run gen.go -tables=all -output tables.go
+//go:generate go run gen.go -tables=all
//go:generate mv tables.go $GOROOT/src/unicode
diff --git a/internal/export/unicode/gen.go b/internal/export/unicode/gen.go
index c93e695..afdc94d 100644
--- a/internal/export/unicode/gen.go
+++ b/internal/export/unicode/gen.go
@@ -10,21 +10,17 @@
package main
import (
- "bufio"
"flag"
"fmt"
- "io"
"log"
- "net/http"
"os"
- "os/exec"
- "path/filepath"
"regexp"
"sort"
- "strconv"
"strings"
"unicode"
+ "golang.org/x/text/internal/gen"
+ "golang.org/x/text/internal/ucd"
"golang.org/x/text/unicode/rangetable"
)
@@ -50,11 +46,6 @@
return unicode.Version
}
-var dataURL = flag.String("data", "", "full URL for UnicodeData.txt; defaults to --url/UnicodeData.txt")
-var casefoldingURL = flag.String("casefolding", "", "full URL for CaseFolding.txt; defaults to --url/CaseFolding.txt")
-var url = flag.String("url",
- "http://www.unicode.org/Public/"+defaultVersion()+"/ucd/",
- "URL of Unicode database directory")
var tablelist = flag.String("tables",
"all",
"comma-separated list of which tables to generate; can be letter")
@@ -70,51 +61,18 @@
var test = flag.Bool("test",
false,
"test existing tables; can be used to compare web data with package data")
-var localFiles = flag.Bool("local",
- false,
- "data files have been copied to current directory; for debugging only")
-var outputFile = flag.String("output",
- "",
- "output file for generated tables; default stdout")
var scriptRe = regexp.MustCompile(`^([0-9A-F]+)(\.\.[0-9A-F]+)? *; ([A-Za-z_]+)$`)
var logger = log.New(os.Stderr, "", log.Lshortfile)
-var output *bufio.Writer // points to os.Stdout or to "gofmt > outputFile"
+var output *gen.CodeWriter
func setupOutput() {
- output = bufio.NewWriter(startGofmt())
-}
-
-// startGofmt connects output to a gofmt process if -output is set.
-func startGofmt() io.Writer {
- if *outputFile == "" {
- return os.Stdout
- }
- stdout, err := os.Create(*outputFile)
- if err != nil {
- logger.Fatal(err)
- }
- // Pipe output to gofmt.
- gofmt := exec.Command("gofmt")
- fd, err := gofmt.StdinPipe()
- if err != nil {
- logger.Fatal(err)
- }
- gofmt.Stdout = stdout
- gofmt.Stderr = os.Stderr
- err = gofmt.Start()
- if err != nil {
- logger.Fatal(err)
- }
- return fd
+ output = gen.NewCodeWriter()
}
func flushOutput() {
- err := output.Flush()
- if err != nil {
- logger.Fatal(err)
- }
+ output.WriteGoFile("tables.go", "unicode")
}
func printf(format string, args ...interface{}) {
@@ -129,40 +87,6 @@
fmt.Fprintln(output, args...)
}
-type reader struct {
- *bufio.Reader
- fd *os.File
- resp *http.Response
-}
-
-func open(url string) *reader {
- file := filepath.Base(url)
- if *localFiles {
- fd, err := os.Open(file)
- if err != nil {
- logger.Fatal(err)
- }
- return &reader{bufio.NewReader(fd), fd, nil}
- }
- resp, err := http.Get(url)
- if err != nil {
- logger.Fatal(err)
- }
- if resp.StatusCode != 200 {
- logger.Fatalf("bad GET status for %s: %d", file, resp.Status)
- }
- return &reader{bufio.NewReader(resp.Body), nil, resp}
-
-}
-
-func (r *reader) close() {
- if r.fd != nil {
- r.fd.Close()
- } else {
- r.resp.Body.Close()
- }
-}
-
var category = map[string]bool{
// Nd Lu etc.
// We use one-character names to identify merged categories
@@ -175,54 +99,9 @@
"C": true, // Cc Cf Cs Co Cn
}
-// UnicodeData.txt has form:
-// 0037;DIGIT SEVEN;Nd;0;EN;;7;7;7;N;;;;;
-// 007A;LATIN SMALL LETTER Z;Ll;0;L;;;;;N;;;005A;;005A
-// See https://www.unicode.org/reports/tr44/ for a full explanation
-// The fields:
-const (
- FCodePoint = iota
- FName
- FGeneralCategory
- FCanonicalCombiningClass
- FBidiClass
- FDecompositionTypeAndMapping
- FNumericType
- FNumericDigit // If a decimal digit.
- FNumericValue // Includes non-decimal, e.g. U+2155=1/5
- FBidiMirrored
- FUnicode1Name
- FISOComment
- FSimpleUppercaseMapping
- FSimpleLowercaseMapping
- FSimpleTitlecaseMapping
- NumField
-
- MaxChar = 0x10FFFF // anything above this shouldn't exist
-)
-
-var fieldName = []string{
- FCodePoint: "CodePoint",
- FName: "Name",
- FGeneralCategory: "GeneralCategory",
- FCanonicalCombiningClass: "CanonicalCombiningClass",
- FBidiClass: "BidiClass",
- FDecompositionTypeAndMapping: "DecompositionTypeAndMapping",
- FNumericType: "NumericType",
- FNumericDigit: "NumericDigit",
- FNumericValue: "NumericValue",
- FBidiMirrored: "BidiMirrored",
- FUnicode1Name: "Unicode1Name",
- FISOComment: "ISOComment",
- FSimpleUppercaseMapping: "SimpleUppercaseMapping",
- FSimpleLowercaseMapping: "SimpleLowercaseMapping",
- FSimpleTitlecaseMapping: "SimpleTitlecaseMapping",
-}
-
// This contains only the properties we're interested in.
type Char struct {
- field []string // debugging only; could be deleted if we take out char.dump()
- codePoint rune // if zero, this index is not a valid code point.
+ codePoint rune // if zero, this index is not a valid code point.
category string
upperCase rune
lowerCase rune
@@ -231,106 +110,11 @@
caseOrbit rune // next in simple case folding orbit
}
-// Scripts.txt has form:
-// A673 ; Cyrillic # Po SLAVONIC ASTERISK
-// A67C..A67D ; Cyrillic # Mn [2] COMBINING CYRILLIC KAVYKA..COMBINING CYRILLIC PAYEROK
-// See https://www.unicode.org/Public/5.1.0/ucd/UCD.html for full explanation
-
-type Script struct {
- lo, hi uint32 // range of code points
- script string
-}
+const MaxChar = 0x10FFFF
var chars = make([]Char, MaxChar+1)
-var scripts = make(map[string][]Script)
-var props = make(map[string][]Script) // a property looks like a script; can share the format
-
-var lastChar rune = 0
-
-// In UnicodeData.txt, some ranges are marked like this:
-// 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
-// 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
-// parseCategory returns a state variable indicating the weirdness.
-type State int
-
-const (
- SNormal State = iota // known to be zero for the type
- SFirst
- SLast
- SMissing
-)
-
-func parseCategory(line string) (state State) {
- field := strings.Split(line, ";")
- if len(field) != NumField {
- logger.Fatalf("%5s: %d fields (expected %d)\n", line, len(field), NumField)
- }
- point, err := strconv.ParseUint(field[FCodePoint], 16, 64)
- if err != nil {
- logger.Fatalf("%.5s...: %s", line, err)
- }
- lastChar = rune(point)
- if point > MaxChar {
- return
- }
- char := &chars[point]
- char.field = field
- if char.codePoint != 0 {
- logger.Fatalf("point %U reused", point)
- }
- char.codePoint = lastChar
- char.category = field[FGeneralCategory]
- category[char.category] = true
- switch char.category {
- case "Nd":
- // Decimal digit
- _, err := strconv.Atoi(field[FNumericValue])
- if err != nil {
- logger.Fatalf("%U: bad numeric field: %s", point, err)
- }
- case "Lu":
- char.letter(field[FCodePoint], field[FSimpleLowercaseMapping], field[FSimpleTitlecaseMapping])
- case "Ll":
- char.letter(field[FSimpleUppercaseMapping], field[FCodePoint], field[FSimpleTitlecaseMapping])
- case "Lt":
- char.letter(field[FSimpleUppercaseMapping], field[FSimpleLowercaseMapping], field[FCodePoint])
- default:
- char.letter(field[FSimpleUppercaseMapping], field[FSimpleLowercaseMapping], field[FSimpleTitlecaseMapping])
- }
- switch {
- case strings.Index(field[FName], ", First>") > 0:
- state = SFirst
- case strings.Index(field[FName], ", Last>") > 0:
- state = SLast
- }
- return
-}
-
-func (char *Char) dump(s string) {
- print(s, " ")
- for i := 0; i < len(char.field); i++ {
- printf("%s:%q ", fieldName[i], char.field[i])
- }
- print("\n")
-}
-
-func (char *Char) letter(u, l, t string) {
- char.upperCase = char.letterValue(u, "U")
- char.lowerCase = char.letterValue(l, "L")
- char.titleCase = char.letterValue(t, "T")
-}
-
-func (char *Char) letterValue(s string, cas string) rune {
- if s == "" {
- return 0
- }
- v, err := strconv.ParseUint(s, 16, 64)
- if err != nil {
- char.dump(cas)
- logger.Fatalf("%U: bad letter(%s): %s", char.codePoint, s, err)
- }
- return rune(v)
-}
+var scripts = make(map[string][]rune)
+var props = make(map[string][]rune) // a property looks like a script; can share the format
func allCategories() []string {
a := make([]string, 0, len(category))
@@ -341,7 +125,7 @@
return a
}
-func all(scripts map[string][]Script) []string {
+func all(scripts map[string][]rune) []string {
a := make([]string, 0, len(scripts))
for k := range scripts {
a = append(a, k)
@@ -359,105 +143,63 @@
return a
}
-// Extract the version number from the URL
-func version() string {
- // Break on slashes and look for the first numeric field
- fields := strings.Split(*url, "/")
- for _, f := range fields {
- if len(f) > 0 && '0' <= f[0] && f[0] <= '9' {
- return f
- }
- }
- logger.Fatal("unknown version")
- return "Unknown"
-}
-
func categoryOp(code rune, class uint8) bool {
category := chars[code].category
return len(category) > 0 && category[0] == class
}
func loadChars() {
- if *dataURL == "" {
- flag.Set("data", *url+"UnicodeData.txt")
- }
- input := open(*dataURL)
- defer input.close()
- scanner := bufio.NewScanner(input)
- var first rune = 0
- for scanner.Scan() {
- switch parseCategory(scanner.Text()) {
- case SNormal:
- if first != 0 {
- logger.Fatalf("bad state normal at %U", lastChar)
+ ucd.Parse(gen.OpenUCDFile("UnicodeData.txt"), func(p *ucd.Parser) {
+ c := Char{codePoint: p.Rune(0)}
+
+ getRune := func(field int) rune {
+ if p.String(field) == "" {
+ return 0
}
- case SFirst:
- if first != 0 {
- logger.Fatalf("bad state first at %U", lastChar)
- }
- first = lastChar
- case SLast:
- if first == 0 {
- logger.Fatalf("bad state last at %U", lastChar)
- }
- for i := first + 1; i <= lastChar; i++ {
- chars[i] = chars[first]
- chars[i].codePoint = i
- }
- first = 0
+ return p.Rune(field)
}
- }
- if scanner.Err() != nil {
- logger.Fatal(scanner.Err())
- }
+
+ c.category = p.String(ucd.GeneralCategory)
+ category[c.category] = true
+ switch c.category {
+ case "Nd":
+ // Decimal digit
+ p.Int(ucd.NumericValue)
+ case "Lu":
+ c.upperCase = getRune(ucd.CodePoint)
+ c.lowerCase = getRune(ucd.SimpleLowercaseMapping)
+ c.titleCase = getRune(ucd.SimpleTitlecaseMapping)
+ case "Ll":
+ c.upperCase = getRune(ucd.SimpleUppercaseMapping)
+ c.lowerCase = getRune(ucd.CodePoint)
+ c.titleCase = getRune(ucd.SimpleTitlecaseMapping)
+ case "Lt":
+ c.upperCase = getRune(ucd.SimpleUppercaseMapping)
+ c.lowerCase = getRune(ucd.SimpleLowercaseMapping)
+ c.titleCase = getRune(ucd.CodePoint)
+ default:
+ c.upperCase = getRune(ucd.SimpleUppercaseMapping)
+ c.lowerCase = getRune(ucd.SimpleLowercaseMapping)
+ c.titleCase = getRune(ucd.SimpleTitlecaseMapping)
+ }
+
+ chars[c.codePoint] = c
+ })
}
func loadCasefold() {
- if *casefoldingURL == "" {
- flag.Set("casefolding", *url+"CaseFolding.txt")
- }
- input := open(*casefoldingURL)
- defer input.close()
- scanner := bufio.NewScanner(input)
- for scanner.Scan() {
- line := scanner.Text()
- if len(line) == 0 || line[0] == '#' || len(strings.TrimSpace(line)) == 0 {
- continue
- }
- field := strings.Split(line, "; ")
- if len(field) != 4 {
- logger.Fatalf("CaseFolding.txt %.5s...: %d fields (expected %d)\n", line, len(field), 4)
- }
- kind := field[1]
+ ucd.Parse(gen.OpenUCDFile("CaseFolding.txt"), func(p *ucd.Parser) {
+ kind := p.String(1)
if kind != "C" && kind != "S" {
// Only care about 'common' and 'simple' foldings.
- continue
+ return
}
- p1, err := strconv.ParseUint(field[0], 16, 64)
- if err != nil {
- logger.Fatalf("CaseFolding.txt %.5s...: %s", line, err)
- }
- p2, err := strconv.ParseUint(field[2], 16, 64)
- if err != nil {
- logger.Fatalf("CaseFolding.txt %.5s...: %s", line, err)
- }
+ p1 := p.Rune(0)
+ p2 := p.Rune(2)
chars[p1].foldCase = rune(p2)
- }
- if scanner.Err() != nil {
- logger.Fatal(scanner.Err())
- }
+ })
}
-const progHeader = `// Copyright 2013 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Code generated by go generate; DO NOT EDIT.
-
-package unicode
-
-`
-
var categoryMapping = map[string]string{
"Lu": "Letter, uppercase",
"Ll": "Letter, lowercase",
@@ -504,10 +246,9 @@
fullCategoryTest(list)
return
}
- printf(progHeader)
println("// Version is the Unicode edition from which the tables are derived.")
- printf("const Version = %q\n\n", version())
+ printf("const Version = %q\n\n", gen.UnicodeVersion())
if *tablelist == "all" {
println("// Categories is the set of Unicode category tables.")
@@ -575,14 +316,12 @@
decl[ndecl] = varDecl
ndecl++
if len(name) == 1 { // unified categories
- decl := fmt.Sprintf("var _%s = &RangeTable{\n", name)
dumpRange(
- decl,
+ "_"+name,
func(code rune) bool { return categoryOp(code, name[0]) })
continue
}
- dumpRange(
- fmt.Sprintf("var _%s = &RangeTable{\n", name),
+ dumpRange("_"+name,
func(code rune) bool { return chars[code].category == name })
}
decl.Sort()
@@ -596,7 +335,7 @@
type Op func(code rune) bool
-func dumpRange(header string, inCategory Op) {
+func dumpRange(name string, inCategory Op) {
runes := []rune{}
for i := range chars {
r := rune(i)
@@ -604,12 +343,12 @@
runes = append(runes, r)
}
}
- printRangeTable(header, runes)
+ printRangeTable(name, runes)
}
-func printRangeTable(header string, runes []rune) {
+func printRangeTable(name string, runes []rune) {
rt := rangetable.New(runes...)
- print(header)
+ printf("var %s = &RangeTable{\n", name)
println("\tR16: []Range16{")
for _, r := range rt.R16 {
printf("\t\t{%#04x, %#04x, %d},\n", r.Lo, r.Hi, r.Stride)
@@ -666,39 +405,7 @@
}
}
-func parseScript(line string, scripts map[string][]Script) {
- comment := strings.Index(line, "#")
- if comment >= 0 {
- line = line[0:comment]
- }
- line = strings.TrimSpace(line)
- if len(line) == 0 {
- return
- }
- field := strings.Split(line, ";")
- if len(field) != 2 {
- logger.Fatalf("%s: %d fields (expected 2)\n", line, len(field))
- }
- matches := scriptRe.FindStringSubmatch(line)
- if len(matches) != 4 {
- logger.Fatalf("%s: %d matches (expected 3)\n", line, len(matches))
- }
- lo, err := strconv.ParseUint(matches[1], 16, 64)
- if err != nil {
- logger.Fatalf("%.5s...: %s", line, err)
- }
- hi := lo
- if len(matches[2]) > 2 { // ignore leading ..
- hi, err = strconv.ParseUint(matches[2][2:], 16, 64)
- if err != nil {
- logger.Fatalf("%.5s...: %s", line, err)
- }
- }
- name := matches[3]
- scripts[name] = append(scripts[name], Script{uint32(lo), uint32(hi), name})
-}
-
-func fullScriptTest(list []string, installed map[string]*unicode.RangeTable, scripts map[string][]Script) {
+func fullScriptTest(list []string, installed map[string]*unicode.RangeTable, scripts map[string][]rune) {
for _, name := range list {
if _, ok := scripts[name]; !ok {
logger.Fatal("unknown script", name)
@@ -707,11 +414,9 @@
if !ok {
logger.Fatal("unknown table", name)
}
- for _, script := range scripts[name] {
- for r := script.lo; r <= script.hi; r++ {
- if !unicode.Is(installed[name], rune(r)) {
- fmt.Fprintf(os.Stderr, "%U: not in script %s\n", r, name)
- }
+ for _, r := range scripts[name] {
+ if !unicode.Is(installed[name], rune(r)) {
+ fmt.Fprintf(os.Stderr, "%U: not in script %s\n", r, name)
}
}
}
@@ -736,16 +441,10 @@
if flaglist == "" {
return
}
- input := open(*url + file)
- scanner := bufio.NewScanner(input)
- for scanner.Scan() {
- parseScript(scanner.Text(), table)
- }
- if scanner.Err() != nil {
- logger.Fatal(scanner.Err())
- }
- input.close()
-
+ ucd.Parse(gen.OpenUCDFile(file), func(p *ucd.Parser) {
+ name := p.String(1)
+ table[name] = append(table[name], p.Rune(0))
+ })
// Find out which scripts to dump
list := strings.Split(flaglist, ",")
if flaglist == "all" {
@@ -792,14 +491,7 @@
alias, name)
ndecl++
}
- decl := fmt.Sprintf("var _%s = &RangeTable {\n", name)
- runes := []rune{}
- for _, scr := range table[name] {
- for r := scr.lo; r <= scr.hi; r++ {
- runes = append(runes, rune(r))
- }
- }
- printRangeTable(decl, runes)
+ printRangeTable("_"+name, table[name])
}
decl.Sort()
println("// These variables have type *RangeTable.")
@@ -951,9 +643,6 @@
}
func printCases() {
- if !*cases {
- return
- }
if *test {
fullCaseTest()
return
@@ -1144,7 +833,7 @@
scr := make(map[string]map[rune]bool)
for name := range scripts {
- if x := foldExceptions(inScript(name)); len(x) > 0 {
+ if x := foldExceptions(scripts[name]); len(x) > 0 {
scr[name] = x
}
}
@@ -1166,17 +855,6 @@
return x
}
-// inScript returns a list of all the runes in the script.
-func inScript(name string) []rune {
- var x []rune
- for _, s := range scripts[name] {
- for c := s.lo; c <= s.hi; c++ {
- x = append(x, rune(c))
- }
- }
- return x
-}
-
// foldExceptions returns a list of all the runes fold-equivalent
// to runes in class but not in class themselves.
func foldExceptions(class []rune) map[rune]bool {
@@ -1329,9 +1007,7 @@
printf("}\n\n")
for _, name := range allCatFold(m) {
class := m[name]
- dumpRange(
- fmt.Sprintf("var fold%s = &RangeTable{\n", name),
- func(code rune) bool { return class[code] })
+ dumpRange("fold"+name, func(code rune) bool { return class[code] })
}
}
diff --git a/internal/gen/code.go b/internal/gen/code.go
index 55dd1ed..75435c9 100644
--- a/internal/gen/code.go
+++ b/internal/gen/code.go
@@ -84,7 +84,9 @@
// writes it as a Go file to the given writer with the given package name.
func (w *CodeWriter) WriteGo(out io.Writer, pkg, tags string) (n int, err error) {
sz := w.Size
- w.WriteComment("Total table size %d bytes (%dKiB); checksum: %X\n", sz, sz/1024, w.Hash.Sum32())
+ if sz > 0 {
+ w.WriteComment("Total table size %d bytes (%dKiB); checksum: %X\n", sz, sz/1024, w.Hash.Sum32())
+ }
defer w.buf.Reset()
return WriteGo(out, pkg, tags, w.buf.Bytes())
}