blob: 9a92a0130a0b8a1c7f06bd6757fe4c50bb6ceba1 [file] [log] [blame]
Rob Pike396b47b2009-08-26 16:01:31 -07001// Copyright 2009 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
Russ Cox9f333172012-02-14 16:39:20 -05005// +build ignore
6
Rob Pike396b47b2009-08-26 16:01:31 -07007// Unicode table generator.
8// Data read from the web.
9
10package main
11
12import (
Robert Griesemer45ca9f72009-12-15 15:41:46 -080013 "bufio"
14 "flag"
15 "fmt"
Rob Pikedff17f42014-08-25 14:56:35 -070016 "io"
Robert Griesemer45ca9f72009-12-15 15:41:46 -080017 "log"
Rob Pike45e3bcb2011-11-08 15:41:54 -080018 "net/http"
Robert Griesemer45ca9f72009-12-15 15:41:46 -080019 "os"
Rob Pikedff17f42014-08-25 14:56:35 -070020 "os/exec"
Rob Pike5ea413e2011-07-27 15:54:23 -070021 "path/filepath"
Russ Cox965845a2011-11-02 15:54:16 -040022 "regexp"
Robert Griesemer45ca9f72009-12-15 15:41:46 -080023 "sort"
24 "strconv"
25 "strings"
Robert Griesemer45ca9f72009-12-15 15:41:46 -080026 "unicode"
Rob Pike396b47b2009-08-26 16:01:31 -070027)
28
Rob Pike22c2b472009-08-28 23:05:16 -070029func main() {
Robert Griesemer45ca9f72009-12-15 15:41:46 -080030 flag.Parse()
Rob Pikedff17f42014-08-25 14:56:35 -070031 setupOutput()
Robert Griesemer45ca9f72009-12-15 15:41:46 -080032 loadChars() // always needed
Russ Coxfc77e822011-06-16 17:56:25 -040033 loadCasefold()
Robert Griesemer45ca9f72009-12-15 15:41:46 -080034 printCategories()
35 printScriptOrProperty(false)
36 printScriptOrProperty(true)
37 printCases()
Rob Pike8d64e732011-06-04 07:46:22 +100038 printLatinProperties()
Russ Coxfc77e822011-06-16 17:56:25 -040039 printCasefold()
Rob Pike0de328e2011-05-31 09:58:07 +100040 printSizes()
Rob Pikedff17f42014-08-25 14:56:35 -070041 flushOutput()
Rob Pike22c2b472009-08-28 23:05:16 -070042}
43
Marcel van Lohuizen3dd96e92017-09-13 17:45:51 +020044func defaultVersion() string {
45 if v := os.Getenv("UNICODE_VERSION"); v != "" {
46 return v
47 }
48 return unicode.Version
49}
50
Russ Coxed6eb5b2009-11-08 21:46:20 -080051var dataURL = flag.String("data", "", "full URL for UnicodeData.txt; defaults to --url/UnicodeData.txt")
Russ Coxfc77e822011-06-16 17:56:25 -040052var casefoldingURL = flag.String("casefolding", "", "full URL for CaseFolding.txt; defaults to --url/CaseFolding.txt")
Rob Pike94e69152009-08-27 09:14:32 -070053var url = flag.String("url",
Marcel van Lohuizen3dd96e92017-09-13 17:45:51 +020054 "http://www.unicode.org/Public/"+defaultVersion()+"/ucd/",
Rob Pike8b6274e2009-08-27 17:04:23 -070055 "URL of Unicode database directory")
56var tablelist = flag.String("tables",
Rob Pike94e69152009-08-27 09:14:32 -070057 "all",
Rob Pikea8246512009-11-02 11:37:52 -080058 "comma-separated list of which tables to generate; can be letter")
Rob Pike8b6274e2009-08-27 17:04:23 -070059var scriptlist = flag.String("scripts",
60 "all",
Rob Pikea8246512009-11-02 11:37:52 -080061 "comma-separated list of which script tables to generate")
Rob Pike1e55e4a2009-08-31 16:43:17 -070062var proplist = flag.String("props",
63 "all",
Rob Pikea8246512009-11-02 11:37:52 -080064 "comma-separated list of which property tables to generate")
Rob Pike22c2b472009-08-28 23:05:16 -070065var cases = flag.Bool("cases",
66 true,
Rob Pikea8246512009-11-02 11:37:52 -080067 "generate case tables")
Rob Pike94e69152009-08-27 09:14:32 -070068var test = flag.Bool("test",
69 false,
Rob Pikea8246512009-11-02 11:37:52 -080070 "test existing tables; can be used to compare web data with package data")
Rob Pike5ea413e2011-07-27 15:54:23 -070071var localFiles = flag.Bool("local",
72 false,
73 "data files have been copied to current directory; for debugging only")
Rob Pikedff17f42014-08-25 14:56:35 -070074var outputFile = flag.String("output",
75 "",
76 "output file for generated tables; default stdout")
Rob Pike396b47b2009-08-26 16:01:31 -070077
Russ Cox0f0f34e2011-01-30 16:09:16 -050078var scriptRe = regexp.MustCompile(`^([0-9A-F]+)(\.\.[0-9A-F]+)? *; ([A-Za-z_]+)$`)
79var logger = log.New(os.Stderr, "", log.Lshortfile)
Rob Pike396b47b2009-08-26 16:01:31 -070080
Rob Pikedff17f42014-08-25 14:56:35 -070081var output *bufio.Writer // points to os.Stdout or to "gofmt > outputFile"
82
83func setupOutput() {
84 output = bufio.NewWriter(startGofmt())
85}
86
87// startGofmt connects output to a gofmt process if -output is set.
88func startGofmt() io.Writer {
89 if *outputFile == "" {
90 return os.Stdout
91 }
92 stdout, err := os.Create(*outputFile)
93 if err != nil {
94 logger.Fatal(err)
95 }
96 // Pipe output to gofmt.
97 gofmt := exec.Command("gofmt")
98 fd, err := gofmt.StdinPipe()
99 if err != nil {
100 logger.Fatal(err)
101 }
102 gofmt.Stdout = stdout
103 gofmt.Stderr = os.Stderr
104 err = gofmt.Start()
105 if err != nil {
106 logger.Fatal(err)
107 }
108 return fd
109}
110
111func flushOutput() {
112 err := output.Flush()
113 if err != nil {
114 logger.Fatal(err)
115 }
116}
117
118func printf(format string, args ...interface{}) {
119 fmt.Fprintf(output, format, args...)
120}
121
122func print(args ...interface{}) {
123 fmt.Fprint(output, args...)
124}
125
126func println(args ...interface{}) {
127 fmt.Fprintln(output, args...)
128}
129
Rob Pike5ea413e2011-07-27 15:54:23 -0700130type reader struct {
131 *bufio.Reader
132 fd *os.File
133 resp *http.Response
134}
135
136func open(url string) *reader {
137 file := filepath.Base(url)
138 if *localFiles {
139 fd, err := os.Open(file)
140 if err != nil {
141 logger.Fatal(err)
142 }
143 return &reader{bufio.NewReader(fd), fd, nil}
144 }
Russ Cox92703ff2011-09-26 13:10:16 -0400145 resp, err := http.Get(url)
Rob Pike5ea413e2011-07-27 15:54:23 -0700146 if err != nil {
147 logger.Fatal(err)
148 }
149 if resp.StatusCode != 200 {
150 logger.Fatalf("bad GET status for %s: %d", file, resp.Status)
151 }
152 return &reader{bufio.NewReader(resp.Body), nil, resp}
153
154}
155
156func (r *reader) close() {
157 if r.fd != nil {
158 r.fd.Close()
159 } else {
160 r.resp.Body.Close()
161 }
162}
163
Rob Pike8d64e732011-06-04 07:46:22 +1000164var category = map[string]bool{
165 // Nd Lu etc.
166 // We use one-character names to identify merged categories
167 "L": true, // Lu Ll Lt Lm Lo
168 "P": true, // Pc Pd Ps Pe Pu Pf Po
169 "M": true, // Mn Mc Me
170 "N": true, // Nd Nl No
171 "S": true, // Sm Sc Sk So
172 "Z": true, // Zs Zl Zp
173 "C": true, // Cc Cf Cs Co Cn
174}
Rob Pike94e69152009-08-27 09:14:32 -0700175
Rob Pike8b6274e2009-08-27 17:04:23 -0700176// UnicodeData.txt has form:
Rob Pike396b47b2009-08-26 16:01:31 -0700177// 0037;DIGIT SEVEN;Nd;0;EN;;7;7;7;N;;;;;
178// 007A;LATIN SMALL LETTER Z;Ll;0;L;;;;;N;;;005A;;005A
Rob Pikea8e5db92011-07-06 15:35:23 +1000179// See http://www.unicode.org/reports/tr44/ for a full explanation
Rob Pike22c2b472009-08-28 23:05:16 -0700180// The fields:
Rob Pike396b47b2009-08-26 16:01:31 -0700181const (
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800182 FCodePoint = iota
183 FName
184 FGeneralCategory
185 FCanonicalCombiningClass
186 FBidiClass
Rob Pikea8e5db92011-07-06 15:35:23 +1000187 FDecompositionTypeAndMapping
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800188 FNumericType
Rob Pikea8e5db92011-07-06 15:35:23 +1000189 FNumericDigit // If a decimal digit.
190 FNumericValue // Includes non-decimal, e.g. U+2155=1/5
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800191 FBidiMirrored
192 FUnicode1Name
193 FISOComment
194 FSimpleUppercaseMapping
195 FSimpleLowercaseMapping
196 FSimpleTitlecaseMapping
197 NumField
Rob Pike396b47b2009-08-26 16:01:31 -0700198
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800199 MaxChar = 0x10FFFF // anything above this shouldn't exist
Rob Pike396b47b2009-08-26 16:01:31 -0700200)
201
202var fieldName = []string{
Rob Pikea8e5db92011-07-06 15:35:23 +1000203 FCodePoint: "CodePoint",
204 FName: "Name",
205 FGeneralCategory: "GeneralCategory",
206 FCanonicalCombiningClass: "CanonicalCombiningClass",
207 FBidiClass: "BidiClass",
208 FDecompositionTypeAndMapping: "DecompositionTypeAndMapping",
209 FNumericType: "NumericType",
210 FNumericDigit: "NumericDigit",
211 FNumericValue: "NumericValue",
212 FBidiMirrored: "BidiMirrored",
213 FUnicode1Name: "Unicode1Name",
214 FISOComment: "ISOComment",
215 FSimpleUppercaseMapping: "SimpleUppercaseMapping",
216 FSimpleLowercaseMapping: "SimpleLowercaseMapping",
217 FSimpleTitlecaseMapping: "SimpleTitlecaseMapping",
Rob Pike396b47b2009-08-26 16:01:31 -0700218}
219
220// This contains only the properties we're interested in.
221type Char struct {
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800222 field []string // debugging only; could be deleted if we take out char.dump()
Russ Cox7630a102011-10-25 22:23:15 -0700223 codePoint rune // if zero, this index is not a valid code point.
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800224 category string
Russ Cox7630a102011-10-25 22:23:15 -0700225 upperCase rune
226 lowerCase rune
227 titleCase rune
228 foldCase rune // simple case folding
229 caseOrbit rune // next in simple case folding orbit
Rob Pike396b47b2009-08-26 16:01:31 -0700230}
231
Rob Pike8b6274e2009-08-27 17:04:23 -0700232// Scripts.txt has form:
233// A673 ; Cyrillic # Po SLAVONIC ASTERISK
234// A67C..A67D ; Cyrillic # Mn [2] COMBINING CYRILLIC KAVYKA..COMBINING CYRILLIC PAYEROK
235// See http://www.unicode.org/Public/5.1.0/ucd/UCD.html for full explanation
236
237type Script struct {
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800238 lo, hi uint32 // range of code points
239 script string
Rob Pike8b6274e2009-08-27 17:04:23 -0700240}
241
Rob Pike22c2b472009-08-28 23:05:16 -0700242var chars = make([]Char, MaxChar+1)
Robert Griesemer841c18a2009-11-04 21:39:55 -0800243var scripts = make(map[string][]Script)
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800244var props = make(map[string][]Script) // a property looks like a script; can share the format
Rob Pike396b47b2009-08-26 16:01:31 -0700245
Russ Cox7630a102011-10-25 22:23:15 -0700246var lastChar rune = 0
Rob Pike1e55e4a2009-08-31 16:43:17 -0700247
Rob Pikef59ae062009-08-28 11:57:38 -0700248// In UnicodeData.txt, some ranges are marked like this:
Rob Pike22c2b472009-08-28 23:05:16 -0700249// 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
250// 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
Rob Pikef59ae062009-08-28 11:57:38 -0700251// parseCategory returns a state variable indicating the weirdness.
252type State int
Robert Griesemer841c18a2009-11-04 21:39:55 -0800253
Rob Pikef59ae062009-08-28 11:57:38 -0700254const (
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800255 SNormal State = iota // known to be zero for the type
256 SFirst
257 SLast
258 SMissing
Rob Pikef59ae062009-08-28 11:57:38 -0700259)
260
261func parseCategory(line string) (state State) {
Rob Pikeebb15662011-06-28 09:43:14 +1000262 field := strings.Split(line, ";")
Rob Pike396b47b2009-08-26 16:01:31 -0700263 if len(field) != NumField {
Rob Pikeeea18d92011-02-01 12:47:35 -0800264 logger.Fatalf("%5s: %d fields (expected %d)\n", line, len(field), NumField)
Rob Pike396b47b2009-08-26 16:01:31 -0700265 }
Russ Cox2666b812011-12-05 15:48:46 -0500266 point, err := strconv.ParseUint(field[FCodePoint], 16, 64)
Rob Pike396b47b2009-08-26 16:01:31 -0700267 if err != nil {
Rob Pikeeea18d92011-02-01 12:47:35 -0800268 logger.Fatalf("%.5s...: %s", line, err)
Rob Pike396b47b2009-08-26 16:01:31 -0700269 }
Russ Cox7630a102011-10-25 22:23:15 -0700270 lastChar = rune(point)
Rob Pike22c2b472009-08-28 23:05:16 -0700271 if point > MaxChar {
Robert Griesemer40621d52009-11-09 12:07:39 -0800272 return
Rob Pike396b47b2009-08-26 16:01:31 -0700273 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800274 char := &chars[point]
275 char.field = field
Rob Pike396b47b2009-08-26 16:01:31 -0700276 if char.codePoint != 0 {
Rob Pikeeea18d92011-02-01 12:47:35 -0800277 logger.Fatalf("point %U reused", point)
Rob Pike396b47b2009-08-26 16:01:31 -0700278 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800279 char.codePoint = lastChar
280 char.category = field[FGeneralCategory]
281 category[char.category] = true
Rob Pike396b47b2009-08-26 16:01:31 -0700282 switch char.category {
283 case "Nd":
284 // Decimal digit
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800285 _, err := strconv.Atoi(field[FNumericValue])
Rob Pike396b47b2009-08-26 16:01:31 -0700286 if err != nil {
Rob Pikeeea18d92011-02-01 12:47:35 -0800287 logger.Fatalf("%U: bad numeric field: %s", point, err)
Rob Pike396b47b2009-08-26 16:01:31 -0700288 }
Rob Pike396b47b2009-08-26 16:01:31 -0700289 case "Lu":
Robert Griesemer40621d52009-11-09 12:07:39 -0800290 char.letter(field[FCodePoint], field[FSimpleLowercaseMapping], field[FSimpleTitlecaseMapping])
Rob Pike396b47b2009-08-26 16:01:31 -0700291 case "Ll":
Robert Griesemer40621d52009-11-09 12:07:39 -0800292 char.letter(field[FSimpleUppercaseMapping], field[FCodePoint], field[FSimpleTitlecaseMapping])
Rob Pike396b47b2009-08-26 16:01:31 -0700293 case "Lt":
Robert Griesemer40621d52009-11-09 12:07:39 -0800294 char.letter(field[FSimpleUppercaseMapping], field[FSimpleLowercaseMapping], field[FCodePoint])
Rob Pike5ea413e2011-07-27 15:54:23 -0700295 default:
Robert Griesemer40621d52009-11-09 12:07:39 -0800296 char.letter(field[FSimpleUppercaseMapping], field[FSimpleLowercaseMapping], field[FSimpleTitlecaseMapping])
Rob Pike396b47b2009-08-26 16:01:31 -0700297 }
Rob Pikef59ae062009-08-28 11:57:38 -0700298 switch {
299 case strings.Index(field[FName], ", First>") > 0:
Robert Griesemer40621d52009-11-09 12:07:39 -0800300 state = SFirst
Rob Pikef59ae062009-08-28 11:57:38 -0700301 case strings.Index(field[FName], ", Last>") > 0:
Robert Griesemer40621d52009-11-09 12:07:39 -0800302 state = SLast
Rob Pikef59ae062009-08-28 11:57:38 -0700303 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800304 return
Rob Pike396b47b2009-08-26 16:01:31 -0700305}
306
307func (char *Char) dump(s string) {
Rob Pikedff17f42014-08-25 14:56:35 -0700308 print(s, " ")
Robert Griesemer841c18a2009-11-04 21:39:55 -0800309 for i := 0; i < len(char.field); i++ {
Rob Pikedff17f42014-08-25 14:56:35 -0700310 printf("%s:%q ", fieldName[i], char.field[i])
Rob Pike396b47b2009-08-26 16:01:31 -0700311 }
Rob Pikedff17f42014-08-25 14:56:35 -0700312 print("\n")
Rob Pike396b47b2009-08-26 16:01:31 -0700313}
314
315func (char *Char) letter(u, l, t string) {
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800316 char.upperCase = char.letterValue(u, "U")
317 char.lowerCase = char.letterValue(l, "L")
318 char.titleCase = char.letterValue(t, "T")
Rob Pike396b47b2009-08-26 16:01:31 -0700319}
320
Russ Cox7630a102011-10-25 22:23:15 -0700321func (char *Char) letterValue(s string, cas string) rune {
Rob Pike396b47b2009-08-26 16:01:31 -0700322 if s == "" {
Robert Griesemer40621d52009-11-09 12:07:39 -0800323 return 0
Rob Pike396b47b2009-08-26 16:01:31 -0700324 }
Russ Cox2666b812011-12-05 15:48:46 -0500325 v, err := strconv.ParseUint(s, 16, 64)
Rob Pike396b47b2009-08-26 16:01:31 -0700326 if err != nil {
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800327 char.dump(cas)
Rob Pikeeea18d92011-02-01 12:47:35 -0800328 logger.Fatalf("%U: bad letter(%s): %s", char.codePoint, s, err)
Rob Pike396b47b2009-08-26 16:01:31 -0700329 }
Russ Cox7630a102011-10-25 22:23:15 -0700330 return rune(v)
Rob Pike396b47b2009-08-26 16:01:31 -0700331}
332
Rob Pike94e69152009-08-27 09:14:32 -0700333func allCategories() []string {
Russ Coxb4d6b712011-10-19 16:02:22 -0400334 a := make([]string, 0, len(category))
Rob Pike94e69152009-08-27 09:14:32 -0700335 for k := range category {
Russ Coxb4d6b712011-10-19 16:02:22 -0400336 a = append(a, k)
Rob Pike94e69152009-08-27 09:14:32 -0700337 }
Russ Coxb4d6b712011-10-19 16:02:22 -0400338 sort.Strings(a)
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800339 return a
Rob Pike94e69152009-08-27 09:14:32 -0700340}
341
Robert Griesemer841c18a2009-11-04 21:39:55 -0800342func all(scripts map[string][]Script) []string {
Russ Coxb4d6b712011-10-19 16:02:22 -0400343 a := make([]string, 0, len(scripts))
Rob Pike8b6274e2009-08-27 17:04:23 -0700344 for k := range scripts {
Russ Coxb4d6b712011-10-19 16:02:22 -0400345 a = append(a, k)
Rob Pike8b6274e2009-08-27 17:04:23 -0700346 }
Russ Coxb4d6b712011-10-19 16:02:22 -0400347 sort.Strings(a)
348 return a
349}
350
Russ Cox7630a102011-10-25 22:23:15 -0700351func allCatFold(m map[string]map[rune]bool) []string {
Russ Coxb4d6b712011-10-19 16:02:22 -0400352 a := make([]string, 0, len(m))
353 for k := range m {
354 a = append(a, k)
355 }
356 sort.Strings(a)
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800357 return a
Rob Pike8b6274e2009-08-27 17:04:23 -0700358}
359
Rob Pike94e69152009-08-27 09:14:32 -0700360// Extract the version number from the URL
361func version() string {
362 // Break on slashes and look for the first numeric field
Rob Pikeebb15662011-06-28 09:43:14 +1000363 fields := strings.Split(*url, "/")
Rob Pike94e69152009-08-27 09:14:32 -0700364 for _, f := range fields {
365 if len(f) > 0 && '0' <= f[0] && f[0] <= '9' {
Robert Griesemer40621d52009-11-09 12:07:39 -0800366 return f
Rob Pike94e69152009-08-27 09:14:32 -0700367 }
368 }
Rob Pikeeea18d92011-02-01 12:47:35 -0800369 logger.Fatal("unknown version")
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800370 return "Unknown"
Rob Pike94e69152009-08-27 09:14:32 -0700371}
372
Russ Cox7630a102011-10-25 22:23:15 -0700373func categoryOp(code rune, class uint8) bool {
Rob Pike8d64e732011-06-04 07:46:22 +1000374 category := chars[code].category
375 return len(category) > 0 && category[0] == class
Rob Pike94e69152009-08-27 09:14:32 -0700376}
377
Rob Pike22c2b472009-08-28 23:05:16 -0700378func loadChars() {
Russ Coxed6eb5b2009-11-08 21:46:20 -0800379 if *dataURL == "" {
Robert Griesemer3bb00322009-11-09 21:23:52 -0800380 flag.Set("data", *url+"UnicodeData.txt")
Rob Pike8b6274e2009-08-27 17:04:23 -0700381 }
Rob Pike5ea413e2011-07-27 15:54:23 -0700382 input := open(*dataURL)
Rob Pike6f96a762013-02-21 10:47:31 -0800383 defer input.close()
384 scanner := bufio.NewScanner(input)
Russ Cox7630a102011-10-25 22:23:15 -0700385 var first rune = 0
Rob Pike6f96a762013-02-21 10:47:31 -0800386 for scanner.Scan() {
387 switch parseCategory(scanner.Text()) {
Rob Pikef59ae062009-08-28 11:57:38 -0700388 case SNormal:
389 if first != 0 {
Rob Pike0de328e2011-05-31 09:58:07 +1000390 logger.Fatalf("bad state normal at %U", lastChar)
Rob Pikef59ae062009-08-28 11:57:38 -0700391 }
392 case SFirst:
393 if first != 0 {
Rob Pike0de328e2011-05-31 09:58:07 +1000394 logger.Fatalf("bad state first at %U", lastChar)
Rob Pikef59ae062009-08-28 11:57:38 -0700395 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800396 first = lastChar
Rob Pikef59ae062009-08-28 11:57:38 -0700397 case SLast:
398 if first == 0 {
Rob Pike0de328e2011-05-31 09:58:07 +1000399 logger.Fatalf("bad state last at %U", lastChar)
Rob Pikef59ae062009-08-28 11:57:38 -0700400 }
Robert Griesemer3bb00322009-11-09 21:23:52 -0800401 for i := first + 1; i <= lastChar; i++ {
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800402 chars[i] = chars[first]
403 chars[i].codePoint = i
Rob Pikef59ae062009-08-28 11:57:38 -0700404 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800405 first = 0
Rob Pikef59ae062009-08-28 11:57:38 -0700406 }
Rob Pike396b47b2009-08-26 16:01:31 -0700407 }
Rob Pike6f96a762013-02-21 10:47:31 -0800408 if scanner.Err() != nil {
409 logger.Fatal(scanner.Err())
410 }
Rob Pike22c2b472009-08-28 23:05:16 -0700411}
412
Russ Coxfc77e822011-06-16 17:56:25 -0400413func loadCasefold() {
414 if *casefoldingURL == "" {
415 flag.Set("casefolding", *url+"CaseFolding.txt")
416 }
Rob Pike5ea413e2011-07-27 15:54:23 -0700417 input := open(*casefoldingURL)
Rob Pike6f96a762013-02-21 10:47:31 -0800418 defer input.close()
419 scanner := bufio.NewScanner(input)
420 for scanner.Scan() {
421 line := scanner.Text()
422 if len(line) == 0 || line[0] == '#' || len(strings.TrimSpace(line)) == 0 {
Russ Coxfc77e822011-06-16 17:56:25 -0400423 continue
424 }
Rob Pikeebb15662011-06-28 09:43:14 +1000425 field := strings.Split(line, "; ")
Russ Coxfc77e822011-06-16 17:56:25 -0400426 if len(field) != 4 {
427 logger.Fatalf("CaseFolding.txt %.5s...: %d fields (expected %d)\n", line, len(field), 4)
428 }
429 kind := field[1]
430 if kind != "C" && kind != "S" {
431 // Only care about 'common' and 'simple' foldings.
432 continue
433 }
Russ Cox2666b812011-12-05 15:48:46 -0500434 p1, err := strconv.ParseUint(field[0], 16, 64)
Russ Coxfc77e822011-06-16 17:56:25 -0400435 if err != nil {
436 logger.Fatalf("CaseFolding.txt %.5s...: %s", line, err)
437 }
Russ Cox2666b812011-12-05 15:48:46 -0500438 p2, err := strconv.ParseUint(field[2], 16, 64)
Russ Coxfc77e822011-06-16 17:56:25 -0400439 if err != nil {
440 logger.Fatalf("CaseFolding.txt %.5s...: %s", line, err)
441 }
Russ Cox7630a102011-10-25 22:23:15 -0700442 chars[p1].foldCase = rune(p2)
Russ Coxfc77e822011-06-16 17:56:25 -0400443 }
Rob Pike6f96a762013-02-21 10:47:31 -0800444 if scanner.Err() != nil {
445 logger.Fatal(scanner.Err())
446 }
Russ Coxfc77e822011-06-16 17:56:25 -0400447}
448
ChaiShushan64379b82013-12-17 06:52:32 -0800449const progHeader = `// Copyright 2013 The Go Authors. All rights reserved.
450// Use of this source code is governed by a BSD-style
451// license that can be found in the LICENSE file.
452
Brad Fitzpatrick6914b0e2017-06-10 20:08:18 -0700453// Code generated by maketables; DO NOT EDIT.
454// To regenerate, run:
Russ Coxfc77e822011-06-16 17:56:25 -0400455// maketables --tables=%s --data=%s --casefolding=%s
Rob Pike0de328e2011-05-31 09:58:07 +1000456
457package unicode
458
459`
460
Rob Pike22c2b472009-08-28 23:05:16 -0700461func printCategories() {
462 if *tablelist == "" {
Robert Griesemer40621d52009-11-09 12:07:39 -0800463 return
Rob Pike22c2b472009-08-28 23:05:16 -0700464 }
Rob Pike94e69152009-08-27 09:14:32 -0700465 // Find out which categories to dump
Rob Pikeebb15662011-06-28 09:43:14 +1000466 list := strings.Split(*tablelist, ",")
Rob Pike8b6274e2009-08-27 17:04:23 -0700467 if *tablelist == "all" {
Robert Griesemer40621d52009-11-09 12:07:39 -0800468 list = allCategories()
Rob Pike94e69152009-08-27 09:14:32 -0700469 }
470 if *test {
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800471 fullCategoryTest(list)
472 return
Rob Pike94e69152009-08-27 09:14:32 -0700473 }
Rob Pikedff17f42014-08-25 14:56:35 -0700474 printf(progHeader, *tablelist, *dataURL, *casefoldingURL)
Rob Pike94e69152009-08-27 09:14:32 -0700475
Rob Pikedff17f42014-08-25 14:56:35 -0700476 println("// Version is the Unicode edition from which the tables are derived.")
477 printf("const Version = %q\n\n", version())
Rob Pike94e69152009-08-27 09:14:32 -0700478
Rob Pike8b6274e2009-08-27 17:04:23 -0700479 if *tablelist == "all" {
Rob Pikedff17f42014-08-25 14:56:35 -0700480 println("// Categories is the set of Unicode category tables.")
481 println("var Categories = map[string] *RangeTable {")
Russ Coxb4d6b712011-10-19 16:02:22 -0400482 for _, k := range allCategories() {
Rob Pikedff17f42014-08-25 14:56:35 -0700483 printf("\t%q: %s,\n", k, k)
Rob Pike94e69152009-08-27 09:14:32 -0700484 }
Rob Pikedff17f42014-08-25 14:56:35 -0700485 print("}\n\n")
Rob Pike94e69152009-08-27 09:14:32 -0700486 }
487
Rob Pike4b1170d2011-06-11 09:25:18 +1000488 decl := make(sort.StringSlice, len(list))
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800489 ndecl := 0
Rob Pike94e69152009-08-27 09:14:32 -0700490 for _, name := range list {
491 if _, ok := category[name]; !ok {
Rob Pikeeea18d92011-02-01 12:47:35 -0800492 logger.Fatal("unknown category", name)
Rob Pike94e69152009-08-27 09:14:32 -0700493 }
494 // We generate an UpperCase name to serve as concise documentation and an _UnderScored
Brad Fitzpatrick5fea2cc2016-03-01 23:21:55 +0000495 // name to store the data. This stops godoc dumping all the tables but keeps them
Rob Pike94e69152009-08-27 09:14:32 -0700496 // available to clients.
Rob Pike25caf182009-08-27 18:38:02 -0700497 // Cases deserving special comments
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800498 varDecl := ""
Rob Pike25caf182009-08-27 18:38:02 -0700499 switch name {
Rob Pike8d64e732011-06-04 07:46:22 +1000500 case "C":
501 varDecl = "\tOther = _C; // Other/C is the set of Unicode control and special characters, category C.\n"
502 varDecl += "\tC = _C\n"
503 case "L":
504 varDecl = "\tLetter = _L; // Letter/L is the set of Unicode letters, category L.\n"
505 varDecl += "\tL = _L\n"
506 case "M":
Oling Cata88d8282013-03-19 13:48:07 -0400507 varDecl = "\tMark = _M; // Mark/M is the set of Unicode mark characters, category M.\n"
Rob Pike8d64e732011-06-04 07:46:22 +1000508 varDecl += "\tM = _M\n"
509 case "N":
510 varDecl = "\tNumber = _N; // Number/N is the set of Unicode number characters, category N.\n"
511 varDecl += "\tN = _N\n"
512 case "P":
513 varDecl = "\tPunct = _P; // Punct/P is the set of Unicode punctuation characters, category P.\n"
514 varDecl += "\tP = _P\n"
515 case "S":
516 varDecl = "\tSymbol = _S; // Symbol/S is the set of Unicode symbol characters, category S.\n"
517 varDecl += "\tS = _S\n"
518 case "Z":
519 varDecl = "\tSpace = _Z; // Space/Z is the set of Unicode space characters, category Z.\n"
520 varDecl += "\tZ = _Z\n"
Rob Pike25caf182009-08-27 18:38:02 -0700521 case "Nd":
Robert Griesemer40621d52009-11-09 12:07:39 -0800522 varDecl = "\tDigit = _Nd; // Digit is the set of Unicode characters with the \"decimal digit\" property.\n"
Rob Pike25caf182009-08-27 18:38:02 -0700523 case "Lu":
Robert Griesemer40621d52009-11-09 12:07:39 -0800524 varDecl = "\tUpper = _Lu; // Upper is the set of Unicode upper case letters.\n"
Rob Pike25caf182009-08-27 18:38:02 -0700525 case "Ll":
Robert Griesemer40621d52009-11-09 12:07:39 -0800526 varDecl = "\tLower = _Ll; // Lower is the set of Unicode lower case letters.\n"
Rob Pike25caf182009-08-27 18:38:02 -0700527 case "Lt":
Robert Griesemer40621d52009-11-09 12:07:39 -0800528 varDecl = "\tTitle = _Lt; // Title is the set of Unicode title case letters.\n"
Rob Pike25caf182009-08-27 18:38:02 -0700529 }
Rob Pike8d64e732011-06-04 07:46:22 +1000530 if len(name) > 1 {
Rob Pike25caf182009-08-27 18:38:02 -0700531 varDecl += fmt.Sprintf(
532 "\t%s = _%s; // %s is the set of Unicode characters in category %s.\n",
Robert Griesemer40621d52009-11-09 12:07:39 -0800533 name, name, name, name)
Rob Pike25caf182009-08-27 18:38:02 -0700534 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800535 decl[ndecl] = varDecl
536 ndecl++
Rob Pike8d64e732011-06-04 07:46:22 +1000537 if len(name) == 1 { // unified categories
538 decl := fmt.Sprintf("var _%s = &RangeTable{\n", name)
Rob Pike94e69152009-08-27 09:14:32 -0700539 dumpRange(
Rob Pike8d64e732011-06-04 07:46:22 +1000540 decl,
Russ Cox7630a102011-10-25 22:23:15 -0700541 func(code rune) bool { return categoryOp(code, name[0]) })
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800542 continue
Rob Pike94e69152009-08-27 09:14:32 -0700543 }
Rob Pike396b47b2009-08-26 16:01:31 -0700544 dumpRange(
Rob Pike0de328e2011-05-31 09:58:07 +1000545 fmt.Sprintf("var _%s = &RangeTable{\n", name),
Russ Cox7630a102011-10-25 22:23:15 -0700546 func(code rune) bool { return chars[code].category == name })
Rob Pike396b47b2009-08-26 16:01:31 -0700547 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800548 decl.Sort()
Rob Pikedff17f42014-08-25 14:56:35 -0700549 println("// These variables have type *RangeTable.")
550 println("var (")
Rob Pike25caf182009-08-27 18:38:02 -0700551 for _, d := range decl {
Rob Pikedff17f42014-08-25 14:56:35 -0700552 print(d)
Rob Pike25caf182009-08-27 18:38:02 -0700553 }
Rob Pikedff17f42014-08-25 14:56:35 -0700554 print(")\n\n")
Rob Pike396b47b2009-08-26 16:01:31 -0700555}
556
Russ Cox7630a102011-10-25 22:23:15 -0700557type Op func(code rune) bool
Robert Griesemer841c18a2009-11-04 21:39:55 -0800558
Rob Pike0de328e2011-05-31 09:58:07 +1000559const format = "\t\t{0x%04x, 0x%04x, %d},\n"
Rob Pike396b47b2009-08-26 16:01:31 -0700560
Rob Pike25caf182009-08-27 18:38:02 -0700561func dumpRange(header string, inCategory Op) {
Rob Pikedff17f42014-08-25 14:56:35 -0700562 print(header)
Russ Cox7630a102011-10-25 22:23:15 -0700563 next := rune(0)
Russ Cox4591cd62012-09-21 00:35:25 -0400564 latinOffset := 0
Rob Pikedff17f42014-08-25 14:56:35 -0700565 print("\tR16: []Range16{\n")
Rob Pike396b47b2009-08-26 16:01:31 -0700566 // one Range for each iteration
Rob Pike0de328e2011-05-31 09:58:07 +1000567 count := &range16Count
568 size := 16
Rob Pike396b47b2009-08-26 16:01:31 -0700569 for {
570 // look for start of range
Russ Cox7630a102011-10-25 22:23:15 -0700571 for next < rune(len(chars)) && !inCategory(next) {
Robert Griesemer40621d52009-11-09 12:07:39 -0800572 next++
Rob Pike396b47b2009-08-26 16:01:31 -0700573 }
Russ Cox7630a102011-10-25 22:23:15 -0700574 if next >= rune(len(chars)) {
Rob Pike396b47b2009-08-26 16:01:31 -0700575 // no characters remain
Robert Griesemer40621d52009-11-09 12:07:39 -0800576 break
Rob Pike396b47b2009-08-26 16:01:31 -0700577 }
578
579 // start of range
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800580 lo := next
581 hi := next
Russ Cox7630a102011-10-25 22:23:15 -0700582 stride := rune(1)
Rob Pike396b47b2009-08-26 16:01:31 -0700583 // accept lo
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800584 next++
Rob Pike396b47b2009-08-26 16:01:31 -0700585 // look for another character to set the stride
Russ Cox7630a102011-10-25 22:23:15 -0700586 for next < rune(len(chars)) && !inCategory(next) {
Robert Griesemer40621d52009-11-09 12:07:39 -0800587 next++
Rob Pike396b47b2009-08-26 16:01:31 -0700588 }
Russ Cox7630a102011-10-25 22:23:15 -0700589 if next >= rune(len(chars)) {
Rob Pike396b47b2009-08-26 16:01:31 -0700590 // no more characters
Rob Pikedff17f42014-08-25 14:56:35 -0700591 printf(format, lo, hi, stride)
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800592 break
Rob Pike396b47b2009-08-26 16:01:31 -0700593 }
594 // set stride
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800595 stride = next - lo
Rob Pike396b47b2009-08-26 16:01:31 -0700596 // check for length of run. next points to first jump in stride
Russ Cox7630a102011-10-25 22:23:15 -0700597 for i := next; i < rune(len(chars)); i++ {
Robert Griesemer3bb00322009-11-09 21:23:52 -0800598 if inCategory(i) == (((i - lo) % stride) == 0) {
Rob Pike396b47b2009-08-26 16:01:31 -0700599 // accept
600 if inCategory(i) {
Robert Griesemer40621d52009-11-09 12:07:39 -0800601 hi = i
Rob Pike396b47b2009-08-26 16:01:31 -0700602 }
603 } else {
604 // no more characters in this run
Robert Griesemer40621d52009-11-09 12:07:39 -0800605 break
Rob Pike396b47b2009-08-26 16:01:31 -0700606 }
607 }
Russ Cox4591cd62012-09-21 00:35:25 -0400608 if uint32(hi) <= unicode.MaxLatin1 {
609 latinOffset++
610 }
Rob Pike9ec0c012011-06-01 09:49:51 +1000611 size, count = printRange(uint32(lo), uint32(hi), uint32(stride), size, count)
Rob Pike396b47b2009-08-26 16:01:31 -0700612 // next range: start looking where this range ends
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800613 next = hi + 1
Rob Pike396b47b2009-08-26 16:01:31 -0700614 }
Rob Pikedff17f42014-08-25 14:56:35 -0700615 print("\t},\n")
Russ Cox4591cd62012-09-21 00:35:25 -0400616 if latinOffset > 0 {
Rob Pikedff17f42014-08-25 14:56:35 -0700617 printf("\tLatinOffset: %d,\n", latinOffset)
Russ Cox4591cd62012-09-21 00:35:25 -0400618 }
Rob Pikedff17f42014-08-25 14:56:35 -0700619 print("}\n\n")
Rob Pike396b47b2009-08-26 16:01:31 -0700620}
Rob Pike94e69152009-08-27 09:14:32 -0700621
Rob Pike9ec0c012011-06-01 09:49:51 +1000622func printRange(lo, hi, stride uint32, size int, count *int) (int, *int) {
623 if size == 16 && hi >= 1<<16 {
624 if lo < 1<<16 {
625 if lo+stride != hi {
Rob Pike8d64e732011-06-04 07:46:22 +1000626 logger.Fatalf("unexpected straddle: %U %U %d", lo, hi, stride)
Rob Pike9ec0c012011-06-01 09:49:51 +1000627 }
628 // No range contains U+FFFF as an instance, so split
629 // the range into two entries. That way we can maintain
630 // the invariant that R32 contains only >= 1<<16.
Rob Pikedff17f42014-08-25 14:56:35 -0700631 printf(format, lo, lo, 1)
Rob Pike9ec0c012011-06-01 09:49:51 +1000632 lo = hi
633 stride = 1
634 *count++
635 }
Rob Pikedff17f42014-08-25 14:56:35 -0700636 print("\t},\n")
637 print("\tR32: []Range32{\n")
Rob Pike9ec0c012011-06-01 09:49:51 +1000638 size = 32
639 count = &range32Count
640 }
Rob Pikedff17f42014-08-25 14:56:35 -0700641 printf(format, lo, hi, stride)
Rob Pike9ec0c012011-06-01 09:49:51 +1000642 *count++
643 return size, count
644}
645
Rob Pike8b6274e2009-08-27 17:04:23 -0700646func fullCategoryTest(list []string) {
Rob Pike94e69152009-08-27 09:14:32 -0700647 for _, name := range list {
648 if _, ok := category[name]; !ok {
Rob Pikeeea18d92011-02-01 12:47:35 -0800649 logger.Fatal("unknown category", name)
Rob Pike94e69152009-08-27 09:14:32 -0700650 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800651 r, ok := unicode.Categories[name]
Rob Pike8d64e732011-06-04 07:46:22 +1000652 if !ok && len(name) > 1 {
653 logger.Fatalf("unknown table %q", name)
Rob Pike94e69152009-08-27 09:14:32 -0700654 }
Rob Pike8d64e732011-06-04 07:46:22 +1000655 if len(name) == 1 {
Russ Cox7630a102011-10-25 22:23:15 -0700656 verifyRange(name, func(code rune) bool { return categoryOp(code, name[0]) }, r)
Rob Pike94e69152009-08-27 09:14:32 -0700657 } else {
658 verifyRange(
659 name,
Russ Cox7630a102011-10-25 22:23:15 -0700660 func(code rune) bool { return chars[code].category == name },
Robert Griesemer40621d52009-11-09 12:07:39 -0800661 r)
Rob Pike94e69152009-08-27 09:14:32 -0700662 }
663 }
664}
665
Rob Pike0de328e2011-05-31 09:58:07 +1000666func verifyRange(name string, inCategory Op, table *unicode.RangeTable) {
Rob Pike8d64e732011-06-04 07:46:22 +1000667 count := 0
Russ Cox7630a102011-10-25 22:23:15 -0700668 for j := range chars {
669 i := rune(j)
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800670 web := inCategory(i)
671 pkg := unicode.Is(table, i)
Rob Pike94e69152009-08-27 09:14:32 -0700672 if web != pkg {
Rob Pike0de328e2011-05-31 09:58:07 +1000673 fmt.Fprintf(os.Stderr, "%s: %U: web=%t pkg=%t\n", name, i, web, pkg)
Rob Pike8d64e732011-06-04 07:46:22 +1000674 count++
675 if count > 10 {
676 break
677 }
Rob Pike94e69152009-08-27 09:14:32 -0700678 }
679 }
680}
Rob Pike8b6274e2009-08-27 17:04:23 -0700681
Robert Griesemer841c18a2009-11-04 21:39:55 -0800682func parseScript(line string, scripts map[string][]Script) {
Marvin Stenger90d71fe2017-10-05 15:49:32 +0200683 comment := strings.Index(line, "#")
Rob Pike8b6274e2009-08-27 17:04:23 -0700684 if comment >= 0 {
Robert Griesemer40621d52009-11-09 12:07:39 -0800685 line = line[0:comment]
Rob Pike8b6274e2009-08-27 17:04:23 -0700686 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800687 line = strings.TrimSpace(line)
Rob Pike8b6274e2009-08-27 17:04:23 -0700688 if len(line) == 0 {
Robert Griesemer40621d52009-11-09 12:07:39 -0800689 return
Rob Pike8b6274e2009-08-27 17:04:23 -0700690 }
Rob Pikeebb15662011-06-28 09:43:14 +1000691 field := strings.Split(line, ";")
Rob Pike8b6274e2009-08-27 17:04:23 -0700692 if len(field) != 2 {
Rob Pikeeea18d92011-02-01 12:47:35 -0800693 logger.Fatalf("%s: %d fields (expected 2)\n", line, len(field))
Rob Pike8b6274e2009-08-27 17:04:23 -0700694 }
Russ Cox0f0f34e2011-01-30 16:09:16 -0500695 matches := scriptRe.FindStringSubmatch(line)
Rob Pike8b6274e2009-08-27 17:04:23 -0700696 if len(matches) != 4 {
Rob Pikeeea18d92011-02-01 12:47:35 -0800697 logger.Fatalf("%s: %d matches (expected 3)\n", line, len(matches))
Rob Pike8b6274e2009-08-27 17:04:23 -0700698 }
Russ Cox2666b812011-12-05 15:48:46 -0500699 lo, err := strconv.ParseUint(matches[1], 16, 64)
Rob Pike8b6274e2009-08-27 17:04:23 -0700700 if err != nil {
Rob Pikeeea18d92011-02-01 12:47:35 -0800701 logger.Fatalf("%.5s...: %s", line, err)
Rob Pike8b6274e2009-08-27 17:04:23 -0700702 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800703 hi := lo
704 if len(matches[2]) > 2 { // ignore leading ..
Russ Cox2666b812011-12-05 15:48:46 -0500705 hi, err = strconv.ParseUint(matches[2][2:], 16, 64)
Rob Pike8b6274e2009-08-27 17:04:23 -0700706 if err != nil {
Rob Pikeeea18d92011-02-01 12:47:35 -0800707 logger.Fatalf("%.5s...: %s", line, err)
Rob Pike8b6274e2009-08-27 17:04:23 -0700708 }
709 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800710 name := matches[3]
Russ Cox69c4e932010-10-27 19:47:23 -0700711 scripts[name] = append(scripts[name], Script{uint32(lo), uint32(hi), name})
Rob Pike8b6274e2009-08-27 17:04:23 -0700712}
713
Rob Pike8b6274e2009-08-27 17:04:23 -0700714// The script tables have a lot of adjacent elements. Fold them together.
Rob Pike0de328e2011-05-31 09:58:07 +1000715func foldAdjacent(r []Script) []unicode.Range32 {
716 s := make([]unicode.Range32, 0, len(r))
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800717 j := 0
Rob Pike8b6274e2009-08-27 17:04:23 -0700718 for i := 0; i < len(r); i++ {
Rob Pike0de328e2011-05-31 09:58:07 +1000719 if j > 0 && r[i].lo == s[j-1].Hi+1 {
720 s[j-1].Hi = r[i].hi
Rob Pike8b6274e2009-08-27 17:04:23 -0700721 } else {
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800722 s = s[0 : j+1]
Nigel Tao102638c2012-02-03 10:12:25 +1100723 s[j] = unicode.Range32{
724 Lo: uint32(r[i].lo),
725 Hi: uint32(r[i].hi),
726 Stride: 1,
727 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800728 j++
Rob Pike8b6274e2009-08-27 17:04:23 -0700729 }
730 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800731 return s
Rob Pike8b6274e2009-08-27 17:04:23 -0700732}
733
Rob Pike0de328e2011-05-31 09:58:07 +1000734func fullScriptTest(list []string, installed map[string]*unicode.RangeTable, scripts map[string][]Script) {
Rob Pike8b6274e2009-08-27 17:04:23 -0700735 for _, name := range list {
736 if _, ok := scripts[name]; !ok {
Rob Pikeeea18d92011-02-01 12:47:35 -0800737 logger.Fatal("unknown script", name)
Rob Pike8b6274e2009-08-27 17:04:23 -0700738 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800739 _, ok := installed[name]
Rob Pike8b6274e2009-08-27 17:04:23 -0700740 if !ok {
Rob Pikeeea18d92011-02-01 12:47:35 -0800741 logger.Fatal("unknown table", name)
Rob Pike8b6274e2009-08-27 17:04:23 -0700742 }
743 for _, script := range scripts[name] {
744 for r := script.lo; r <= script.hi; r++ {
Russ Cox7630a102011-10-25 22:23:15 -0700745 if !unicode.Is(installed[name], rune(r)) {
Rob Pike0de328e2011-05-31 09:58:07 +1000746 fmt.Fprintf(os.Stderr, "%U: not in script %s\n", r, name)
Rob Pike8b6274e2009-08-27 17:04:23 -0700747 }
748 }
749 }
750 }
751}
Rob Pike22c2b472009-08-28 23:05:16 -0700752
Marcel van Lohuizena2a4db72016-06-28 12:31:02 +0200753var deprecatedAliases = map[string]string{
754 "Sentence_Terminal": "STerm",
755}
756
Rob Pike1e55e4a2009-08-31 16:43:17 -0700757// PropList.txt has the same format as Scripts.txt so we can share its parser.
758func printScriptOrProperty(doProps bool) {
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800759 flag := "scripts"
760 flaglist := *scriptlist
761 file := "Scripts.txt"
762 table := scripts
763 installed := unicode.Scripts
Rob Pike1e55e4a2009-08-31 16:43:17 -0700764 if doProps {
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800765 flag = "props"
766 flaglist = *proplist
767 file = "PropList.txt"
768 table = props
769 installed = unicode.Properties
Rob Pike1e55e4a2009-08-31 16:43:17 -0700770 }
771 if flaglist == "" {
Robert Griesemer40621d52009-11-09 12:07:39 -0800772 return
Rob Pike1e55e4a2009-08-31 16:43:17 -0700773 }
Rob Pike5ea413e2011-07-27 15:54:23 -0700774 input := open(*url + file)
Rob Pike6f96a762013-02-21 10:47:31 -0800775 scanner := bufio.NewScanner(input)
776 for scanner.Scan() {
777 parseScript(scanner.Text(), table)
778 }
779 if scanner.Err() != nil {
780 logger.Fatal(scanner.Err())
Rob Pike1e55e4a2009-08-31 16:43:17 -0700781 }
Rob Pike5ea413e2011-07-27 15:54:23 -0700782 input.close()
Rob Pike1e55e4a2009-08-31 16:43:17 -0700783
784 // Find out which scripts to dump
Rob Pikeebb15662011-06-28 09:43:14 +1000785 list := strings.Split(flaglist, ",")
Rob Pike1e55e4a2009-08-31 16:43:17 -0700786 if flaglist == "all" {
Robert Griesemer40621d52009-11-09 12:07:39 -0800787 list = all(table)
Rob Pike1e55e4a2009-08-31 16:43:17 -0700788 }
789 if *test {
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800790 fullScriptTest(list, installed, table)
791 return
Rob Pike1e55e4a2009-08-31 16:43:17 -0700792 }
793
Rob Pikedff17f42014-08-25 14:56:35 -0700794 printf(
Robert Griesemer295ceb42009-12-09 16:54:07 -0800795 "// Generated by running\n"+
796 "// maketables --%s=%s --url=%s\n"+
Robert Griesemer45cba572009-11-05 18:26:16 -0800797 "// DO NOT EDIT\n\n",
Rob Pike1e55e4a2009-08-31 16:43:17 -0700798 flag,
799 flaglist,
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800800 *url)
Rob Pike1e55e4a2009-08-31 16:43:17 -0700801 if flaglist == "all" {
802 if doProps {
Rob Pikedff17f42014-08-25 14:56:35 -0700803 println("// Properties is the set of Unicode property tables.")
804 println("var Properties = map[string] *RangeTable{")
Rob Pike1e55e4a2009-08-31 16:43:17 -0700805 } else {
Rob Pikedff17f42014-08-25 14:56:35 -0700806 println("// Scripts is the set of Unicode script tables.")
807 println("var Scripts = map[string] *RangeTable{")
Rob Pike1e55e4a2009-08-31 16:43:17 -0700808 }
Russ Coxb4d6b712011-10-19 16:02:22 -0400809 for _, k := range all(table) {
Rob Pikedff17f42014-08-25 14:56:35 -0700810 printf("\t%q: %s,\n", k, k)
Marcel van Lohuizena2a4db72016-06-28 12:31:02 +0200811 if alias, ok := deprecatedAliases[k]; ok {
812 printf("\t%q: %s,\n", alias, k)
813 }
Rob Pike1e55e4a2009-08-31 16:43:17 -0700814 }
Rob Pikedff17f42014-08-25 14:56:35 -0700815 print("}\n\n")
Rob Pike1e55e4a2009-08-31 16:43:17 -0700816 }
817
Marcel van Lohuizena2a4db72016-06-28 12:31:02 +0200818 decl := make(sort.StringSlice, len(list)+len(deprecatedAliases))
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800819 ndecl := 0
Rob Pike1e55e4a2009-08-31 16:43:17 -0700820 for _, name := range list {
821 if doProps {
822 decl[ndecl] = fmt.Sprintf(
823 "\t%s = _%s;\t// %s is the set of Unicode characters with property %s.\n",
Robert Griesemer40621d52009-11-09 12:07:39 -0800824 name, name, name, name)
Rob Pike1e55e4a2009-08-31 16:43:17 -0700825 } else {
826 decl[ndecl] = fmt.Sprintf(
827 "\t%s = _%s;\t// %s is the set of Unicode characters in script %s.\n",
Robert Griesemer40621d52009-11-09 12:07:39 -0800828 name, name, name, name)
Rob Pike1e55e4a2009-08-31 16:43:17 -0700829 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800830 ndecl++
Marcel van Lohuizena2a4db72016-06-28 12:31:02 +0200831 if alias, ok := deprecatedAliases[name]; ok {
832 decl[ndecl] = fmt.Sprintf(
833 "\t%[1]s = _%[2]s;\t// %[1]s is an alias for %[2]s.\n",
834 alias, name)
835 ndecl++
836 }
Rob Pikedff17f42014-08-25 14:56:35 -0700837 printf("var _%s = &RangeTable {\n", name)
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800838 ranges := foldAdjacent(table[name])
Rob Pikedff17f42014-08-25 14:56:35 -0700839 print("\tR16: []Range16{\n")
Rob Pike0de328e2011-05-31 09:58:07 +1000840 size := 16
841 count := &range16Count
Rob Pike1e55e4a2009-08-31 16:43:17 -0700842 for _, s := range ranges {
Rob Pike9ec0c012011-06-01 09:49:51 +1000843 size, count = printRange(s.Lo, s.Hi, s.Stride, size, count)
Rob Pike1e55e4a2009-08-31 16:43:17 -0700844 }
Rob Pikedff17f42014-08-25 14:56:35 -0700845 print("\t},\n")
Russ Cox4591cd62012-09-21 00:35:25 -0400846 if off := findLatinOffset(ranges); off > 0 {
Rob Pikedff17f42014-08-25 14:56:35 -0700847 printf("\tLatinOffset: %d,\n", off)
Russ Cox4591cd62012-09-21 00:35:25 -0400848 }
Rob Pikedff17f42014-08-25 14:56:35 -0700849 print("}\n\n")
Rob Pike1e55e4a2009-08-31 16:43:17 -0700850 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800851 decl.Sort()
Rob Pikedff17f42014-08-25 14:56:35 -0700852 println("// These variables have type *RangeTable.")
853 println("var (")
Rob Pike1e55e4a2009-08-31 16:43:17 -0700854 for _, d := range decl {
Rob Pikedff17f42014-08-25 14:56:35 -0700855 print(d)
Rob Pike1e55e4a2009-08-31 16:43:17 -0700856 }
Rob Pikedff17f42014-08-25 14:56:35 -0700857 print(")\n\n")
Rob Pike1e55e4a2009-08-31 16:43:17 -0700858}
859
Russ Cox4591cd62012-09-21 00:35:25 -0400860func findLatinOffset(ranges []unicode.Range32) int {
861 i := 0
862 for i < len(ranges) && ranges[i].Hi <= unicode.MaxLatin1 {
863 i++
864 }
865 return i
866}
867
Rob Pike22c2b472009-08-28 23:05:16 -0700868const (
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800869 CaseUpper = 1 << iota
870 CaseLower
871 CaseTitle
872 CaseNone = 0 // must be zero
873 CaseMissing = -1 // character not present; not a valid case state
Rob Pike22c2b472009-08-28 23:05:16 -0700874)
875
876type caseState struct {
Russ Cox7630a102011-10-25 22:23:15 -0700877 point rune
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800878 _case int
Russ Cox7630a102011-10-25 22:23:15 -0700879 deltaToUpper rune
880 deltaToLower rune
881 deltaToTitle rune
Rob Pike22c2b472009-08-28 23:05:16 -0700882}
883
884// Is d a continuation of the state of c?
885func (c *caseState) adjacent(d *caseState) bool {
886 if d.point < c.point {
Robert Griesemer40621d52009-11-09 12:07:39 -0800887 c, d = d, c
Rob Pike22c2b472009-08-28 23:05:16 -0700888 }
889 switch {
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800890 case d.point != c.point+1: // code points not adjacent (shouldn't happen)
Robert Griesemer40621d52009-11-09 12:07:39 -0800891 return false
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800892 case d._case != c._case: // different cases
Robert Griesemer40621d52009-11-09 12:07:39 -0800893 return c.upperLowerAdjacent(d)
Rob Pike22c2b472009-08-28 23:05:16 -0700894 case c._case == CaseNone:
Robert Griesemer40621d52009-11-09 12:07:39 -0800895 return false
Rob Pike22c2b472009-08-28 23:05:16 -0700896 case c._case == CaseMissing:
Robert Griesemer40621d52009-11-09 12:07:39 -0800897 return false
Rob Pike22c2b472009-08-28 23:05:16 -0700898 case d.deltaToUpper != c.deltaToUpper:
Robert Griesemer40621d52009-11-09 12:07:39 -0800899 return false
Rob Pike22c2b472009-08-28 23:05:16 -0700900 case d.deltaToLower != c.deltaToLower:
Robert Griesemer40621d52009-11-09 12:07:39 -0800901 return false
Rob Pike22c2b472009-08-28 23:05:16 -0700902 case d.deltaToTitle != c.deltaToTitle:
Robert Griesemer40621d52009-11-09 12:07:39 -0800903 return false
Rob Pike22c2b472009-08-28 23:05:16 -0700904 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800905 return true
Rob Pike22c2b472009-08-28 23:05:16 -0700906}
907
Rob Pike3c098e22009-08-30 14:02:42 -0700908// Is d the same as c, but opposite in upper/lower case? this would make it
909// an element of an UpperLower sequence.
910func (c *caseState) upperLowerAdjacent(d *caseState) bool {
911 // check they're a matched case pair. we know they have adjacent values
912 switch {
913 case c._case == CaseUpper && d._case != CaseLower:
Robert Griesemer40621d52009-11-09 12:07:39 -0800914 return false
Rob Pike3c098e22009-08-30 14:02:42 -0700915 case c._case == CaseLower && d._case != CaseUpper:
Robert Griesemer40621d52009-11-09 12:07:39 -0800916 return false
Rob Pike3c098e22009-08-30 14:02:42 -0700917 }
918 // matched pair (at least in upper/lower). make the order Upper Lower
919 if c._case == CaseLower {
Robert Griesemer40621d52009-11-09 12:07:39 -0800920 c, d = d, c
Rob Pike3c098e22009-08-30 14:02:42 -0700921 }
922 // for an Upper Lower sequence the deltas have to be in order
923 // c: 0 1 0
924 // d: -1 0 -1
925 switch {
926 case c.deltaToUpper != 0:
Robert Griesemer40621d52009-11-09 12:07:39 -0800927 return false
Rob Pike3c098e22009-08-30 14:02:42 -0700928 case c.deltaToLower != 1:
Robert Griesemer40621d52009-11-09 12:07:39 -0800929 return false
Rob Pike3c098e22009-08-30 14:02:42 -0700930 case c.deltaToTitle != 0:
Robert Griesemer40621d52009-11-09 12:07:39 -0800931 return false
Rob Pike3c098e22009-08-30 14:02:42 -0700932 case d.deltaToUpper != -1:
Robert Griesemer40621d52009-11-09 12:07:39 -0800933 return false
Rob Pike3c098e22009-08-30 14:02:42 -0700934 case d.deltaToLower != 0:
Robert Griesemer40621d52009-11-09 12:07:39 -0800935 return false
Rob Pike3c098e22009-08-30 14:02:42 -0700936 case d.deltaToTitle != -1:
Robert Griesemer40621d52009-11-09 12:07:39 -0800937 return false
Rob Pike3c098e22009-08-30 14:02:42 -0700938 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800939 return true
Rob Pike3c098e22009-08-30 14:02:42 -0700940}
941
942// Does this character start an UpperLower sequence?
943func (c *caseState) isUpperLower() bool {
944 // for an Upper Lower sequence the deltas have to be in order
945 // c: 0 1 0
946 switch {
947 case c.deltaToUpper != 0:
Robert Griesemer40621d52009-11-09 12:07:39 -0800948 return false
Rob Pike3c098e22009-08-30 14:02:42 -0700949 case c.deltaToLower != 1:
Robert Griesemer40621d52009-11-09 12:07:39 -0800950 return false
Rob Pike3c098e22009-08-30 14:02:42 -0700951 case c.deltaToTitle != 0:
Robert Griesemer40621d52009-11-09 12:07:39 -0800952 return false
Rob Pike3c098e22009-08-30 14:02:42 -0700953 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800954 return true
Rob Pike3c098e22009-08-30 14:02:42 -0700955}
956
957// Does this character start a LowerUpper sequence?
958func (c *caseState) isLowerUpper() bool {
959 // for an Upper Lower sequence the deltas have to be in order
960 // c: -1 0 -1
961 switch {
962 case c.deltaToUpper != -1:
Robert Griesemer40621d52009-11-09 12:07:39 -0800963 return false
Rob Pike3c098e22009-08-30 14:02:42 -0700964 case c.deltaToLower != 0:
Robert Griesemer40621d52009-11-09 12:07:39 -0800965 return false
Rob Pike3c098e22009-08-30 14:02:42 -0700966 case c.deltaToTitle != -1:
Robert Griesemer40621d52009-11-09 12:07:39 -0800967 return false
Rob Pike3c098e22009-08-30 14:02:42 -0700968 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800969 return true
Rob Pike3c098e22009-08-30 14:02:42 -0700970}
971
Russ Cox7630a102011-10-25 22:23:15 -0700972func getCaseState(i rune) (c *caseState) {
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800973 c = &caseState{point: i, _case: CaseNone}
974 ch := &chars[i]
Russ Cox7630a102011-10-25 22:23:15 -0700975 switch ch.codePoint {
Rob Pike22c2b472009-08-28 23:05:16 -0700976 case 0:
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800977 c._case = CaseMissing // Will get NUL wrong but that doesn't matter
978 return
Rob Pike22c2b472009-08-28 23:05:16 -0700979 case ch.upperCase:
Robert Griesemer40621d52009-11-09 12:07:39 -0800980 c._case = CaseUpper
Rob Pike22c2b472009-08-28 23:05:16 -0700981 case ch.lowerCase:
Robert Griesemer40621d52009-11-09 12:07:39 -0800982 c._case = CaseLower
Rob Pike22c2b472009-08-28 23:05:16 -0700983 case ch.titleCase:
Robert Griesemer40621d52009-11-09 12:07:39 -0800984 c._case = CaseTitle
Rob Pike22c2b472009-08-28 23:05:16 -0700985 }
Rob Pike5ea413e2011-07-27 15:54:23 -0700986 // Some things such as roman numeral U+2161 don't describe themselves
Brad Fitzpatrick5fea2cc2016-03-01 23:21:55 +0000987 // as upper case, but have a lower case. Second-guess them.
Rob Pike5ea413e2011-07-27 15:54:23 -0700988 if c._case == CaseNone && ch.lowerCase != 0 {
989 c._case = CaseUpper
990 }
991 // Same in the other direction.
992 if c._case == CaseNone && ch.upperCase != 0 {
993 c._case = CaseLower
994 }
995
Rob Pike22c2b472009-08-28 23:05:16 -0700996 if ch.upperCase != 0 {
Robert Griesemer40621d52009-11-09 12:07:39 -0800997 c.deltaToUpper = ch.upperCase - i
Rob Pike22c2b472009-08-28 23:05:16 -0700998 }
999 if ch.lowerCase != 0 {
Robert Griesemer40621d52009-11-09 12:07:39 -08001000 c.deltaToLower = ch.lowerCase - i
Rob Pike22c2b472009-08-28 23:05:16 -07001001 }
1002 if ch.titleCase != 0 {
Robert Griesemer40621d52009-11-09 12:07:39 -08001003 c.deltaToTitle = ch.titleCase - i
Rob Pike22c2b472009-08-28 23:05:16 -07001004 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -08001005 return
Rob Pike22c2b472009-08-28 23:05:16 -07001006}
1007
1008func printCases() {
1009 if !*cases {
Robert Griesemer40621d52009-11-09 12:07:39 -08001010 return
Rob Pike22c2b472009-08-28 23:05:16 -07001011 }
1012 if *test {
Robert Griesemer45ca9f72009-12-15 15:41:46 -08001013 fullCaseTest()
1014 return
Rob Pike22c2b472009-08-28 23:05:16 -07001015 }
Rob Pikedff17f42014-08-25 14:56:35 -07001016 printf(
Robert Griesemer295ceb42009-12-09 16:54:07 -08001017 "// Generated by running\n"+
Russ Coxfc77e822011-06-16 17:56:25 -04001018 "// maketables --data=%s --casefolding=%s\n"+
Robert Griesemer295ceb42009-12-09 16:54:07 -08001019 "// DO NOT EDIT\n\n"+
1020 "// CaseRanges is the table describing case mappings for all letters with\n"+
1021 "// non-self mappings.\n"+
1022 "var CaseRanges = _CaseRanges\n"+
Robert Griesemer45cba572009-11-05 18:26:16 -08001023 "var _CaseRanges = []CaseRange {\n",
Russ Coxfc77e822011-06-16 17:56:25 -04001024 *dataURL, *casefoldingURL)
Rob Pike22c2b472009-08-28 23:05:16 -07001025
Robert Griesemer45ca9f72009-12-15 15:41:46 -08001026 var startState *caseState // the start of a run; nil for not active
1027 var prevState = &caseState{} // the state of the previous character
Rob Pikea8246512009-11-02 11:37:52 -08001028 for i := range chars {
Russ Cox7630a102011-10-25 22:23:15 -07001029 state := getCaseState(rune(i))
Rob Pike22c2b472009-08-28 23:05:16 -07001030 if state.adjacent(prevState) {
Robert Griesemer45ca9f72009-12-15 15:41:46 -08001031 prevState = state
1032 continue
Rob Pike22c2b472009-08-28 23:05:16 -07001033 }
1034 // end of run (possibly)
Robert Griesemer45ca9f72009-12-15 15:41:46 -08001035 printCaseRange(startState, prevState)
1036 startState = nil
Rob Pike22c2b472009-08-28 23:05:16 -07001037 if state._case != CaseMissing && state._case != CaseNone {
Robert Griesemer40621d52009-11-09 12:07:39 -08001038 startState = state
Rob Pike22c2b472009-08-28 23:05:16 -07001039 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -08001040 prevState = state
Rob Pike22c2b472009-08-28 23:05:16 -07001041 }
Rob Pikedff17f42014-08-25 14:56:35 -07001042 print("}\n")
Rob Pike22c2b472009-08-28 23:05:16 -07001043}
1044
1045func printCaseRange(lo, hi *caseState) {
1046 if lo == nil {
Robert Griesemer40621d52009-11-09 12:07:39 -08001047 return
Rob Pike22c2b472009-08-28 23:05:16 -07001048 }
1049 if lo.deltaToUpper == 0 && lo.deltaToLower == 0 && lo.deltaToTitle == 0 {
1050 // character represents itself in all cases - no need to mention it
Robert Griesemer40621d52009-11-09 12:07:39 -08001051 return
Rob Pike22c2b472009-08-28 23:05:16 -07001052 }
Rob Pike3c098e22009-08-30 14:02:42 -07001053 switch {
1054 case hi.point > lo.point && lo.isUpperLower():
Rob Pikedff17f42014-08-25 14:56:35 -07001055 printf("\t{0x%04X, 0x%04X, d{UpperLower, UpperLower, UpperLower}},\n",
Robert Griesemer40621d52009-11-09 12:07:39 -08001056 lo.point, hi.point)
Rob Pike3c098e22009-08-30 14:02:42 -07001057 case hi.point > lo.point && lo.isLowerUpper():
Rob Pike0de328e2011-05-31 09:58:07 +10001058 logger.Fatalf("LowerUpper sequence: should not happen: %U. If it's real, need to fix To()", lo.point)
Rob Pikedff17f42014-08-25 14:56:35 -07001059 printf("\t{0x%04X, 0x%04X, d{LowerUpper, LowerUpper, LowerUpper}},\n",
Robert Griesemer45ca9f72009-12-15 15:41:46 -08001060 lo.point, hi.point)
Rob Pike3c098e22009-08-30 14:02:42 -07001061 default:
Rob Pikedff17f42014-08-25 14:56:35 -07001062 printf("\t{0x%04X, 0x%04X, d{%d, %d, %d}},\n",
Rob Pike3c098e22009-08-30 14:02:42 -07001063 lo.point, hi.point,
Robert Griesemer40621d52009-11-09 12:07:39 -08001064 lo.deltaToUpper, lo.deltaToLower, lo.deltaToTitle)
Rob Pike3c098e22009-08-30 14:02:42 -07001065 }
Rob Pike22c2b472009-08-28 23:05:16 -07001066}
1067
1068// If the cased value in the Char is 0, it means use the rune itself.
Russ Cox7630a102011-10-25 22:23:15 -07001069func caseIt(r, cased rune) rune {
Rob Pike22c2b472009-08-28 23:05:16 -07001070 if cased == 0 {
Russ Cox7630a102011-10-25 22:23:15 -07001071 return r
Rob Pike22c2b472009-08-28 23:05:16 -07001072 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -08001073 return cased
Rob Pike22c2b472009-08-28 23:05:16 -07001074}
1075
1076func fullCaseTest() {
Russ Cox7630a102011-10-25 22:23:15 -07001077 for j, c := range chars {
1078 i := rune(j)
Robert Griesemer45ca9f72009-12-15 15:41:46 -08001079 lower := unicode.ToLower(i)
1080 want := caseIt(i, c.lowerCase)
Rob Pike22c2b472009-08-28 23:05:16 -07001081 if lower != want {
Rob Pike0de328e2011-05-31 09:58:07 +10001082 fmt.Fprintf(os.Stderr, "lower %U should be %U is %U\n", i, want, lower)
Rob Pike22c2b472009-08-28 23:05:16 -07001083 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -08001084 upper := unicode.ToUpper(i)
1085 want = caseIt(i, c.upperCase)
Rob Pike22c2b472009-08-28 23:05:16 -07001086 if upper != want {
Rob Pike0de328e2011-05-31 09:58:07 +10001087 fmt.Fprintf(os.Stderr, "upper %U should be %U is %U\n", i, want, upper)
Rob Pike22c2b472009-08-28 23:05:16 -07001088 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -08001089 title := unicode.ToTitle(i)
1090 want = caseIt(i, c.titleCase)
Rob Pike22c2b472009-08-28 23:05:16 -07001091 if title != want {
Rob Pike0de328e2011-05-31 09:58:07 +10001092 fmt.Fprintf(os.Stderr, "title %U should be %U is %U\n", i, want, title)
Rob Pike22c2b472009-08-28 23:05:16 -07001093 }
1094 }
1095}
Rob Pike0de328e2011-05-31 09:58:07 +10001096
Rob Pike8d64e732011-06-04 07:46:22 +10001097func printLatinProperties() {
1098 if *test {
1099 return
1100 }
Rob Pikedff17f42014-08-25 14:56:35 -07001101 println("var properties = [MaxLatin1+1]uint8{")
Rob Pike7a922872011-06-04 09:28:27 +10001102 for code := 0; code <= unicode.MaxLatin1; code++ {
Rob Pike8d64e732011-06-04 07:46:22 +10001103 var property string
1104 switch chars[code].category {
1105 case "Cc", "": // NUL has no category.
1106 property = "pC"
1107 case "Cf": // soft hyphen, unique category, not printable.
1108 property = "0"
1109 case "Ll":
1110 property = "pLl | pp"
Marcel van Lohuizene14cf902012-10-31 17:32:16 +01001111 case "Lo":
1112 property = "pLo | pp"
Rob Pike8d64e732011-06-04 07:46:22 +10001113 case "Lu":
1114 property = "pLu | pp"
1115 case "Nd", "No":
1116 property = "pN | pp"
1117 case "Pc", "Pd", "Pe", "Pf", "Pi", "Po", "Ps":
1118 property = "pP | pp"
1119 case "Sc", "Sk", "Sm", "So":
1120 property = "pS | pp"
1121 case "Zs":
1122 property = "pZ"
1123 default:
1124 logger.Fatalf("%U has unknown category %q", code, chars[code].category)
1125 }
1126 // Special case
1127 if code == ' ' {
1128 property = "pZ | pp"
1129 }
Rob Pikedff17f42014-08-25 14:56:35 -07001130 printf("\t0x%02X: %s, // %q\n", code, property, code)
Rob Pike8d64e732011-06-04 07:46:22 +10001131 }
Rob Pikedff17f42014-08-25 14:56:35 -07001132 printf("}\n\n")
Rob Pike8d64e732011-06-04 07:46:22 +10001133}
1134
Russ Coxfc77e822011-06-16 17:56:25 -04001135func printCasefold() {
1136 // Build list of case-folding groups attached to each canonical folded char (typically lower case).
Russ Cox7630a102011-10-25 22:23:15 -07001137 var caseOrbit = make([][]rune, MaxChar+1)
1138 for j := range chars {
1139 i := rune(j)
Russ Coxfc77e822011-06-16 17:56:25 -04001140 c := &chars[i]
1141 if c.foldCase == 0 {
1142 continue
1143 }
1144 orb := caseOrbit[c.foldCase]
1145 if orb == nil {
1146 orb = append(orb, c.foldCase)
1147 }
1148 caseOrbit[c.foldCase] = append(orb, i)
1149 }
1150
1151 // Insert explicit 1-element groups when assuming [lower, upper] would be wrong.
Russ Cox7630a102011-10-25 22:23:15 -07001152 for j := range chars {
1153 i := rune(j)
Russ Coxfc77e822011-06-16 17:56:25 -04001154 c := &chars[i]
1155 f := c.foldCase
1156 if f == 0 {
1157 f = i
1158 }
1159 orb := caseOrbit[f]
1160 if orb == nil && (c.upperCase != 0 && c.upperCase != i || c.lowerCase != 0 && c.lowerCase != i) {
1161 // Default assumption of [upper, lower] is wrong.
Russ Cox7630a102011-10-25 22:23:15 -07001162 caseOrbit[i] = []rune{i}
Russ Coxfc77e822011-06-16 17:56:25 -04001163 }
1164 }
1165
Marcel van Lohuizenfe15da62015-06-21 20:21:04 +02001166 // Delete the groups for which assuming [lower, upper] or [upper, lower] is right.
Russ Coxfc77e822011-06-16 17:56:25 -04001167 for i, orb := range caseOrbit {
1168 if len(orb) == 2 && chars[orb[0]].upperCase == orb[1] && chars[orb[1]].lowerCase == orb[0] {
1169 caseOrbit[i] = nil
1170 }
Marcel van Lohuizenfe15da62015-06-21 20:21:04 +02001171 if len(orb) == 2 && chars[orb[1]].upperCase == orb[0] && chars[orb[0]].lowerCase == orb[1] {
1172 caseOrbit[i] = nil
1173 }
Russ Coxfc77e822011-06-16 17:56:25 -04001174 }
1175
1176 // Record orbit information in chars.
1177 for _, orb := range caseOrbit {
1178 if orb == nil {
1179 continue
1180 }
Daniel Martí57e7d622017-09-12 17:09:46 +02001181 sort.Slice(orb, func(i, j int) bool {
1182 return orb[i] < orb[j]
1183 })
Russ Coxfc77e822011-06-16 17:56:25 -04001184 c := orb[len(orb)-1]
1185 for _, d := range orb {
1186 chars[c].caseOrbit = d
1187 c = d
1188 }
1189 }
1190
Egon Elbree607abb2015-11-17 16:51:23 +02001191 printAsciiFold()
Russ Coxfc77e822011-06-16 17:56:25 -04001192 printCaseOrbit()
1193
1194 // Tables of category and script folding exceptions: code points
1195 // that must be added when interpreting a particular category/script
1196 // in a case-folding context.
Russ Cox7630a102011-10-25 22:23:15 -07001197 cat := make(map[string]map[rune]bool)
Russ Coxfc77e822011-06-16 17:56:25 -04001198 for name := range category {
1199 if x := foldExceptions(inCategory(name)); len(x) > 0 {
1200 cat[name] = x
1201 }
1202 }
1203
Russ Cox7630a102011-10-25 22:23:15 -07001204 scr := make(map[string]map[rune]bool)
Russ Coxfc77e822011-06-16 17:56:25 -04001205 for name := range scripts {
1206 if x := foldExceptions(inScript(name)); len(x) > 0 {
Russ Cox82abd412017-06-14 20:29:42 -04001207 scr[name] = x
Russ Coxfc77e822011-06-16 17:56:25 -04001208 }
1209 }
1210
1211 printCatFold("FoldCategory", cat)
1212 printCatFold("FoldScript", scr)
1213}
1214
1215// inCategory returns a list of all the runes in the category.
Russ Cox7630a102011-10-25 22:23:15 -07001216func inCategory(name string) []rune {
1217 var x []rune
1218 for j := range chars {
1219 i := rune(j)
Russ Coxfc77e822011-06-16 17:56:25 -04001220 c := &chars[i]
1221 if c.category == name || len(name) == 1 && len(c.category) > 1 && c.category[0] == name[0] {
1222 x = append(x, i)
1223 }
1224 }
1225 return x
1226}
1227
1228// inScript returns a list of all the runes in the script.
Russ Cox7630a102011-10-25 22:23:15 -07001229func inScript(name string) []rune {
1230 var x []rune
Russ Coxfc77e822011-06-16 17:56:25 -04001231 for _, s := range scripts[name] {
1232 for c := s.lo; c <= s.hi; c++ {
Russ Cox7630a102011-10-25 22:23:15 -07001233 x = append(x, rune(c))
Russ Coxfc77e822011-06-16 17:56:25 -04001234 }
1235 }
1236 return x
1237}
1238
1239// foldExceptions returns a list of all the runes fold-equivalent
1240// to runes in class but not in class themselves.
Russ Cox7630a102011-10-25 22:23:15 -07001241func foldExceptions(class []rune) map[rune]bool {
Russ Coxfc77e822011-06-16 17:56:25 -04001242 // Create map containing class and all fold-equivalent chars.
Russ Cox7630a102011-10-25 22:23:15 -07001243 m := make(map[rune]bool)
Russ Coxfc77e822011-06-16 17:56:25 -04001244 for _, r := range class {
1245 c := &chars[r]
1246 if c.caseOrbit == 0 {
1247 // Just upper and lower.
1248 if u := c.upperCase; u != 0 {
1249 m[u] = true
1250 }
1251 if l := c.lowerCase; l != 0 {
1252 m[l] = true
1253 }
1254 m[r] = true
1255 continue
1256 }
1257 // Otherwise walk orbit.
1258 r0 := r
1259 for {
1260 m[r] = true
1261 r = chars[r].caseOrbit
1262 if r == r0 {
1263 break
1264 }
1265 }
1266 }
1267
1268 // Remove class itself.
1269 for _, r := range class {
Russ Cox313c8222011-10-18 09:56:34 -04001270 delete(m, r)
Russ Coxfc77e822011-06-16 17:56:25 -04001271 }
1272
1273 // What's left is the exceptions.
1274 return m
1275}
1276
1277var comment = map[string]string{
1278 "FoldCategory": "// FoldCategory maps a category name to a table of\n" +
1279 "// code points outside the category that are equivalent under\n" +
1280 "// simple case folding to code points inside the category.\n" +
1281 "// If there is no entry for a category name, there are no such points.\n",
1282
1283 "FoldScript": "// FoldScript maps a script name to a table of\n" +
1284 "// code points outside the script that are equivalent under\n" +
1285 "// simple case folding to code points inside the script.\n" +
1286 "// If there is no entry for a script name, there are no such points.\n",
1287}
1288
Egon Elbree607abb2015-11-17 16:51:23 +02001289func printAsciiFold() {
1290 printf("var asciiFold = [MaxASCII + 1]uint16{\n")
1291 for i := rune(0); i <= unicode.MaxASCII; i++ {
1292 c := chars[i]
1293 f := c.caseOrbit
1294 if f == 0 {
1295 if c.lowerCase != i && c.lowerCase != 0 {
1296 f = c.lowerCase
1297 } else if c.upperCase != i && c.upperCase != 0 {
1298 f = c.upperCase
1299 } else {
1300 f = i
1301 }
1302 }
1303 printf("\t0x%04X,\n", f)
1304 }
1305 printf("}\n\n")
1306}
1307
Russ Coxfc77e822011-06-16 17:56:25 -04001308func printCaseOrbit() {
1309 if *test {
Russ Cox7630a102011-10-25 22:23:15 -07001310 for j := range chars {
1311 i := rune(j)
Russ Coxfc77e822011-06-16 17:56:25 -04001312 c := &chars[i]
1313 f := c.caseOrbit
1314 if f == 0 {
1315 if c.lowerCase != i && c.lowerCase != 0 {
1316 f = c.lowerCase
1317 } else if c.upperCase != i && c.upperCase != 0 {
1318 f = c.upperCase
1319 } else {
1320 f = i
1321 }
1322 }
1323 if g := unicode.SimpleFold(i); g != f {
1324 fmt.Fprintf(os.Stderr, "unicode.SimpleFold(%#U) = %#U, want %#U\n", i, g, f)
1325 }
1326 }
1327 return
1328 }
1329
Rob Pikedff17f42014-08-25 14:56:35 -07001330 printf("var caseOrbit = []foldPair{\n")
Russ Coxfc77e822011-06-16 17:56:25 -04001331 for i := range chars {
1332 c := &chars[i]
1333 if c.caseOrbit != 0 {
Rob Pikedff17f42014-08-25 14:56:35 -07001334 printf("\t{0x%04X, 0x%04X},\n", i, c.caseOrbit)
Russ Coxfc77e822011-06-16 17:56:25 -04001335 foldPairCount++
1336 }
1337 }
Rob Pikedff17f42014-08-25 14:56:35 -07001338 printf("}\n\n")
Russ Coxfc77e822011-06-16 17:56:25 -04001339}
1340
Russ Cox7630a102011-10-25 22:23:15 -07001341func printCatFold(name string, m map[string]map[rune]bool) {
Russ Coxfc77e822011-06-16 17:56:25 -04001342 if *test {
1343 var pkgMap map[string]*unicode.RangeTable
1344 if name == "FoldCategory" {
1345 pkgMap = unicode.FoldCategory
1346 } else {
1347 pkgMap = unicode.FoldScript
1348 }
1349 if len(pkgMap) != len(m) {
1350 fmt.Fprintf(os.Stderr, "unicode.%s has %d elements, want %d\n", name, len(pkgMap), len(m))
1351 return
1352 }
1353 for k, v := range m {
1354 t, ok := pkgMap[k]
1355 if !ok {
1356 fmt.Fprintf(os.Stderr, "unicode.%s[%q] missing\n", name, k)
1357 continue
1358 }
1359 n := 0
1360 for _, r := range t.R16 {
Russ Cox7630a102011-10-25 22:23:15 -07001361 for c := rune(r.Lo); c <= rune(r.Hi); c += rune(r.Stride) {
Russ Coxfc77e822011-06-16 17:56:25 -04001362 if !v[c] {
1363 fmt.Fprintf(os.Stderr, "unicode.%s[%q] contains %#U, should not\n", name, k, c)
1364 }
1365 n++
1366 }
1367 }
1368 for _, r := range t.R32 {
Russ Cox7630a102011-10-25 22:23:15 -07001369 for c := rune(r.Lo); c <= rune(r.Hi); c += rune(r.Stride) {
Russ Coxfc77e822011-06-16 17:56:25 -04001370 if !v[c] {
1371 fmt.Fprintf(os.Stderr, "unicode.%s[%q] contains %#U, should not\n", name, k, c)
1372 }
1373 n++
1374 }
1375 }
1376 if n != len(v) {
1377 fmt.Fprintf(os.Stderr, "unicode.%s[%q] has %d code points, want %d\n", name, k, n, len(v))
1378 }
1379 }
1380 return
1381 }
1382
Rob Pikedff17f42014-08-25 14:56:35 -07001383 print(comment[name])
1384 printf("var %s = map[string]*RangeTable{\n", name)
Russ Coxb4d6b712011-10-19 16:02:22 -04001385 for _, name := range allCatFold(m) {
Rob Pikedff17f42014-08-25 14:56:35 -07001386 printf("\t%q: fold%s,\n", name, name)
Russ Coxfc77e822011-06-16 17:56:25 -04001387 }
Rob Pikedff17f42014-08-25 14:56:35 -07001388 printf("}\n\n")
Russ Coxb4d6b712011-10-19 16:02:22 -04001389 for _, name := range allCatFold(m) {
1390 class := m[name]
Russ Coxfc77e822011-06-16 17:56:25 -04001391 dumpRange(
1392 fmt.Sprintf("var fold%s = &RangeTable{\n", name),
Russ Cox7630a102011-10-25 22:23:15 -07001393 func(code rune) bool { return class[code] })
Russ Coxfc77e822011-06-16 17:56:25 -04001394 }
1395}
1396
1397var range16Count = 0 // Number of entries in the 16-bit range tables.
1398var range32Count = 0 // Number of entries in the 32-bit range tables.
1399var foldPairCount = 0 // Number of fold pairs in the exception tables.
Rob Pike0de328e2011-05-31 09:58:07 +10001400
1401func printSizes() {
Rob Pike9ec0c012011-06-01 09:49:51 +10001402 if *test {
1403 return
1404 }
Rob Pikedff17f42014-08-25 14:56:35 -07001405 println()
1406 printf("// Range entries: %d 16-bit, %d 32-bit, %d total.\n", range16Count, range32Count, range16Count+range32Count)
Rob Pike0de328e2011-05-31 09:58:07 +10001407 range16Bytes := range16Count * 3 * 2
1408 range32Bytes := range32Count * 3 * 4
Rob Pikedff17f42014-08-25 14:56:35 -07001409 printf("// Range bytes: %d 16-bit, %d 32-bit, %d total.\n", range16Bytes, range32Bytes, range16Bytes+range32Bytes)
1410 println()
1411 printf("// Fold orbit bytes: %d pairs, %d bytes\n", foldPairCount, foldPairCount*2*2)
Rob Pike0de328e2011-05-31 09:58:07 +10001412}