blob: 39c7121a67dfe1cd0ad91f2374c5a7f743be7902 [file] [log] [blame]
Rob Pike396b47b2009-08-26 16:01:31 -07001// Copyright 2009 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Unicode table generator.
6// Data read from the web.
7
8package main
9
10import (
Robert Griesemer45ca9f72009-12-15 15:41:46 -080011 "bufio"
12 "flag"
13 "fmt"
14 "http"
15 "log"
16 "os"
17 "sort"
18 "strconv"
19 "strings"
20 "regexp"
21 "unicode"
Rob Pike396b47b2009-08-26 16:01:31 -070022)
23
Rob Pike22c2b472009-08-28 23:05:16 -070024func main() {
Robert Griesemer45ca9f72009-12-15 15:41:46 -080025 flag.Parse()
26 loadChars() // always needed
27 printCategories()
28 printScriptOrProperty(false)
29 printScriptOrProperty(true)
30 printCases()
Rob Pike8d64e732011-06-04 07:46:22 +100031 printLatinProperties()
Rob Pike0de328e2011-05-31 09:58:07 +100032 printSizes()
Rob Pike22c2b472009-08-28 23:05:16 -070033}
34
Russ Coxed6eb5b2009-11-08 21:46:20 -080035var dataURL = flag.String("data", "", "full URL for UnicodeData.txt; defaults to --url/UnicodeData.txt")
Rob Pike94e69152009-08-27 09:14:32 -070036var url = flag.String("url",
Rob Pikefc52d702011-01-31 15:20:44 -080037 "http://www.unicode.org/Public/6.0.0/ucd/",
Rob Pike8b6274e2009-08-27 17:04:23 -070038 "URL of Unicode database directory")
39var tablelist = flag.String("tables",
Rob Pike94e69152009-08-27 09:14:32 -070040 "all",
Rob Pikea8246512009-11-02 11:37:52 -080041 "comma-separated list of which tables to generate; can be letter")
Rob Pike8b6274e2009-08-27 17:04:23 -070042var scriptlist = flag.String("scripts",
43 "all",
Rob Pikea8246512009-11-02 11:37:52 -080044 "comma-separated list of which script tables to generate")
Rob Pike1e55e4a2009-08-31 16:43:17 -070045var proplist = flag.String("props",
46 "all",
Rob Pikea8246512009-11-02 11:37:52 -080047 "comma-separated list of which property tables to generate")
Rob Pike22c2b472009-08-28 23:05:16 -070048var cases = flag.Bool("cases",
49 true,
Rob Pikea8246512009-11-02 11:37:52 -080050 "generate case tables")
Rob Pike94e69152009-08-27 09:14:32 -070051var test = flag.Bool("test",
52 false,
Rob Pikea8246512009-11-02 11:37:52 -080053 "test existing tables; can be used to compare web data with package data")
Rob Pike396b47b2009-08-26 16:01:31 -070054
Russ Cox0f0f34e2011-01-30 16:09:16 -050055var scriptRe = regexp.MustCompile(`^([0-9A-F]+)(\.\.[0-9A-F]+)? *; ([A-Za-z_]+)$`)
56var logger = log.New(os.Stderr, "", log.Lshortfile)
Rob Pike396b47b2009-08-26 16:01:31 -070057
Rob Pike8d64e732011-06-04 07:46:22 +100058var category = map[string]bool{
59 // Nd Lu etc.
60 // We use one-character names to identify merged categories
61 "L": true, // Lu Ll Lt Lm Lo
62 "P": true, // Pc Pd Ps Pe Pu Pf Po
63 "M": true, // Mn Mc Me
64 "N": true, // Nd Nl No
65 "S": true, // Sm Sc Sk So
66 "Z": true, // Zs Zl Zp
67 "C": true, // Cc Cf Cs Co Cn
68}
Rob Pike94e69152009-08-27 09:14:32 -070069
Rob Pike8b6274e2009-08-27 17:04:23 -070070// UnicodeData.txt has form:
Rob Pike396b47b2009-08-26 16:01:31 -070071// 0037;DIGIT SEVEN;Nd;0;EN;;7;7;7;N;;;;;
72// 007A;LATIN SMALL LETTER Z;Ll;0;L;;;;;N;;;005A;;005A
73// See http://www.unicode.org/Public/5.1.0/ucd/UCD.html for full explanation
Rob Pike22c2b472009-08-28 23:05:16 -070074// The fields:
Rob Pike396b47b2009-08-26 16:01:31 -070075const (
Robert Griesemer45ca9f72009-12-15 15:41:46 -080076 FCodePoint = iota
77 FName
78 FGeneralCategory
79 FCanonicalCombiningClass
80 FBidiClass
81 FDecompositionType
82 FDecompositionMapping
83 FNumericType
84 FNumericValue
85 FBidiMirrored
86 FUnicode1Name
87 FISOComment
88 FSimpleUppercaseMapping
89 FSimpleLowercaseMapping
90 FSimpleTitlecaseMapping
91 NumField
Rob Pike396b47b2009-08-26 16:01:31 -070092
Robert Griesemer45ca9f72009-12-15 15:41:46 -080093 MaxChar = 0x10FFFF // anything above this shouldn't exist
Rob Pike396b47b2009-08-26 16:01:31 -070094)
95
96var fieldName = []string{
97 "CodePoint",
98 "Name",
99 "GeneralCategory",
100 "CanonicalCombiningClass",
101 "BidiClass",
102 "DecompositionType",
103 "DecompositionMapping",
104 "NumericType",
105 "NumericValue",
106 "BidiMirrored",
107 "Unicode1Name",
108 "ISOComment",
109 "SimpleUppercaseMapping",
110 "SimpleLowercaseMapping",
Robert Griesemer841c18a2009-11-04 21:39:55 -0800111 "SimpleTitlecaseMapping",
Rob Pike396b47b2009-08-26 16:01:31 -0700112}
113
114// This contains only the properties we're interested in.
115type Char struct {
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800116 field []string // debugging only; could be deleted if we take out char.dump()
117 codePoint uint32 // if zero, this index is not a valid code point.
118 category string
119 upperCase int
120 lowerCase int
121 titleCase int
Rob Pike396b47b2009-08-26 16:01:31 -0700122}
123
Rob Pike8b6274e2009-08-27 17:04:23 -0700124// Scripts.txt has form:
125// A673 ; Cyrillic # Po SLAVONIC ASTERISK
126// A67C..A67D ; Cyrillic # Mn [2] COMBINING CYRILLIC KAVYKA..COMBINING CYRILLIC PAYEROK
127// See http://www.unicode.org/Public/5.1.0/ucd/UCD.html for full explanation
128
129type Script struct {
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800130 lo, hi uint32 // range of code points
131 script string
Rob Pike8b6274e2009-08-27 17:04:23 -0700132}
133
Rob Pike22c2b472009-08-28 23:05:16 -0700134var chars = make([]Char, MaxChar+1)
Robert Griesemer841c18a2009-11-04 21:39:55 -0800135var scripts = make(map[string][]Script)
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800136var props = make(map[string][]Script) // a property looks like a script; can share the format
Rob Pike396b47b2009-08-26 16:01:31 -0700137
Rob Pike1e55e4a2009-08-31 16:43:17 -0700138var lastChar uint32 = 0
139
Rob Pikef59ae062009-08-28 11:57:38 -0700140// In UnicodeData.txt, some ranges are marked like this:
Rob Pike22c2b472009-08-28 23:05:16 -0700141// 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
142// 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
Rob Pikef59ae062009-08-28 11:57:38 -0700143// parseCategory returns a state variable indicating the weirdness.
144type State int
Robert Griesemer841c18a2009-11-04 21:39:55 -0800145
Rob Pikef59ae062009-08-28 11:57:38 -0700146const (
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800147 SNormal State = iota // known to be zero for the type
148 SFirst
149 SLast
150 SMissing
Rob Pikef59ae062009-08-28 11:57:38 -0700151)
152
153func parseCategory(line string) (state State) {
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800154 field := strings.Split(line, ";", -1)
Rob Pike396b47b2009-08-26 16:01:31 -0700155 if len(field) != NumField {
Rob Pikeeea18d92011-02-01 12:47:35 -0800156 logger.Fatalf("%5s: %d fields (expected %d)\n", line, len(field), NumField)
Rob Pike396b47b2009-08-26 16:01:31 -0700157 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800158 point, err := strconv.Btoui64(field[FCodePoint], 16)
Rob Pike396b47b2009-08-26 16:01:31 -0700159 if err != nil {
Rob Pikeeea18d92011-02-01 12:47:35 -0800160 logger.Fatalf("%.5s...: %s", line, err)
Rob Pike396b47b2009-08-26 16:01:31 -0700161 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800162 lastChar = uint32(point)
Rob Pike396b47b2009-08-26 16:01:31 -0700163 if point == 0 {
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800164 return // not interesting and we use 0 as unset
Rob Pike396b47b2009-08-26 16:01:31 -0700165 }
Rob Pike22c2b472009-08-28 23:05:16 -0700166 if point > MaxChar {
Robert Griesemer40621d52009-11-09 12:07:39 -0800167 return
Rob Pike396b47b2009-08-26 16:01:31 -0700168 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800169 char := &chars[point]
170 char.field = field
Rob Pike396b47b2009-08-26 16:01:31 -0700171 if char.codePoint != 0 {
Rob Pikeeea18d92011-02-01 12:47:35 -0800172 logger.Fatalf("point %U reused", point)
Rob Pike396b47b2009-08-26 16:01:31 -0700173 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800174 char.codePoint = lastChar
175 char.category = field[FGeneralCategory]
176 category[char.category] = true
Rob Pike396b47b2009-08-26 16:01:31 -0700177 switch char.category {
178 case "Nd":
179 // Decimal digit
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800180 _, err := strconv.Atoi(field[FNumericValue])
Rob Pike396b47b2009-08-26 16:01:31 -0700181 if err != nil {
Rob Pikeeea18d92011-02-01 12:47:35 -0800182 logger.Fatalf("%U: bad numeric field: %s", point, err)
Rob Pike396b47b2009-08-26 16:01:31 -0700183 }
Rob Pike396b47b2009-08-26 16:01:31 -0700184 case "Lu":
Robert Griesemer40621d52009-11-09 12:07:39 -0800185 char.letter(field[FCodePoint], field[FSimpleLowercaseMapping], field[FSimpleTitlecaseMapping])
Rob Pike396b47b2009-08-26 16:01:31 -0700186 case "Ll":
Robert Griesemer40621d52009-11-09 12:07:39 -0800187 char.letter(field[FSimpleUppercaseMapping], field[FCodePoint], field[FSimpleTitlecaseMapping])
Rob Pike396b47b2009-08-26 16:01:31 -0700188 case "Lt":
Robert Griesemer40621d52009-11-09 12:07:39 -0800189 char.letter(field[FSimpleUppercaseMapping], field[FSimpleLowercaseMapping], field[FCodePoint])
Rob Pike396b47b2009-08-26 16:01:31 -0700190 case "Lm", "Lo":
Robert Griesemer40621d52009-11-09 12:07:39 -0800191 char.letter(field[FSimpleUppercaseMapping], field[FSimpleLowercaseMapping], field[FSimpleTitlecaseMapping])
Rob Pike396b47b2009-08-26 16:01:31 -0700192 }
Rob Pikef59ae062009-08-28 11:57:38 -0700193 switch {
194 case strings.Index(field[FName], ", First>") > 0:
Robert Griesemer40621d52009-11-09 12:07:39 -0800195 state = SFirst
Rob Pikef59ae062009-08-28 11:57:38 -0700196 case strings.Index(field[FName], ", Last>") > 0:
Robert Griesemer40621d52009-11-09 12:07:39 -0800197 state = SLast
Rob Pikef59ae062009-08-28 11:57:38 -0700198 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800199 return
Rob Pike396b47b2009-08-26 16:01:31 -0700200}
201
202func (char *Char) dump(s string) {
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800203 fmt.Print(s, " ")
Robert Griesemer841c18a2009-11-04 21:39:55 -0800204 for i := 0; i < len(char.field); i++ {
Robert Griesemer40621d52009-11-09 12:07:39 -0800205 fmt.Printf("%s:%q ", fieldName[i], char.field[i])
Rob Pike396b47b2009-08-26 16:01:31 -0700206 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800207 fmt.Print("\n")
Rob Pike396b47b2009-08-26 16:01:31 -0700208}
209
210func (char *Char) letter(u, l, t string) {
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800211 char.upperCase = char.letterValue(u, "U")
212 char.lowerCase = char.letterValue(l, "L")
213 char.titleCase = char.letterValue(t, "T")
Rob Pike396b47b2009-08-26 16:01:31 -0700214}
215
Rob Pike22c2b472009-08-28 23:05:16 -0700216func (char *Char) letterValue(s string, cas string) int {
Rob Pike396b47b2009-08-26 16:01:31 -0700217 if s == "" {
Robert Griesemer40621d52009-11-09 12:07:39 -0800218 return 0
Rob Pike396b47b2009-08-26 16:01:31 -0700219 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800220 v, err := strconv.Btoui64(s, 16)
Rob Pike396b47b2009-08-26 16:01:31 -0700221 if err != nil {
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800222 char.dump(cas)
Rob Pikeeea18d92011-02-01 12:47:35 -0800223 logger.Fatalf("%U: bad letter(%s): %s", char.codePoint, s, err)
Rob Pike396b47b2009-08-26 16:01:31 -0700224 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800225 return int(v)
Rob Pike396b47b2009-08-26 16:01:31 -0700226}
227
Rob Pike94e69152009-08-27 09:14:32 -0700228func allCategories() []string {
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800229 a := make([]string, len(category))
230 i := 0
Rob Pike94e69152009-08-27 09:14:32 -0700231 for k := range category {
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800232 a[i] = k
233 i++
Rob Pike94e69152009-08-27 09:14:32 -0700234 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800235 return a
Rob Pike94e69152009-08-27 09:14:32 -0700236}
237
Robert Griesemer841c18a2009-11-04 21:39:55 -0800238func all(scripts map[string][]Script) []string {
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800239 a := make([]string, len(scripts))
240 i := 0
Rob Pike8b6274e2009-08-27 17:04:23 -0700241 for k := range scripts {
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800242 a[i] = k
243 i++
Rob Pike8b6274e2009-08-27 17:04:23 -0700244 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800245 return a
Rob Pike8b6274e2009-08-27 17:04:23 -0700246}
247
Rob Pike94e69152009-08-27 09:14:32 -0700248// Extract the version number from the URL
249func version() string {
250 // Break on slashes and look for the first numeric field
Rob Pike38f12312010-07-01 14:08:14 -0700251 fields := strings.Split(*url, "/", -1)
Rob Pike94e69152009-08-27 09:14:32 -0700252 for _, f := range fields {
253 if len(f) > 0 && '0' <= f[0] && f[0] <= '9' {
Robert Griesemer40621d52009-11-09 12:07:39 -0800254 return f
Rob Pike94e69152009-08-27 09:14:32 -0700255 }
256 }
Rob Pikeeea18d92011-02-01 12:47:35 -0800257 logger.Fatal("unknown version")
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800258 return "Unknown"
Rob Pike94e69152009-08-27 09:14:32 -0700259}
260
Rob Pike8d64e732011-06-04 07:46:22 +1000261func categoryOp(code int, class uint8) bool {
262 category := chars[code].category
263 return len(category) > 0 && category[0] == class
Rob Pike94e69152009-08-27 09:14:32 -0700264}
265
Rob Pike22c2b472009-08-28 23:05:16 -0700266func loadChars() {
Russ Coxed6eb5b2009-11-08 21:46:20 -0800267 if *dataURL == "" {
Robert Griesemer3bb00322009-11-09 21:23:52 -0800268 flag.Set("data", *url+"UnicodeData.txt")
Rob Pike8b6274e2009-08-27 17:04:23 -0700269 }
Brad Fitzpatrickb2400c22011-05-13 18:56:39 -0700270 resp, err := http.Get(*dataURL)
Rob Pike396b47b2009-08-26 16:01:31 -0700271 if err != nil {
Rob Pikeeea18d92011-02-01 12:47:35 -0800272 logger.Fatal(err)
Rob Pike396b47b2009-08-26 16:01:31 -0700273 }
Rob Pike94e69152009-08-27 09:14:32 -0700274 if resp.StatusCode != 200 {
Rob Pikeeea18d92011-02-01 12:47:35 -0800275 logger.Fatal("bad GET status for UnicodeData.txt", resp.Status)
Rob Pike94e69152009-08-27 09:14:32 -0700276 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800277 input := bufio.NewReader(resp.Body)
278 var first uint32 = 0
Rob Pike396b47b2009-08-26 16:01:31 -0700279 for {
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800280 line, err := input.ReadString('\n')
Rob Pike396b47b2009-08-26 16:01:31 -0700281 if err != nil {
282 if err == os.EOF {
Robert Griesemer40621d52009-11-09 12:07:39 -0800283 break
Rob Pike396b47b2009-08-26 16:01:31 -0700284 }
Rob Pikeeea18d92011-02-01 12:47:35 -0800285 logger.Fatal(err)
Rob Pike396b47b2009-08-26 16:01:31 -0700286 }
Robert Griesemer841c18a2009-11-04 21:39:55 -0800287 switch parseCategory(line[0 : len(line)-1]) {
Rob Pikef59ae062009-08-28 11:57:38 -0700288 case SNormal:
289 if first != 0 {
Rob Pike0de328e2011-05-31 09:58:07 +1000290 logger.Fatalf("bad state normal at %U", lastChar)
Rob Pikef59ae062009-08-28 11:57:38 -0700291 }
292 case SFirst:
293 if first != 0 {
Rob Pike0de328e2011-05-31 09:58:07 +1000294 logger.Fatalf("bad state first at %U", lastChar)
Rob Pikef59ae062009-08-28 11:57:38 -0700295 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800296 first = lastChar
Rob Pikef59ae062009-08-28 11:57:38 -0700297 case SLast:
298 if first == 0 {
Rob Pike0de328e2011-05-31 09:58:07 +1000299 logger.Fatalf("bad state last at %U", lastChar)
Rob Pikef59ae062009-08-28 11:57:38 -0700300 }
Robert Griesemer3bb00322009-11-09 21:23:52 -0800301 for i := first + 1; i <= lastChar; i++ {
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800302 chars[i] = chars[first]
303 chars[i].codePoint = i
Rob Pikef59ae062009-08-28 11:57:38 -0700304 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800305 first = 0
Rob Pikef59ae062009-08-28 11:57:38 -0700306 }
Rob Pike396b47b2009-08-26 16:01:31 -0700307 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800308 resp.Body.Close()
Rob Pike22c2b472009-08-28 23:05:16 -0700309}
310
Rob Pike0de328e2011-05-31 09:58:07 +1000311const progHeader = `// Generated by running
312// maketables --tables=%s --data=%s
313// DO NOT EDIT
314
315package unicode
316
317`
318
319
Rob Pike22c2b472009-08-28 23:05:16 -0700320func printCategories() {
321 if *tablelist == "" {
Robert Griesemer40621d52009-11-09 12:07:39 -0800322 return
Rob Pike22c2b472009-08-28 23:05:16 -0700323 }
Rob Pike94e69152009-08-27 09:14:32 -0700324 // Find out which categories to dump
Rob Pike38f12312010-07-01 14:08:14 -0700325 list := strings.Split(*tablelist, ",", -1)
Rob Pike8b6274e2009-08-27 17:04:23 -0700326 if *tablelist == "all" {
Robert Griesemer40621d52009-11-09 12:07:39 -0800327 list = allCategories()
Rob Pike94e69152009-08-27 09:14:32 -0700328 }
329 if *test {
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800330 fullCategoryTest(list)
331 return
Rob Pike94e69152009-08-27 09:14:32 -0700332 }
Rob Pike0de328e2011-05-31 09:58:07 +1000333 fmt.Printf(progHeader, *tablelist, *dataURL)
Rob Pike94e69152009-08-27 09:14:32 -0700334
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800335 fmt.Println("// Version is the Unicode edition from which the tables are derived.")
336 fmt.Printf("const Version = %q\n\n", version())
Rob Pike94e69152009-08-27 09:14:32 -0700337
Rob Pike8b6274e2009-08-27 17:04:23 -0700338 if *tablelist == "all" {
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800339 fmt.Println("// Categories is the set of Unicode data tables.")
Rob Pike0de328e2011-05-31 09:58:07 +1000340 fmt.Println("var Categories = map[string] *RangeTable {")
Ryan Hitchman062406b2010-12-08 21:36:56 -0800341 for k := range category {
Robert Griesemer40621d52009-11-09 12:07:39 -0800342 fmt.Printf("\t%q: %s,\n", k, k)
Rob Pike94e69152009-08-27 09:14:32 -0700343 }
Rob Pike1ce62452010-12-07 16:42:54 -0500344 fmt.Print("}\n\n")
Rob Pike94e69152009-08-27 09:14:32 -0700345 }
346
Rob Pike4b1170d2011-06-11 09:25:18 +1000347 decl := make(sort.StringSlice, len(list))
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800348 ndecl := 0
Rob Pike94e69152009-08-27 09:14:32 -0700349 for _, name := range list {
350 if _, ok := category[name]; !ok {
Rob Pikeeea18d92011-02-01 12:47:35 -0800351 logger.Fatal("unknown category", name)
Rob Pike94e69152009-08-27 09:14:32 -0700352 }
353 // We generate an UpperCase name to serve as concise documentation and an _UnderScored
354 // name to store the data. This stops godoc dumping all the tables but keeps them
355 // available to clients.
Rob Pike25caf182009-08-27 18:38:02 -0700356 // Cases deserving special comments
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800357 varDecl := ""
Rob Pike25caf182009-08-27 18:38:02 -0700358 switch name {
Rob Pike8d64e732011-06-04 07:46:22 +1000359 case "C":
360 varDecl = "\tOther = _C; // Other/C is the set of Unicode control and special characters, category C.\n"
361 varDecl += "\tC = _C\n"
362 case "L":
363 varDecl = "\tLetter = _L; // Letter/L is the set of Unicode letters, category L.\n"
364 varDecl += "\tL = _L\n"
365 case "M":
366 varDecl = "\tMark = _M; // Mark/M is the set of Unicode mark characters, category M.\n"
367 varDecl += "\tM = _M\n"
368 case "N":
369 varDecl = "\tNumber = _N; // Number/N is the set of Unicode number characters, category N.\n"
370 varDecl += "\tN = _N\n"
371 case "P":
372 varDecl = "\tPunct = _P; // Punct/P is the set of Unicode punctuation characters, category P.\n"
373 varDecl += "\tP = _P\n"
374 case "S":
375 varDecl = "\tSymbol = _S; // Symbol/S is the set of Unicode symbol characters, category S.\n"
376 varDecl += "\tS = _S\n"
377 case "Z":
378 varDecl = "\tSpace = _Z; // Space/Z is the set of Unicode space characters, category Z.\n"
379 varDecl += "\tZ = _Z\n"
Rob Pike25caf182009-08-27 18:38:02 -0700380 case "Nd":
Robert Griesemer40621d52009-11-09 12:07:39 -0800381 varDecl = "\tDigit = _Nd; // Digit is the set of Unicode characters with the \"decimal digit\" property.\n"
Rob Pike25caf182009-08-27 18:38:02 -0700382 case "Lu":
Robert Griesemer40621d52009-11-09 12:07:39 -0800383 varDecl = "\tUpper = _Lu; // Upper is the set of Unicode upper case letters.\n"
Rob Pike25caf182009-08-27 18:38:02 -0700384 case "Ll":
Robert Griesemer40621d52009-11-09 12:07:39 -0800385 varDecl = "\tLower = _Ll; // Lower is the set of Unicode lower case letters.\n"
Rob Pike25caf182009-08-27 18:38:02 -0700386 case "Lt":
Robert Griesemer40621d52009-11-09 12:07:39 -0800387 varDecl = "\tTitle = _Lt; // Title is the set of Unicode title case letters.\n"
Rob Pike25caf182009-08-27 18:38:02 -0700388 }
Rob Pike8d64e732011-06-04 07:46:22 +1000389 if len(name) > 1 {
Rob Pike25caf182009-08-27 18:38:02 -0700390 varDecl += fmt.Sprintf(
391 "\t%s = _%s; // %s is the set of Unicode characters in category %s.\n",
Robert Griesemer40621d52009-11-09 12:07:39 -0800392 name, name, name, name)
Rob Pike25caf182009-08-27 18:38:02 -0700393 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800394 decl[ndecl] = varDecl
395 ndecl++
Rob Pike8d64e732011-06-04 07:46:22 +1000396 if len(name) == 1 { // unified categories
397 decl := fmt.Sprintf("var _%s = &RangeTable{\n", name)
Rob Pike94e69152009-08-27 09:14:32 -0700398 dumpRange(
Rob Pike8d64e732011-06-04 07:46:22 +1000399 decl,
400 func(code int) bool { return categoryOp(code, name[0]) })
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800401 continue
Rob Pike94e69152009-08-27 09:14:32 -0700402 }
Rob Pike396b47b2009-08-26 16:01:31 -0700403 dumpRange(
Rob Pike0de328e2011-05-31 09:58:07 +1000404 fmt.Sprintf("var _%s = &RangeTable{\n", name),
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800405 func(code int) bool { return chars[code].category == name })
Rob Pike396b47b2009-08-26 16:01:31 -0700406 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800407 decl.Sort()
408 fmt.Println("var (")
Rob Pike25caf182009-08-27 18:38:02 -0700409 for _, d := range decl {
Robert Griesemer40621d52009-11-09 12:07:39 -0800410 fmt.Print(d)
Rob Pike25caf182009-08-27 18:38:02 -0700411 }
Rob Pike1ce62452010-12-07 16:42:54 -0500412 fmt.Print(")\n\n")
Rob Pike396b47b2009-08-26 16:01:31 -0700413}
414
415type Op func(code int) bool
Robert Griesemer841c18a2009-11-04 21:39:55 -0800416
Rob Pike0de328e2011-05-31 09:58:07 +1000417const format = "\t\t{0x%04x, 0x%04x, %d},\n"
Rob Pike396b47b2009-08-26 16:01:31 -0700418
Rob Pike25caf182009-08-27 18:38:02 -0700419func dumpRange(header string, inCategory Op) {
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800420 fmt.Print(header)
421 next := 0
Rob Pike0de328e2011-05-31 09:58:07 +1000422 fmt.Print("\tR16: []Range16{\n")
Rob Pike396b47b2009-08-26 16:01:31 -0700423 // one Range for each iteration
Rob Pike0de328e2011-05-31 09:58:07 +1000424 count := &range16Count
425 size := 16
Rob Pike396b47b2009-08-26 16:01:31 -0700426 for {
427 // look for start of range
428 for next < len(chars) && !inCategory(next) {
Robert Griesemer40621d52009-11-09 12:07:39 -0800429 next++
Rob Pike396b47b2009-08-26 16:01:31 -0700430 }
431 if next >= len(chars) {
432 // no characters remain
Robert Griesemer40621d52009-11-09 12:07:39 -0800433 break
Rob Pike396b47b2009-08-26 16:01:31 -0700434 }
435
436 // start of range
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800437 lo := next
438 hi := next
439 stride := 1
Rob Pike396b47b2009-08-26 16:01:31 -0700440 // accept lo
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800441 next++
Rob Pike396b47b2009-08-26 16:01:31 -0700442 // look for another character to set the stride
443 for next < len(chars) && !inCategory(next) {
Robert Griesemer40621d52009-11-09 12:07:39 -0800444 next++
Rob Pike396b47b2009-08-26 16:01:31 -0700445 }
446 if next >= len(chars) {
447 // no more characters
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800448 fmt.Printf(format, lo, hi, stride)
449 break
Rob Pike396b47b2009-08-26 16:01:31 -0700450 }
451 // set stride
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800452 stride = next - lo
Rob Pike396b47b2009-08-26 16:01:31 -0700453 // check for length of run. next points to first jump in stride
454 for i := next; i < len(chars); i++ {
Robert Griesemer3bb00322009-11-09 21:23:52 -0800455 if inCategory(i) == (((i - lo) % stride) == 0) {
Rob Pike396b47b2009-08-26 16:01:31 -0700456 // accept
457 if inCategory(i) {
Robert Griesemer40621d52009-11-09 12:07:39 -0800458 hi = i
Rob Pike396b47b2009-08-26 16:01:31 -0700459 }
460 } else {
461 // no more characters in this run
Robert Griesemer40621d52009-11-09 12:07:39 -0800462 break
Rob Pike396b47b2009-08-26 16:01:31 -0700463 }
464 }
Rob Pike9ec0c012011-06-01 09:49:51 +1000465 size, count = printRange(uint32(lo), uint32(hi), uint32(stride), size, count)
Rob Pike396b47b2009-08-26 16:01:31 -0700466 // next range: start looking where this range ends
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800467 next = hi + 1
Rob Pike396b47b2009-08-26 16:01:31 -0700468 }
Rob Pike0de328e2011-05-31 09:58:07 +1000469 fmt.Print("\t},\n")
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800470 fmt.Print("}\n\n")
Rob Pike396b47b2009-08-26 16:01:31 -0700471}
Rob Pike94e69152009-08-27 09:14:32 -0700472
Rob Pike9ec0c012011-06-01 09:49:51 +1000473func printRange(lo, hi, stride uint32, size int, count *int) (int, *int) {
474 if size == 16 && hi >= 1<<16 {
475 if lo < 1<<16 {
476 if lo+stride != hi {
Rob Pike8d64e732011-06-04 07:46:22 +1000477 logger.Fatalf("unexpected straddle: %U %U %d", lo, hi, stride)
Rob Pike9ec0c012011-06-01 09:49:51 +1000478 }
479 // No range contains U+FFFF as an instance, so split
480 // the range into two entries. That way we can maintain
481 // the invariant that R32 contains only >= 1<<16.
482 fmt.Printf(format, lo, lo, 1)
483 lo = hi
484 stride = 1
485 *count++
486 }
487 fmt.Print("\t},\n")
488 fmt.Print("\tR32: []Range32{\n")
489 size = 32
490 count = &range32Count
491 }
492 fmt.Printf(format, lo, hi, stride)
493 *count++
494 return size, count
495}
496
Rob Pike8b6274e2009-08-27 17:04:23 -0700497func fullCategoryTest(list []string) {
Rob Pike94e69152009-08-27 09:14:32 -0700498 for _, name := range list {
499 if _, ok := category[name]; !ok {
Rob Pikeeea18d92011-02-01 12:47:35 -0800500 logger.Fatal("unknown category", name)
Rob Pike94e69152009-08-27 09:14:32 -0700501 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800502 r, ok := unicode.Categories[name]
Rob Pike8d64e732011-06-04 07:46:22 +1000503 if !ok && len(name) > 1 {
504 logger.Fatalf("unknown table %q", name)
Rob Pike94e69152009-08-27 09:14:32 -0700505 }
Rob Pike8d64e732011-06-04 07:46:22 +1000506 if len(name) == 1 {
507 verifyRange(name, func(code int) bool { return categoryOp(code, name[0]) }, r)
Rob Pike94e69152009-08-27 09:14:32 -0700508 } else {
509 verifyRange(
510 name,
511 func(code int) bool { return chars[code].category == name },
Robert Griesemer40621d52009-11-09 12:07:39 -0800512 r)
Rob Pike94e69152009-08-27 09:14:32 -0700513 }
514 }
515}
516
Rob Pike0de328e2011-05-31 09:58:07 +1000517func verifyRange(name string, inCategory Op, table *unicode.RangeTable) {
Rob Pike8d64e732011-06-04 07:46:22 +1000518 count := 0
Rob Pikea8246512009-11-02 11:37:52 -0800519 for i := range chars {
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800520 web := inCategory(i)
521 pkg := unicode.Is(table, i)
Rob Pike94e69152009-08-27 09:14:32 -0700522 if web != pkg {
Rob Pike0de328e2011-05-31 09:58:07 +1000523 fmt.Fprintf(os.Stderr, "%s: %U: web=%t pkg=%t\n", name, i, web, pkg)
Rob Pike8d64e732011-06-04 07:46:22 +1000524 count++
525 if count > 10 {
526 break
527 }
Rob Pike94e69152009-08-27 09:14:32 -0700528 }
529 }
530}
Rob Pike8b6274e2009-08-27 17:04:23 -0700531
Robert Griesemer841c18a2009-11-04 21:39:55 -0800532func parseScript(line string, scripts map[string][]Script) {
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800533 comment := strings.Index(line, "#")
Rob Pike8b6274e2009-08-27 17:04:23 -0700534 if comment >= 0 {
Robert Griesemer40621d52009-11-09 12:07:39 -0800535 line = line[0:comment]
Rob Pike8b6274e2009-08-27 17:04:23 -0700536 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800537 line = strings.TrimSpace(line)
Rob Pike8b6274e2009-08-27 17:04:23 -0700538 if len(line) == 0 {
Robert Griesemer40621d52009-11-09 12:07:39 -0800539 return
Rob Pike8b6274e2009-08-27 17:04:23 -0700540 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800541 field := strings.Split(line, ";", -1)
Rob Pike8b6274e2009-08-27 17:04:23 -0700542 if len(field) != 2 {
Rob Pikeeea18d92011-02-01 12:47:35 -0800543 logger.Fatalf("%s: %d fields (expected 2)\n", line, len(field))
Rob Pike8b6274e2009-08-27 17:04:23 -0700544 }
Russ Cox0f0f34e2011-01-30 16:09:16 -0500545 matches := scriptRe.FindStringSubmatch(line)
Rob Pike8b6274e2009-08-27 17:04:23 -0700546 if len(matches) != 4 {
Rob Pikeeea18d92011-02-01 12:47:35 -0800547 logger.Fatalf("%s: %d matches (expected 3)\n", line, len(matches))
Rob Pike8b6274e2009-08-27 17:04:23 -0700548 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800549 lo, err := strconv.Btoui64(matches[1], 16)
Rob Pike8b6274e2009-08-27 17:04:23 -0700550 if err != nil {
Rob Pikeeea18d92011-02-01 12:47:35 -0800551 logger.Fatalf("%.5s...: %s", line, err)
Rob Pike8b6274e2009-08-27 17:04:23 -0700552 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800553 hi := lo
554 if len(matches[2]) > 2 { // ignore leading ..
555 hi, err = strconv.Btoui64(matches[2][2:], 16)
Rob Pike8b6274e2009-08-27 17:04:23 -0700556 if err != nil {
Rob Pikeeea18d92011-02-01 12:47:35 -0800557 logger.Fatalf("%.5s...: %s", line, err)
Rob Pike8b6274e2009-08-27 17:04:23 -0700558 }
559 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800560 name := matches[3]
Russ Cox69c4e932010-10-27 19:47:23 -0700561 scripts[name] = append(scripts[name], Script{uint32(lo), uint32(hi), name})
Rob Pike8b6274e2009-08-27 17:04:23 -0700562}
563
Rob Pike8b6274e2009-08-27 17:04:23 -0700564// The script tables have a lot of adjacent elements. Fold them together.
Rob Pike0de328e2011-05-31 09:58:07 +1000565func foldAdjacent(r []Script) []unicode.Range32 {
566 s := make([]unicode.Range32, 0, len(r))
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800567 j := 0
Rob Pike8b6274e2009-08-27 17:04:23 -0700568 for i := 0; i < len(r); i++ {
Rob Pike0de328e2011-05-31 09:58:07 +1000569 if j > 0 && r[i].lo == s[j-1].Hi+1 {
570 s[j-1].Hi = r[i].hi
Rob Pike8b6274e2009-08-27 17:04:23 -0700571 } else {
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800572 s = s[0 : j+1]
Rob Pike0de328e2011-05-31 09:58:07 +1000573 s[j] = unicode.Range32{uint32(r[i].lo), uint32(r[i].hi), 1}
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800574 j++
Rob Pike8b6274e2009-08-27 17:04:23 -0700575 }
576 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800577 return s
Rob Pike8b6274e2009-08-27 17:04:23 -0700578}
579
Rob Pike0de328e2011-05-31 09:58:07 +1000580func fullScriptTest(list []string, installed map[string]*unicode.RangeTable, scripts map[string][]Script) {
Rob Pike8b6274e2009-08-27 17:04:23 -0700581 for _, name := range list {
582 if _, ok := scripts[name]; !ok {
Rob Pikeeea18d92011-02-01 12:47:35 -0800583 logger.Fatal("unknown script", name)
Rob Pike8b6274e2009-08-27 17:04:23 -0700584 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800585 _, ok := installed[name]
Rob Pike8b6274e2009-08-27 17:04:23 -0700586 if !ok {
Rob Pikeeea18d92011-02-01 12:47:35 -0800587 logger.Fatal("unknown table", name)
Rob Pike8b6274e2009-08-27 17:04:23 -0700588 }
589 for _, script := range scripts[name] {
590 for r := script.lo; r <= script.hi; r++ {
Rob Pike1e55e4a2009-08-31 16:43:17 -0700591 if !unicode.Is(installed[name], int(r)) {
Rob Pike0de328e2011-05-31 09:58:07 +1000592 fmt.Fprintf(os.Stderr, "%U: not in script %s\n", r, name)
Rob Pike8b6274e2009-08-27 17:04:23 -0700593 }
594 }
595 }
596 }
597}
Rob Pike22c2b472009-08-28 23:05:16 -0700598
Rob Pike1e55e4a2009-08-31 16:43:17 -0700599// PropList.txt has the same format as Scripts.txt so we can share its parser.
600func printScriptOrProperty(doProps bool) {
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800601 flag := "scripts"
602 flaglist := *scriptlist
603 file := "Scripts.txt"
604 table := scripts
605 installed := unicode.Scripts
Rob Pike1e55e4a2009-08-31 16:43:17 -0700606 if doProps {
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800607 flag = "props"
608 flaglist = *proplist
609 file = "PropList.txt"
610 table = props
611 installed = unicode.Properties
Rob Pike1e55e4a2009-08-31 16:43:17 -0700612 }
613 if flaglist == "" {
Robert Griesemer40621d52009-11-09 12:07:39 -0800614 return
Rob Pike1e55e4a2009-08-31 16:43:17 -0700615 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800616 var err os.Error
Brad Fitzpatrickb2400c22011-05-13 18:56:39 -0700617 resp, err := http.Get(*url + file)
Rob Pike1e55e4a2009-08-31 16:43:17 -0700618 if err != nil {
Rob Pikeeea18d92011-02-01 12:47:35 -0800619 logger.Fatal(err)
Rob Pike1e55e4a2009-08-31 16:43:17 -0700620 }
621 if resp.StatusCode != 200 {
Rob Pikeeea18d92011-02-01 12:47:35 -0800622 logger.Fatal("bad GET status for ", file, ":", resp.Status)
Rob Pike1e55e4a2009-08-31 16:43:17 -0700623 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800624 input := bufio.NewReader(resp.Body)
Rob Pike1e55e4a2009-08-31 16:43:17 -0700625 for {
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800626 line, err := input.ReadString('\n')
Rob Pike1e55e4a2009-08-31 16:43:17 -0700627 if err != nil {
628 if err == os.EOF {
Robert Griesemer40621d52009-11-09 12:07:39 -0800629 break
Rob Pike1e55e4a2009-08-31 16:43:17 -0700630 }
Rob Pikeeea18d92011-02-01 12:47:35 -0800631 logger.Fatal(err)
Rob Pike1e55e4a2009-08-31 16:43:17 -0700632 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800633 parseScript(line[0:len(line)-1], table)
Rob Pike1e55e4a2009-08-31 16:43:17 -0700634 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800635 resp.Body.Close()
Rob Pike1e55e4a2009-08-31 16:43:17 -0700636
637 // Find out which scripts to dump
Rob Pike38f12312010-07-01 14:08:14 -0700638 list := strings.Split(flaglist, ",", -1)
Rob Pike1e55e4a2009-08-31 16:43:17 -0700639 if flaglist == "all" {
Robert Griesemer40621d52009-11-09 12:07:39 -0800640 list = all(table)
Rob Pike1e55e4a2009-08-31 16:43:17 -0700641 }
642 if *test {
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800643 fullScriptTest(list, installed, table)
644 return
Rob Pike1e55e4a2009-08-31 16:43:17 -0700645 }
646
647 fmt.Printf(
Robert Griesemer295ceb42009-12-09 16:54:07 -0800648 "// Generated by running\n"+
649 "// maketables --%s=%s --url=%s\n"+
Robert Griesemer45cba572009-11-05 18:26:16 -0800650 "// DO NOT EDIT\n\n",
Rob Pike1e55e4a2009-08-31 16:43:17 -0700651 flag,
652 flaglist,
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800653 *url)
Rob Pike1e55e4a2009-08-31 16:43:17 -0700654 if flaglist == "all" {
655 if doProps {
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800656 fmt.Println("// Properties is the set of Unicode property tables.")
Rob Pike0de328e2011-05-31 09:58:07 +1000657 fmt.Println("var Properties = map[string] *RangeTable{")
Rob Pike1e55e4a2009-08-31 16:43:17 -0700658 } else {
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800659 fmt.Println("// Scripts is the set of Unicode script tables.")
Rob Pike0de328e2011-05-31 09:58:07 +1000660 fmt.Println("var Scripts = map[string] *RangeTable{")
Rob Pike1e55e4a2009-08-31 16:43:17 -0700661 }
Ryan Hitchman062406b2010-12-08 21:36:56 -0800662 for k := range table {
Robert Griesemer40621d52009-11-09 12:07:39 -0800663 fmt.Printf("\t%q: %s,\n", k, k)
Rob Pike1e55e4a2009-08-31 16:43:17 -0700664 }
Rob Pike1ce62452010-12-07 16:42:54 -0500665 fmt.Print("}\n\n")
Rob Pike1e55e4a2009-08-31 16:43:17 -0700666 }
667
Rob Pike4b1170d2011-06-11 09:25:18 +1000668 decl := make(sort.StringSlice, len(list))
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800669 ndecl := 0
Rob Pike1e55e4a2009-08-31 16:43:17 -0700670 for _, name := range list {
671 if doProps {
672 decl[ndecl] = fmt.Sprintf(
673 "\t%s = _%s;\t// %s is the set of Unicode characters with property %s.\n",
Robert Griesemer40621d52009-11-09 12:07:39 -0800674 name, name, name, name)
Rob Pike1e55e4a2009-08-31 16:43:17 -0700675 } else {
676 decl[ndecl] = fmt.Sprintf(
677 "\t%s = _%s;\t// %s is the set of Unicode characters in script %s.\n",
Robert Griesemer40621d52009-11-09 12:07:39 -0800678 name, name, name, name)
Rob Pike1e55e4a2009-08-31 16:43:17 -0700679 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800680 ndecl++
Rob Pike0de328e2011-05-31 09:58:07 +1000681 fmt.Printf("var _%s = &RangeTable {\n", name)
682 fmt.Print("\tR16: []Range16{\n")
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800683 ranges := foldAdjacent(table[name])
Rob Pike0de328e2011-05-31 09:58:07 +1000684 size := 16
685 count := &range16Count
Rob Pike1e55e4a2009-08-31 16:43:17 -0700686 for _, s := range ranges {
Rob Pike9ec0c012011-06-01 09:49:51 +1000687 size, count = printRange(s.Lo, s.Hi, s.Stride, size, count)
Rob Pike1e55e4a2009-08-31 16:43:17 -0700688 }
Rob Pike0de328e2011-05-31 09:58:07 +1000689 fmt.Print("\t},\n")
Rob Pike1ce62452010-12-07 16:42:54 -0500690 fmt.Print("}\n\n")
Rob Pike1e55e4a2009-08-31 16:43:17 -0700691 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800692 decl.Sort()
693 fmt.Println("var (")
Rob Pike1e55e4a2009-08-31 16:43:17 -0700694 for _, d := range decl {
Robert Griesemer40621d52009-11-09 12:07:39 -0800695 fmt.Print(d)
Rob Pike1e55e4a2009-08-31 16:43:17 -0700696 }
Rob Pike1ce62452010-12-07 16:42:54 -0500697 fmt.Print(")\n\n")
Rob Pike1e55e4a2009-08-31 16:43:17 -0700698}
699
Rob Pike22c2b472009-08-28 23:05:16 -0700700const (
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800701 CaseUpper = 1 << iota
702 CaseLower
703 CaseTitle
704 CaseNone = 0 // must be zero
705 CaseMissing = -1 // character not present; not a valid case state
Rob Pike22c2b472009-08-28 23:05:16 -0700706)
707
708type caseState struct {
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800709 point int
710 _case int
711 deltaToUpper int
712 deltaToLower int
713 deltaToTitle int
Rob Pike22c2b472009-08-28 23:05:16 -0700714}
715
716// Is d a continuation of the state of c?
717func (c *caseState) adjacent(d *caseState) bool {
718 if d.point < c.point {
Robert Griesemer40621d52009-11-09 12:07:39 -0800719 c, d = d, c
Rob Pike22c2b472009-08-28 23:05:16 -0700720 }
721 switch {
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800722 case d.point != c.point+1: // code points not adjacent (shouldn't happen)
Robert Griesemer40621d52009-11-09 12:07:39 -0800723 return false
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800724 case d._case != c._case: // different cases
Robert Griesemer40621d52009-11-09 12:07:39 -0800725 return c.upperLowerAdjacent(d)
Rob Pike22c2b472009-08-28 23:05:16 -0700726 case c._case == CaseNone:
Robert Griesemer40621d52009-11-09 12:07:39 -0800727 return false
Rob Pike22c2b472009-08-28 23:05:16 -0700728 case c._case == CaseMissing:
Robert Griesemer40621d52009-11-09 12:07:39 -0800729 return false
Rob Pike22c2b472009-08-28 23:05:16 -0700730 case d.deltaToUpper != c.deltaToUpper:
Robert Griesemer40621d52009-11-09 12:07:39 -0800731 return false
Rob Pike22c2b472009-08-28 23:05:16 -0700732 case d.deltaToLower != c.deltaToLower:
Robert Griesemer40621d52009-11-09 12:07:39 -0800733 return false
Rob Pike22c2b472009-08-28 23:05:16 -0700734 case d.deltaToTitle != c.deltaToTitle:
Robert Griesemer40621d52009-11-09 12:07:39 -0800735 return false
Rob Pike22c2b472009-08-28 23:05:16 -0700736 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800737 return true
Rob Pike22c2b472009-08-28 23:05:16 -0700738}
739
Rob Pike3c098e22009-08-30 14:02:42 -0700740// Is d the same as c, but opposite in upper/lower case? this would make it
741// an element of an UpperLower sequence.
742func (c *caseState) upperLowerAdjacent(d *caseState) bool {
743 // check they're a matched case pair. we know they have adjacent values
744 switch {
745 case c._case == CaseUpper && d._case != CaseLower:
Robert Griesemer40621d52009-11-09 12:07:39 -0800746 return false
Rob Pike3c098e22009-08-30 14:02:42 -0700747 case c._case == CaseLower && d._case != CaseUpper:
Robert Griesemer40621d52009-11-09 12:07:39 -0800748 return false
Rob Pike3c098e22009-08-30 14:02:42 -0700749 }
750 // matched pair (at least in upper/lower). make the order Upper Lower
751 if c._case == CaseLower {
Robert Griesemer40621d52009-11-09 12:07:39 -0800752 c, d = d, c
Rob Pike3c098e22009-08-30 14:02:42 -0700753 }
754 // for an Upper Lower sequence the deltas have to be in order
755 // c: 0 1 0
756 // d: -1 0 -1
757 switch {
758 case c.deltaToUpper != 0:
Robert Griesemer40621d52009-11-09 12:07:39 -0800759 return false
Rob Pike3c098e22009-08-30 14:02:42 -0700760 case c.deltaToLower != 1:
Robert Griesemer40621d52009-11-09 12:07:39 -0800761 return false
Rob Pike3c098e22009-08-30 14:02:42 -0700762 case c.deltaToTitle != 0:
Robert Griesemer40621d52009-11-09 12:07:39 -0800763 return false
Rob Pike3c098e22009-08-30 14:02:42 -0700764 case d.deltaToUpper != -1:
Robert Griesemer40621d52009-11-09 12:07:39 -0800765 return false
Rob Pike3c098e22009-08-30 14:02:42 -0700766 case d.deltaToLower != 0:
Robert Griesemer40621d52009-11-09 12:07:39 -0800767 return false
Rob Pike3c098e22009-08-30 14:02:42 -0700768 case d.deltaToTitle != -1:
Robert Griesemer40621d52009-11-09 12:07:39 -0800769 return false
Rob Pike3c098e22009-08-30 14:02:42 -0700770 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800771 return true
Rob Pike3c098e22009-08-30 14:02:42 -0700772}
773
774// Does this character start an UpperLower sequence?
775func (c *caseState) isUpperLower() bool {
776 // for an Upper Lower sequence the deltas have to be in order
777 // c: 0 1 0
778 switch {
779 case c.deltaToUpper != 0:
Robert Griesemer40621d52009-11-09 12:07:39 -0800780 return false
Rob Pike3c098e22009-08-30 14:02:42 -0700781 case c.deltaToLower != 1:
Robert Griesemer40621d52009-11-09 12:07:39 -0800782 return false
Rob Pike3c098e22009-08-30 14:02:42 -0700783 case c.deltaToTitle != 0:
Robert Griesemer40621d52009-11-09 12:07:39 -0800784 return false
Rob Pike3c098e22009-08-30 14:02:42 -0700785 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800786 return true
Rob Pike3c098e22009-08-30 14:02:42 -0700787}
788
789// Does this character start a LowerUpper sequence?
790func (c *caseState) isLowerUpper() bool {
791 // for an Upper Lower sequence the deltas have to be in order
792 // c: -1 0 -1
793 switch {
794 case c.deltaToUpper != -1:
Robert Griesemer40621d52009-11-09 12:07:39 -0800795 return false
Rob Pike3c098e22009-08-30 14:02:42 -0700796 case c.deltaToLower != 0:
Robert Griesemer40621d52009-11-09 12:07:39 -0800797 return false
Rob Pike3c098e22009-08-30 14:02:42 -0700798 case c.deltaToTitle != -1:
Robert Griesemer40621d52009-11-09 12:07:39 -0800799 return false
Rob Pike3c098e22009-08-30 14:02:42 -0700800 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800801 return true
Rob Pike3c098e22009-08-30 14:02:42 -0700802}
803
Rob Pike22c2b472009-08-28 23:05:16 -0700804func getCaseState(i int) (c *caseState) {
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800805 c = &caseState{point: i, _case: CaseNone}
806 ch := &chars[i]
Rob Pike22c2b472009-08-28 23:05:16 -0700807 switch int(ch.codePoint) {
808 case 0:
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800809 c._case = CaseMissing // Will get NUL wrong but that doesn't matter
810 return
Rob Pike22c2b472009-08-28 23:05:16 -0700811 case ch.upperCase:
Robert Griesemer40621d52009-11-09 12:07:39 -0800812 c._case = CaseUpper
Rob Pike22c2b472009-08-28 23:05:16 -0700813 case ch.lowerCase:
Robert Griesemer40621d52009-11-09 12:07:39 -0800814 c._case = CaseLower
Rob Pike22c2b472009-08-28 23:05:16 -0700815 case ch.titleCase:
Robert Griesemer40621d52009-11-09 12:07:39 -0800816 c._case = CaseTitle
Rob Pike22c2b472009-08-28 23:05:16 -0700817 }
818 if ch.upperCase != 0 {
Robert Griesemer40621d52009-11-09 12:07:39 -0800819 c.deltaToUpper = ch.upperCase - i
Rob Pike22c2b472009-08-28 23:05:16 -0700820 }
821 if ch.lowerCase != 0 {
Robert Griesemer40621d52009-11-09 12:07:39 -0800822 c.deltaToLower = ch.lowerCase - i
Rob Pike22c2b472009-08-28 23:05:16 -0700823 }
824 if ch.titleCase != 0 {
Robert Griesemer40621d52009-11-09 12:07:39 -0800825 c.deltaToTitle = ch.titleCase - i
Rob Pike22c2b472009-08-28 23:05:16 -0700826 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800827 return
Rob Pike22c2b472009-08-28 23:05:16 -0700828}
829
830func printCases() {
831 if !*cases {
Robert Griesemer40621d52009-11-09 12:07:39 -0800832 return
Rob Pike22c2b472009-08-28 23:05:16 -0700833 }
834 if *test {
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800835 fullCaseTest()
836 return
Rob Pike22c2b472009-08-28 23:05:16 -0700837 }
838 fmt.Printf(
Robert Griesemer295ceb42009-12-09 16:54:07 -0800839 "// Generated by running\n"+
840 "// maketables --data=%s\n"+
841 "// DO NOT EDIT\n\n"+
842 "// CaseRanges is the table describing case mappings for all letters with\n"+
843 "// non-self mappings.\n"+
844 "var CaseRanges = _CaseRanges\n"+
Robert Griesemer45cba572009-11-05 18:26:16 -0800845 "var _CaseRanges = []CaseRange {\n",
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800846 *dataURL)
Rob Pike22c2b472009-08-28 23:05:16 -0700847
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800848 var startState *caseState // the start of a run; nil for not active
849 var prevState = &caseState{} // the state of the previous character
Rob Pikea8246512009-11-02 11:37:52 -0800850 for i := range chars {
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800851 state := getCaseState(i)
Rob Pike22c2b472009-08-28 23:05:16 -0700852 if state.adjacent(prevState) {
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800853 prevState = state
854 continue
Rob Pike22c2b472009-08-28 23:05:16 -0700855 }
856 // end of run (possibly)
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800857 printCaseRange(startState, prevState)
858 startState = nil
Rob Pike22c2b472009-08-28 23:05:16 -0700859 if state._case != CaseMissing && state._case != CaseNone {
Robert Griesemer40621d52009-11-09 12:07:39 -0800860 startState = state
Rob Pike22c2b472009-08-28 23:05:16 -0700861 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800862 prevState = state
Rob Pike22c2b472009-08-28 23:05:16 -0700863 }
Rob Pike1ce62452010-12-07 16:42:54 -0500864 fmt.Print("}\n")
Rob Pike22c2b472009-08-28 23:05:16 -0700865}
866
867func printCaseRange(lo, hi *caseState) {
868 if lo == nil {
Robert Griesemer40621d52009-11-09 12:07:39 -0800869 return
Rob Pike22c2b472009-08-28 23:05:16 -0700870 }
871 if lo.deltaToUpper == 0 && lo.deltaToLower == 0 && lo.deltaToTitle == 0 {
872 // character represents itself in all cases - no need to mention it
Robert Griesemer40621d52009-11-09 12:07:39 -0800873 return
Rob Pike22c2b472009-08-28 23:05:16 -0700874 }
Rob Pike3c098e22009-08-30 14:02:42 -0700875 switch {
876 case hi.point > lo.point && lo.isUpperLower():
Rob Pikefc52d702011-01-31 15:20:44 -0800877 fmt.Printf("\t{0x%04X, 0x%04X, d{UpperLower, UpperLower, UpperLower}},\n",
Robert Griesemer40621d52009-11-09 12:07:39 -0800878 lo.point, hi.point)
Rob Pike3c098e22009-08-30 14:02:42 -0700879 case hi.point > lo.point && lo.isLowerUpper():
Rob Pike0de328e2011-05-31 09:58:07 +1000880 logger.Fatalf("LowerUpper sequence: should not happen: %U. If it's real, need to fix To()", lo.point)
Rob Pikefc52d702011-01-31 15:20:44 -0800881 fmt.Printf("\t{0x%04X, 0x%04X, d{LowerUpper, LowerUpper, LowerUpper}},\n",
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800882 lo.point, hi.point)
Rob Pike3c098e22009-08-30 14:02:42 -0700883 default:
Rob Pikefc52d702011-01-31 15:20:44 -0800884 fmt.Printf("\t{0x%04X, 0x%04X, d{%d, %d, %d}},\n",
Rob Pike3c098e22009-08-30 14:02:42 -0700885 lo.point, hi.point,
Robert Griesemer40621d52009-11-09 12:07:39 -0800886 lo.deltaToUpper, lo.deltaToLower, lo.deltaToTitle)
Rob Pike3c098e22009-08-30 14:02:42 -0700887 }
Rob Pike22c2b472009-08-28 23:05:16 -0700888}
889
890// If the cased value in the Char is 0, it means use the rune itself.
891func caseIt(rune, cased int) int {
892 if cased == 0 {
Robert Griesemer40621d52009-11-09 12:07:39 -0800893 return rune
Rob Pike22c2b472009-08-28 23:05:16 -0700894 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800895 return cased
Rob Pike22c2b472009-08-28 23:05:16 -0700896}
897
898func fullCaseTest() {
899 for i, c := range chars {
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800900 lower := unicode.ToLower(i)
901 want := caseIt(i, c.lowerCase)
Rob Pike22c2b472009-08-28 23:05:16 -0700902 if lower != want {
Rob Pike0de328e2011-05-31 09:58:07 +1000903 fmt.Fprintf(os.Stderr, "lower %U should be %U is %U\n", i, want, lower)
Rob Pike22c2b472009-08-28 23:05:16 -0700904 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800905 upper := unicode.ToUpper(i)
906 want = caseIt(i, c.upperCase)
Rob Pike22c2b472009-08-28 23:05:16 -0700907 if upper != want {
Rob Pike0de328e2011-05-31 09:58:07 +1000908 fmt.Fprintf(os.Stderr, "upper %U should be %U is %U\n", i, want, upper)
Rob Pike22c2b472009-08-28 23:05:16 -0700909 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800910 title := unicode.ToTitle(i)
911 want = caseIt(i, c.titleCase)
Rob Pike22c2b472009-08-28 23:05:16 -0700912 if title != want {
Rob Pike0de328e2011-05-31 09:58:07 +1000913 fmt.Fprintf(os.Stderr, "title %U should be %U is %U\n", i, want, title)
Rob Pike22c2b472009-08-28 23:05:16 -0700914 }
915 }
916}
Rob Pike0de328e2011-05-31 09:58:07 +1000917
Rob Pike8d64e732011-06-04 07:46:22 +1000918func printLatinProperties() {
919 if *test {
920 return
921 }
Rob Pike7a922872011-06-04 09:28:27 +1000922 fmt.Println("var properties = [MaxLatin1+1]uint8{")
923 for code := 0; code <= unicode.MaxLatin1; code++ {
Rob Pike8d64e732011-06-04 07:46:22 +1000924 var property string
925 switch chars[code].category {
926 case "Cc", "": // NUL has no category.
927 property = "pC"
928 case "Cf": // soft hyphen, unique category, not printable.
929 property = "0"
930 case "Ll":
931 property = "pLl | pp"
932 case "Lu":
933 property = "pLu | pp"
934 case "Nd", "No":
935 property = "pN | pp"
936 case "Pc", "Pd", "Pe", "Pf", "Pi", "Po", "Ps":
937 property = "pP | pp"
938 case "Sc", "Sk", "Sm", "So":
939 property = "pS | pp"
940 case "Zs":
941 property = "pZ"
942 default:
943 logger.Fatalf("%U has unknown category %q", code, chars[code].category)
944 }
945 // Special case
946 if code == ' ' {
947 property = "pZ | pp"
948 }
949 fmt.Printf("\t0x%.2X: %s, // %q\n", code, property, code)
950 }
951 fmt.Println("}")
952}
953
Rob Pike0de328e2011-05-31 09:58:07 +1000954var range16Count = 0 // Number of entries in the 16-bit range tables.
955var range32Count = 0 // Number of entries in the 32-bit range tables.
956
957func printSizes() {
Rob Pike9ec0c012011-06-01 09:49:51 +1000958 if *test {
959 return
960 }
Rob Pike0de328e2011-05-31 09:58:07 +1000961 fmt.Println()
962 fmt.Printf("// Range entries: %d 16-bit, %d 32-bit, %d total.\n", range16Count, range32Count, range16Count+range32Count)
963 range16Bytes := range16Count * 3 * 2
964 range32Bytes := range32Count * 3 * 4
965 fmt.Printf("// Range bytes: %d 16-bit, %d 32-bit, %d total.\n", range16Bytes, range32Bytes, range16Bytes+range32Bytes)
966}