blob: 28809b3939699702e593eded0d3798f233bfc647 [file] [log] [blame]
Rob Pike396b47b2009-08-26 16:01:31 -07001// Copyright 2009 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Unicode table generator.
6// Data read from the web.
7
8package main
9
10import (
Robert Griesemer45ca9f72009-12-15 15:41:46 -080011 "bufio"
12 "flag"
13 "fmt"
14 "http"
15 "log"
16 "os"
17 "sort"
18 "strconv"
19 "strings"
20 "regexp"
21 "unicode"
Rob Pike396b47b2009-08-26 16:01:31 -070022)
23
Rob Pike22c2b472009-08-28 23:05:16 -070024func main() {
Robert Griesemer45ca9f72009-12-15 15:41:46 -080025 flag.Parse()
26 loadChars() // always needed
27 printCategories()
28 printScriptOrProperty(false)
29 printScriptOrProperty(true)
30 printCases()
Rob Pike22c2b472009-08-28 23:05:16 -070031}
32
Russ Coxed6eb5b2009-11-08 21:46:20 -080033var dataURL = flag.String("data", "", "full URL for UnicodeData.txt; defaults to --url/UnicodeData.txt")
Rob Pike94e69152009-08-27 09:14:32 -070034var url = flag.String("url",
Rob Pikeceb1fe22009-12-01 16:22:21 -080035 "http://www.unicode.org/Public/5.2.0/ucd/",
Rob Pike8b6274e2009-08-27 17:04:23 -070036 "URL of Unicode database directory")
37var tablelist = flag.String("tables",
Rob Pike94e69152009-08-27 09:14:32 -070038 "all",
Rob Pikea8246512009-11-02 11:37:52 -080039 "comma-separated list of which tables to generate; can be letter")
Rob Pike8b6274e2009-08-27 17:04:23 -070040var scriptlist = flag.String("scripts",
41 "all",
Rob Pikea8246512009-11-02 11:37:52 -080042 "comma-separated list of which script tables to generate")
Rob Pike1e55e4a2009-08-31 16:43:17 -070043var proplist = flag.String("props",
44 "all",
Rob Pikea8246512009-11-02 11:37:52 -080045 "comma-separated list of which property tables to generate")
Rob Pike22c2b472009-08-28 23:05:16 -070046var cases = flag.Bool("cases",
47 true,
Rob Pikea8246512009-11-02 11:37:52 -080048 "generate case tables")
Rob Pike94e69152009-08-27 09:14:32 -070049var test = flag.Bool("test",
50 false,
Rob Pikea8246512009-11-02 11:37:52 -080051 "test existing tables; can be used to compare web data with package data")
Rob Pike396b47b2009-08-26 16:01:31 -070052
Russ Cox0f0f34e2011-01-30 16:09:16 -050053var scriptRe = regexp.MustCompile(`^([0-9A-F]+)(\.\.[0-9A-F]+)? *; ([A-Za-z_]+)$`)
54var logger = log.New(os.Stderr, "", log.Lshortfile)
Rob Pike396b47b2009-08-26 16:01:31 -070055
Robert Griesemer45ca9f72009-12-15 15:41:46 -080056var category = map[string]bool{"letter": true} // Nd Lu etc. letter is a special case
Rob Pike94e69152009-08-27 09:14:32 -070057
Rob Pike8b6274e2009-08-27 17:04:23 -070058// UnicodeData.txt has form:
Rob Pike396b47b2009-08-26 16:01:31 -070059// 0037;DIGIT SEVEN;Nd;0;EN;;7;7;7;N;;;;;
60// 007A;LATIN SMALL LETTER Z;Ll;0;L;;;;;N;;;005A;;005A
61// See http://www.unicode.org/Public/5.1.0/ucd/UCD.html for full explanation
Rob Pike22c2b472009-08-28 23:05:16 -070062// The fields:
Rob Pike396b47b2009-08-26 16:01:31 -070063const (
Robert Griesemer45ca9f72009-12-15 15:41:46 -080064 FCodePoint = iota
65 FName
66 FGeneralCategory
67 FCanonicalCombiningClass
68 FBidiClass
69 FDecompositionType
70 FDecompositionMapping
71 FNumericType
72 FNumericValue
73 FBidiMirrored
74 FUnicode1Name
75 FISOComment
76 FSimpleUppercaseMapping
77 FSimpleLowercaseMapping
78 FSimpleTitlecaseMapping
79 NumField
Rob Pike396b47b2009-08-26 16:01:31 -070080
Robert Griesemer45ca9f72009-12-15 15:41:46 -080081 MaxChar = 0x10FFFF // anything above this shouldn't exist
Rob Pike396b47b2009-08-26 16:01:31 -070082)
83
84var fieldName = []string{
85 "CodePoint",
86 "Name",
87 "GeneralCategory",
88 "CanonicalCombiningClass",
89 "BidiClass",
90 "DecompositionType",
91 "DecompositionMapping",
92 "NumericType",
93 "NumericValue",
94 "BidiMirrored",
95 "Unicode1Name",
96 "ISOComment",
97 "SimpleUppercaseMapping",
98 "SimpleLowercaseMapping",
Robert Griesemer841c18a2009-11-04 21:39:55 -080099 "SimpleTitlecaseMapping",
Rob Pike396b47b2009-08-26 16:01:31 -0700100}
101
102// This contains only the properties we're interested in.
103type Char struct {
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800104 field []string // debugging only; could be deleted if we take out char.dump()
105 codePoint uint32 // if zero, this index is not a valid code point.
106 category string
107 upperCase int
108 lowerCase int
109 titleCase int
Rob Pike396b47b2009-08-26 16:01:31 -0700110}
111
Rob Pike8b6274e2009-08-27 17:04:23 -0700112// Scripts.txt has form:
113// A673 ; Cyrillic # Po SLAVONIC ASTERISK
114// A67C..A67D ; Cyrillic # Mn [2] COMBINING CYRILLIC KAVYKA..COMBINING CYRILLIC PAYEROK
115// See http://www.unicode.org/Public/5.1.0/ucd/UCD.html for full explanation
116
117type Script struct {
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800118 lo, hi uint32 // range of code points
119 script string
Rob Pike8b6274e2009-08-27 17:04:23 -0700120}
121
Rob Pike22c2b472009-08-28 23:05:16 -0700122var chars = make([]Char, MaxChar+1)
Robert Griesemer841c18a2009-11-04 21:39:55 -0800123var scripts = make(map[string][]Script)
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800124var props = make(map[string][]Script) // a property looks like a script; can share the format
Rob Pike396b47b2009-08-26 16:01:31 -0700125
Rob Pike1e55e4a2009-08-31 16:43:17 -0700126var lastChar uint32 = 0
127
Rob Pikef59ae062009-08-28 11:57:38 -0700128// In UnicodeData.txt, some ranges are marked like this:
Rob Pike22c2b472009-08-28 23:05:16 -0700129// 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
130// 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
Rob Pikef59ae062009-08-28 11:57:38 -0700131// parseCategory returns a state variable indicating the weirdness.
132type State int
Robert Griesemer841c18a2009-11-04 21:39:55 -0800133
Rob Pikef59ae062009-08-28 11:57:38 -0700134const (
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800135 SNormal State = iota // known to be zero for the type
136 SFirst
137 SLast
138 SMissing
Rob Pikef59ae062009-08-28 11:57:38 -0700139)
140
141func parseCategory(line string) (state State) {
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800142 field := strings.Split(line, ";", -1)
Rob Pike396b47b2009-08-26 16:01:31 -0700143 if len(field) != NumField {
Russ Cox0f0f34e2011-01-30 16:09:16 -0500144 logger.Exitf("%5s: %d fields (expected %d)\n", line, len(field), NumField)
Rob Pike396b47b2009-08-26 16:01:31 -0700145 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800146 point, err := strconv.Btoui64(field[FCodePoint], 16)
Rob Pike396b47b2009-08-26 16:01:31 -0700147 if err != nil {
Russ Cox0f0f34e2011-01-30 16:09:16 -0500148 logger.Exitf("%.5s...: %s", line, err)
Rob Pike396b47b2009-08-26 16:01:31 -0700149 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800150 lastChar = uint32(point)
Rob Pike396b47b2009-08-26 16:01:31 -0700151 if point == 0 {
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800152 return // not interesting and we use 0 as unset
Rob Pike396b47b2009-08-26 16:01:31 -0700153 }
Rob Pike22c2b472009-08-28 23:05:16 -0700154 if point > MaxChar {
Robert Griesemer40621d52009-11-09 12:07:39 -0800155 return
Rob Pike396b47b2009-08-26 16:01:31 -0700156 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800157 char := &chars[point]
158 char.field = field
Rob Pike396b47b2009-08-26 16:01:31 -0700159 if char.codePoint != 0 {
Russ Cox0f0f34e2011-01-30 16:09:16 -0500160 logger.Exitf("point %U reused", point)
Rob Pike396b47b2009-08-26 16:01:31 -0700161 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800162 char.codePoint = lastChar
163 char.category = field[FGeneralCategory]
164 category[char.category] = true
Rob Pike396b47b2009-08-26 16:01:31 -0700165 switch char.category {
166 case "Nd":
167 // Decimal digit
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800168 _, err := strconv.Atoi(field[FNumericValue])
Rob Pike396b47b2009-08-26 16:01:31 -0700169 if err != nil {
Russ Cox0f0f34e2011-01-30 16:09:16 -0500170 logger.Exitf("%U: bad numeric field: %s", point, err)
Rob Pike396b47b2009-08-26 16:01:31 -0700171 }
Rob Pike396b47b2009-08-26 16:01:31 -0700172 case "Lu":
Robert Griesemer40621d52009-11-09 12:07:39 -0800173 char.letter(field[FCodePoint], field[FSimpleLowercaseMapping], field[FSimpleTitlecaseMapping])
Rob Pike396b47b2009-08-26 16:01:31 -0700174 case "Ll":
Robert Griesemer40621d52009-11-09 12:07:39 -0800175 char.letter(field[FSimpleUppercaseMapping], field[FCodePoint], field[FSimpleTitlecaseMapping])
Rob Pike396b47b2009-08-26 16:01:31 -0700176 case "Lt":
Robert Griesemer40621d52009-11-09 12:07:39 -0800177 char.letter(field[FSimpleUppercaseMapping], field[FSimpleLowercaseMapping], field[FCodePoint])
Rob Pike396b47b2009-08-26 16:01:31 -0700178 case "Lm", "Lo":
Robert Griesemer40621d52009-11-09 12:07:39 -0800179 char.letter(field[FSimpleUppercaseMapping], field[FSimpleLowercaseMapping], field[FSimpleTitlecaseMapping])
Rob Pike396b47b2009-08-26 16:01:31 -0700180 }
Rob Pikef59ae062009-08-28 11:57:38 -0700181 switch {
182 case strings.Index(field[FName], ", First>") > 0:
Robert Griesemer40621d52009-11-09 12:07:39 -0800183 state = SFirst
Rob Pikef59ae062009-08-28 11:57:38 -0700184 case strings.Index(field[FName], ", Last>") > 0:
Robert Griesemer40621d52009-11-09 12:07:39 -0800185 state = SLast
Rob Pikef59ae062009-08-28 11:57:38 -0700186 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800187 return
Rob Pike396b47b2009-08-26 16:01:31 -0700188}
189
190func (char *Char) dump(s string) {
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800191 fmt.Print(s, " ")
Robert Griesemer841c18a2009-11-04 21:39:55 -0800192 for i := 0; i < len(char.field); i++ {
Robert Griesemer40621d52009-11-09 12:07:39 -0800193 fmt.Printf("%s:%q ", fieldName[i], char.field[i])
Rob Pike396b47b2009-08-26 16:01:31 -0700194 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800195 fmt.Print("\n")
Rob Pike396b47b2009-08-26 16:01:31 -0700196}
197
198func (char *Char) letter(u, l, t string) {
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800199 char.upperCase = char.letterValue(u, "U")
200 char.lowerCase = char.letterValue(l, "L")
201 char.titleCase = char.letterValue(t, "T")
Rob Pike396b47b2009-08-26 16:01:31 -0700202}
203
Rob Pike22c2b472009-08-28 23:05:16 -0700204func (char *Char) letterValue(s string, cas string) int {
Rob Pike396b47b2009-08-26 16:01:31 -0700205 if s == "" {
Robert Griesemer40621d52009-11-09 12:07:39 -0800206 return 0
Rob Pike396b47b2009-08-26 16:01:31 -0700207 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800208 v, err := strconv.Btoui64(s, 16)
Rob Pike396b47b2009-08-26 16:01:31 -0700209 if err != nil {
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800210 char.dump(cas)
Russ Cox0f0f34e2011-01-30 16:09:16 -0500211 logger.Exitf("%U: bad letter(%s): %s", char.codePoint, s, err)
Rob Pike396b47b2009-08-26 16:01:31 -0700212 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800213 return int(v)
Rob Pike396b47b2009-08-26 16:01:31 -0700214}
215
Rob Pike94e69152009-08-27 09:14:32 -0700216func allCategories() []string {
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800217 a := make([]string, len(category))
218 i := 0
Rob Pike94e69152009-08-27 09:14:32 -0700219 for k := range category {
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800220 a[i] = k
221 i++
Rob Pike94e69152009-08-27 09:14:32 -0700222 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800223 return a
Rob Pike94e69152009-08-27 09:14:32 -0700224}
225
Robert Griesemer841c18a2009-11-04 21:39:55 -0800226func all(scripts map[string][]Script) []string {
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800227 a := make([]string, len(scripts))
228 i := 0
Rob Pike8b6274e2009-08-27 17:04:23 -0700229 for k := range scripts {
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800230 a[i] = k
231 i++
Rob Pike8b6274e2009-08-27 17:04:23 -0700232 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800233 return a
Rob Pike8b6274e2009-08-27 17:04:23 -0700234}
235
Rob Pike94e69152009-08-27 09:14:32 -0700236// Extract the version number from the URL
237func version() string {
238 // Break on slashes and look for the first numeric field
Rob Pike38f12312010-07-01 14:08:14 -0700239 fields := strings.Split(*url, "/", -1)
Rob Pike94e69152009-08-27 09:14:32 -0700240 for _, f := range fields {
241 if len(f) > 0 && '0' <= f[0] && f[0] <= '9' {
Robert Griesemer40621d52009-11-09 12:07:39 -0800242 return f
Rob Pike94e69152009-08-27 09:14:32 -0700243 }
244 }
Russ Cox0f0f34e2011-01-30 16:09:16 -0500245 logger.Exit("unknown version")
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800246 return "Unknown"
Rob Pike94e69152009-08-27 09:14:32 -0700247}
248
249func letterOp(code int) bool {
250 switch chars[code].category {
251 case "Lu", "Ll", "Lt", "Lm", "Lo":
Robert Griesemer40621d52009-11-09 12:07:39 -0800252 return true
Rob Pike94e69152009-08-27 09:14:32 -0700253 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800254 return false
Rob Pike94e69152009-08-27 09:14:32 -0700255}
256
Rob Pike22c2b472009-08-28 23:05:16 -0700257func loadChars() {
Russ Coxed6eb5b2009-11-08 21:46:20 -0800258 if *dataURL == "" {
Robert Griesemer3bb00322009-11-09 21:23:52 -0800259 flag.Set("data", *url+"UnicodeData.txt")
Rob Pike8b6274e2009-08-27 17:04:23 -0700260 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800261 resp, _, err := http.Get(*dataURL)
Rob Pike396b47b2009-08-26 16:01:31 -0700262 if err != nil {
Russ Cox0f0f34e2011-01-30 16:09:16 -0500263 logger.Exit(err)
Rob Pike396b47b2009-08-26 16:01:31 -0700264 }
Rob Pike94e69152009-08-27 09:14:32 -0700265 if resp.StatusCode != 200 {
Russ Cox0f0f34e2011-01-30 16:09:16 -0500266 logger.Exit("bad GET status for UnicodeData.txt", resp.Status)
Rob Pike94e69152009-08-27 09:14:32 -0700267 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800268 input := bufio.NewReader(resp.Body)
269 var first uint32 = 0
Rob Pike396b47b2009-08-26 16:01:31 -0700270 for {
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800271 line, err := input.ReadString('\n')
Rob Pike396b47b2009-08-26 16:01:31 -0700272 if err != nil {
273 if err == os.EOF {
Robert Griesemer40621d52009-11-09 12:07:39 -0800274 break
Rob Pike396b47b2009-08-26 16:01:31 -0700275 }
Russ Cox0f0f34e2011-01-30 16:09:16 -0500276 logger.Exit(err)
Rob Pike396b47b2009-08-26 16:01:31 -0700277 }
Robert Griesemer841c18a2009-11-04 21:39:55 -0800278 switch parseCategory(line[0 : len(line)-1]) {
Rob Pikef59ae062009-08-28 11:57:38 -0700279 case SNormal:
280 if first != 0 {
Russ Cox0f0f34e2011-01-30 16:09:16 -0500281 logger.Exitf("bad state normal at U+%04X", lastChar)
Rob Pikef59ae062009-08-28 11:57:38 -0700282 }
283 case SFirst:
284 if first != 0 {
Russ Cox0f0f34e2011-01-30 16:09:16 -0500285 logger.Exitf("bad state first at U+%04X", lastChar)
Rob Pikef59ae062009-08-28 11:57:38 -0700286 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800287 first = lastChar
Rob Pikef59ae062009-08-28 11:57:38 -0700288 case SLast:
289 if first == 0 {
Russ Cox0f0f34e2011-01-30 16:09:16 -0500290 logger.Exitf("bad state last at U+%04X", lastChar)
Rob Pikef59ae062009-08-28 11:57:38 -0700291 }
Robert Griesemer3bb00322009-11-09 21:23:52 -0800292 for i := first + 1; i <= lastChar; i++ {
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800293 chars[i] = chars[first]
294 chars[i].codePoint = i
Rob Pikef59ae062009-08-28 11:57:38 -0700295 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800296 first = 0
Rob Pikef59ae062009-08-28 11:57:38 -0700297 }
Rob Pike396b47b2009-08-26 16:01:31 -0700298 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800299 resp.Body.Close()
Rob Pike22c2b472009-08-28 23:05:16 -0700300}
301
302func printCategories() {
303 if *tablelist == "" {
Robert Griesemer40621d52009-11-09 12:07:39 -0800304 return
Rob Pike22c2b472009-08-28 23:05:16 -0700305 }
Rob Pike94e69152009-08-27 09:14:32 -0700306 // Find out which categories to dump
Rob Pike38f12312010-07-01 14:08:14 -0700307 list := strings.Split(*tablelist, ",", -1)
Rob Pike8b6274e2009-08-27 17:04:23 -0700308 if *tablelist == "all" {
Robert Griesemer40621d52009-11-09 12:07:39 -0800309 list = allCategories()
Rob Pike94e69152009-08-27 09:14:32 -0700310 }
311 if *test {
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800312 fullCategoryTest(list)
313 return
Rob Pike94e69152009-08-27 09:14:32 -0700314 }
Rob Pike396b47b2009-08-26 16:01:31 -0700315 fmt.Printf(
Robert Griesemer295ceb42009-12-09 16:54:07 -0800316 "// Generated by running\n"+
317 "// maketables --tables=%s --data=%s\n"+
318 "// DO NOT EDIT\n\n"+
Robert Griesemer45cba572009-11-05 18:26:16 -0800319 "package unicode\n\n",
Rob Pike8b6274e2009-08-27 17:04:23 -0700320 *tablelist,
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800321 *dataURL)
Rob Pike94e69152009-08-27 09:14:32 -0700322
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800323 fmt.Println("// Version is the Unicode edition from which the tables are derived.")
324 fmt.Printf("const Version = %q\n\n", version())
Rob Pike94e69152009-08-27 09:14:32 -0700325
Rob Pike8b6274e2009-08-27 17:04:23 -0700326 if *tablelist == "all" {
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800327 fmt.Println("// Categories is the set of Unicode data tables.")
328 fmt.Println("var Categories = map[string] []Range {")
Ryan Hitchman062406b2010-12-08 21:36:56 -0800329 for k := range category {
Robert Griesemer40621d52009-11-09 12:07:39 -0800330 fmt.Printf("\t%q: %s,\n", k, k)
Rob Pike94e69152009-08-27 09:14:32 -0700331 }
Rob Pike1ce62452010-12-07 16:42:54 -0500332 fmt.Print("}\n\n")
Rob Pike94e69152009-08-27 09:14:32 -0700333 }
334
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800335 decl := make(sort.StringArray, len(list))
336 ndecl := 0
Rob Pike94e69152009-08-27 09:14:32 -0700337 for _, name := range list {
338 if _, ok := category[name]; !ok {
Russ Cox0f0f34e2011-01-30 16:09:16 -0500339 logger.Exit("unknown category", name)
Rob Pike94e69152009-08-27 09:14:32 -0700340 }
341 // We generate an UpperCase name to serve as concise documentation and an _UnderScored
342 // name to store the data. This stops godoc dumping all the tables but keeps them
343 // available to clients.
Rob Pike25caf182009-08-27 18:38:02 -0700344 // Cases deserving special comments
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800345 varDecl := ""
Rob Pike25caf182009-08-27 18:38:02 -0700346 switch name {
347 case "letter":
Robert Griesemer40621d52009-11-09 12:07:39 -0800348 varDecl = "\tLetter = letter; // Letter is the set of Unicode letters.\n"
Rob Pike25caf182009-08-27 18:38:02 -0700349 case "Nd":
Robert Griesemer40621d52009-11-09 12:07:39 -0800350 varDecl = "\tDigit = _Nd; // Digit is the set of Unicode characters with the \"decimal digit\" property.\n"
Rob Pike25caf182009-08-27 18:38:02 -0700351 case "Lu":
Robert Griesemer40621d52009-11-09 12:07:39 -0800352 varDecl = "\tUpper = _Lu; // Upper is the set of Unicode upper case letters.\n"
Rob Pike25caf182009-08-27 18:38:02 -0700353 case "Ll":
Robert Griesemer40621d52009-11-09 12:07:39 -0800354 varDecl = "\tLower = _Ll; // Lower is the set of Unicode lower case letters.\n"
Rob Pike25caf182009-08-27 18:38:02 -0700355 case "Lt":
Robert Griesemer40621d52009-11-09 12:07:39 -0800356 varDecl = "\tTitle = _Lt; // Title is the set of Unicode title case letters.\n"
Rob Pike25caf182009-08-27 18:38:02 -0700357 }
358 if name != "letter" {
359 varDecl += fmt.Sprintf(
360 "\t%s = _%s; // %s is the set of Unicode characters in category %s.\n",
Robert Griesemer40621d52009-11-09 12:07:39 -0800361 name, name, name, name)
Rob Pike25caf182009-08-27 18:38:02 -0700362 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800363 decl[ndecl] = varDecl
364 ndecl++
365 if name == "letter" { // special case
Rob Pike94e69152009-08-27 09:14:32 -0700366 dumpRange(
Rob Pike94e69152009-08-27 09:14:32 -0700367 "var letter = []Range {\n",
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800368 letterOp)
369 continue
Rob Pike94e69152009-08-27 09:14:32 -0700370 }
Rob Pike396b47b2009-08-26 16:01:31 -0700371 dumpRange(
Rob Pike25caf182009-08-27 18:38:02 -0700372 fmt.Sprintf("var _%s = []Range {\n", name),
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800373 func(code int) bool { return chars[code].category == name })
Rob Pike396b47b2009-08-26 16:01:31 -0700374 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800375 decl.Sort()
376 fmt.Println("var (")
Rob Pike25caf182009-08-27 18:38:02 -0700377 for _, d := range decl {
Robert Griesemer40621d52009-11-09 12:07:39 -0800378 fmt.Print(d)
Rob Pike25caf182009-08-27 18:38:02 -0700379 }
Rob Pike1ce62452010-12-07 16:42:54 -0500380 fmt.Print(")\n\n")
Rob Pike396b47b2009-08-26 16:01:31 -0700381}
382
383type Op func(code int) bool
Robert Griesemer841c18a2009-11-04 21:39:55 -0800384
385const format = "\tRange{0x%04x, 0x%04x, %d},\n"
Rob Pike396b47b2009-08-26 16:01:31 -0700386
Rob Pike25caf182009-08-27 18:38:02 -0700387func dumpRange(header string, inCategory Op) {
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800388 fmt.Print(header)
389 next := 0
Rob Pike396b47b2009-08-26 16:01:31 -0700390 // one Range for each iteration
391 for {
392 // look for start of range
393 for next < len(chars) && !inCategory(next) {
Robert Griesemer40621d52009-11-09 12:07:39 -0800394 next++
Rob Pike396b47b2009-08-26 16:01:31 -0700395 }
396 if next >= len(chars) {
397 // no characters remain
Robert Griesemer40621d52009-11-09 12:07:39 -0800398 break
Rob Pike396b47b2009-08-26 16:01:31 -0700399 }
400
401 // start of range
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800402 lo := next
403 hi := next
404 stride := 1
Rob Pike396b47b2009-08-26 16:01:31 -0700405 // accept lo
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800406 next++
Rob Pike396b47b2009-08-26 16:01:31 -0700407 // look for another character to set the stride
408 for next < len(chars) && !inCategory(next) {
Robert Griesemer40621d52009-11-09 12:07:39 -0800409 next++
Rob Pike396b47b2009-08-26 16:01:31 -0700410 }
411 if next >= len(chars) {
412 // no more characters
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800413 fmt.Printf(format, lo, hi, stride)
414 break
Rob Pike396b47b2009-08-26 16:01:31 -0700415 }
416 // set stride
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800417 stride = next - lo
Rob Pike396b47b2009-08-26 16:01:31 -0700418 // check for length of run. next points to first jump in stride
419 for i := next; i < len(chars); i++ {
Robert Griesemer3bb00322009-11-09 21:23:52 -0800420 if inCategory(i) == (((i - lo) % stride) == 0) {
Rob Pike396b47b2009-08-26 16:01:31 -0700421 // accept
422 if inCategory(i) {
Robert Griesemer40621d52009-11-09 12:07:39 -0800423 hi = i
Rob Pike396b47b2009-08-26 16:01:31 -0700424 }
425 } else {
426 // no more characters in this run
Robert Griesemer40621d52009-11-09 12:07:39 -0800427 break
Rob Pike396b47b2009-08-26 16:01:31 -0700428 }
429 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800430 fmt.Printf(format, lo, hi, stride)
Rob Pike396b47b2009-08-26 16:01:31 -0700431 // next range: start looking where this range ends
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800432 next = hi + 1
Rob Pike396b47b2009-08-26 16:01:31 -0700433 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800434 fmt.Print("}\n\n")
Rob Pike396b47b2009-08-26 16:01:31 -0700435}
Rob Pike94e69152009-08-27 09:14:32 -0700436
Rob Pike8b6274e2009-08-27 17:04:23 -0700437func fullCategoryTest(list []string) {
Rob Pike94e69152009-08-27 09:14:32 -0700438 for _, name := range list {
439 if _, ok := category[name]; !ok {
Russ Cox0f0f34e2011-01-30 16:09:16 -0500440 logger.Exit("unknown category", name)
Rob Pike94e69152009-08-27 09:14:32 -0700441 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800442 r, ok := unicode.Categories[name]
Rob Pike94e69152009-08-27 09:14:32 -0700443 if !ok {
Russ Cox0f0f34e2011-01-30 16:09:16 -0500444 logger.Exit("unknown table", name)
Rob Pike94e69152009-08-27 09:14:32 -0700445 }
446 if name == "letter" {
Robert Griesemer40621d52009-11-09 12:07:39 -0800447 verifyRange(name, letterOp, r)
Rob Pike94e69152009-08-27 09:14:32 -0700448 } else {
449 verifyRange(
450 name,
451 func(code int) bool { return chars[code].category == name },
Robert Griesemer40621d52009-11-09 12:07:39 -0800452 r)
Rob Pike94e69152009-08-27 09:14:32 -0700453 }
454 }
455}
456
457func verifyRange(name string, inCategory Op, table []unicode.Range) {
Rob Pikea8246512009-11-02 11:37:52 -0800458 for i := range chars {
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800459 web := inCategory(i)
460 pkg := unicode.Is(table, i)
Rob Pike94e69152009-08-27 09:14:32 -0700461 if web != pkg {
Robert Griesemer40621d52009-11-09 12:07:39 -0800462 fmt.Fprintf(os.Stderr, "%s: U+%04X: web=%t pkg=%t\n", name, i, web, pkg)
Rob Pike94e69152009-08-27 09:14:32 -0700463 }
464 }
465}
Rob Pike8b6274e2009-08-27 17:04:23 -0700466
Robert Griesemer841c18a2009-11-04 21:39:55 -0800467func parseScript(line string, scripts map[string][]Script) {
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800468 comment := strings.Index(line, "#")
Rob Pike8b6274e2009-08-27 17:04:23 -0700469 if comment >= 0 {
Robert Griesemer40621d52009-11-09 12:07:39 -0800470 line = line[0:comment]
Rob Pike8b6274e2009-08-27 17:04:23 -0700471 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800472 line = strings.TrimSpace(line)
Rob Pike8b6274e2009-08-27 17:04:23 -0700473 if len(line) == 0 {
Robert Griesemer40621d52009-11-09 12:07:39 -0800474 return
Rob Pike8b6274e2009-08-27 17:04:23 -0700475 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800476 field := strings.Split(line, ";", -1)
Rob Pike8b6274e2009-08-27 17:04:23 -0700477 if len(field) != 2 {
Russ Cox0f0f34e2011-01-30 16:09:16 -0500478 logger.Exitf("%s: %d fields (expected 2)\n", line, len(field))
Rob Pike8b6274e2009-08-27 17:04:23 -0700479 }
Russ Cox0f0f34e2011-01-30 16:09:16 -0500480 matches := scriptRe.FindStringSubmatch(line)
Rob Pike8b6274e2009-08-27 17:04:23 -0700481 if len(matches) != 4 {
Russ Cox0f0f34e2011-01-30 16:09:16 -0500482 logger.Exitf("%s: %d matches (expected 3)\n", line, len(matches))
Rob Pike8b6274e2009-08-27 17:04:23 -0700483 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800484 lo, err := strconv.Btoui64(matches[1], 16)
Rob Pike8b6274e2009-08-27 17:04:23 -0700485 if err != nil {
Russ Cox0f0f34e2011-01-30 16:09:16 -0500486 logger.Exitf("%.5s...: %s", line, err)
Rob Pike8b6274e2009-08-27 17:04:23 -0700487 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800488 hi := lo
489 if len(matches[2]) > 2 { // ignore leading ..
490 hi, err = strconv.Btoui64(matches[2][2:], 16)
Rob Pike8b6274e2009-08-27 17:04:23 -0700491 if err != nil {
Russ Cox0f0f34e2011-01-30 16:09:16 -0500492 logger.Exitf("%.5s...: %s", line, err)
Rob Pike8b6274e2009-08-27 17:04:23 -0700493 }
494 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800495 name := matches[3]
Russ Cox69c4e932010-10-27 19:47:23 -0700496 scripts[name] = append(scripts[name], Script{uint32(lo), uint32(hi), name})
Rob Pike8b6274e2009-08-27 17:04:23 -0700497}
498
Rob Pike8b6274e2009-08-27 17:04:23 -0700499// The script tables have a lot of adjacent elements. Fold them together.
500func foldAdjacent(r []Script) []unicode.Range {
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800501 s := make([]unicode.Range, 0, len(r))
502 j := 0
Rob Pike8b6274e2009-08-27 17:04:23 -0700503 for i := 0; i < len(r); i++ {
Robert Griesemer3bb00322009-11-09 21:23:52 -0800504 if j > 0 && int(r[i].lo) == s[j-1].Hi+1 {
Robert Griesemer40621d52009-11-09 12:07:39 -0800505 s[j-1].Hi = int(r[i].hi)
Rob Pike8b6274e2009-08-27 17:04:23 -0700506 } else {
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800507 s = s[0 : j+1]
508 s[j] = unicode.Range{int(r[i].lo), int(r[i].hi), 1}
509 j++
Rob Pike8b6274e2009-08-27 17:04:23 -0700510 }
511 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800512 return s
Rob Pike8b6274e2009-08-27 17:04:23 -0700513}
514
Robert Griesemer841c18a2009-11-04 21:39:55 -0800515func fullScriptTest(list []string, installed map[string][]unicode.Range, scripts map[string][]Script) {
Rob Pike8b6274e2009-08-27 17:04:23 -0700516 for _, name := range list {
517 if _, ok := scripts[name]; !ok {
Russ Cox0f0f34e2011-01-30 16:09:16 -0500518 logger.Exit("unknown script", name)
Rob Pike8b6274e2009-08-27 17:04:23 -0700519 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800520 _, ok := installed[name]
Rob Pike8b6274e2009-08-27 17:04:23 -0700521 if !ok {
Russ Cox0f0f34e2011-01-30 16:09:16 -0500522 logger.Exit("unknown table", name)
Rob Pike8b6274e2009-08-27 17:04:23 -0700523 }
524 for _, script := range scripts[name] {
525 for r := script.lo; r <= script.hi; r++ {
Rob Pike1e55e4a2009-08-31 16:43:17 -0700526 if !unicode.Is(installed[name], int(r)) {
Robert Griesemer40621d52009-11-09 12:07:39 -0800527 fmt.Fprintf(os.Stderr, "U+%04X: not in script %s\n", r, name)
Rob Pike8b6274e2009-08-27 17:04:23 -0700528 }
529 }
530 }
531 }
532}
Rob Pike22c2b472009-08-28 23:05:16 -0700533
Rob Pike1e55e4a2009-08-31 16:43:17 -0700534// PropList.txt has the same format as Scripts.txt so we can share its parser.
535func printScriptOrProperty(doProps bool) {
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800536 flag := "scripts"
537 flaglist := *scriptlist
538 file := "Scripts.txt"
539 table := scripts
540 installed := unicode.Scripts
Rob Pike1e55e4a2009-08-31 16:43:17 -0700541 if doProps {
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800542 flag = "props"
543 flaglist = *proplist
544 file = "PropList.txt"
545 table = props
546 installed = unicode.Properties
Rob Pike1e55e4a2009-08-31 16:43:17 -0700547 }
548 if flaglist == "" {
Robert Griesemer40621d52009-11-09 12:07:39 -0800549 return
Rob Pike1e55e4a2009-08-31 16:43:17 -0700550 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800551 var err os.Error
552 resp, _, err := http.Get(*url + file)
Rob Pike1e55e4a2009-08-31 16:43:17 -0700553 if err != nil {
Russ Cox0f0f34e2011-01-30 16:09:16 -0500554 logger.Exit(err)
Rob Pike1e55e4a2009-08-31 16:43:17 -0700555 }
556 if resp.StatusCode != 200 {
Russ Cox0f0f34e2011-01-30 16:09:16 -0500557 logger.Exit("bad GET status for ", file, ":", resp.Status)
Rob Pike1e55e4a2009-08-31 16:43:17 -0700558 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800559 input := bufio.NewReader(resp.Body)
Rob Pike1e55e4a2009-08-31 16:43:17 -0700560 for {
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800561 line, err := input.ReadString('\n')
Rob Pike1e55e4a2009-08-31 16:43:17 -0700562 if err != nil {
563 if err == os.EOF {
Robert Griesemer40621d52009-11-09 12:07:39 -0800564 break
Rob Pike1e55e4a2009-08-31 16:43:17 -0700565 }
Russ Cox0f0f34e2011-01-30 16:09:16 -0500566 logger.Exit(err)
Rob Pike1e55e4a2009-08-31 16:43:17 -0700567 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800568 parseScript(line[0:len(line)-1], table)
Rob Pike1e55e4a2009-08-31 16:43:17 -0700569 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800570 resp.Body.Close()
Rob Pike1e55e4a2009-08-31 16:43:17 -0700571
572 // Find out which scripts to dump
Rob Pike38f12312010-07-01 14:08:14 -0700573 list := strings.Split(flaglist, ",", -1)
Rob Pike1e55e4a2009-08-31 16:43:17 -0700574 if flaglist == "all" {
Robert Griesemer40621d52009-11-09 12:07:39 -0800575 list = all(table)
Rob Pike1e55e4a2009-08-31 16:43:17 -0700576 }
577 if *test {
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800578 fullScriptTest(list, installed, table)
579 return
Rob Pike1e55e4a2009-08-31 16:43:17 -0700580 }
581
582 fmt.Printf(
Robert Griesemer295ceb42009-12-09 16:54:07 -0800583 "// Generated by running\n"+
584 "// maketables --%s=%s --url=%s\n"+
Robert Griesemer45cba572009-11-05 18:26:16 -0800585 "// DO NOT EDIT\n\n",
Rob Pike1e55e4a2009-08-31 16:43:17 -0700586 flag,
587 flaglist,
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800588 *url)
Rob Pike1e55e4a2009-08-31 16:43:17 -0700589 if flaglist == "all" {
590 if doProps {
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800591 fmt.Println("// Properties is the set of Unicode property tables.")
592 fmt.Println("var Properties = map[string] []Range {")
Rob Pike1e55e4a2009-08-31 16:43:17 -0700593 } else {
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800594 fmt.Println("// Scripts is the set of Unicode script tables.")
595 fmt.Println("var Scripts = map[string] []Range {")
Rob Pike1e55e4a2009-08-31 16:43:17 -0700596 }
Ryan Hitchman062406b2010-12-08 21:36:56 -0800597 for k := range table {
Robert Griesemer40621d52009-11-09 12:07:39 -0800598 fmt.Printf("\t%q: %s,\n", k, k)
Rob Pike1e55e4a2009-08-31 16:43:17 -0700599 }
Rob Pike1ce62452010-12-07 16:42:54 -0500600 fmt.Print("}\n\n")
Rob Pike1e55e4a2009-08-31 16:43:17 -0700601 }
602
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800603 decl := make(sort.StringArray, len(list))
604 ndecl := 0
Rob Pike1e55e4a2009-08-31 16:43:17 -0700605 for _, name := range list {
606 if doProps {
607 decl[ndecl] = fmt.Sprintf(
608 "\t%s = _%s;\t// %s is the set of Unicode characters with property %s.\n",
Robert Griesemer40621d52009-11-09 12:07:39 -0800609 name, name, name, name)
Rob Pike1e55e4a2009-08-31 16:43:17 -0700610 } else {
611 decl[ndecl] = fmt.Sprintf(
612 "\t%s = _%s;\t// %s is the set of Unicode characters in script %s.\n",
Robert Griesemer40621d52009-11-09 12:07:39 -0800613 name, name, name, name)
Rob Pike1e55e4a2009-08-31 16:43:17 -0700614 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800615 ndecl++
616 fmt.Printf("var _%s = []Range {\n", name)
617 ranges := foldAdjacent(table[name])
Rob Pike1e55e4a2009-08-31 16:43:17 -0700618 for _, s := range ranges {
Robert Griesemer40621d52009-11-09 12:07:39 -0800619 fmt.Printf(format, s.Lo, s.Hi, s.Stride)
Rob Pike1e55e4a2009-08-31 16:43:17 -0700620 }
Rob Pike1ce62452010-12-07 16:42:54 -0500621 fmt.Print("}\n\n")
Rob Pike1e55e4a2009-08-31 16:43:17 -0700622 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800623 decl.Sort()
624 fmt.Println("var (")
Rob Pike1e55e4a2009-08-31 16:43:17 -0700625 for _, d := range decl {
Robert Griesemer40621d52009-11-09 12:07:39 -0800626 fmt.Print(d)
Rob Pike1e55e4a2009-08-31 16:43:17 -0700627 }
Rob Pike1ce62452010-12-07 16:42:54 -0500628 fmt.Print(")\n\n")
Rob Pike1e55e4a2009-08-31 16:43:17 -0700629}
630
Rob Pike22c2b472009-08-28 23:05:16 -0700631const (
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800632 CaseUpper = 1 << iota
633 CaseLower
634 CaseTitle
635 CaseNone = 0 // must be zero
636 CaseMissing = -1 // character not present; not a valid case state
Rob Pike22c2b472009-08-28 23:05:16 -0700637)
638
639type caseState struct {
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800640 point int
641 _case int
642 deltaToUpper int
643 deltaToLower int
644 deltaToTitle int
Rob Pike22c2b472009-08-28 23:05:16 -0700645}
646
647// Is d a continuation of the state of c?
648func (c *caseState) adjacent(d *caseState) bool {
649 if d.point < c.point {
Robert Griesemer40621d52009-11-09 12:07:39 -0800650 c, d = d, c
Rob Pike22c2b472009-08-28 23:05:16 -0700651 }
652 switch {
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800653 case d.point != c.point+1: // code points not adjacent (shouldn't happen)
Robert Griesemer40621d52009-11-09 12:07:39 -0800654 return false
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800655 case d._case != c._case: // different cases
Robert Griesemer40621d52009-11-09 12:07:39 -0800656 return c.upperLowerAdjacent(d)
Rob Pike22c2b472009-08-28 23:05:16 -0700657 case c._case == CaseNone:
Robert Griesemer40621d52009-11-09 12:07:39 -0800658 return false
Rob Pike22c2b472009-08-28 23:05:16 -0700659 case c._case == CaseMissing:
Robert Griesemer40621d52009-11-09 12:07:39 -0800660 return false
Rob Pike22c2b472009-08-28 23:05:16 -0700661 case d.deltaToUpper != c.deltaToUpper:
Robert Griesemer40621d52009-11-09 12:07:39 -0800662 return false
Rob Pike22c2b472009-08-28 23:05:16 -0700663 case d.deltaToLower != c.deltaToLower:
Robert Griesemer40621d52009-11-09 12:07:39 -0800664 return false
Rob Pike22c2b472009-08-28 23:05:16 -0700665 case d.deltaToTitle != c.deltaToTitle:
Robert Griesemer40621d52009-11-09 12:07:39 -0800666 return false
Rob Pike22c2b472009-08-28 23:05:16 -0700667 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800668 return true
Rob Pike22c2b472009-08-28 23:05:16 -0700669}
670
Rob Pike3c098e22009-08-30 14:02:42 -0700671// Is d the same as c, but opposite in upper/lower case? this would make it
672// an element of an UpperLower sequence.
673func (c *caseState) upperLowerAdjacent(d *caseState) bool {
674 // check they're a matched case pair. we know they have adjacent values
675 switch {
676 case c._case == CaseUpper && d._case != CaseLower:
Robert Griesemer40621d52009-11-09 12:07:39 -0800677 return false
Rob Pike3c098e22009-08-30 14:02:42 -0700678 case c._case == CaseLower && d._case != CaseUpper:
Robert Griesemer40621d52009-11-09 12:07:39 -0800679 return false
Rob Pike3c098e22009-08-30 14:02:42 -0700680 }
681 // matched pair (at least in upper/lower). make the order Upper Lower
682 if c._case == CaseLower {
Robert Griesemer40621d52009-11-09 12:07:39 -0800683 c, d = d, c
Rob Pike3c098e22009-08-30 14:02:42 -0700684 }
685 // for an Upper Lower sequence the deltas have to be in order
686 // c: 0 1 0
687 // d: -1 0 -1
688 switch {
689 case c.deltaToUpper != 0:
Robert Griesemer40621d52009-11-09 12:07:39 -0800690 return false
Rob Pike3c098e22009-08-30 14:02:42 -0700691 case c.deltaToLower != 1:
Robert Griesemer40621d52009-11-09 12:07:39 -0800692 return false
Rob Pike3c098e22009-08-30 14:02:42 -0700693 case c.deltaToTitle != 0:
Robert Griesemer40621d52009-11-09 12:07:39 -0800694 return false
Rob Pike3c098e22009-08-30 14:02:42 -0700695 case d.deltaToUpper != -1:
Robert Griesemer40621d52009-11-09 12:07:39 -0800696 return false
Rob Pike3c098e22009-08-30 14:02:42 -0700697 case d.deltaToLower != 0:
Robert Griesemer40621d52009-11-09 12:07:39 -0800698 return false
Rob Pike3c098e22009-08-30 14:02:42 -0700699 case d.deltaToTitle != -1:
Robert Griesemer40621d52009-11-09 12:07:39 -0800700 return false
Rob Pike3c098e22009-08-30 14:02:42 -0700701 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800702 return true
Rob Pike3c098e22009-08-30 14:02:42 -0700703}
704
705// Does this character start an UpperLower sequence?
706func (c *caseState) isUpperLower() bool {
707 // for an Upper Lower sequence the deltas have to be in order
708 // c: 0 1 0
709 switch {
710 case c.deltaToUpper != 0:
Robert Griesemer40621d52009-11-09 12:07:39 -0800711 return false
Rob Pike3c098e22009-08-30 14:02:42 -0700712 case c.deltaToLower != 1:
Robert Griesemer40621d52009-11-09 12:07:39 -0800713 return false
Rob Pike3c098e22009-08-30 14:02:42 -0700714 case c.deltaToTitle != 0:
Robert Griesemer40621d52009-11-09 12:07:39 -0800715 return false
Rob Pike3c098e22009-08-30 14:02:42 -0700716 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800717 return true
Rob Pike3c098e22009-08-30 14:02:42 -0700718}
719
720// Does this character start a LowerUpper sequence?
721func (c *caseState) isLowerUpper() bool {
722 // for an Upper Lower sequence the deltas have to be in order
723 // c: -1 0 -1
724 switch {
725 case c.deltaToUpper != -1:
Robert Griesemer40621d52009-11-09 12:07:39 -0800726 return false
Rob Pike3c098e22009-08-30 14:02:42 -0700727 case c.deltaToLower != 0:
Robert Griesemer40621d52009-11-09 12:07:39 -0800728 return false
Rob Pike3c098e22009-08-30 14:02:42 -0700729 case c.deltaToTitle != -1:
Robert Griesemer40621d52009-11-09 12:07:39 -0800730 return false
Rob Pike3c098e22009-08-30 14:02:42 -0700731 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800732 return true
Rob Pike3c098e22009-08-30 14:02:42 -0700733}
734
Rob Pike22c2b472009-08-28 23:05:16 -0700735func getCaseState(i int) (c *caseState) {
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800736 c = &caseState{point: i, _case: CaseNone}
737 ch := &chars[i]
Rob Pike22c2b472009-08-28 23:05:16 -0700738 switch int(ch.codePoint) {
739 case 0:
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800740 c._case = CaseMissing // Will get NUL wrong but that doesn't matter
741 return
Rob Pike22c2b472009-08-28 23:05:16 -0700742 case ch.upperCase:
Robert Griesemer40621d52009-11-09 12:07:39 -0800743 c._case = CaseUpper
Rob Pike22c2b472009-08-28 23:05:16 -0700744 case ch.lowerCase:
Robert Griesemer40621d52009-11-09 12:07:39 -0800745 c._case = CaseLower
Rob Pike22c2b472009-08-28 23:05:16 -0700746 case ch.titleCase:
Robert Griesemer40621d52009-11-09 12:07:39 -0800747 c._case = CaseTitle
Rob Pike22c2b472009-08-28 23:05:16 -0700748 }
749 if ch.upperCase != 0 {
Robert Griesemer40621d52009-11-09 12:07:39 -0800750 c.deltaToUpper = ch.upperCase - i
Rob Pike22c2b472009-08-28 23:05:16 -0700751 }
752 if ch.lowerCase != 0 {
Robert Griesemer40621d52009-11-09 12:07:39 -0800753 c.deltaToLower = ch.lowerCase - i
Rob Pike22c2b472009-08-28 23:05:16 -0700754 }
755 if ch.titleCase != 0 {
Robert Griesemer40621d52009-11-09 12:07:39 -0800756 c.deltaToTitle = ch.titleCase - i
Rob Pike22c2b472009-08-28 23:05:16 -0700757 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800758 return
Rob Pike22c2b472009-08-28 23:05:16 -0700759}
760
761func printCases() {
762 if !*cases {
Robert Griesemer40621d52009-11-09 12:07:39 -0800763 return
Rob Pike22c2b472009-08-28 23:05:16 -0700764 }
765 if *test {
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800766 fullCaseTest()
767 return
Rob Pike22c2b472009-08-28 23:05:16 -0700768 }
769 fmt.Printf(
Robert Griesemer295ceb42009-12-09 16:54:07 -0800770 "// Generated by running\n"+
771 "// maketables --data=%s\n"+
772 "// DO NOT EDIT\n\n"+
773 "// CaseRanges is the table describing case mappings for all letters with\n"+
774 "// non-self mappings.\n"+
775 "var CaseRanges = _CaseRanges\n"+
Robert Griesemer45cba572009-11-05 18:26:16 -0800776 "var _CaseRanges = []CaseRange {\n",
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800777 *dataURL)
Rob Pike22c2b472009-08-28 23:05:16 -0700778
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800779 var startState *caseState // the start of a run; nil for not active
780 var prevState = &caseState{} // the state of the previous character
Rob Pikea8246512009-11-02 11:37:52 -0800781 for i := range chars {
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800782 state := getCaseState(i)
Rob Pike22c2b472009-08-28 23:05:16 -0700783 if state.adjacent(prevState) {
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800784 prevState = state
785 continue
Rob Pike22c2b472009-08-28 23:05:16 -0700786 }
787 // end of run (possibly)
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800788 printCaseRange(startState, prevState)
789 startState = nil
Rob Pike22c2b472009-08-28 23:05:16 -0700790 if state._case != CaseMissing && state._case != CaseNone {
Robert Griesemer40621d52009-11-09 12:07:39 -0800791 startState = state
Rob Pike22c2b472009-08-28 23:05:16 -0700792 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800793 prevState = state
Rob Pike22c2b472009-08-28 23:05:16 -0700794 }
Rob Pike1ce62452010-12-07 16:42:54 -0500795 fmt.Print("}\n")
Rob Pike22c2b472009-08-28 23:05:16 -0700796}
797
798func printCaseRange(lo, hi *caseState) {
799 if lo == nil {
Robert Griesemer40621d52009-11-09 12:07:39 -0800800 return
Rob Pike22c2b472009-08-28 23:05:16 -0700801 }
802 if lo.deltaToUpper == 0 && lo.deltaToLower == 0 && lo.deltaToTitle == 0 {
803 // character represents itself in all cases - no need to mention it
Robert Griesemer40621d52009-11-09 12:07:39 -0800804 return
Rob Pike22c2b472009-08-28 23:05:16 -0700805 }
Rob Pike3c098e22009-08-30 14:02:42 -0700806 switch {
807 case hi.point > lo.point && lo.isUpperLower():
808 fmt.Printf("\tCaseRange{0x%04X, 0x%04X, d{UpperLower, UpperLower, UpperLower}},\n",
Robert Griesemer40621d52009-11-09 12:07:39 -0800809 lo.point, hi.point)
Rob Pike3c098e22009-08-30 14:02:42 -0700810 case hi.point > lo.point && lo.isLowerUpper():
Russ Cox0f0f34e2011-01-30 16:09:16 -0500811 logger.Exitf("LowerUpper sequence: should not happen: U+%04X. If it's real, need to fix To()", lo.point)
Rob Pike3c098e22009-08-30 14:02:42 -0700812 fmt.Printf("\tCaseRange{0x%04X, 0x%04X, d{LowerUpper, LowerUpper, LowerUpper}},\n",
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800813 lo.point, hi.point)
Rob Pike3c098e22009-08-30 14:02:42 -0700814 default:
815 fmt.Printf("\tCaseRange{0x%04X, 0x%04X, d{%d, %d, %d}},\n",
816 lo.point, hi.point,
Robert Griesemer40621d52009-11-09 12:07:39 -0800817 lo.deltaToUpper, lo.deltaToLower, lo.deltaToTitle)
Rob Pike3c098e22009-08-30 14:02:42 -0700818 }
Rob Pike22c2b472009-08-28 23:05:16 -0700819}
820
821// If the cased value in the Char is 0, it means use the rune itself.
822func caseIt(rune, cased int) int {
823 if cased == 0 {
Robert Griesemer40621d52009-11-09 12:07:39 -0800824 return rune
Rob Pike22c2b472009-08-28 23:05:16 -0700825 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800826 return cased
Rob Pike22c2b472009-08-28 23:05:16 -0700827}
828
829func fullCaseTest() {
830 for i, c := range chars {
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800831 lower := unicode.ToLower(i)
832 want := caseIt(i, c.lowerCase)
Rob Pike22c2b472009-08-28 23:05:16 -0700833 if lower != want {
Robert Griesemer40621d52009-11-09 12:07:39 -0800834 fmt.Fprintf(os.Stderr, "lower U+%04X should be U+%04X is U+%04X\n", i, want, lower)
Rob Pike22c2b472009-08-28 23:05:16 -0700835 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800836 upper := unicode.ToUpper(i)
837 want = caseIt(i, c.upperCase)
Rob Pike22c2b472009-08-28 23:05:16 -0700838 if upper != want {
Robert Griesemer40621d52009-11-09 12:07:39 -0800839 fmt.Fprintf(os.Stderr, "upper U+%04X should be U+%04X is U+%04X\n", i, want, upper)
Rob Pike22c2b472009-08-28 23:05:16 -0700840 }
Robert Griesemer45ca9f72009-12-15 15:41:46 -0800841 title := unicode.ToTitle(i)
842 want = caseIt(i, c.titleCase)
Rob Pike22c2b472009-08-28 23:05:16 -0700843 if title != want {
Robert Griesemer40621d52009-11-09 12:07:39 -0800844 fmt.Fprintf(os.Stderr, "title U+%04X should be U+%04X is U+%04X\n", i, want, title)
Rob Pike22c2b472009-08-28 23:05:16 -0700845 }
846 }
847}