unicode: add case folding tables

R=r, r
CC=golang-dev
https://golang.org/cl/4571074
diff --git a/src/pkg/unicode/maketables.go b/src/pkg/unicode/maketables.go
index 39c7121..421d294 100644
--- a/src/pkg/unicode/maketables.go
+++ b/src/pkg/unicode/maketables.go
@@ -24,15 +24,18 @@
 func main() {
 	flag.Parse()
 	loadChars() // always needed
+	loadCasefold()
 	printCategories()
 	printScriptOrProperty(false)
 	printScriptOrProperty(true)
 	printCases()
 	printLatinProperties()
+	printCasefold()
 	printSizes()
 }
 
 var dataURL = flag.String("data", "", "full URL for UnicodeData.txt; defaults to --url/UnicodeData.txt")
+var casefoldingURL = flag.String("casefolding", "", "full URL for CaseFolding.txt; defaults to --url/CaseFolding.txt")
 var url = flag.String("url",
 	"http://www.unicode.org/Public/6.0.0/ucd/",
 	"URL of Unicode database directory")
@@ -119,6 +122,8 @@
 	upperCase int
 	lowerCase int
 	titleCase int
+	foldCase  int // simple case folding
+	caseOrbit int // next in simple case folding orbit
 }
 
 // Scripts.txt has form:
@@ -308,8 +313,53 @@
 	resp.Body.Close()
 }
 
+func loadCasefold() {
+	if *casefoldingURL == "" {
+		flag.Set("casefolding", *url+"CaseFolding.txt")
+	}
+	resp, err := http.Get(*casefoldingURL)
+	if err != nil {
+		logger.Fatal(err)
+	}
+	if resp.StatusCode != 200 {
+		logger.Fatal("bad GET status for CaseFolding.txt", resp.Status)
+	}
+	input := bufio.NewReader(resp.Body)
+	for {
+		line, err := input.ReadString('\n')
+		if err != nil {
+			if err == os.EOF {
+				break
+			}
+			logger.Fatal(err)
+		}
+		if line[0] == '#' {
+			continue
+		}
+		field := strings.Split(line, "; ", -1)
+		if len(field) != 4 {
+			logger.Fatalf("CaseFolding.txt %.5s...: %d fields (expected %d)\n", line, len(field), 4)
+		}
+		kind := field[1]
+		if kind != "C" && kind != "S" {
+			// Only care about 'common' and 'simple' foldings.
+			continue
+		}
+		p1, err := strconv.Btoui64(field[0], 16)
+		if err != nil {
+			logger.Fatalf("CaseFolding.txt %.5s...: %s", line, err)
+		}
+		p2, err := strconv.Btoui64(field[2], 16)
+		if err != nil {
+			logger.Fatalf("CaseFolding.txt %.5s...: %s", line, err)
+		}
+		chars[p1].foldCase = int(p2)
+	}
+	resp.Body.Close()
+}
+
 const progHeader = `// Generated by running
-//	maketables --tables=%s --data=%s
+//	maketables --tables=%s --data=%s --casefolding=%s
 // DO NOT EDIT
 
 package unicode
@@ -330,7 +380,7 @@
 		fullCategoryTest(list)
 		return
 	}
-	fmt.Printf(progHeader, *tablelist, *dataURL)
+	fmt.Printf(progHeader, *tablelist, *dataURL, *casefoldingURL)
 
 	fmt.Println("// Version is the Unicode edition from which the tables are derived.")
 	fmt.Printf("const Version = %q\n\n", version())
@@ -837,13 +887,13 @@
 	}
 	fmt.Printf(
 		"// Generated by running\n"+
-			"//	maketables --data=%s\n"+
+			"//	maketables --data=%s --casefolding=%s\n"+
 			"// DO NOT EDIT\n\n"+
 			"// CaseRanges is the table describing case mappings for all letters with\n"+
 			"// non-self mappings.\n"+
 			"var CaseRanges = _CaseRanges\n"+
 			"var _CaseRanges = []CaseRange {\n",
-		*dataURL)
+		*dataURL, *casefoldingURL)
 
 	var startState *caseState    // the start of a run; nil for not active
 	var prevState = &caseState{} // the state of the previous character
@@ -946,13 +996,246 @@
 		if code == ' ' {
 			property = "pZ | pp"
 		}
-		fmt.Printf("\t0x%.2X: %s, // %q\n", code, property, code)
+		fmt.Printf("\t0x%02X: %s, // %q\n", code, property, code)
 	}
-	fmt.Println("}")
+	fmt.Printf("}\n\n")
 }
 
-var range16Count = 0 // Number of entries in the 16-bit range tables.
-var range32Count = 0 // Number of entries in the 32-bit range tables.
+func printCasefold() {
+	// Build list of case-folding groups attached to each canonical folded char (typically lower case).
+	var caseOrbit = make([][]int, MaxChar+1)
+	for i := range chars {
+		c := &chars[i]
+		if c.foldCase == 0 {
+			continue
+		}
+		orb := caseOrbit[c.foldCase]
+		if orb == nil {
+			orb = append(orb, c.foldCase)
+		}
+		caseOrbit[c.foldCase] = append(orb, i)
+	}
+
+	// Insert explicit 1-element groups when assuming [lower, upper] would be wrong.
+	for i := range chars {
+		c := &chars[i]
+		f := c.foldCase
+		if f == 0 {
+			f = i
+		}
+		orb := caseOrbit[f]
+		if orb == nil && (c.upperCase != 0 && c.upperCase != i || c.lowerCase != 0 && c.lowerCase != i) {
+			// Default assumption of [upper, lower] is wrong.
+			caseOrbit[i] = []int{i}
+		}
+	}
+
+	// Delete the groups for which assuming [lower, upper] is right.
+	for i, orb := range caseOrbit {
+		if len(orb) == 2 && chars[orb[0]].upperCase == orb[1] && chars[orb[1]].lowerCase == orb[0] {
+			caseOrbit[i] = nil
+		}
+	}
+
+	// Record orbit information in chars.
+	for _, orb := range caseOrbit {
+		if orb == nil {
+			continue
+		}
+		sort.SortInts(orb)
+		c := orb[len(orb)-1]
+		for _, d := range orb {
+			chars[c].caseOrbit = d
+			c = d
+		}
+	}
+
+	printCaseOrbit()
+
+	// Tables of category and script folding exceptions: code points
+	// that must be added when interpreting a particular category/script
+	// in a case-folding context.
+	cat := make(map[string]map[int]bool)
+	for name := range category {
+		if x := foldExceptions(inCategory(name)); len(x) > 0 {
+			cat[name] = x
+		}
+	}
+
+	scr := make(map[string]map[int]bool)
+	for name := range scripts {
+		if x := foldExceptions(inScript(name)); len(x) > 0 {
+			cat[name] = x
+		}
+	}
+
+	printCatFold("FoldCategory", cat)
+	printCatFold("FoldScript", scr)
+}
+
+// inCategory returns a list of all the runes in the category.
+func inCategory(name string) []int {
+	var x []int
+	for i := range chars {
+		c := &chars[i]
+		if c.category == name || len(name) == 1 && len(c.category) > 1 && c.category[0] == name[0] {
+			x = append(x, i)
+		}
+	}
+	return x
+}
+
+// inScript returns a list of all the runes in the script.
+func inScript(name string) []int {
+	var x []int
+	for _, s := range scripts[name] {
+		for c := s.lo; c <= s.hi; c++ {
+			x = append(x, int(c))
+		}
+	}
+	return x
+}
+
+// foldExceptions returns a list of all the runes fold-equivalent
+// to runes in class but not in class themselves.
+func foldExceptions(class []int) map[int]bool {
+	// Create map containing class and all fold-equivalent chars.
+	m := make(map[int]bool)
+	for _, r := range class {
+		c := &chars[r]
+		if c.caseOrbit == 0 {
+			// Just upper and lower.
+			if u := c.upperCase; u != 0 {
+				m[u] = true
+			}
+			if l := c.lowerCase; l != 0 {
+				m[l] = true
+			}
+			m[r] = true
+			continue
+		}
+		// Otherwise walk orbit.
+		r0 := r
+		for {
+			m[r] = true
+			r = chars[r].caseOrbit
+			if r == r0 {
+				break
+			}
+		}
+	}
+
+	// Remove class itself.
+	for _, r := range class {
+		m[r] = false, false
+	}
+
+	// What's left is the exceptions.
+	return m
+}
+
+var comment = map[string]string{
+	"FoldCategory": "// FoldCategory maps a category name to a table of\n" +
+		"// code points outside the category that are equivalent under\n" +
+		"// simple case folding to code points inside the category.\n" +
+		"// If there is no entry for a category name, there are no such points.\n",
+
+	"FoldScript": "// FoldScript maps a script name to a table of\n" +
+		"// code points outside the script that are equivalent under\n" +
+		"// simple case folding to code points inside the script.\n" +
+		"// If there is no entry for a script name, there are no such points.\n",
+}
+
+func printCaseOrbit() {
+	if *test {
+		for i := range chars {
+			c := &chars[i]
+			f := c.caseOrbit
+			if f == 0 {
+				if c.lowerCase != i && c.lowerCase != 0 {
+					f = c.lowerCase
+				} else if c.upperCase != i && c.upperCase != 0 {
+					f = c.upperCase
+				} else {
+					f = i
+				}
+			}
+			if g := unicode.SimpleFold(i); g != f {
+				fmt.Fprintf(os.Stderr, "unicode.SimpleFold(%#U) = %#U, want %#U\n", i, g, f)
+			}
+		}
+		return
+	}
+
+	fmt.Printf("var caseOrbit = []foldPair{\n")
+	for i := range chars {
+		c := &chars[i]
+		if c.caseOrbit != 0 {
+			fmt.Printf("\t{0x%04X, 0x%04X},\n", i, c.caseOrbit)
+			foldPairCount++
+		}
+	}
+	fmt.Printf("}\n\n")
+}
+
+func printCatFold(name string, m map[string]map[int]bool) {
+	if *test {
+		var pkgMap map[string]*unicode.RangeTable
+		if name == "FoldCategory" {
+			pkgMap = unicode.FoldCategory
+		} else {
+			pkgMap = unicode.FoldScript
+		}
+		if len(pkgMap) != len(m) {
+			fmt.Fprintf(os.Stderr, "unicode.%s has %d elements, want %d\n", name, len(pkgMap), len(m))
+			return
+		}
+		for k, v := range m {
+			t, ok := pkgMap[k]
+			if !ok {
+				fmt.Fprintf(os.Stderr, "unicode.%s[%q] missing\n", name, k)
+				continue
+			}
+			n := 0
+			for _, r := range t.R16 {
+				for c := int(r.Lo); c <= int(r.Hi); c += int(r.Stride) {
+					if !v[c] {
+						fmt.Fprintf(os.Stderr, "unicode.%s[%q] contains %#U, should not\n", name, k, c)
+					}
+					n++
+				}
+			}
+			for _, r := range t.R32 {
+				for c := int(r.Lo); c <= int(r.Hi); c += int(r.Stride) {
+					if !v[c] {
+						fmt.Fprintf(os.Stderr, "unicode.%s[%q] contains %#U, should not\n", name, k, c)
+					}
+					n++
+				}
+			}
+			if n != len(v) {
+				fmt.Fprintf(os.Stderr, "unicode.%s[%q] has %d code points, want %d\n", name, k, n, len(v))
+			}
+		}
+		return
+	}
+
+	fmt.Print(comment[name])
+	fmt.Printf("var %s = map[string]*RangeTable{\n", name)
+	for name := range m {
+		fmt.Printf("\t%q: fold%s,\n", name, name)
+	}
+	fmt.Printf("}\n\n")
+	for name, class := range m {
+		dumpRange(
+			fmt.Sprintf("var fold%s = &RangeTable{\n", name),
+			func(code int) bool { return class[code] })
+	}
+}
+
+var range16Count = 0  // Number of entries in the 16-bit range tables.
+var range32Count = 0  // Number of entries in the 32-bit range tables.
+var foldPairCount = 0 // Number of fold pairs in the exception tables.
 
 func printSizes() {
 	if *test {
@@ -963,4 +1246,6 @@
 	range16Bytes := range16Count * 3 * 2
 	range32Bytes := range32Count * 3 * 4
 	fmt.Printf("// Range bytes: %d 16-bit, %d 32-bit, %d total.\n", range16Bytes, range32Bytes, range16Bytes+range32Bytes)
+	fmt.Println()
+	fmt.Printf("// Fold orbit bytes: %d pairs, %d bytes\n", foldPairCount, foldPairCount*2*2)
 }