first cut at case mapping tables and library. next cut will do the optimization for alternating sequences. R=rsc DELTA=1658 (1620 added, 9 deleted, 29 changed) OCL=34072 CL=34075

commit: 22c2b476a84f541a6a6818f4d3dbf30cb0802867 [log] [tgz]
author: Rob Pike <r@golang.org> Fri Aug 28 23:05:16 2009 -0700
committer: Rob Pike <r@golang.org> Fri Aug 28 23:05:16 2009 -0700
tree: e8cf0917b412c1a14cb1b13d1f90f13c179ea886
parent: 30dcb13420342880ac8b9b4d34ad25e9d6d86d65 [diff] [blame]
diff --git a/src/pkg/unicode/maketables.go b/src/pkg/unicode/maketables.go
index b5ae373..a26f9bd 100644
--- a/src/pkg/unicode/maketables.go
+++ b/src/pkg/unicode/maketables.go

@@ -21,6 +21,14 @@
 	"unicode";
 )
 
+func main() {
+	flag.Parse();
+	loadChars();	// always needed
+	printCategories();
+	printScripts();
+	printCases();
+}
+
 var dataUrl = flag.String("data", "", "full URL for UnicodeData.txt; defaults to --url/UnicodeData.txt");
 var url = flag.String("url",
 	"http://www.unicode.org/Public/5.1.0/ucd/",
@@ -31,6 +39,9 @@
 var scriptlist = flag.String("scripts",
 	"all",
 	"comma-separated list of which script tables to generate");
+var cases = flag.Bool("cases",
+	true,
+	"generate case tables");
 var test = flag.Bool("test",
 	false,
 	"test existing tables; can be used to compare web data with package data");
@@ -44,7 +55,7 @@
 //	0037;DIGIT SEVEN;Nd;0;EN;;7;7;7;N;;;;;
 //	007A;LATIN SMALL LETTER Z;Ll;0;L;;;;;N;;;005A;;005A
 // See http://www.unicode.org/Public/5.1.0/ucd/UCD.html for full explanation
-// The fields
+// The fields:
 const (
 	FCodePoint = iota;
 	FName;
@@ -87,11 +98,11 @@
 // This contains only the properties we're interested in.
 type Char struct {
 	field	[]string; 	// debugging only; could be deleted if we take out char.dump()
-	codePoint	uint32;	// redundant (it's the index in the chars table) but useful
+	codePoint	uint32;	// if zero, this index is not a valid code point.
 	category	string;
-	upperCase	uint32;
-	lowerCase	uint32;
-	titleCase	uint32;
+	upperCase	int;
+	lowerCase	int;
+	titleCase	int;
 }
 
 // Scripts.txt has form:
@@ -104,26 +115,21 @@
 	script	string;
 }
 
-func main() {
-	flag.Parse();
-	printCategories();
-	printScripts();
-}
-
-var chars = make([]Char, MaxChar)
+var chars = make([]Char, MaxChar+1)
 var scripts = make(map[string] []Script)
 
 var lastChar uint32 = 0;
 
 // In UnicodeData.txt, some ranges are marked like this:
-// 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
-// 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
+//	3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
+//	4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
 // parseCategory returns a state variable indicating the weirdness.
 type State int
 const (
 	SNormal State = iota;	// known to be zero for the type
 	SFirst;
 	SLast;
+	SMissing;
 )
 
 func parseCategory(line string) (state State) {
@@ -139,7 +145,7 @@
 	if point == 0 {
 		return	// not interesting and we use 0 as unset
 	}
-	if point >= MaxChar {
+	if point > MaxChar {
 		return;
 	}
 	char := &chars[point];
@@ -189,7 +195,7 @@
 	char.titleCase = char.letterValue(t, "T");
 }
 
-func (char *Char) letterValue(s string, cas string) uint32 {
+func (char *Char) letterValue(s string, cas string) int {
 	if s == "" {
 		return 0
 	}
@@ -198,7 +204,7 @@
 		char.dump(cas);
 		die.Logf("U+%04x: bad letter(%s): %s", char.codePoint, s, err)
 	}
-	return uint32(v)
+	return int(v)
 }
 
 func allCategories() []string {
@@ -242,10 +248,7 @@
 	return false
 }
 
-func printCategories() {
-	if *tablelist == "" {
-		return
-	}
+func loadChars() {
 	if *dataUrl == "" {
 		flag.Set("data", *url + "UnicodeData.txt");
 	}
@@ -288,6 +291,12 @@
 		}
 	}
 	resp.Body.Close();
+}
+
+func printCategories() {
+	if *tablelist == "" {
+		return
+	}
 	// Find out which categories to dump
 	list := strings.Split(*tablelist, ",", 0);
 	if *tablelist == "all" {
@@ -299,11 +308,11 @@
 	}
 	fmt.Printf(
 		"// Generated by running\n"
-		"//	maketables --tables=%s --url=%s\n"
+		"//	maketables --tables=%s --data=%s\n"
 		"// DO NOT EDIT\n\n"
 		"package unicode\n\n",
 		*tablelist,
-		*url
+		*dataUrl
 	);
 
 	fmt.Println("// Version is the Unicode edition from which the tables are derived.");
@@ -496,6 +505,9 @@
 }
 
 func printScripts() {
+	if *scriptlist == "" {
+		return
+	}
 	var err os.Error;
 	scriptRe, err = regexp.Compile(`([0-9A-F]+)(\.\.[0-9A-F]+)? +; ([A-Za-z_]+)`);
 	if err != nil {
@@ -604,3 +616,148 @@
 		}
 	}
 }
+
+const (
+	CaseUpper = 1 << iota;
+	CaseLower;
+	CaseTitle;
+	CaseNone = 0;	// must be zero
+	CaseMissing = -1;	// character not present; not a valid case state
+)
+
+type caseState struct {
+	point	int;
+	_case	int;
+	deltaToUpper	int;
+	deltaToLower	int;
+	deltaToTitle	int;
+}
+
+// Is d a continuation of the state of c?
+func (c *caseState) adjacent(d *caseState) bool {
+	if d.point < c.point {
+		return d.adjacent(c)
+	}
+	switch {
+	case d.point != c.point+1:
+		return false
+	case d._case != c._case:
+		return false
+	case c._case == CaseNone:
+		return false
+	case c._case == CaseMissing:
+		return false
+	case d.deltaToUpper != c.deltaToUpper:
+		return false
+	case d.deltaToLower != c.deltaToLower:
+		return false
+	case d.deltaToTitle != c.deltaToTitle:
+		return false
+	}
+	return true; 
+}
+
+func getCaseState(i int) (c *caseState) {
+	c = &caseState{ point: i, _case: CaseNone };
+	ch := &chars[i];
+	switch int(ch.codePoint) {
+	case 0:
+		c._case = CaseMissing;	// Will get NUL wrong but that doesn't matter
+		return;
+	case ch.upperCase:
+		c._case = CaseUpper;
+	case ch.lowerCase:
+		c._case = CaseLower;
+	case ch.titleCase:
+		c._case = CaseTitle;
+	}
+	if ch.upperCase != 0 {
+		c.deltaToUpper = ch.upperCase - i
+	}
+	if ch.lowerCase != 0 {
+		c.deltaToLower = ch.lowerCase - i
+	}
+	if ch.titleCase != 0 {
+		c.deltaToTitle = ch.titleCase - i
+	}
+	return;
+}
+
+func printCases() {
+	if !*cases {
+		return
+	}
+	if *test {
+		fullCaseTest();
+		return
+	}
+	fmt.Printf(
+		"// Generated by running\n"
+		"//	maketables --data=%s\n"
+		"// DO NOT EDIT\n\n"
+		"// CaseRanges is the table describing case mappings for all letters with\n"
+		"// non-self mappings.\n"
+		"var CaseRanges = _CaseRanges\n"
+		"var _CaseRanges = []CaseRange {\n",
+		*dataUrl
+	);
+
+	var startState *caseState;	// the start of a run; nil for not active
+	var prevState = &caseState{};	// the state of the previous character
+	for i, c := range chars {
+		state := getCaseState(i);
+		if state.adjacent(prevState) {
+			prevState = state;
+			continue;
+		}
+		// end of run (possibly)
+		printCaseRange(startState, prevState);
+		startState = nil;
+		if state._case != CaseMissing && state._case != CaseNone {
+			startState = state;
+		}
+		prevState = state;
+	}
+	fmt.Printf("}\n");
+}
+
+func printCaseRange(lo, hi *caseState) {
+	if lo == nil {
+		return
+	}
+	if lo.deltaToUpper == 0 && lo.deltaToLower == 0 && lo.deltaToTitle == 0 {
+		// character represents itself in all cases - no need to mention it
+		return
+	}
+	fmt.Printf("\tCaseRange{0x%04X, 0x%04X, d{%d, %d, %d}},\n",
+		lo.point, hi.point,
+		lo.deltaToUpper, lo.deltaToLower, lo.deltaToTitle)
+}
+
+// If the cased value in the Char is 0, it means use the rune itself.
+func caseIt(rune, cased int) int {
+	if cased == 0 {
+		return rune
+	}
+	return cased
+}
+
+func fullCaseTest() {
+	for i, c := range chars {
+		lower := unicode.ToLower(i);
+		want := caseIt(i, c.lowerCase);
+		if lower != want {
+			fmt.Fprintf(os.Stderr, "lower U+%04X should be U+%04X is U+%04X\n", i, want, lower);
+		}
+		upper := unicode.ToUpper(i);
+		want = caseIt(i, c.upperCase);
+		if upper != want {
+			fmt.Fprintf(os.Stderr, "upper U+%04X should be U+%04X is U+%04X\n", i, want, upper);
+		}
+		title := unicode.ToTitle(i);
+		want = caseIt(i, c.titleCase);
+		if title != want {
+			fmt.Fprintf(os.Stderr, "title U+%04X should be U+%04X is U+%04X\n", i, want, title);
+		}
+	}
+}
commit	22c2b476a84f541a6a6818f4d3dbf30cb0802867	[log] [tgz]
author	Rob Pike <r@golang.org>	Fri Aug 28 23:05:16 2009 -0700
committer	Rob Pike <r@golang.org>	Fri Aug 28 23:05:16 2009 -0700
tree	e8cf0917b412c1a14cb1b13d1f90f13c179ea886
parent	30dcb13420342880ac8b9b4d34ad25e9d6d86d65 [diff] [blame]