first cut at case mapping tables and library.
next cut will do the optimization for alternating sequences.
R=rsc
DELTA=1658 (1620 added, 9 deleted, 29 changed)
OCL=34072
CL=34075
diff --git a/src/pkg/unicode/maketables.go b/src/pkg/unicode/maketables.go
index b5ae373..a26f9bd 100644
--- a/src/pkg/unicode/maketables.go
+++ b/src/pkg/unicode/maketables.go
@@ -21,6 +21,14 @@
"unicode";
)
+func main() {
+ flag.Parse();
+ loadChars(); // always needed
+ printCategories();
+ printScripts();
+ printCases();
+}
+
var dataUrl = flag.String("data", "", "full URL for UnicodeData.txt; defaults to --url/UnicodeData.txt");
var url = flag.String("url",
"http://www.unicode.org/Public/5.1.0/ucd/",
@@ -31,6 +39,9 @@
var scriptlist = flag.String("scripts",
"all",
"comma-separated list of which script tables to generate");
+var cases = flag.Bool("cases",
+ true,
+ "generate case tables");
var test = flag.Bool("test",
false,
"test existing tables; can be used to compare web data with package data");
@@ -44,7 +55,7 @@
// 0037;DIGIT SEVEN;Nd;0;EN;;7;7;7;N;;;;;
// 007A;LATIN SMALL LETTER Z;Ll;0;L;;;;;N;;;005A;;005A
// See http://www.unicode.org/Public/5.1.0/ucd/UCD.html for full explanation
-// The fields
+// The fields:
const (
FCodePoint = iota;
FName;
@@ -87,11 +98,11 @@
// This contains only the properties we're interested in.
type Char struct {
field []string; // debugging only; could be deleted if we take out char.dump()
- codePoint uint32; // redundant (it's the index in the chars table) but useful
+ codePoint uint32; // if zero, this index is not a valid code point.
category string;
- upperCase uint32;
- lowerCase uint32;
- titleCase uint32;
+ upperCase int;
+ lowerCase int;
+ titleCase int;
}
// Scripts.txt has form:
@@ -104,26 +115,21 @@
script string;
}
-func main() {
- flag.Parse();
- printCategories();
- printScripts();
-}
-
-var chars = make([]Char, MaxChar)
+var chars = make([]Char, MaxChar+1)
var scripts = make(map[string] []Script)
var lastChar uint32 = 0;
// In UnicodeData.txt, some ranges are marked like this:
-// 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
-// 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
+// 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
+// 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
// parseCategory returns a state variable indicating the weirdness.
type State int
const (
SNormal State = iota; // known to be zero for the type
SFirst;
SLast;
+ SMissing;
)
func parseCategory(line string) (state State) {
@@ -139,7 +145,7 @@
if point == 0 {
return // not interesting and we use 0 as unset
}
- if point >= MaxChar {
+ if point > MaxChar {
return;
}
char := &chars[point];
@@ -189,7 +195,7 @@
char.titleCase = char.letterValue(t, "T");
}
-func (char *Char) letterValue(s string, cas string) uint32 {
+func (char *Char) letterValue(s string, cas string) int {
if s == "" {
return 0
}
@@ -198,7 +204,7 @@
char.dump(cas);
die.Logf("U+%04x: bad letter(%s): %s", char.codePoint, s, err)
}
- return uint32(v)
+ return int(v)
}
func allCategories() []string {
@@ -242,10 +248,7 @@
return false
}
-func printCategories() {
- if *tablelist == "" {
- return
- }
+func loadChars() {
if *dataUrl == "" {
flag.Set("data", *url + "UnicodeData.txt");
}
@@ -288,6 +291,12 @@
}
}
resp.Body.Close();
+}
+
+func printCategories() {
+ if *tablelist == "" {
+ return
+ }
// Find out which categories to dump
list := strings.Split(*tablelist, ",", 0);
if *tablelist == "all" {
@@ -299,11 +308,11 @@
}
fmt.Printf(
"// Generated by running\n"
- "// maketables --tables=%s --url=%s\n"
+ "// maketables --tables=%s --data=%s\n"
"// DO NOT EDIT\n\n"
"package unicode\n\n",
*tablelist,
- *url
+ *dataUrl
);
fmt.Println("// Version is the Unicode edition from which the tables are derived.");
@@ -496,6 +505,9 @@
}
func printScripts() {
+ if *scriptlist == "" {
+ return
+ }
var err os.Error;
scriptRe, err = regexp.Compile(`([0-9A-F]+)(\.\.[0-9A-F]+)? +; ([A-Za-z_]+)`);
if err != nil {
@@ -604,3 +616,148 @@
}
}
}
+
+const (
+ CaseUpper = 1 << iota;
+ CaseLower;
+ CaseTitle;
+ CaseNone = 0; // must be zero
+ CaseMissing = -1; // character not present; not a valid case state
+)
+
+type caseState struct {
+ point int;
+ _case int;
+ deltaToUpper int;
+ deltaToLower int;
+ deltaToTitle int;
+}
+
+// Is d a continuation of the state of c?
+func (c *caseState) adjacent(d *caseState) bool {
+ if d.point < c.point {
+ return d.adjacent(c)
+ }
+ switch {
+ case d.point != c.point+1:
+ return false
+ case d._case != c._case:
+ return false
+ case c._case == CaseNone:
+ return false
+ case c._case == CaseMissing:
+ return false
+ case d.deltaToUpper != c.deltaToUpper:
+ return false
+ case d.deltaToLower != c.deltaToLower:
+ return false
+ case d.deltaToTitle != c.deltaToTitle:
+ return false
+ }
+ return true;
+}
+
+func getCaseState(i int) (c *caseState) {
+ c = &caseState{ point: i, _case: CaseNone };
+ ch := &chars[i];
+ switch int(ch.codePoint) {
+ case 0:
+ c._case = CaseMissing; // Will get NUL wrong but that doesn't matter
+ return;
+ case ch.upperCase:
+ c._case = CaseUpper;
+ case ch.lowerCase:
+ c._case = CaseLower;
+ case ch.titleCase:
+ c._case = CaseTitle;
+ }
+ if ch.upperCase != 0 {
+ c.deltaToUpper = ch.upperCase - i
+ }
+ if ch.lowerCase != 0 {
+ c.deltaToLower = ch.lowerCase - i
+ }
+ if ch.titleCase != 0 {
+ c.deltaToTitle = ch.titleCase - i
+ }
+ return;
+}
+
+func printCases() {
+ if !*cases {
+ return
+ }
+ if *test {
+ fullCaseTest();
+ return
+ }
+ fmt.Printf(
+ "// Generated by running\n"
+ "// maketables --data=%s\n"
+ "// DO NOT EDIT\n\n"
+ "// CaseRanges is the table describing case mappings for all letters with\n"
+ "// non-self mappings.\n"
+ "var CaseRanges = _CaseRanges\n"
+ "var _CaseRanges = []CaseRange {\n",
+ *dataUrl
+ );
+
+ var startState *caseState; // the start of a run; nil for not active
+ var prevState = &caseState{}; // the state of the previous character
+ for i, c := range chars {
+ state := getCaseState(i);
+ if state.adjacent(prevState) {
+ prevState = state;
+ continue;
+ }
+ // end of run (possibly)
+ printCaseRange(startState, prevState);
+ startState = nil;
+ if state._case != CaseMissing && state._case != CaseNone {
+ startState = state;
+ }
+ prevState = state;
+ }
+ fmt.Printf("}\n");
+}
+
+func printCaseRange(lo, hi *caseState) {
+ if lo == nil {
+ return
+ }
+ if lo.deltaToUpper == 0 && lo.deltaToLower == 0 && lo.deltaToTitle == 0 {
+ // character represents itself in all cases - no need to mention it
+ return
+ }
+ fmt.Printf("\tCaseRange{0x%04X, 0x%04X, d{%d, %d, %d}},\n",
+ lo.point, hi.point,
+ lo.deltaToUpper, lo.deltaToLower, lo.deltaToTitle)
+}
+
+// If the cased value in the Char is 0, it means use the rune itself.
+func caseIt(rune, cased int) int {
+ if cased == 0 {
+ return rune
+ }
+ return cased
+}
+
+func fullCaseTest() {
+ for i, c := range chars {
+ lower := unicode.ToLower(i);
+ want := caseIt(i, c.lowerCase);
+ if lower != want {
+ fmt.Fprintf(os.Stderr, "lower U+%04X should be U+%04X is U+%04X\n", i, want, lower);
+ }
+ upper := unicode.ToUpper(i);
+ want = caseIt(i, c.upperCase);
+ if upper != want {
+ fmt.Fprintf(os.Stderr, "upper U+%04X should be U+%04X is U+%04X\n", i, want, upper);
+ }
+ title := unicode.ToTitle(i);
+ want = caseIt(i, c.titleCase);
+ if title != want {
+ fmt.Fprintf(os.Stderr, "title U+%04X should be U+%04X is U+%04X\n", i, want, title);
+ }
+ }
+}