| // Copyright 2013 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| package cldr |
| |
| import ( |
| "bufio" |
| "encoding/xml" |
| "errors" |
| "fmt" |
| "strconv" |
| "strings" |
| "unicode" |
| "unicode/utf8" |
| ) |
| |
| // RuleProcessor can be passed to Collator's Process method, which |
| // parses the rules and calls the respective method for each rule found. |
| type RuleProcessor interface { |
| Reset(anchor string, before int) error |
| Insert(level int, str, context, extend string) error |
| Index(id string) |
| } |
| |
| const ( |
| // cldrIndex is a Unicode-reserved sentinel value used to mark the start |
| // of a grouping within an index. |
| // We ignore any rule that starts with this rune. |
| // See https://unicode.org/reports/tr35/#Collation_Elements for details. |
| cldrIndex = "\uFDD0" |
| |
| // specialAnchor is the format in which to represent logical reset positions, |
| // such as "first tertiary ignorable". |
| specialAnchor = "<%s/>" |
| ) |
| |
| // Process parses the rules for the tailorings of this collation |
| // and calls the respective methods of p for each rule found. |
| func (c Collation) Process(p RuleProcessor) (err error) { |
| if len(c.Cr) > 0 { |
| if len(c.Cr) > 1 { |
| return fmt.Errorf("multiple cr elements, want 0 or 1") |
| } |
| return processRules(p, c.Cr[0].Data()) |
| } |
| if c.Rules.Any != nil { |
| return c.processXML(p) |
| } |
| return errors.New("no tailoring data") |
| } |
| |
| // processRules parses rules in the Collation Rule Syntax defined in |
| // https://www.unicode.org/reports/tr35/tr35-collation.html#Collation_Tailorings. |
| func processRules(p RuleProcessor, s string) (err error) { |
| chk := func(s string, e error) string { |
| if err == nil { |
| err = e |
| } |
| return s |
| } |
| i := 0 // Save the line number for use after the loop. |
| scanner := bufio.NewScanner(strings.NewReader(s)) |
| for ; scanner.Scan() && err == nil; i++ { |
| for s := skipSpace(scanner.Text()); s != "" && s[0] != '#'; s = skipSpace(s) { |
| level := 5 |
| var ch byte |
| switch ch, s = s[0], s[1:]; ch { |
| case '&': // followed by <anchor> or '[' <key> ']' |
| if s = skipSpace(s); consume(&s, '[') { |
| s = chk(parseSpecialAnchor(p, s)) |
| } else { |
| s = chk(parseAnchor(p, 0, s)) |
| } |
| case '<': // sort relation '<'{1,4}, optionally followed by '*'. |
| for level = 1; consume(&s, '<'); level++ { |
| } |
| if level > 4 { |
| err = fmt.Errorf("level %d > 4", level) |
| } |
| fallthrough |
| case '=': // identity relation, optionally followed by *. |
| if consume(&s, '*') { |
| s = chk(parseSequence(p, level, s)) |
| } else { |
| s = chk(parseOrder(p, level, s)) |
| } |
| default: |
| chk("", fmt.Errorf("illegal operator %q", ch)) |
| break |
| } |
| } |
| } |
| if chk("", scanner.Err()); err != nil { |
| return fmt.Errorf("%d: %v", i, err) |
| } |
| return nil |
| } |
| |
| // parseSpecialAnchor parses the anchor syntax which is either of the form |
| // |
| // ['before' <level>] <anchor> |
| // |
| // or |
| // |
| // [<label>] |
| // |
| // The starting should already be consumed. |
| func parseSpecialAnchor(p RuleProcessor, s string) (tail string, err error) { |
| i := strings.IndexByte(s, ']') |
| if i == -1 { |
| return "", errors.New("unmatched bracket") |
| } |
| a := strings.TrimSpace(s[:i]) |
| s = s[i+1:] |
| if strings.HasPrefix(a, "before ") { |
| l, err := strconv.ParseUint(skipSpace(a[len("before "):]), 10, 3) |
| if err != nil { |
| return s, err |
| } |
| return parseAnchor(p, int(l), s) |
| } |
| return s, p.Reset(fmt.Sprintf(specialAnchor, a), 0) |
| } |
| |
| func parseAnchor(p RuleProcessor, level int, s string) (tail string, err error) { |
| anchor, s, err := scanString(s) |
| if err != nil { |
| return s, err |
| } |
| return s, p.Reset(anchor, level) |
| } |
| |
| func parseOrder(p RuleProcessor, level int, s string) (tail string, err error) { |
| var value, context, extend string |
| if value, s, err = scanString(s); err != nil { |
| return s, err |
| } |
| if strings.HasPrefix(value, cldrIndex) { |
| p.Index(value[len(cldrIndex):]) |
| return |
| } |
| if consume(&s, '|') { |
| if context, s, err = scanString(s); err != nil { |
| return s, errors.New("missing string after context") |
| } |
| } |
| if consume(&s, '/') { |
| if extend, s, err = scanString(s); err != nil { |
| return s, errors.New("missing string after extension") |
| } |
| } |
| return s, p.Insert(level, value, context, extend) |
| } |
| |
| // scanString scans a single input string. |
| func scanString(s string) (str, tail string, err error) { |
| if s = skipSpace(s); s == "" { |
| return s, s, errors.New("missing string") |
| } |
| buf := [16]byte{} // small but enough to hold most cases. |
| value := buf[:0] |
| for s != "" { |
| if consume(&s, '\'') { |
| i := strings.IndexByte(s, '\'') |
| if i == -1 { |
| return "", "", errors.New(`unmatched single quote`) |
| } |
| if i == 0 { |
| value = append(value, '\'') |
| } else { |
| value = append(value, s[:i]...) |
| } |
| s = s[i+1:] |
| continue |
| } |
| r, sz := utf8.DecodeRuneInString(s) |
| if unicode.IsSpace(r) || strings.ContainsRune("&<=#", r) { |
| break |
| } |
| value = append(value, s[:sz]...) |
| s = s[sz:] |
| } |
| return string(value), skipSpace(s), nil |
| } |
| |
| func parseSequence(p RuleProcessor, level int, s string) (tail string, err error) { |
| if s = skipSpace(s); s == "" { |
| return s, errors.New("empty sequence") |
| } |
| last := rune(0) |
| for s != "" { |
| r, sz := utf8.DecodeRuneInString(s) |
| s = s[sz:] |
| |
| if r == '-' { |
| // We have a range. The first element was already written. |
| if last == 0 { |
| return s, errors.New("range without starter value") |
| } |
| r, sz = utf8.DecodeRuneInString(s) |
| s = s[sz:] |
| if r == utf8.RuneError || r < last { |
| return s, fmt.Errorf("invalid range %q-%q", last, r) |
| } |
| for i := last + 1; i <= r; i++ { |
| if err := p.Insert(level, string(i), "", ""); err != nil { |
| return s, err |
| } |
| } |
| last = 0 |
| continue |
| } |
| |
| if unicode.IsSpace(r) || unicode.IsPunct(r) { |
| break |
| } |
| |
| // normal case |
| if err := p.Insert(level, string(r), "", ""); err != nil { |
| return s, err |
| } |
| last = r |
| } |
| return s, nil |
| } |
| |
| func skipSpace(s string) string { |
| return strings.TrimLeftFunc(s, unicode.IsSpace) |
| } |
| |
| // consumes returns whether the next byte is ch. If so, it gobbles it by |
| // updating s. |
| func consume(s *string, ch byte) (ok bool) { |
| if *s == "" || (*s)[0] != ch { |
| return false |
| } |
| *s = (*s)[1:] |
| return true |
| } |
| |
| // The following code parses Collation rules of CLDR version 24 and before. |
| |
| var lmap = map[byte]int{ |
| 'p': 1, |
| 's': 2, |
| 't': 3, |
| 'i': 5, |
| } |
| |
| type rulesElem struct { |
| Rules struct { |
| Common |
| Any []*struct { |
| XMLName xml.Name |
| rule |
| } `xml:",any"` |
| } `xml:"rules"` |
| } |
| |
| type rule struct { |
| Value string `xml:",chardata"` |
| Before string `xml:"before,attr"` |
| Any []*struct { |
| XMLName xml.Name |
| rule |
| } `xml:",any"` |
| } |
| |
| var emptyValueError = errors.New("cldr: empty rule value") |
| |
| func (r *rule) value() (string, error) { |
| // Convert hexadecimal Unicode codepoint notation to a string. |
| s := charRe.ReplaceAllStringFunc(r.Value, replaceUnicode) |
| r.Value = s |
| if s == "" { |
| if len(r.Any) != 1 { |
| return "", emptyValueError |
| } |
| r.Value = fmt.Sprintf(specialAnchor, r.Any[0].XMLName.Local) |
| r.Any = nil |
| } else if len(r.Any) != 0 { |
| return "", fmt.Errorf("cldr: XML elements found in collation rule: %v", r.Any) |
| } |
| return r.Value, nil |
| } |
| |
| func (r rule) process(p RuleProcessor, name, context, extend string) error { |
| v, err := r.value() |
| if err != nil { |
| return err |
| } |
| switch name { |
| case "p", "s", "t", "i": |
| if strings.HasPrefix(v, cldrIndex) { |
| p.Index(v[len(cldrIndex):]) |
| return nil |
| } |
| if err := p.Insert(lmap[name[0]], v, context, extend); err != nil { |
| return err |
| } |
| case "pc", "sc", "tc", "ic": |
| level := lmap[name[0]] |
| for _, s := range v { |
| if err := p.Insert(level, string(s), context, extend); err != nil { |
| return err |
| } |
| } |
| default: |
| return fmt.Errorf("cldr: unsupported tag: %q", name) |
| } |
| return nil |
| } |
| |
| // processXML parses the format of CLDR versions 24 and older. |
| func (c Collation) processXML(p RuleProcessor) (err error) { |
| // Collation is generated and defined in xml.go. |
| var v string |
| for _, r := range c.Rules.Any { |
| switch r.XMLName.Local { |
| case "reset": |
| level := 0 |
| switch r.Before { |
| case "primary", "1": |
| level = 1 |
| case "secondary", "2": |
| level = 2 |
| case "tertiary", "3": |
| level = 3 |
| case "": |
| default: |
| return fmt.Errorf("cldr: unknown level %q", r.Before) |
| } |
| v, err = r.value() |
| if err == nil { |
| err = p.Reset(v, level) |
| } |
| case "x": |
| var context, extend string |
| for _, r1 := range r.Any { |
| v, err = r1.value() |
| switch r1.XMLName.Local { |
| case "context": |
| context = v |
| case "extend": |
| extend = v |
| } |
| } |
| for _, r1 := range r.Any { |
| if t := r1.XMLName.Local; t == "context" || t == "extend" { |
| continue |
| } |
| r1.rule.process(p, r1.XMLName.Local, context, extend) |
| } |
| default: |
| err = r.rule.process(p, r.XMLName.Local, "", "") |
| } |
| if err != nil { |
| return err |
| } |
| } |
| return nil |
| } |