unicode/cldr/collate.go - text - Git at Google

 // Copyright 2013 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

 package cldr

 import (
 	"bufio"
 	"encoding/xml"
 	"errors"
 	"fmt"
 	"strconv"
 	"strings"
 	"unicode"
 	"unicode/utf8"
 )

 // RuleProcessor can be passed to Collator's Process method, which
 // parses the rules and calls the respective method for each rule found.
 type RuleProcessor interface {
 	Reset(anchor string, before int) error
 	Insert(level int, str, context, extend string) error
 	Index(id string)
 }

 const (
 	// cldrIndex is a Unicode-reserved sentinel value used to mark the start
 	// of a grouping within an index.
 	// We ignore any rule that starts with this rune.
 	// See https://unicode.org/reports/tr35/#Collation_Elements for details.
 	cldrIndex = "\uFDD0"

 	// specialAnchor is the format in which to represent logical reset positions,
 	// such as "first tertiary ignorable".
 	specialAnchor = "<%s/>"
 )

 // Process parses the rules for the tailorings of this collation
 // and calls the respective methods of p for each rule found.
 func (c Collation) Process(p RuleProcessor) (err error) {
 	if len(c.Cr) > 0 {
 		if len(c.Cr) > 1 {
 			return fmt.Errorf("multiple cr elements, want 0 or 1")
 		}
 		return processRules(p, c.Cr[0].Data())
 	}
 	if c.Rules.Any != nil {
 		return c.processXML(p)
 	}
 	return errors.New("no tailoring data")
 }

 // processRules parses rules in the Collation Rule Syntax defined in
 // https://www.unicode.org/reports/tr35/tr35-collation.html#Collation_Tailorings.
 func processRules(p RuleProcessor, s string) (err error) {
 	chk := func(s string, e error) string {
 		if err == nil {
 			err = e
 		}
 		return s
 	}
 	i := 0 // Save the line number for use after the loop.
 	scanner := bufio.NewScanner(strings.NewReader(s))
 	for ; scanner.Scan() && err == nil; i++ {
 		for s := skipSpace(scanner.Text()); s != "" && s[0] != '#'; s = skipSpace(s) {
 			level := 5
 			var ch byte
 			switch ch, s = s[0], s[1:]; ch {
 			case '&': // followed by <anchor> or '[' <key> ']'
 				if s = skipSpace(s); consume(&s, '[') {
 					s = chk(parseSpecialAnchor(p, s))
 				} else {
 					s = chk(parseAnchor(p, 0, s))
 				}
 			case '<': // sort relation '<'{1,4}, optionally followed by '*'.
 				for level = 1; consume(&s, '<'); level++ {
 				}
 				if level > 4 {
 					err = fmt.Errorf("level %d > 4", level)
 				}
 				fallthrough
 			case '=': // identity relation, optionally followed by *.
 				if consume(&s, '*') {
 					s = chk(parseSequence(p, level, s))
 				} else {
 					s = chk(parseOrder(p, level, s))
 				}
 			default:
 				chk("", fmt.Errorf("illegal operator %q", ch))
 				break
 			}
 		}
 	}
 	if chk("", scanner.Err()); err != nil {
 		return fmt.Errorf("%d: %v", i, err)
 	}
 	return nil
 }

 // parseSpecialAnchor parses the anchor syntax which is either of the form
 //    ['before' <level>] <anchor>
 // or
 //    [<label>]
 // The starting should already be consumed.
 func parseSpecialAnchor(p RuleProcessor, s string) (tail string, err error) {
 	i := strings.IndexByte(s, ']')
 	if i == -1 {
 		return "", errors.New("unmatched bracket")
 	}
 	a := strings.TrimSpace(s[:i])
 	s = s[i+1:]
 	if strings.HasPrefix(a, "before ") {
 		l, err := strconv.ParseUint(skipSpace(a[len("before "):]), 10, 3)
 		if err != nil {
 			return s, err
 		}
 		return parseAnchor(p, int(l), s)
 	}
 	return s, p.Reset(fmt.Sprintf(specialAnchor, a), 0)
 }

 func parseAnchor(p RuleProcessor, level int, s string) (tail string, err error) {
 	anchor, s, err := scanString(s)
 	if err != nil {
 		return s, err
 	}
 	return s, p.Reset(anchor, level)
 }

 func parseOrder(p RuleProcessor, level int, s string) (tail string, err error) {
 	var value, context, extend string
 	if value, s, err = scanString(s); err != nil {
 		return s, err
 	}
 	if strings.HasPrefix(value, cldrIndex) {
 		p.Index(value[len(cldrIndex):])
 		return
 	}
 	if consume(&s, '|') {
 		if context, s, err = scanString(s); err != nil {
 			return s, errors.New("missing string after context")
 		}
 	}
 	if consume(&s, '/') {
 		if extend, s, err = scanString(s); err != nil {
 			return s, errors.New("missing string after extension")
 		}
 	}
 	return s, p.Insert(level, value, context, extend)
 }

 // scanString scans a single input string.
 func scanString(s string) (str, tail string, err error) {
 	if s = skipSpace(s); s == "" {
 		return s, s, errors.New("missing string")
 	}
 	buf := [16]byte{} // small but enough to hold most cases.
 	value := buf[:0]
 	for s != "" {
 		if consume(&s, '\'') {
 			i := strings.IndexByte(s, '\'')
 			if i == -1 {
 				return "", "", errors.New(`unmatched single quote`)
 			}
 			if i == 0 {
 				value = append(value, '\'')
 			} else {
 				value = append(value, s[:i]...)
 			}
 			s = s[i+1:]
 			continue
 		}
 		r, sz := utf8.DecodeRuneInString(s)
 		if unicode.IsSpace(r) || strings.ContainsRune("&<=#", r) {
 			break
 		}
 		value = append(value, s[:sz]...)
 		s = s[sz:]
 	}
 	return string(value), skipSpace(s), nil
 }

 func parseSequence(p RuleProcessor, level int, s string) (tail string, err error) {
 	if s = skipSpace(s); s == "" {
 		return s, errors.New("empty sequence")
 	}
 	last := rune(0)
 	for s != "" {
 		r, sz := utf8.DecodeRuneInString(s)
 		s = s[sz:]

 		if r == '-' {
 			// We have a range. The first element was already written.
 			if last == 0 {
 				return s, errors.New("range without starter value")
 			}
 			r, sz = utf8.DecodeRuneInString(s)
 			s = s[sz:]
 			if r == utf8.RuneError || r < last {
 				return s, fmt.Errorf("invalid range %q-%q", last, r)
 			}
 			for i := last + 1; i <= r; i++ {
 				if err := p.Insert(level, string(i), "", ""); err != nil {
 					return s, err
 				}
 			}
 			last = 0
 			continue
 		}

 		if unicode.IsSpace(r) || unicode.IsPunct(r) {
 			break
 		}

 		// normal case
 		if err := p.Insert(level, string(r), "", ""); err != nil {
 			return s, err
 		}
 		last = r
 	}
 	return s, nil
 }

 func skipSpace(s string) string {
 	return strings.TrimLeftFunc(s, unicode.IsSpace)
 }

 // consumes returns whether the next byte is ch. If so, it gobbles it by
 // updating s.
 func consume(s *string, ch byte) (ok bool) {
 	if *s == "" || (*s)[0] != ch {
 		return false
 	}
 	*s = (*s)[1:]
 	return true
 }

 // The following code parses Collation rules of CLDR version 24 and before.

 var lmap = map[byte]int{
 	'p': 1,
 	's': 2,
 	't': 3,
 	'i': 5,
 }

 type rulesElem struct {
 	Rules struct {
 		Common
 		Any []*struct {
 			XMLName xml.Name
 			rule
 		} `xml:",any"`
 	} `xml:"rules"`
 }

 type rule struct {
 	Value  string `xml:",chardata"`
 	Before string `xml:"before,attr"`
 	Any    []*struct {
 		XMLName xml.Name
 		rule
 	} `xml:",any"`
 }

 var emptyValueError = errors.New("cldr: empty rule value")

 func (r *rule) value() (string, error) {
 	// Convert hexadecimal Unicode codepoint notation to a string.
 	s := charRe.ReplaceAllStringFunc(r.Value, replaceUnicode)
 	r.Value = s
 	if s == "" {
 		if len(r.Any) != 1 {
 			return "", emptyValueError
 		}
 		r.Value = fmt.Sprintf(specialAnchor, r.Any[0].XMLName.Local)
 		r.Any = nil
 	} else if len(r.Any) != 0 {
 		return "", fmt.Errorf("cldr: XML elements found in collation rule: %v", r.Any)
 	}
 	return r.Value, nil
 }

 func (r rule) process(p RuleProcessor, name, context, extend string) error {
 	v, err := r.value()
 	if err != nil {
 		return err
 	}
 	switch name {
 	case "p", "s", "t", "i":
 		if strings.HasPrefix(v, cldrIndex) {
 			p.Index(v[len(cldrIndex):])
 			return nil
 		}
 		if err := p.Insert(lmap[name[0]], v, context, extend); err != nil {
 			return err
 		}
 	case "pc", "sc", "tc", "ic":
 		level := lmap[name[0]]
 		for _, s := range v {
 			if err := p.Insert(level, string(s), context, extend); err != nil {
 				return err
 			}
 		}
 	default:
 		return fmt.Errorf("cldr: unsupported tag: %q", name)
 	}
 	return nil
 }

 // processXML parses the format of CLDR versions 24 and older.
 func (c Collation) processXML(p RuleProcessor) (err error) {
 	// Collation is generated and defined in xml.go.
 	var v string
 	for _, r := range c.Rules.Any {
 		switch r.XMLName.Local {
 		case "reset":
 			level := 0
 			switch r.Before {
 			case "primary", "1":
 				level = 1
 			case "secondary", "2":
 				level = 2
 			case "tertiary", "3":
 				level = 3
 			case "":
 			default:
 				return fmt.Errorf("cldr: unknown level %q", r.Before)
 			}
 			v, err = r.value()
 			if err == nil {
 				err = p.Reset(v, level)
 			}
 		case "x":
 			var context, extend string
 			for _, r1 := range r.Any {
 				v, err = r1.value()
 				switch r1.XMLName.Local {
 				case "context":
 					context = v
 				case "extend":
 					extend = v
 				}
 			}
 			for _, r1 := range r.Any {
 				if t := r1.XMLName.Local; t == "context" || t == "extend" {
 					continue
 				}
 				r1.rule.process(p, r1.XMLName.Local, context, extend)
 			}
 		default:
 			err = r.rule.process(p, r.XMLName.Local, "", "")
 		}
 		if err != nil {
 			return err
 		}
 	}
 	return nil
 }
	// Copyright 2013 The Go Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style
	// license that can be found in the LICENSE file.

	package cldr

	import (
	"bufio"
	"encoding/xml"
	"errors"
	"fmt"
	"strconv"
	"strings"
	"unicode"
	"unicode/utf8"
	)

	// RuleProcessor can be passed to Collator's Process method, which
	// parses the rules and calls the respective method for each rule found.
	type RuleProcessor interface {
	Reset(anchor string, before int) error
	Insert(level int, str, context, extend string) error
	Index(id string)
	}

	const (
	// cldrIndex is a Unicode-reserved sentinel value used to mark the start
	// of a grouping within an index.
	// We ignore any rule that starts with this rune.
	// See https://unicode.org/reports/tr35/#Collation_Elements for details.
	cldrIndex = "\uFDD0"

	// specialAnchor is the format in which to represent logical reset positions,
	// such as "first tertiary ignorable".
	specialAnchor = "<%s/>"
	)

	// Process parses the rules for the tailorings of this collation
	// and calls the respective methods of p for each rule found.
	func (c Collation) Process(p RuleProcessor) (err error) {
	if len(c.Cr) > 0 {
	if len(c.Cr) > 1 {
	return fmt.Errorf("multiple cr elements, want 0 or 1")
	}
	return processRules(p, c.Cr[0].Data())
	}
	if c.Rules.Any != nil {
	return c.processXML(p)
	}
	return errors.New("no tailoring data")
	}

	// processRules parses rules in the Collation Rule Syntax defined in
	// https://www.unicode.org/reports/tr35/tr35-collation.html#Collation_Tailorings.
	func processRules(p RuleProcessor, s string) (err error) {
	chk := func(s string, e error) string {
	if err == nil {
	err = e
	}
	return s
	}
	i := 0 // Save the line number for use after the loop.
	scanner := bufio.NewScanner(strings.NewReader(s))
	for ; scanner.Scan() && err == nil; i++ {
	for s := skipSpace(scanner.Text()); s != "" && s[0] != '#'; s = skipSpace(s) {
	level := 5
	var ch byte
	switch ch, s = s[0], s[1:]; ch {
	case '&': // followed by <anchor> or '[' <key> ']'
	if s = skipSpace(s); consume(&s, '[') {
	s = chk(parseSpecialAnchor(p, s))
	} else {
	s = chk(parseAnchor(p, 0, s))
	}
	case '<': // sort relation '<'{1,4}, optionally followed by '*'.
	for level = 1; consume(&s, '<'); level++ {
	}
	if level > 4 {
	err = fmt.Errorf("level %d > 4", level)
	}
	fallthrough
	case '=': // identity relation, optionally followed by *.
	if consume(&s, '*') {
	s = chk(parseSequence(p, level, s))
	} else {
	s = chk(parseOrder(p, level, s))
	}
	default:
	chk("", fmt.Errorf("illegal operator %q", ch))
	break
	}
	}
	}
	if chk("", scanner.Err()); err != nil {
	return fmt.Errorf("%d: %v", i, err)
	}
	return nil
	}

	// parseSpecialAnchor parses the anchor syntax which is either of the form
	// ['before' <level>] <anchor>
	// or
	// [<label>]
	// The starting should already be consumed.
	func parseSpecialAnchor(p RuleProcessor, s string) (tail string, err error) {
	i := strings.IndexByte(s, ']')
	if i == -1 {
	return "", errors.New("unmatched bracket")
	}
	a := strings.TrimSpace(s[:i])
	s = s[i+1:]
	if strings.HasPrefix(a, "before ") {
	l, err := strconv.ParseUint(skipSpace(a[len("before "):]), 10, 3)
	if err != nil {
	return s, err
	}
	return parseAnchor(p, int(l), s)
	}
	return s, p.Reset(fmt.Sprintf(specialAnchor, a), 0)
	}

	func parseAnchor(p RuleProcessor, level int, s string) (tail string, err error) {
	anchor, s, err := scanString(s)
	if err != nil {
	return s, err
	}
	return s, p.Reset(anchor, level)
	}

	func parseOrder(p RuleProcessor, level int, s string) (tail string, err error) {
	var value, context, extend string
	if value, s, err = scanString(s); err != nil {
	return s, err
	}
	if strings.HasPrefix(value, cldrIndex) {
	p.Index(value[len(cldrIndex):])
	return
	}
	if consume(&s, '\|') {
	if context, s, err = scanString(s); err != nil {
	return s, errors.New("missing string after context")
	}
	}
	if consume(&s, '/') {
	if extend, s, err = scanString(s); err != nil {
	return s, errors.New("missing string after extension")
	}
	}
	return s, p.Insert(level, value, context, extend)
	}

	// scanString scans a single input string.
	func scanString(s string) (str, tail string, err error) {
	if s = skipSpace(s); s == "" {
	return s, s, errors.New("missing string")
	}
	buf := [16]byte{} // small but enough to hold most cases.
	value := buf[:0]
	for s != "" {
	if consume(&s, '\'') {
	i := strings.IndexByte(s, '\'')
	if i == -1 {
	return "", "", errors.New(`unmatched single quote`)
	}
	if i == 0 {
	value = append(value, '\'')
	} else {
	value = append(value, s[:i]...)
	}
	s = s[i+1:]
	continue
	}
	r, sz := utf8.DecodeRuneInString(s)
	if unicode.IsSpace(r) \|\| strings.ContainsRune("&<=#", r) {
	break
	}
	value = append(value, s[:sz]...)
	s = s[sz:]
	}
	return string(value), skipSpace(s), nil
	}

	func parseSequence(p RuleProcessor, level int, s string) (tail string, err error) {
	if s = skipSpace(s); s == "" {
	return s, errors.New("empty sequence")
	}
	last := rune(0)
	for s != "" {
	r, sz := utf8.DecodeRuneInString(s)
	s = s[sz:]

	if r == '-' {
	// We have a range. The first element was already written.
	if last == 0 {
	return s, errors.New("range without starter value")
	}
	r, sz = utf8.DecodeRuneInString(s)
	s = s[sz:]
	if r == utf8.RuneError \|\| r < last {
	return s, fmt.Errorf("invalid range %q-%q", last, r)
	}
	for i := last + 1; i <= r; i++ {
	if err := p.Insert(level, string(i), "", ""); err != nil {
	return s, err
	}
	}
	last = 0
	continue
	}

	if unicode.IsSpace(r) \|\| unicode.IsPunct(r) {
	break
	}

	// normal case
	if err := p.Insert(level, string(r), "", ""); err != nil {
	return s, err
	}
	last = r
	}
	return s, nil
	}

	func skipSpace(s string) string {
	return strings.TrimLeftFunc(s, unicode.IsSpace)
	}

	// consumes returns whether the next byte is ch. If so, it gobbles it by
	// updating s.
	func consume(s *string, ch byte) (ok bool) {
	if s == "" \|\| (s)[0] != ch {
	return false
	}
	s = (s)[1:]
	return true
	}

	// The following code parses Collation rules of CLDR version 24 and before.

	var lmap = map[byte]int{
	'p': 1,
	's': 2,
	't': 3,
	'i': 5,
	}

	type rulesElem struct {
	Rules struct {
	Common
	Any []*struct {
	XMLName xml.Name
	rule
	} `xml:",any"`
	} `xml:"rules"`
	}

	type rule struct {
	Value string `xml:",chardata"`
	Before string `xml:"before,attr"`
	Any []*struct {
	XMLName xml.Name
	rule
	} `xml:",any"`
	}

	var emptyValueError = errors.New("cldr: empty rule value")

	func (r *rule) value() (string, error) {
	// Convert hexadecimal Unicode codepoint notation to a string.
	s := charRe.ReplaceAllStringFunc(r.Value, replaceUnicode)
	r.Value = s
	if s == "" {
	if len(r.Any) != 1 {
	return "", emptyValueError
	}
	r.Value = fmt.Sprintf(specialAnchor, r.Any[0].XMLName.Local)
	r.Any = nil
	} else if len(r.Any) != 0 {
	return "", fmt.Errorf("cldr: XML elements found in collation rule: %v", r.Any)
	}
	return r.Value, nil
	}

	func (r rule) process(p RuleProcessor, name, context, extend string) error {
	v, err := r.value()
	if err != nil {
	return err
	}
	switch name {
	case "p", "s", "t", "i":
	if strings.HasPrefix(v, cldrIndex) {
	p.Index(v[len(cldrIndex):])
	return nil
	}
	if err := p.Insert(lmap[name[0]], v, context, extend); err != nil {
	return err
	}
	case "pc", "sc", "tc", "ic":
	level := lmap[name[0]]
	for _, s := range v {
	if err := p.Insert(level, string(s), context, extend); err != nil {
	return err
	}
	}
	default:
	return fmt.Errorf("cldr: unsupported tag: %q", name)
	}
	return nil
	}

	// processXML parses the format of CLDR versions 24 and older.
	func (c Collation) processXML(p RuleProcessor) (err error) {
	// Collation is generated and defined in xml.go.
	var v string
	for _, r := range c.Rules.Any {
	switch r.XMLName.Local {
	case "reset":
	level := 0
	switch r.Before {
	case "primary", "1":
	level = 1
	case "secondary", "2":
	level = 2
	case "tertiary", "3":
	level = 3
	case "":
	default:
	return fmt.Errorf("cldr: unknown level %q", r.Before)
	}
	v, err = r.value()
	if err == nil {
	err = p.Reset(v, level)
	}
	case "x":
	var context, extend string
	for _, r1 := range r.Any {
	v, err = r1.value()
	switch r1.XMLName.Local {
	case "context":
	context = v
	case "extend":
	extend = v
	}
	}
	for _, r1 := range r.Any {
	if t := r1.XMLName.Local; t == "context" \|\| t == "extend" {
	continue
	}
	r1.rule.process(p, r1.XMLName.Local, context, extend)
	}
	default:
	err = r.rule.process(p, r.XMLName.Local, "", "")
	}
	if err != nil {
	return err
	}
	}
	return nil
	}