internal/ucd/ucd.go - text - Git at Google

 // Copyright 2014 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

 // Package ucd provides a parser for Unicode Character Database files, the
 // format of which is defined in http://www.unicode.org/reports/tr44/. See
 // http://www.unicode.org/Public/UCD/latest/ucd/ for example files.
 //
 // It currently does not support substitutions of missing fields.
 package ucd // import "golang.org/x/text/internal/ucd"

 import (
 	"bufio"
 	"bytes"
 	"errors"
 	"io"
 	"log"
 	"regexp"
 	"strconv"
 	"strings"
 )

 // UnicodeData.txt fields.
 const (
 	CodePoint = iota
 	Name
 	GeneralCategory
 	CanonicalCombiningClass
 	BidiClass
 	DecompMapping
 	DecimalValue
 	DigitValue
 	NumericValue
 	BidiMirrored
 	Unicode1Name
 	ISOComment
 	SimpleUppercaseMapping
 	SimpleLowercaseMapping
 	SimpleTitlecaseMapping
 )

 // Parse calls f for each entry in the given reader of a UCD file. It will close
 // the reader upon return. It will call log.Fatal if any error occurred.
 //
 // This implements the most common usage pattern of using Parser.
 func Parse(r io.ReadCloser, f func(p *Parser)) {
 	defer r.Close()

 	p := New(r)
 	for p.Next() {
 		f(p)
 	}
 	if err := p.Err(); err != nil {
 		r.Close() // os.Exit will cause defers not to be called.
 		log.Fatal(err)
 	}
 }

 // An Option is used to configure a Parser.
 type Option func(p *Parser)

 func keepRanges(p *Parser) {
 	p.keepRanges = true
 }

 var (
 	// KeepRanges prevents the expansion of ranges. The raw ranges can be
 	// obtained by calling Range(0) on the parser.
 	KeepRanges Option = keepRanges
 )

 // The Part option register a handler for lines starting with a '@'. The text
 // after a '@' is available as the first field. Comments are handled as usual.
 func Part(f func(p *Parser)) Option {
 	return func(p *Parser) {
 		p.partHandler = f
 	}
 }

 // The CommentHandler option passes comments that are on a line by itself to
 // a given handler.
 func CommentHandler(f func(s string)) Option {
 	return func(p *Parser) {
 		p.commentHandler = f
 	}
 }

 // A Parser parses Unicode Character Database (UCD) files.
 type Parser struct {
 	scanner *bufio.Scanner

 	keepRanges bool // Don't expand rune ranges in field 0.

 	err     error
 	comment []byte
 	field   [][]byte
 	// parsedRange is needed in case Range(0) is called more than once for one
 	// field. In some cases this requires scanning ahead.
 	parsedRange          bool
 	rangeStart, rangeEnd rune

 	partHandler    func(p *Parser)
 	commentHandler func(s string)
 }

 func (p *Parser) setError(err error) {
 	if p.err == nil {
 		p.err = err
 	}
 }

 func (p *Parser) getField(i int) []byte {
 	if i >= len(p.field) {
 		return nil
 	}
 	return p.field[i]
 }

 // Err returns a non-nil error if any error occurred during parsing.
 func (p *Parser) Err() error {
 	return p.err
 }

 // New returns a Parser for the given Reader.
 func New(r io.Reader, o ...Option) *Parser {
 	p := &Parser{
 		scanner: bufio.NewScanner(r),
 	}
 	for _, f := range o {
 		f(p)
 	}
 	return p
 }

 // Next parses the next line in the file. It returns true if a line was parsed
 // and false if it reached the end of the file.
 func (p *Parser) Next() bool {
 	if !p.keepRanges && p.rangeStart < p.rangeEnd {
 		p.rangeStart++
 		return true
 	}
 	p.comment = nil
 	p.field = p.field[:0]
 	p.parsedRange = false

 	for p.scanner.Scan() {
 		b := p.scanner.Bytes()
 		if len(b) == 0 {
 			continue
 		}
 		if b[0] == '#' {
 			if p.commentHandler != nil {
 				p.commentHandler(strings.TrimSpace(string(b[1:])))
 			}
 			continue
 		}

 		// Parse line
 		if i := bytes.IndexByte(b, '#'); i != -1 {
 			p.comment = bytes.TrimSpace(b[i+1:])
 			b = b[:i]
 		}
 		if b[0] == '@' {
 			if p.partHandler != nil {
 				p.field = append(p.field, bytes.TrimSpace(b[1:]))
 				p.partHandler(p)
 				p.field = p.field[:0]
 			}
 			p.comment = nil
 			continue
 		}
 		for {
 			i := bytes.IndexByte(b, ';')
 			if i == -1 {
 				p.field = append(p.field, bytes.TrimSpace(b))
 				break
 			}
 			p.field = append(p.field, bytes.TrimSpace(b[:i]))
 			b = b[i+1:]
 		}
 		if !p.keepRanges {
 			p.rangeStart, p.rangeEnd = p.getRange(0)
 		}
 		return true
 	}
 	p.setError(p.scanner.Err())
 	return false
 }

 func parseRune(b []byte) (rune, error) {
 	if len(b) > 2 && b[0] == 'U' && b[1] == '+' {
 		b = b[2:]
 	}
 	x, err := strconv.ParseUint(string(b), 16, 32)
 	return rune(x), err
 }

 func (p *Parser) parseRune(b []byte) rune {
 	x, err := parseRune(b)
 	p.setError(err)
 	return x
 }

 // Rune parses and returns field i as a rune.
 func (p *Parser) Rune(i int) rune {
 	if i > 0 || p.keepRanges {
 		return p.parseRune(p.getField(i))
 	}
 	return p.rangeStart
 }

 // Runes interprets and returns field i as a sequence of runes.
 func (p *Parser) Runes(i int) (runes []rune) {
 	add := func(b []byte) {
 		if b = bytes.TrimSpace(b); len(b) > 0 {
 			runes = append(runes, p.parseRune(b))
 		}
 	}
 	for b := p.getField(i); ; {
 		i := bytes.IndexByte(b, ' ')
 		if i == -1 {
 			add(b)
 			break
 		}
 		add(b[:i])
 		b = b[i+1:]
 	}
 	return
 }

 var (
 	errIncorrectLegacyRange = errors.New("ucd: unmatched <* First>")

 	// reRange matches one line of a legacy rune range.
 	reRange = regexp.MustCompile("^([0-9A-F]*);<([^,]*), ([^>]*)>(.*)$")
 )

 // Range parses and returns field i as a rune range. A range is inclusive at
 // both ends. If the field only has one rune, first and last will be identical.
 // It supports the legacy format for ranges used in UnicodeData.txt.
 func (p *Parser) Range(i int) (first, last rune) {
 	if !p.keepRanges {
 		return p.rangeStart, p.rangeStart
 	}
 	return p.getRange(i)
 }

 func (p *Parser) getRange(i int) (first, last rune) {
 	b := p.getField(i)
 	if k := bytes.Index(b, []byte("..")); k != -1 {
 		return p.parseRune(b[:k]), p.parseRune(b[k+2:])
 	}
 	// The first field may not be a rune, in which case we may ignore any error
 	// and set the range as 0..0.
 	x, err := parseRune(b)
 	if err != nil {
 		// Disable range parsing henceforth. This ensures that an error will be
 		// returned if the user subsequently will try to parse this field as
 		// a Rune.
 		p.keepRanges = true
 	}
 	// Special case for UnicodeData that was retained for backwards compatibility.
 	if i == 0 && len(p.field) > 1 && bytes.HasSuffix(p.field[1], []byte("First>")) {
 		if p.parsedRange {
 			return p.rangeStart, p.rangeEnd
 		}
 		mf := reRange.FindStringSubmatch(p.scanner.Text())
 		if mf == nil || !p.scanner.Scan() {
 			p.setError(errIncorrectLegacyRange)
 			return x, x
 		}
 		// Using Bytes would be more efficient here, but Text is a lot easier
 		// and this is not a frequent case.
 		ml := reRange.FindStringSubmatch(p.scanner.Text())
 		if ml == nil || mf[2] != ml[2] || ml[3] != "Last" || mf[4] != ml[4] {
 			p.setError(errIncorrectLegacyRange)
 			return x, x
 		}
 		p.rangeStart, p.rangeEnd = x, p.parseRune(p.scanner.Bytes()[:len(ml[1])])
 		p.parsedRange = true
 		return p.rangeStart, p.rangeEnd
 	}
 	return x, x
 }

 // bools recognizes all valid UCD boolean values.
 var bools = map[string]bool{
 	"":      false,
 	"N":     false,
 	"No":    false,
 	"F":     false,
 	"False": false,
 	"Y":     true,
 	"Yes":   true,
 	"T":     true,
 	"True":  true,
 }

 // Bool parses and returns field i as a boolean value.
 func (p *Parser) Bool(i int) bool {
 	b := p.getField(i)
 	for s, v := range bools {
 		if bstrEq(b, s) {
 			return v
 		}
 	}
 	p.setError(strconv.ErrSyntax)
 	return false
 }

 // Int parses and returns field i as an integer value.
 func (p *Parser) Int(i int) int {
 	x, err := strconv.ParseInt(string(p.getField(i)), 10, 64)
 	p.setError(err)
 	return int(x)
 }

 // Uint parses and returns field i as an unsigned integer value.
 func (p *Parser) Uint(i int) uint {
 	x, err := strconv.ParseUint(string(p.getField(i)), 10, 64)
 	p.setError(err)
 	return uint(x)
 }

 // Float parses and returns field i as a decimal value.
 func (p *Parser) Float(i int) float64 {
 	x, err := strconv.ParseFloat(string(p.getField(i)), 64)
 	p.setError(err)
 	return x
 }

 // String parses and returns field i as a string value.
 func (p *Parser) String(i int) string {
 	return string(p.getField(i))
 }

 // Strings parses and returns field i as a space-separated list of strings.
 func (p *Parser) Strings(i int) []string {
 	ss := strings.Split(string(p.getField(i)), " ")
 	for i, s := range ss {
 		ss[i] = strings.TrimSpace(s)
 	}
 	return ss
 }

 // Comment returns the comments for the current line.
 func (p *Parser) Comment() string {
 	return string(p.comment)
 }

 var errUndefinedEnum = errors.New("ucd: undefined enum value")

 // Enum interprets and returns field i as a value that must be one of the values
 // in enum.
 func (p *Parser) Enum(i int, enum ...string) string {
 	b := p.getField(i)
 	for _, s := range enum {
 		if bstrEq(b, s) {
 			return s
 		}
 	}
 	p.setError(errUndefinedEnum)
 	return ""
 }

 func bstrEq(b []byte, s string) bool {
 	if len(b) != len(s) {
 		return false
 	}
 	for i, c := range b {
 		if c != s[i] {
 			return false
 		}
 	}
 	return true
 }
	// Copyright 2014 The Go Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style
	// license that can be found in the LICENSE file.

	// Package ucd provides a parser for Unicode Character Database files, the
	// format of which is defined in http://www.unicode.org/reports/tr44/. See
	// http://www.unicode.org/Public/UCD/latest/ucd/ for example files.
	//
	// It currently does not support substitutions of missing fields.
	package ucd // import "golang.org/x/text/internal/ucd"

	import (
	"bufio"
	"bytes"
	"errors"
	"io"
	"log"
	"regexp"
	"strconv"
	"strings"
	)

	// UnicodeData.txt fields.
	const (
	CodePoint = iota
	Name
	GeneralCategory
	CanonicalCombiningClass
	BidiClass
	DecompMapping
	DecimalValue
	DigitValue
	NumericValue
	BidiMirrored
	Unicode1Name
	ISOComment
	SimpleUppercaseMapping
	SimpleLowercaseMapping
	SimpleTitlecaseMapping
	)

	// Parse calls f for each entry in the given reader of a UCD file. It will close
	// the reader upon return. It will call log.Fatal if any error occurred.
	//
	// This implements the most common usage pattern of using Parser.
	func Parse(r io.ReadCloser, f func(p *Parser)) {
	defer r.Close()

	p := New(r)
	for p.Next() {
	f(p)
	}
	if err := p.Err(); err != nil {
	r.Close() // os.Exit will cause defers not to be called.
	log.Fatal(err)
	}
	}

	// An Option is used to configure a Parser.
	type Option func(p *Parser)

	func keepRanges(p *Parser) {
	p.keepRanges = true
	}

	var (
	// KeepRanges prevents the expansion of ranges. The raw ranges can be
	// obtained by calling Range(0) on the parser.
	KeepRanges Option = keepRanges
	)

	// The Part option register a handler for lines starting with a '@'. The text
	// after a '@' is available as the first field. Comments are handled as usual.
	func Part(f func(p *Parser)) Option {
	return func(p *Parser) {
	p.partHandler = f
	}
	}

	// The CommentHandler option passes comments that are on a line by itself to
	// a given handler.
	func CommentHandler(f func(s string)) Option {
	return func(p *Parser) {
	p.commentHandler = f
	}
	}

	// A Parser parses Unicode Character Database (UCD) files.
	type Parser struct {
	scanner *bufio.Scanner

	keepRanges bool // Don't expand rune ranges in field 0.

	err error
	comment []byte
	field [][]byte
	// parsedRange is needed in case Range(0) is called more than once for one
	// field. In some cases this requires scanning ahead.
	parsedRange bool
	rangeStart, rangeEnd rune

	partHandler func(p *Parser)
	commentHandler func(s string)
	}

	func (p *Parser) setError(err error) {
	if p.err == nil {
	p.err = err
	}
	}

	func (p *Parser) getField(i int) []byte {
	if i >= len(p.field) {
	return nil
	}
	return p.field[i]
	}

	// Err returns a non-nil error if any error occurred during parsing.
	func (p *Parser) Err() error {
	return p.err
	}

	// New returns a Parser for the given Reader.
	func New(r io.Reader, o ...Option) *Parser {
	p := &Parser{
	scanner: bufio.NewScanner(r),
	}
	for _, f := range o {
	f(p)
	}
	return p
	}

	// Next parses the next line in the file. It returns true if a line was parsed
	// and false if it reached the end of the file.
	func (p *Parser) Next() bool {
	if !p.keepRanges && p.rangeStart < p.rangeEnd {
	p.rangeStart++
	return true
	}
	p.comment = nil
	p.field = p.field[:0]
	p.parsedRange = false

	for p.scanner.Scan() {
	b := p.scanner.Bytes()
	if len(b) == 0 {
	continue
	}
	if b[0] == '#' {
	if p.commentHandler != nil {
	p.commentHandler(strings.TrimSpace(string(b[1:])))
	}
	continue
	}

	// Parse line
	if i := bytes.IndexByte(b, '#'); i != -1 {
	p.comment = bytes.TrimSpace(b[i+1:])
	b = b[:i]
	}
	if b[0] == '@' {
	if p.partHandler != nil {
	p.field = append(p.field, bytes.TrimSpace(b[1:]))
	p.partHandler(p)
	p.field = p.field[:0]
	}
	p.comment = nil
	continue
	}
	for {
	i := bytes.IndexByte(b, ';')
	if i == -1 {
	p.field = append(p.field, bytes.TrimSpace(b))
	break
	}
	p.field = append(p.field, bytes.TrimSpace(b[:i]))
	b = b[i+1:]
	}
	if !p.keepRanges {
	p.rangeStart, p.rangeEnd = p.getRange(0)
	}
	return true
	}
	p.setError(p.scanner.Err())
	return false
	}

	func parseRune(b []byte) (rune, error) {
	if len(b) > 2 && b[0] == 'U' && b[1] == '+' {
	b = b[2:]
	}
	x, err := strconv.ParseUint(string(b), 16, 32)
	return rune(x), err
	}

	func (p *Parser) parseRune(b []byte) rune {
	x, err := parseRune(b)
	p.setError(err)
	return x
	}

	// Rune parses and returns field i as a rune.
	func (p *Parser) Rune(i int) rune {
	if i > 0 \|\| p.keepRanges {
	return p.parseRune(p.getField(i))
	}
	return p.rangeStart
	}

	// Runes interprets and returns field i as a sequence of runes.
	func (p *Parser) Runes(i int) (runes []rune) {
	add := func(b []byte) {
	if b = bytes.TrimSpace(b); len(b) > 0 {
	runes = append(runes, p.parseRune(b))
	}
	}
	for b := p.getField(i); ; {
	i := bytes.IndexByte(b, ' ')
	if i == -1 {
	add(b)
	break
	}
	add(b[:i])
	b = b[i+1:]
	}
	return
	}

	var (
	errIncorrectLegacyRange = errors.New("ucd: unmatched <* First>")

	// reRange matches one line of a legacy rune range.
	reRange = regexp.MustCompile("^([0-9A-F]);<([^,]), ([^>])>(.)$")
	)

	// Range parses and returns field i as a rune range. A range is inclusive at
	// both ends. If the field only has one rune, first and last will be identical.
	// It supports the legacy format for ranges used in UnicodeData.txt.
	func (p *Parser) Range(i int) (first, last rune) {
	if !p.keepRanges {
	return p.rangeStart, p.rangeStart
	}
	return p.getRange(i)
	}

	func (p *Parser) getRange(i int) (first, last rune) {
	b := p.getField(i)
	if k := bytes.Index(b, []byte("..")); k != -1 {
	return p.parseRune(b[:k]), p.parseRune(b[k+2:])
	}
	// The first field may not be a rune, in which case we may ignore any error
	// and set the range as 0..0.
	x, err := parseRune(b)
	if err != nil {
	// Disable range parsing henceforth. This ensures that an error will be
	// returned if the user subsequently will try to parse this field as
	// a Rune.
	p.keepRanges = true
	}
	// Special case for UnicodeData that was retained for backwards compatibility.
	if i == 0 && len(p.field) > 1 && bytes.HasSuffix(p.field[1], []byte("First>")) {
	if p.parsedRange {
	return p.rangeStart, p.rangeEnd
	}
	mf := reRange.FindStringSubmatch(p.scanner.Text())
	if mf == nil \|\| !p.scanner.Scan() {
	p.setError(errIncorrectLegacyRange)
	return x, x
	}
	// Using Bytes would be more efficient here, but Text is a lot easier
	// and this is not a frequent case.
	ml := reRange.FindStringSubmatch(p.scanner.Text())
	if ml == nil \|\| mf[2] != ml[2] \|\| ml[3] != "Last" \|\| mf[4] != ml[4] {
	p.setError(errIncorrectLegacyRange)
	return x, x
	}
	p.rangeStart, p.rangeEnd = x, p.parseRune(p.scanner.Bytes()[:len(ml[1])])
	p.parsedRange = true
	return p.rangeStart, p.rangeEnd
	}
	return x, x
	}

	// bools recognizes all valid UCD boolean values.
	var bools = map[string]bool{
	"": false,
	"N": false,
	"No": false,
	"F": false,
	"False": false,
	"Y": true,
	"Yes": true,
	"T": true,
	"True": true,
	}

	// Bool parses and returns field i as a boolean value.
	func (p *Parser) Bool(i int) bool {
	b := p.getField(i)
	for s, v := range bools {
	if bstrEq(b, s) {
	return v
	}
	}
	p.setError(strconv.ErrSyntax)
	return false
	}

	// Int parses and returns field i as an integer value.
	func (p *Parser) Int(i int) int {
	x, err := strconv.ParseInt(string(p.getField(i)), 10, 64)
	p.setError(err)
	return int(x)
	}

	// Uint parses and returns field i as an unsigned integer value.
	func (p *Parser) Uint(i int) uint {
	x, err := strconv.ParseUint(string(p.getField(i)), 10, 64)
	p.setError(err)
	return uint(x)
	}

	// Float parses and returns field i as a decimal value.
	func (p *Parser) Float(i int) float64 {
	x, err := strconv.ParseFloat(string(p.getField(i)), 64)
	p.setError(err)
	return x
	}

	// String parses and returns field i as a string value.
	func (p *Parser) String(i int) string {
	return string(p.getField(i))
	}

	// Strings parses and returns field i as a space-separated list of strings.
	func (p *Parser) Strings(i int) []string {
	ss := strings.Split(string(p.getField(i)), " ")
	for i, s := range ss {
	ss[i] = strings.TrimSpace(s)
	}
	return ss
	}

	// Comment returns the comments for the current line.
	func (p *Parser) Comment() string {
	return string(p.comment)
	}

	var errUndefinedEnum = errors.New("ucd: undefined enum value")

	// Enum interprets and returns field i as a value that must be one of the values
	// in enum.
	func (p *Parser) Enum(i int, enum ...string) string {
	b := p.getField(i)
	for _, s := range enum {
	if bstrEq(b, s) {
	return s
	}
	}
	p.setError(errUndefinedEnum)
	return ""
	}

	func bstrEq(b []byte, s string) bool {
	if len(b) != len(s) {
	return false
	}
	for i, c := range b {
	if c != s[i] {
	return false
	}
	}
	return true
	}