collate/regtest.go - text - Git at Google

 // Copyright 2012 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

 // +build ignore

 package main

 import (
 	"archive/zip"
 	"bufio"
 	"bytes"
 	"flag"
 	"fmt"
 	"io"
 	"io/ioutil"
 	"log"
 	"net/http"
 	"os"
 	"path"
 	"regexp"
 	"strconv"
 	"strings"
 	"unicode"
 	"unicode/utf8"

 	"golang.org/x/text/collate"
 	"golang.org/x/text/collate/build"
 	"golang.org/x/text/language"
 )

 // This regression test runs tests for the test files in CollationTest.zip
 // (taken from http://www.unicode.org/Public/UCA/<unicode.Version>/).
 //
 // The test files have the following form:
 // # header
 // 0009 0021;	# ('\u0009') <CHARACTER TABULATION>	[| | | 0201 025E]
 // 0009 003F;	# ('\u0009') <CHARACTER TABULATION>	[| | | 0201 0263]
 // 000A 0021;	# ('\u000A') <LINE FEED (LF)>	[| | | 0202 025E]
 // 000A 003F;	# ('\u000A') <LINE FEED (LF)>	[| | | 0202 0263]
 //
 // The part before the semicolon is the hex representation of a sequence
 // of runes. After the hash mark is a comment. The strings
 // represented by rune sequence are in the file in sorted order, as
 // defined by the DUCET.

 var testdata = flag.String("testdata",
 	"http://www.unicode.org/Public/UCA/"+unicode.Version+"/CollationTest.zip",
 	"URL of Unicode collation tests zip file")
 var ducet = flag.String("ducet",
 	"http://unicode.org/Public/UCA/"+unicode.Version+"/allkeys.txt",
 	"URL of the Default Unicode Collation Element Table (DUCET).")
 var localFiles = flag.Bool("local",
 	false,
 	"data files have been copied to the current directory; for debugging only")

 type Test struct {
 	name    string
 	str     [][]byte
 	comment []string
 }

 var versionRe = regexp.MustCompile(`# UCA Version: (.*)\n?$`)
 var testRe = regexp.MustCompile(`^([\dA-F ]+);.*# (.*)\n?$`)

 func Error(e error) {
 	if e != nil {
 		log.Fatal(e)
 	}
 }

 // openReader opens the url or file given by url and returns it as an io.ReadCloser
 // or nil on error.
 func openReader(url string) io.ReadCloser {
 	if *localFiles {
 		pwd, _ := os.Getwd()
 		url = "file://" + path.Join(pwd, path.Base(url))
 	}
 	t := &http.Transport{}
 	t.RegisterProtocol("file", http.NewFileTransport(http.Dir("/")))
 	c := &http.Client{Transport: t}
 	resp, err := c.Get(url)
 	Error(err)
 	if resp.StatusCode != 200 {
 		Error(fmt.Errorf(`bad GET status for "%s": %s`, url, resp.Status))
 	}
 	return resp.Body
 }

 // parseUCA parses a Default Unicode Collation Element Table of the format
 // specified in http://www.unicode.org/reports/tr10/#File_Format.
 // It returns the variable top.
 func parseUCA(builder *build.Builder) {
 	r := openReader(*ducet)
 	defer r.Close()
 	input := bufio.NewReader(r)
 	colelem := regexp.MustCompile(`\[([.*])([0-9A-F.]+)\]`)
 	for i := 1; true; i++ {
 		l, prefix, err := input.ReadLine()
 		if err == io.EOF {
 			break
 		}
 		Error(err)
 		line := string(l)
 		if prefix {
 			log.Fatalf("%d: buffer overflow", i)
 		}
 		if len(line) == 0 || line[0] == '#' {
 			continue
 		}
 		if line[0] == '@' {
 			if strings.HasPrefix(line[1:], "version ") {
 				if v := strings.Split(line[1:], " ")[1]; v != unicode.Version {
 					log.Fatalf("incompatible version %s; want %s", v, unicode.Version)
 				}
 			}
 		} else {
 			// parse entries
 			part := strings.Split(line, " ; ")
 			if len(part) != 2 {
 				log.Fatalf("%d: production rule without ';': %v", i, line)
 			}
 			lhs := []rune{}
 			for _, v := range strings.Split(part[0], " ") {
 				if v != "" {
 					lhs = append(lhs, rune(convHex(i, v)))
 				}
 			}
 			vars := []int{}
 			rhs := [][]int{}
 			for i, m := range colelem.FindAllStringSubmatch(part[1], -1) {
 				if m[1] == "*" {
 					vars = append(vars, i)
 				}
 				elem := []int{}
 				for _, h := range strings.Split(m[2], ".") {
 					elem = append(elem, convHex(i, h))
 				}
 				rhs = append(rhs, elem)
 			}
 			builder.Add(lhs, rhs, vars)
 		}
 	}
 }

 func convHex(line int, s string) int {
 	r, e := strconv.ParseInt(s, 16, 32)
 	if e != nil {
 		log.Fatalf("%d: %v", line, e)
 	}
 	return int(r)
 }

 func loadTestData() []Test {
 	f := openReader(*testdata)
 	buffer, err := ioutil.ReadAll(f)
 	f.Close()
 	Error(err)
 	archive, err := zip.NewReader(bytes.NewReader(buffer), int64(len(buffer)))
 	Error(err)
 	tests := []Test{}
 	for _, f := range archive.File {
 		// Skip the short versions, which are simply duplicates of the long versions.
 		if strings.Contains(f.Name, "SHORT") || f.FileInfo().IsDir() {
 			continue
 		}
 		ff, err := f.Open()
 		Error(err)
 		defer ff.Close()
 		scanner := bufio.NewScanner(ff)
 		test := Test{name: path.Base(f.Name)}
 		for scanner.Scan() {
 			line := scanner.Text()
 			if len(line) <= 1 || line[0] == '#' {
 				if m := versionRe.FindStringSubmatch(line); m != nil {
 					if m[1] != unicode.Version {
 						log.Printf("warning:%s: version is %s; want %s", f.Name, m[1], unicode.Version)
 					}
 				}
 				continue
 			}
 			m := testRe.FindStringSubmatch(line)
 			if m == nil || len(m) < 3 {
 				log.Fatalf(`Failed to parse: "%s" result: %#v`, line, m)
 			}
 			str := []byte{}
 			// In the regression test data (unpaired) surrogates are assigned a weight
 			// corresponding to their code point value.  However, utf8.DecodeRune,
 			// which is used to compute the implicit weight, assigns FFFD to surrogates.
 			// We therefore skip tests with surrogates.  This skips about 35 entries
 			// per test.
 			valid := true
 			for _, split := range strings.Split(m[1], " ") {
 				r, err := strconv.ParseUint(split, 16, 64)
 				Error(err)
 				valid = valid && utf8.ValidRune(rune(r))
 				str = append(str, string(rune(r))...)
 			}
 			if valid {
 				test.str = append(test.str, str)
 				test.comment = append(test.comment, m[2])
 			}
 		}
 		if scanner.Err() != nil {
 			log.Fatal(scanner.Err())
 		}
 		tests = append(tests, test)
 	}
 	return tests
 }

 var errorCount int

 func fail(t Test, pattern string, args ...interface{}) {
 	format := fmt.Sprintf("error:%s:%s", t.name, pattern)
 	log.Printf(format, args...)
 	errorCount++
 	if errorCount > 30 {
 		log.Fatal("too many errors")
 	}
 }

 func runes(b []byte) []rune {
 	return []rune(string(b))
 }

 var shifted = language.MustParse("und-u-ka-shifted-ks-level4")

 func doTest(t Test) {
 	bld := build.NewBuilder()
 	parseUCA(bld)
 	w, err := bld.Build()
 	Error(err)
 	var tag language.Tag
 	if !strings.Contains(t.name, "NON_IGNOR") {
 		tag = shifted
 	}
 	c := collate.NewFromTable(w, collate.OptionsFromTag(tag))
 	b := &collate.Buffer{}
 	prev := t.str[0]
 	for i := 1; i < len(t.str); i++ {
 		b.Reset()
 		s := t.str[i]
 		ka := c.Key(b, prev)
 		kb := c.Key(b, s)
 		if r := bytes.Compare(ka, kb); r == 1 {
 			fail(t, "%d: Key(%.4X) < Key(%.4X) (%X < %X) == %d; want -1 or 0", i, []rune(string(prev)), []rune(string(s)), ka, kb, r)
 			prev = s
 			continue
 		}
 		if r := c.Compare(prev, s); r == 1 {
 			fail(t, "%d: Compare(%.4X, %.4X) == %d; want -1 or 0", i, runes(prev), runes(s), r)
 		}
 		if r := c.Compare(s, prev); r == -1 {
 			fail(t, "%d: Compare(%.4X, %.4X) == %d; want 1 or 0", i, runes(s), runes(prev), r)
 		}
 		prev = s
 	}
 }

 func main() {
 	flag.Parse()
 	for _, test := range loadTestData() {
 		doTest(test)
 	}
 	if errorCount == 0 {
 		fmt.Println("PASS")
 	}
 }
	// Copyright 2012 The Go Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style
	// license that can be found in the LICENSE file.

	// +build ignore

	package main

	import (
	"archive/zip"
	"bufio"
	"bytes"
	"flag"
	"fmt"
	"io"
	"io/ioutil"
	"log"
	"net/http"
	"os"
	"path"
	"regexp"
	"strconv"
	"strings"
	"unicode"
	"unicode/utf8"

	"golang.org/x/text/collate"
	"golang.org/x/text/collate/build"
	"golang.org/x/text/language"
	)

	// This regression test runs tests for the test files in CollationTest.zip
	// (taken from http://www.unicode.org/Public/UCA/<unicode.Version>/).
	//
	// The test files have the following form:
	// # header
	// 0009 0021; # ('\u0009') <CHARACTER TABULATION> [\| \| \| 0201 025E]
	// 0009 003F; # ('\u0009') <CHARACTER TABULATION> [\| \| \| 0201 0263]
	// 000A 0021; # ('\u000A') <LINE FEED (LF)> [\| \| \| 0202 025E]
	// 000A 003F; # ('\u000A') <LINE FEED (LF)> [\| \| \| 0202 0263]
	//
	// The part before the semicolon is the hex representation of a sequence
	// of runes. After the hash mark is a comment. The strings
	// represented by rune sequence are in the file in sorted order, as
	// defined by the DUCET.

	var testdata = flag.String("testdata",
	"http://www.unicode.org/Public/UCA/"+unicode.Version+"/CollationTest.zip",
	"URL of Unicode collation tests zip file")
	var ducet = flag.String("ducet",
	"http://unicode.org/Public/UCA/"+unicode.Version+"/allkeys.txt",
	"URL of the Default Unicode Collation Element Table (DUCET).")
	var localFiles = flag.Bool("local",
	false,
	"data files have been copied to the current directory; for debugging only")

	type Test struct {
	name string
	str [][]byte
	comment []string
	}

	var versionRe = regexp.MustCompile(`# UCA Version: (.*)\n?$`)
	var testRe = regexp.MustCompile(`^([\dA-F ]+);.# (.)\n?$`)

	func Error(e error) {
	if e != nil {
	log.Fatal(e)
	}
	}

	// openReader opens the url or file given by url and returns it as an io.ReadCloser
	// or nil on error.
	func openReader(url string) io.ReadCloser {
	if *localFiles {
	pwd, _ := os.Getwd()
	url = "file://" + path.Join(pwd, path.Base(url))
	}
	t := &http.Transport{}
	t.RegisterProtocol("file", http.NewFileTransport(http.Dir("/")))
	c := &http.Client{Transport: t}
	resp, err := c.Get(url)
	Error(err)
	if resp.StatusCode != 200 {
	Error(fmt.Errorf(`bad GET status for "%s": %s`, url, resp.Status))
	}
	return resp.Body
	}

	// parseUCA parses a Default Unicode Collation Element Table of the format
	// specified in http://www.unicode.org/reports/tr10/#File_Format.
	// It returns the variable top.
	func parseUCA(builder *build.Builder) {
	r := openReader(*ducet)
	defer r.Close()
	input := bufio.NewReader(r)
	colelem := regexp.MustCompile(`\[([.*])([0-9A-F.]+)\]`)
	for i := 1; true; i++ {
	l, prefix, err := input.ReadLine()
	if err == io.EOF {
	break
	}
	Error(err)
	line := string(l)
	if prefix {
	log.Fatalf("%d: buffer overflow", i)
	}
	if len(line) == 0 \|\| line[0] == '#' {
	continue
	}
	if line[0] == '@' {
	if strings.HasPrefix(line[1:], "version ") {
	if v := strings.Split(line[1:], " ")[1]; v != unicode.Version {
	log.Fatalf("incompatible version %s; want %s", v, unicode.Version)
	}
	}
	} else {
	// parse entries
	part := strings.Split(line, " ; ")
	if len(part) != 2 {
	log.Fatalf("%d: production rule without ';': %v", i, line)
	}
	lhs := []rune{}
	for _, v := range strings.Split(part[0], " ") {
	if v != "" {
	lhs = append(lhs, rune(convHex(i, v)))
	}
	}
	vars := []int{}
	rhs := [][]int{}
	for i, m := range colelem.FindAllStringSubmatch(part[1], -1) {
	if m[1] == "*" {
	vars = append(vars, i)
	}
	elem := []int{}
	for _, h := range strings.Split(m[2], ".") {
	elem = append(elem, convHex(i, h))
	}
	rhs = append(rhs, elem)
	}
	builder.Add(lhs, rhs, vars)
	}
	}
	}

	func convHex(line int, s string) int {
	r, e := strconv.ParseInt(s, 16, 32)
	if e != nil {
	log.Fatalf("%d: %v", line, e)
	}
	return int(r)
	}

	func loadTestData() []Test {
	f := openReader(*testdata)
	buffer, err := ioutil.ReadAll(f)
	f.Close()
	Error(err)
	archive, err := zip.NewReader(bytes.NewReader(buffer), int64(len(buffer)))
	Error(err)
	tests := []Test{}
	for _, f := range archive.File {
	// Skip the short versions, which are simply duplicates of the long versions.
	if strings.Contains(f.Name, "SHORT") \|\| f.FileInfo().IsDir() {
	continue
	}
	ff, err := f.Open()
	Error(err)
	defer ff.Close()
	scanner := bufio.NewScanner(ff)
	test := Test{name: path.Base(f.Name)}
	for scanner.Scan() {
	line := scanner.Text()
	if len(line) <= 1 \|\| line[0] == '#' {
	if m := versionRe.FindStringSubmatch(line); m != nil {
	if m[1] != unicode.Version {
	log.Printf("warning:%s: version is %s; want %s", f.Name, m[1], unicode.Version)
	}
	}
	continue
	}
	m := testRe.FindStringSubmatch(line)
	if m == nil \|\| len(m) < 3 {
	log.Fatalf(`Failed to parse: "%s" result: %#v`, line, m)
	}
	str := []byte{}
	// In the regression test data (unpaired) surrogates are assigned a weight
	// corresponding to their code point value. However, utf8.DecodeRune,
	// which is used to compute the implicit weight, assigns FFFD to surrogates.
	// We therefore skip tests with surrogates. This skips about 35 entries
	// per test.
	valid := true
	for _, split := range strings.Split(m[1], " ") {
	r, err := strconv.ParseUint(split, 16, 64)
	Error(err)
	valid = valid && utf8.ValidRune(rune(r))
	str = append(str, string(rune(r))...)
	}
	if valid {
	test.str = append(test.str, str)
	test.comment = append(test.comment, m[2])
	}
	}
	if scanner.Err() != nil {
	log.Fatal(scanner.Err())
	}
	tests = append(tests, test)
	}
	return tests
	}

	var errorCount int

	func fail(t Test, pattern string, args ...interface{}) {
	format := fmt.Sprintf("error:%s:%s", t.name, pattern)
	log.Printf(format, args...)
	errorCount++
	if errorCount > 30 {
	log.Fatal("too many errors")
	}
	}

	func runes(b []byte) []rune {
	return []rune(string(b))
	}

	var shifted = language.MustParse("und-u-ka-shifted-ks-level4")

	func doTest(t Test) {
	bld := build.NewBuilder()
	parseUCA(bld)
	w, err := bld.Build()
	Error(err)
	var tag language.Tag
	if !strings.Contains(t.name, "NON_IGNOR") {
	tag = shifted
	}
	c := collate.NewFromTable(w, collate.OptionsFromTag(tag))
	b := &collate.Buffer{}
	prev := t.str[0]
	for i := 1; i < len(t.str); i++ {
	b.Reset()
	s := t.str[i]
	ka := c.Key(b, prev)
	kb := c.Key(b, s)
	if r := bytes.Compare(ka, kb); r == 1 {
	fail(t, "%d: Key(%.4X) < Key(%.4X) (%X < %X) == %d; want -1 or 0", i, []rune(string(prev)), []rune(string(s)), ka, kb, r)
	prev = s
	continue
	}
	if r := c.Compare(prev, s); r == 1 {
	fail(t, "%d: Compare(%.4X, %.4X) == %d; want -1 or 0", i, runes(prev), runes(s), r)
	}
	if r := c.Compare(s, prev); r == -1 {
	fail(t, "%d: Compare(%.4X, %.4X) == %d; want 1 or 0", i, runes(s), runes(prev), r)
	}
	prev = s
	}
	}

	func main() {
	flag.Parse()
	for _, test := range loadTestData() {
	doTest(test)
	}
	if errorCount == 0 {
	fmt.Println("PASS")
	}
	}