blob: 8ac39118daaf67c11c979283bf41e9d14f15c06c [file] [log] [blame]
// Copyright 2014 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package cases
import (
"bytes"
"fmt"
"path"
"strings"
"testing"
"unicode/utf8"
"golang.org/x/text/internal/testtext"
"golang.org/x/text/language"
"golang.org/x/text/transform"
"golang.org/x/text/unicode/norm"
)
type testCase struct {
lang string
src interface{} // string, []string, or nil to skip test
title interface{} // string, []string, or nil to skip test
lower interface{} // string, []string, or nil to skip test
upper interface{} // string, []string, or nil to skip test
opts options
}
var testCases = []testCase{
0: {
lang: "und",
src: "abc aBc ABC abC İsıI ΕΣΆΣ",
title: "Abc Abc Abc Abc İsıi Εσάσ",
lower: "abc abc abc abc i\u0307sıi εσάσ",
upper: "ABC ABC ABC ABC İSII ΕΣΆΣ",
opts: getOpts(HandleFinalSigma(false)),
},
1: {
lang: "und",
src: "abc aBc ABC abC İsıI ΕΣΆΣ Σ _Σ -Σ",
title: "Abc Abc Abc Abc İsıi Εσάς Σ _Σ -Σ",
lower: "abc abc abc abc i\u0307sıi εσάς σ _σ -σ",
upper: "ABC ABC ABC ABC İSII ΕΣΆΣ Σ _Σ -Σ",
opts: getOpts(HandleFinalSigma(true)),
},
2: { // Title cased runes.
lang: supported,
src: "DžA",
title: "Dža",
lower: "dža",
upper: "DŽA",
},
3: {
// Title breaking.
lang: supported,
src: []string{
"FOO CASE TEST",
"DON'T DO THiS",
"χωΡΊΣ χωΡΊΣ^a χωΡΊΣ:a χωΡΊΣ:^a χωΡΊΣ^ όμΩΣ Σ",
"with-hyphens",
"49ers 49ers",
`"capitalize a^a -hyphen 0X _u a_u:a`,
"MidNumLet a.b\u2018c\u2019d\u2024e\ufe52f\uff07f\uff0eg",
"MidNum a,b;c\u037ed\u0589e\u060cf\u2044g\ufe50h",
"\u0345 x\u3031x x\u05d0x \u05d0x a'.a a.a a4,a",
},
title: []string{
"Foo Case Test",
"Don't Do This",
"Χωρίς Χωρίσ^A Χωρίσ:a Χωρίσ:^A Χωρίς^ Όμως Σ",
"With-Hyphens",
// Note that 49Ers is correct according to the spec.
// TODO: provide some option to the user to treat different
// characters as cased.
"49Ers 49Ers",
`"Capitalize A^A -Hyphen 0X _U A_u:a`,
"Midnumlet A.b\u2018c\u2019d\u2024e\ufe52f\uff07f\uff0eg",
"Midnum A,B;C\u037eD\u0589E\u060cF\u2044G\ufe50H",
"\u0399 X\u3031X X\u05d0x \u05d0X A'.A A.a A4,A",
},
},
// TODO: These are known deviations from the options{} Unicode Word Breaking
// Algorithm.
// {
// "und",
// "x_\u3031_x a4,4a",
// "X_\u3031_x A4,4a", // Currently is "X_\U3031_X A4,4A".
// "x_\u3031_x a4,4a",
// "X_\u3031_X A4,4A",
// options{},
// },
4: {
// Tests title options
lang: "und",
src: "abc aBc ABC abC İsıI o'Brien",
title: "Abc ABc ABC AbC İsıI O'Brien",
opts: getOpts(NoLower),
},
5: {
lang: "el",
src: "aBc ΟΔΌΣ Οδός Σο ΣΟ Σ oΣ ΟΣ σ ἕξ \u03ac",
title: "Abc Οδός Οδός Σο Σο Σ Oς Ος Σ Ἕξ \u0386",
lower: "abc οδός οδός σο σο σ oς ος σ ἕξ \u03ac",
upper: "ABC ΟΔΟΣ ΟΔΟΣ ΣΟ ΣΟ Σ OΣ ΟΣ Σ ΕΞ \u0391", // Uppercase removes accents
},
6: {
lang: "tr az",
src: "Isiİ İsıI I\u0307sIiİ İsıI\u0307 I\u0300\u0307",
title: "Isii İsıı I\u0307sıii İsıi I\u0300\u0307",
lower: "ısii isıı isıii isıi \u0131\u0300\u0307",
upper: "ISİİ İSII I\u0307SIİİ İSII\u0307 I\u0300\u0307",
},
7: {
lang: "lt",
src: "I Ï J J̈ Į Į̈ Ì Í Ĩ xi̇̈ xj̇̈ xį̇̈ xi̇̀ xi̇́ xi̇̃ XI XÏ XJ XJ̈ XĮ XĮ̈ XI̟̤",
title: "I Ï J J̈ Į Į̈ Ì Í Ĩ Xi̇̈ Xj̇̈ Xį̇̈ Xi̇̀ Xi̇́ Xi̇̃ Xi Xi̇̈ Xj Xj̇̈ Xį Xį̇̈ Xi̟̤",
lower: "i i̇̈ j j̇̈ į į̇̈ i̇̀ i̇́ i̇̃ xi̇̈ xj̇̈ xį̇̈ xi̇̀ xi̇́ xi̇̃ xi xi̇̈ xj xj̇̈ xį xį̇̈ xi̟̤",
upper: "I Ï J J̈ Į Į̈ Ì Í Ĩ XÏ XJ̈ XĮ̈ XÌ XÍ XĨ XI XÏ XJ XJ̈ XĮ XĮ̈ XI̟̤",
},
8: {
lang: "lt",
src: "\u012e\u0300 \u00cc i\u0307\u0300 i\u0307\u0301 i\u0307\u0303 i\u0307\u0308 i\u0300\u0307",
title: "\u012e\u0300 \u00cc \u00cc \u00cd \u0128 \u00cf I\u0300\u0307",
lower: "\u012f\u0307\u0300 i\u0307\u0300 i\u0307\u0300 i\u0307\u0301 i\u0307\u0303 i\u0307\u0308 i\u0300\u0307",
upper: "\u012e\u0300 \u00cc \u00cc \u00cd \u0128 \u00cf I\u0300\u0307",
},
9: {
lang: "nl",
src: "ijs IJs Ij Ijs İJ İJs aa aA 'ns 'S",
title: "IJs IJs IJ IJs İj İjs Aa Aa 'ns 's",
},
// Note: this specification is not currently part of CLDR. The same holds
// for the leading apostrophe handling for Dutch.
// See http://unicode.org/cldr/trac/ticket/7078.
10: {
lang: "af",
src: "wag 'n bietjie",
title: "Wag 'n Bietjie",
lower: "wag 'n bietjie",
upper: "WAG 'N BIETJIE",
},
}
func TestCaseMappings(t *testing.T) {
for i, tt := range testCases {
src, ok := tt.src.([]string)
if !ok {
src = strings.Split(tt.src.(string), " ")
}
for _, lang := range strings.Split(tt.lang, " ") {
tag := language.MustParse(lang)
testEntry := func(name string, mk func(language.Tag, options) transform.SpanningTransformer, gold interface{}) {
c := Caser{mk(tag, tt.opts)}
if gold != nil {
wants, ok := gold.([]string)
if !ok {
wants = strings.Split(gold.(string), " ")
}
for j, want := range wants {
if got := c.String(src[j]); got != want {
t.Errorf("%d:%s:\n%s.String(%+q):\ngot %+q;\nwant %+q", i, lang, name, src[j], got, want)
}
}
}
dst := make([]byte, 256) // big enough to hold any result
src := []byte(strings.Join(src, " "))
v := testtext.AllocsPerRun(20, func() {
c.Transform(dst, src, true)
})
if v > 1.1 {
t.Errorf("%d:%s:\n%s: number of allocs was %f; want 0", i, lang, name, v)
}
}
testEntry("Upper", makeUpper, tt.upper)
testEntry("Lower", makeLower, tt.lower)
testEntry("Title", makeTitle, tt.title)
}
}
}
// TestAlloc tests that some mapping methods should not cause any allocation.
func TestAlloc(t *testing.T) {
dst := make([]byte, 256) // big enough to hold any result
src := []byte(txtNonASCII)
for i, f := range []func() Caser{
func() Caser { return Upper(language.Und) },
func() Caser { return Lower(language.Und) },
func() Caser { return Lower(language.Und, HandleFinalSigma(false)) },
// TODO: use a shared copy for these casers as well, in order of
// importance, starting with the most important:
// func() Caser { return Title(language.Und) },
// func() Caser { return Title(language.Und, HandleFinalSigma(false)) },
} {
testtext.Run(t, "", func(t *testing.T) {
var c Caser
v := testtext.AllocsPerRun(10, func() {
c = f()
})
if v > 0 {
// TODO: Right now only Upper has 1 allocation. Special-case Lower
// and Title as well to have less allocations for the root locale.
t.Errorf("%d:init: number of allocs was %f; want 0", i, v)
}
v = testtext.AllocsPerRun(2, func() {
c.Transform(dst, src, true)
})
if v > 0 {
t.Errorf("%d:transform: number of allocs was %f; want 0", i, v)
}
})
}
}
func testHandover(t *testing.T, c Caser, src string) {
want := c.String(src)
// Find the common prefix.
pSrc := 0
for ; pSrc < len(src) && pSrc < len(want) && want[pSrc] == src[pSrc]; pSrc++ {
}
// Test handover for each substring of the prefix.
for i := 0; i < pSrc; i++ {
testtext.Run(t, fmt.Sprint("interleave/", i), func(t *testing.T) {
dst := make([]byte, 4*len(src))
c.Reset()
nSpan, _ := c.Span([]byte(src[:i]), false)
copy(dst, src[:nSpan])
nTransform, _, _ := c.Transform(dst[nSpan:], []byte(src[nSpan:]), true)
got := string(dst[:nSpan+nTransform])
if got != want {
t.Errorf("full string: got %q; want %q", got, want)
}
})
}
}
func TestHandover(t *testing.T) {
testCases := []struct {
desc string
t Caser
first, second string
}{{
"title/nosigma/single midword",
Title(language.Und, HandleFinalSigma(false)),
"A.", "a",
}, {
"title/nosigma/single midword",
Title(language.Und, HandleFinalSigma(false)),
"A", ".a",
}, {
"title/nosigma/double midword",
Title(language.Und, HandleFinalSigma(false)),
"A..", "a",
}, {
"title/nosigma/double midword",
Title(language.Und, HandleFinalSigma(false)),
"A.", ".a",
}, {
"title/nosigma/double midword",
Title(language.Und, HandleFinalSigma(false)),
"A", "..a",
}, {
"title/sigma/single midword",
Title(language.Und),
"ΟΣ.", "a",
}, {
"title/sigma/single midword",
Title(language.Und),
"ΟΣ", ".a",
}, {
"title/sigma/double midword",
Title(language.Und),
"ΟΣ..", "a",
}, {
"title/sigma/double midword",
Title(language.Und),
"ΟΣ.", ".a",
}, {
"title/sigma/double midword",
Title(language.Und),
"ΟΣ", "..a",
}, {
"title/af/leading apostrophe",
Title(language.Afrikaans),
"'", "n bietje",
}}
for _, tc := range testCases {
testtext.Run(t, tc.desc, func(t *testing.T) {
src := tc.first + tc.second
want := tc.t.String(src)
tc.t.Reset()
n, _ := tc.t.Span([]byte(tc.first), false)
dst := make([]byte, len(want))
copy(dst, tc.first[:n])
nDst, _, _ := tc.t.Transform(dst[n:], []byte(src[n:]), true)
got := string(dst[:n+nDst])
if got != want {
t.Errorf("got %q; want %q", got, want)
}
})
}
}
// minBufSize is the size of the buffer by which the casing operation in
// this package are guaranteed to make progress.
const minBufSize = norm.MaxSegmentSize
type bufferTest struct {
desc, src, want string
firstErr error
dstSize, srcSize int
t transform.SpanningTransformer
}
var bufferTests []bufferTest
func init() {
bufferTests = []bufferTest{{
desc: "und/upper/short dst",
src: "abcdefg",
want: "ABCDEFG",
firstErr: transform.ErrShortDst,
dstSize: 3,
srcSize: minBufSize,
t: Upper(language.Und),
}, {
desc: "und/upper/short src",
src: "123é56",
want: "123É56",
firstErr: transform.ErrShortSrc,
dstSize: 4,
srcSize: 4,
t: Upper(language.Und),
}, {
desc: "und/upper/no error on short",
src: "12",
want: "12",
firstErr: nil,
dstSize: 1,
srcSize: 1,
t: Upper(language.Und),
}, {
desc: "und/lower/short dst",
src: "ABCDEFG",
want: "abcdefg",
firstErr: transform.ErrShortDst,
dstSize: 3,
srcSize: minBufSize,
t: Lower(language.Und),
}, {
desc: "und/lower/short src",
src: "123É56",
want: "123é56",
firstErr: transform.ErrShortSrc,
dstSize: 4,
srcSize: 4,
t: Lower(language.Und),
}, {
desc: "und/lower/no error on short",
src: "12",
want: "12",
firstErr: nil,
dstSize: 1,
srcSize: 1,
t: Lower(language.Und),
}, {
desc: "und/lower/simple (no final sigma)",
src: "ΟΣ ΟΣΣ",
want: "οσ οσσ",
dstSize: minBufSize,
srcSize: minBufSize,
t: Lower(language.Und, HandleFinalSigma(false)),
}, {
desc: "und/title/simple (no final sigma)",
src: "ΟΣ ΟΣΣ",
want: "Οσ Οσσ",
dstSize: minBufSize,
srcSize: minBufSize,
t: Title(language.Und, HandleFinalSigma(false)),
}, {
desc: "und/title/final sigma: no error",
src: "ΟΣ",
want: "Ος",
dstSize: minBufSize,
srcSize: minBufSize,
t: Title(language.Und),
}, {
desc: "und/title/final sigma: short source",
src: "ΟΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣ",
want: "Οσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσς",
firstErr: transform.ErrShortSrc,
dstSize: minBufSize,
srcSize: 10,
t: Title(language.Und),
}, {
desc: "und/title/final sigma: short destination 1",
src: "ΟΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣ",
want: "Οσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσς",
firstErr: transform.ErrShortDst,
dstSize: 10,
srcSize: minBufSize,
t: Title(language.Und),
}, {
desc: "und/title/final sigma: short destination 2",
src: "ΟΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣ",
want: "Οσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσς",
firstErr: transform.ErrShortDst,
dstSize: 9,
srcSize: minBufSize,
t: Title(language.Und),
}, {
desc: "und/title/final sigma: short destination 3",
src: "ΟΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣ",
want: "Οσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσς",
firstErr: transform.ErrShortDst,
dstSize: 8,
srcSize: minBufSize,
t: Title(language.Und),
}, {
desc: "und/title/clipped UTF-8 rune",
src: "σσσσσσσσσσσ",
want: "Σσσσσσσσσσσ",
firstErr: transform.ErrShortSrc,
dstSize: minBufSize,
srcSize: 5,
t: Title(language.Und),
}, {
desc: "und/title/clipped UTF-8 rune atEOF",
src: "σσσ" + string([]byte{0xCF}),
want: "Σσσ" + string([]byte{0xCF}),
dstSize: minBufSize,
srcSize: minBufSize,
t: Title(language.Und),
}, {
// Note: the choice to change the final sigma at the end in case of
// too many case ignorables is arbitrary. The main reason for this
// choice is that it results in simpler code.
desc: "und/title/final sigma: max ignorables",
src: "ΟΣ" + strings.Repeat(".", maxIgnorable) + "a",
want: "Οσ" + strings.Repeat(".", maxIgnorable) + "A",
dstSize: minBufSize,
srcSize: minBufSize,
t: Title(language.Und),
}, {
// Note: the choice to change the final sigma at the end in case of
// too many case ignorables is arbitrary. The main reason for this
// choice is that it results in simpler code.
desc: "und/title/long string",
src: "AA" + strings.Repeat(".", maxIgnorable+1) + "a",
want: "Aa" + strings.Repeat(".", maxIgnorable+1) + "A",
dstSize: minBufSize,
srcSize: len("AA" + strings.Repeat(".", maxIgnorable+1)),
t: Title(language.Und),
}, {
// Note: the choice to change the final sigma at the end in case of
// too many case ignorables is arbitrary. The main reason for this
// choice is that it results in simpler code.
desc: "und/title/final sigma: too many ignorables",
src: "ΟΣ" + strings.Repeat(".", maxIgnorable+1) + "a",
want: "Ος" + strings.Repeat(".", maxIgnorable+1) + "A",
dstSize: minBufSize,
srcSize: len("ΟΣ" + strings.Repeat(".", maxIgnorable+1)),
t: Title(language.Und),
}, {
desc: "und/title/final sigma: apostrophe",
src: "ΟΣ''a",
want: "Οσ''A",
dstSize: minBufSize,
srcSize: minBufSize,
t: Title(language.Und),
}, {
desc: "el/upper/max ignorables",
src: "ο" + strings.Repeat("\u0321", maxIgnorable-1) + "\u0313",
want: "Ο" + strings.Repeat("\u0321", maxIgnorable-1),
dstSize: minBufSize,
srcSize: minBufSize,
t: Upper(language.Greek),
}, {
desc: "el/upper/too many ignorables",
src: "ο" + strings.Repeat("\u0321", maxIgnorable) + "\u0313",
want: "Ο" + strings.Repeat("\u0321", maxIgnorable) + "\u0313",
dstSize: minBufSize,
srcSize: len("ο" + strings.Repeat("\u0321", maxIgnorable)),
t: Upper(language.Greek),
}, {
desc: "el/upper/short dst",
src: "123ο",
want: "123Ο",
firstErr: transform.ErrShortDst,
dstSize: 3,
srcSize: minBufSize,
t: Upper(language.Greek),
}, {
desc: "lt/lower/max ignorables",
src: "I" + strings.Repeat("\u0321", maxIgnorable-1) + "\u0300",
want: "i" + strings.Repeat("\u0321", maxIgnorable-1) + "\u0307\u0300",
dstSize: minBufSize,
srcSize: minBufSize,
t: Lower(language.Lithuanian),
}, {
desc: "lt/lower/too many ignorables",
src: "I" + strings.Repeat("\u0321", maxIgnorable) + "\u0300",
want: "i" + strings.Repeat("\u0321", maxIgnorable) + "\u0300",
dstSize: minBufSize,
srcSize: len("I" + strings.Repeat("\u0321", maxIgnorable)),
t: Lower(language.Lithuanian),
}, {
desc: "lt/lower/decomposition with short dst buffer 1",
src: "aaaaa\u00cc", // U+00CC LATIN CAPITAL LETTER I GRAVE
firstErr: transform.ErrShortDst,
want: "aaaaai\u0307\u0300",
dstSize: 5,
srcSize: minBufSize,
t: Lower(language.Lithuanian),
}, {
desc: "lt/lower/decomposition with short dst buffer 2",
src: "aaaa\u00cc", // U+00CC LATIN CAPITAL LETTER I GRAVE
firstErr: transform.ErrShortDst,
want: "aaaai\u0307\u0300",
dstSize: 5,
srcSize: minBufSize,
t: Lower(language.Lithuanian),
}, {
desc: "lt/upper/max ignorables",
src: "i" + strings.Repeat("\u0321", maxIgnorable-1) + "\u0307\u0300",
want: "I" + strings.Repeat("\u0321", maxIgnorable-1) + "\u0300",
dstSize: minBufSize,
srcSize: minBufSize,
t: Upper(language.Lithuanian),
}, {
desc: "lt/upper/too many ignorables",
src: "i" + strings.Repeat("\u0321", maxIgnorable) + "\u0307\u0300",
want: "I" + strings.Repeat("\u0321", maxIgnorable) + "\u0307\u0300",
dstSize: minBufSize,
srcSize: len("i" + strings.Repeat("\u0321", maxIgnorable)),
t: Upper(language.Lithuanian),
}, {
desc: "lt/upper/short dst",
src: "12i\u0307\u0300",
want: "12\u00cc",
firstErr: transform.ErrShortDst,
dstSize: 3,
srcSize: minBufSize,
t: Upper(language.Lithuanian),
}, {
desc: "aztr/lower/max ignorables",
src: "I" + strings.Repeat("\u0321", maxIgnorable-1) + "\u0307\u0300",
want: "i" + strings.Repeat("\u0321", maxIgnorable-1) + "\u0300",
dstSize: minBufSize,
srcSize: minBufSize,
t: Lower(language.Turkish),
}, {
desc: "aztr/lower/too many ignorables",
src: "I" + strings.Repeat("\u0321", maxIgnorable) + "\u0307\u0300",
want: "\u0131" + strings.Repeat("\u0321", maxIgnorable) + "\u0307\u0300",
dstSize: minBufSize,
srcSize: len("I" + strings.Repeat("\u0321", maxIgnorable)),
t: Lower(language.Turkish),
}, {
desc: "nl/title/pre-IJ cutoff",
src: " ij",
want: " IJ",
firstErr: transform.ErrShortDst,
dstSize: 2,
srcSize: minBufSize,
t: Title(language.Dutch),
}, {
desc: "nl/title/mid-IJ cutoff",
src: " ij",
want: " IJ",
firstErr: transform.ErrShortDst,
dstSize: 3,
srcSize: minBufSize,
t: Title(language.Dutch),
}, {
desc: "af/title/apostrophe",
src: "'n bietje",
want: "'n Bietje",
firstErr: transform.ErrShortDst,
dstSize: 3,
srcSize: minBufSize,
t: Title(language.Afrikaans),
}}
}
func TestShortBuffersAndOverflow(t *testing.T) {
for i, tt := range bufferTests {
testtext.Run(t, tt.desc, func(t *testing.T) {
buf := make([]byte, tt.dstSize)
got := []byte{}
var nSrc, nDst int
var err error
for p := 0; p < len(tt.src); p += nSrc {
q := p + tt.srcSize
if q > len(tt.src) {
q = len(tt.src)
}
nDst, nSrc, err = tt.t.Transform(buf, []byte(tt.src[p:q]), q == len(tt.src))
got = append(got, buf[:nDst]...)
if p == 0 && err != tt.firstErr {
t.Errorf("%d:%s:\n error was %v; want %v", i, tt.desc, err, tt.firstErr)
break
}
}
if string(got) != tt.want {
t.Errorf("%d:%s:\ngot %+q;\nwant %+q", i, tt.desc, got, tt.want)
}
testHandover(t, Caser{tt.t}, tt.src)
})
}
}
func TestSpan(t *testing.T) {
for _, tt := range []struct {
desc string
src string
want string
atEOF bool
err error
t Caser
}{{
desc: "und/upper/basic",
src: "abcdefg",
want: "",
atEOF: true,
err: transform.ErrEndOfSpan,
t: Upper(language.Und),
}, {
desc: "und/upper/short src",
src: "123É"[:4],
want: "123",
atEOF: false,
err: transform.ErrShortSrc,
t: Upper(language.Und),
}, {
desc: "und/upper/no error on short",
src: "12",
want: "12",
atEOF: false,
t: Upper(language.Und),
}, {
desc: "und/lower/basic",
src: "ABCDEFG",
want: "",
atEOF: true,
err: transform.ErrEndOfSpan,
t: Lower(language.Und),
}, {
desc: "und/lower/short src num",
src: "123é"[:4],
want: "123",
atEOF: false,
err: transform.ErrShortSrc,
t: Lower(language.Und),
}, {
desc: "und/lower/short src greek",
src: "αβγé"[:7],
want: "αβγ",
atEOF: false,
err: transform.ErrShortSrc,
t: Lower(language.Und),
}, {
desc: "und/lower/no error on short",
src: "12",
want: "12",
atEOF: false,
t: Lower(language.Und),
}, {
desc: "und/lower/simple (no final sigma)",
src: "ος οσσ",
want: "οσ οσσ",
atEOF: true,
t: Lower(language.Und, HandleFinalSigma(false)),
}, {
desc: "und/title/simple (no final sigma)",
src: "Οσ Οσσ",
want: "Οσ Οσσ",
atEOF: true,
t: Title(language.Und, HandleFinalSigma(false)),
}, {
desc: "und/lower/final sigma: no error",
src: "οΣ", // Oς
want: "ο", // Oς
err: transform.ErrEndOfSpan,
t: Lower(language.Und),
}, {
desc: "und/title/final sigma: no error",
src: "ΟΣ", // Oς
want: "Ο", // Oς
err: transform.ErrEndOfSpan,
t: Title(language.Und),
}, {
desc: "und/title/final sigma: no short source!",
src: "ΟσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσΣ",
want: "Οσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσ",
err: transform.ErrEndOfSpan,
t: Title(language.Und),
}, {
desc: "und/title/clipped UTF-8 rune",
src: "Σσ" + string([]byte{0xCF}),
want: "Σσ",
atEOF: false,
err: transform.ErrShortSrc,
t: Title(language.Und),
}, {
desc: "und/title/clipped UTF-8 rune atEOF",
src: "Σσσ" + string([]byte{0xCF}),
want: "Σσσ" + string([]byte{0xCF}),
atEOF: true,
t: Title(language.Und),
}, {
// Note: the choice to change the final sigma at the end in case of
// too many case ignorables is arbitrary. The main reason for this
// choice is that it results in simpler code.
desc: "und/title/long string",
src: "A" + strings.Repeat("a", maxIgnorable+5),
want: "A" + strings.Repeat("a", maxIgnorable+5),
t: Title(language.Und),
}, {
// Note: the choice to change the final sigma at the end in case of
// too many case ignorables is arbitrary. The main reason for this
// choice is that it results in simpler code.
desc: "und/title/cyrillic",
src: "При",
want: "При",
atEOF: true,
t: Title(language.Und, HandleFinalSigma(false)),
}, {
// Note: the choice to change the final sigma at the end in case of
// too many case ignorables is arbitrary. The main reason for this
// choice is that it results in simpler code.
desc: "und/title/final sigma: max ignorables",
src: "Οσ" + strings.Repeat(".", maxIgnorable) + "A",
want: "Οσ" + strings.Repeat(".", maxIgnorable) + "A",
t: Title(language.Und),
}, {
desc: "el/upper/max ignorables - not implemented",
src: "Ο" + strings.Repeat("\u0321", maxIgnorable-1) + "\u0313",
want: "",
err: transform.ErrEndOfSpan,
t: Upper(language.Greek),
}, {
desc: "el/upper/too many ignorables - not implemented",
src: "Ο" + strings.Repeat("\u0321", maxIgnorable) + "\u0313",
want: "",
err: transform.ErrEndOfSpan,
t: Upper(language.Greek),
}, {
desc: "el/upper/short dst",
src: "123ο",
want: "",
err: transform.ErrEndOfSpan,
t: Upper(language.Greek),
}, {
desc: "lt/lower/max ignorables",
src: "i" + strings.Repeat("\u0321", maxIgnorable-1) + "\u0307\u0300",
want: "i" + strings.Repeat("\u0321", maxIgnorable-1) + "\u0307\u0300",
t: Lower(language.Lithuanian),
}, {
desc: "lt/lower/isLower",
src: "I" + strings.Repeat("\u0321", maxIgnorable) + "\u0300",
want: "",
err: transform.ErrEndOfSpan,
t: Lower(language.Lithuanian),
}, {
desc: "lt/lower/not identical",
src: "aaaaa\u00cc", // U+00CC LATIN CAPITAL LETTER I GRAVE
err: transform.ErrEndOfSpan,
want: "aaaaa",
t: Lower(language.Lithuanian),
}, {
desc: "lt/lower/identical",
src: "aaaai\u0307\u0300", // U+00CC LATIN CAPITAL LETTER I GRAVE
want: "aaaai\u0307\u0300",
t: Lower(language.Lithuanian),
}, {
desc: "lt/upper/not implemented",
src: "I" + strings.Repeat("\u0321", maxIgnorable-1) + "\u0300",
want: "",
err: transform.ErrEndOfSpan,
t: Upper(language.Lithuanian),
}, {
desc: "lt/upper/not implemented, ascii",
src: "AB",
want: "",
err: transform.ErrEndOfSpan,
t: Upper(language.Lithuanian),
}, {
desc: "nl/title/pre-IJ cutoff",
src: " IJ",
want: " IJ",
t: Title(language.Dutch),
}, {
desc: "nl/title/mid-IJ cutoff",
src: " Ia",
want: " Ia",
t: Title(language.Dutch),
}, {
desc: "af/title/apostrophe",
src: "'n Bietje",
want: "'n Bietje",
t: Title(language.Afrikaans),
}, {
desc: "af/title/apostrophe-incorrect",
src: "'N Bietje",
// The Single_Quote (a MidWord), needs to be retained as unspanned so
// that a successive call to Transform can detect that N should not be
// capitalized.
want: "",
err: transform.ErrEndOfSpan,
t: Title(language.Afrikaans),
}} {
testtext.Run(t, tt.desc, func(t *testing.T) {
for p := 0; p < len(tt.want); p += utf8.RuneLen([]rune(tt.src[p:])[0]) {
tt.t.Reset()
n, err := tt.t.Span([]byte(tt.src[:p]), false)
if err != nil && err != transform.ErrShortSrc {
t.Errorf("early failure:Span(%+q): %v (%d < %d)", tt.src[:p], err, n, len(tt.want))
break
}
}
tt.t.Reset()
n, err := tt.t.Span([]byte(tt.src), tt.atEOF)
if n != len(tt.want) || err != tt.err {
t.Errorf("Span(%+q, %v): got %d, %v; want %d, %v", tt.src, tt.atEOF, n, err, len(tt.want), tt.err)
}
testHandover(t, tt.t, tt.src)
})
}
}
var txtASCII = strings.Repeat("The quick brown fox jumps over the lazy dog. ", 50)
// Taken from http://creativecommons.org/licenses/by-sa/3.0/vn/
const txt_vn = `Với các điều kiện sau: Ghi nhận công của tác giả. Nếu bạn sử
dụng, chuyển đổi, hoặc xây dựng dự án từ nội dung được chia sẻ này, bạn phải áp
dụng giấy phép này hoặc một giấy phép khác có các điều khoản tương tự như giấy
phép này cho dự án của bạn. Hiểu rằng: Miễn — Bất kỳ các điều kiện nào trên đây
cũng có thể được miễn bỏ nếu bạn được sự cho phép của người sở hữu bản quyền.
Phạm vi công chúng — Khi tác phẩm hoặc bất kỳ chương nào của tác phẩm đã trong
vùng dành cho công chúng theo quy định của pháp luật thì tình trạng của nó không
bị ảnh hưởng bởi giấy phép trong bất kỳ trường hợp nào.`
// http://creativecommons.org/licenses/by-sa/2.5/cn/
const txt_cn = `您可以自由: 复制、发行、展览、表演、放映、
广播或通过信息网络传播本作品 创作演绎作品
对本作品进行商业性使用 惟须遵守下列条件:
署名 — 您必须按照作者或者许可人指定的方式对作品进行署名。
相同方式共享 — 如果您改变、转换本作品或者以本作品为基础进行创作,
您只能采用与本协议相同的许可协议发布基于本作品的演绎作品。`
// Taken from http://creativecommons.org/licenses/by-sa/1.0/deed.ru
const txt_ru = `При обязательном соблюдении следующих условий: Attribution — Вы
должны атрибутировать произведение (указывать автора и источник) в порядке,
предусмотренном автором или лицензиаром (но только так, чтобы никоим образом не
подразумевалось, что они поддерживают вас или использование вами данного
произведения). Υπό τις ακόλουθες προϋποθέσεις:`
// Taken from http://creativecommons.org/licenses/by-sa/3.0/gr/
const txt_gr = `Αναφορά Δημιουργού — Θα πρέπει να κάνετε την αναφορά στο έργο με
τον τρόπο που έχει οριστεί από το δημιουργό ή το χορηγούντο την άδεια (χωρίς
όμως να εννοείται με οποιονδήποτε τρόπο ότι εγκρίνουν εσάς ή τη χρήση του έργου
από εσάς). Παρόμοια Διανομή — Εάν αλλοιώσετε, τροποποιήσετε ή δημιουργήσετε
περαιτέρω βασισμένοι στο έργο θα μπορείτε να διανέμετε το έργο που θα προκύψει
μόνο με την ίδια ή παρόμοια άδεια.`
const txtNonASCII = txt_vn + txt_cn + txt_ru + txt_gr
// TODO: Improve ASCII performance.
func BenchmarkCasers(b *testing.B) {
for _, s := range []struct{ name, text string }{
{"ascii", txtASCII},
{"nonASCII", txtNonASCII},
{"short", "При"},
} {
src := []byte(s.text)
// Measure case mappings in bytes package for comparison.
for _, f := range []struct {
name string
fn func(b []byte) []byte
}{
{"lower", bytes.ToLower},
{"title", bytes.ToTitle},
{"upper", bytes.ToUpper},
} {
testtext.Bench(b, path.Join(s.name, "bytes", f.name), func(b *testing.B) {
b.SetBytes(int64(len(src)))
for i := 0; i < b.N; i++ {
f.fn(src)
}
})
}
for _, t := range []struct {
name string
caser transform.SpanningTransformer
}{
{"fold/default", Fold()},
{"upper/default", Upper(language.Und)},
{"lower/sigma", Lower(language.Und)},
{"lower/simple", Lower(language.Und, HandleFinalSigma(false))},
{"title/sigma", Title(language.Und)},
{"title/simple", Title(language.Und, HandleFinalSigma(false))},
} {
c := Caser{t.caser}
dst := make([]byte, len(src))
testtext.Bench(b, path.Join(s.name, t.name, "transform"), func(b *testing.B) {
b.SetBytes(int64(len(src)))
for i := 0; i < b.N; i++ {
c.Reset()
c.Transform(dst, src, true)
}
})
// No need to check span for simple cases, as they will be the same
// as sigma.
if strings.HasSuffix(t.name, "/simple") {
continue
}
spanSrc := c.Bytes(src)
testtext.Bench(b, path.Join(s.name, t.name, "span"), func(b *testing.B) {
c.Reset()
if n, _ := c.Span(spanSrc, true); n < len(spanSrc) {
b.Fatalf("spanner is not recognizing text %q as done (at %d)", spanSrc, n)
}
b.SetBytes(int64(len(spanSrc)))
for i := 0; i < b.N; i++ {
c.Reset()
c.Span(spanSrc, true)
}
})
}
}
}