| // Copyright 2015 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| package precis |
| |
| import ( |
| "bytes" |
| "fmt" |
| "reflect" |
| "testing" |
| |
| "golang.org/x/text/internal/testtext" |
| "golang.org/x/text/secure/bidirule" |
| "golang.org/x/text/transform" |
| ) |
| |
| type testCase struct { |
| input string |
| output string |
| err error |
| } |
| |
| var enforceTestCases = []struct { |
| name string |
| p *Profile |
| cases []testCase |
| }{ |
| {"Basic", NewFreeform(), []testCase{ |
| {"e\u0301\u031f", "\u00e9\u031f", nil}, // normalize |
| }}, |
| |
| {"Context Rule 1", NewFreeform(), []testCase{ |
| // Rule 1: zero-width non-joiner (U+200C) |
| // From RFC: |
| // False |
| // If Canonical_Combining_Class(Before(cp)) .eq. Virama Then True; |
| // If RegExpMatch((Joining_Type:{L,D})(Joining_Type:T)*\u200C |
| // (Joining_Type:T)*(Joining_Type:{R,D})) Then True; |
| // |
| // Example runes for different joining types: |
| // Join L: U+A872; PHAGS-PA SUPERFIXED LETTER RA |
| // Join D: U+062C; HAH WITH DOT BELOW |
| // Join T: U+0610; ARABIC SIGN SALLALLAHOU ALAYHE WASSALLAM |
| // Join R: U+0627; ALEF |
| // Virama: U+0A4D; GURMUKHI SIGN VIRAMA |
| // Virama and Join T: U+0ACD; GUJARATI SIGN VIRAMA |
| {"\u200c", "", errContext}, |
| {"\u200ca", "", errContext}, |
| {"a\u200c", "", errContext}, |
| {"\u200c\u0627", "", errContext}, // missing JoinStart |
| {"\u062c\u200c", "", errContext}, // missing JoinEnd |
| {"\u0610\u200c\u0610\u0627", "", errContext}, // missing JoinStart |
| {"\u062c\u0610\u200c\u0610", "", errContext}, // missing JoinEnd |
| |
| // Variants of: D T* U+200c T* R |
| {"\u062c\u200c\u0627", "\u062c\u200c\u0627", nil}, |
| {"\u062c\u0610\u200c\u0610\u0627", "\u062c\u0610\u200c\u0610\u0627", nil}, |
| {"\u062c\u0610\u0610\u200c\u0610\u0610\u0627", "\u062c\u0610\u0610\u200c\u0610\u0610\u0627", nil}, |
| {"\u062c\u0610\u200c\u0627", "\u062c\u0610\u200c\u0627", nil}, |
| {"\u062c\u200c\u0610\u0627", "\u062c\u200c\u0610\u0627", nil}, |
| |
| // Variants of: L T* U+200c T* D |
| {"\ua872\u200c\u062c", "\ua872\u200c\u062c", nil}, |
| {"\ua872\u0610\u200c\u0610\u062c", "\ua872\u0610\u200c\u0610\u062c", nil}, |
| {"\ua872\u0610\u0610\u200c\u0610\u0610\u062c", "\ua872\u0610\u0610\u200c\u0610\u0610\u062c", nil}, |
| {"\ua872\u0610\u200c\u062c", "\ua872\u0610\u200c\u062c", nil}, |
| {"\ua872\u200c\u0610\u062c", "\ua872\u200c\u0610\u062c", nil}, |
| |
| // Virama |
| {"\u0a4d\u200c", "\u0a4d\u200c", nil}, |
| {"\ua872\u0a4d\u200c", "\ua872\u0a4d\u200c", nil}, |
| {"\ua872\u0a4d\u0610\u200c", "", errContext}, |
| {"\ua872\u0a4d\u0610\u200c", "", errContext}, |
| |
| {"\u0acd\u200c", "\u0acd\u200c", nil}, |
| {"\ua872\u0acd\u200c", "\ua872\u0acd\u200c", nil}, |
| {"\ua872\u0acd\u0610\u200c", "", errContext}, |
| {"\ua872\u0acd\u0610\u200c", "", errContext}, |
| |
| // Using Virama as join T |
| {"\ua872\u0acd\u200c\u062c", "\ua872\u0acd\u200c\u062c", nil}, |
| {"\ua872\u200c\u0acd\u062c", "\ua872\u200c\u0acd\u062c", nil}, |
| }}, |
| |
| {"Context Rule 2", NewFreeform(), []testCase{ |
| // Rule 2: zero-width joiner (U+200D) |
| {"\u200d", "", errContext}, |
| {"\u200da", "", errContext}, |
| {"a\u200d", "", errContext}, |
| |
| {"\u0a4d\u200d", "\u0a4d\u200d", nil}, |
| {"\ua872\u0a4d\u200d", "\ua872\u0a4d\u200d", nil}, |
| {"\u0a4da\u200d", "", errContext}, |
| }}, |
| |
| {"Context Rule 3", NewFreeform(), []testCase{ |
| // Rule 3: middle dot |
| {"·", "", errContext}, |
| {"l·", "", errContext}, |
| {"·l", "", errContext}, |
| {"a·", "", errContext}, |
| {"l·a", "", errContext}, |
| {"a·a", "", errContext}, |
| {"l·l", "l·l", nil}, |
| {"al·la", "al·la", nil}, |
| }}, |
| |
| {"Context Rule 4", NewFreeform(), []testCase{ |
| // Rule 4: Greek lower numeral U+0375 |
| {"͵", "", errContext}, |
| {"͵a", "", errContext}, |
| {"α͵", "", errContext}, |
| {"͵α", "͵α", nil}, |
| {"α͵α", "α͵α", nil}, |
| {"͵͵α", "͵͵α", nil}, // The numeric sign is itself Greek. |
| {"α͵͵α", "α͵͵α", nil}, |
| {"α͵͵", "", errContext}, |
| {"α͵͵a", "", errContext}, |
| }}, |
| |
| {"Context Rule 5+6", NewFreeform(), []testCase{ |
| // Rule 5+6: Hebrew preceding |
| // U+05f3: Geresh |
| {"׳", "", errContext}, |
| {"׳ה", "", errContext}, |
| {"a׳b", "", errContext}, |
| {"ש׳", "ש׳", nil}, // U+05e9 U+05f3 |
| {"ש׳׳׳", "ש׳׳׳", nil}, // U+05e9 U+05f3 |
| |
| // U+05f4: Gershayim |
| {"״", "", errContext}, |
| {"״ה", "", errContext}, |
| {"a״b", "", errContext}, |
| {"ש״", "ש״", nil}, // U+05e9 U+05f4 |
| {"ש״״״", "ש״״״", nil}, // U+05e9 U+05f4 |
| {"aש״״״", "aש״״״", nil}, // U+05e9 U+05f4 |
| }}, |
| |
| {"Context Rule 7", NewFreeform(), []testCase{ |
| // Rule 7: Katakana middle Dot |
| {"・", "", errContext}, |
| {"abc・", "", errContext}, |
| {"・def", "", errContext}, |
| {"abc・def", "", errContext}, |
| {"aヅc・def", "aヅc・def", nil}, |
| {"abc・dぶf", "abc・dぶf", nil}, |
| {"⺐bc・def", "⺐bc・def", nil}, |
| }}, |
| |
| {"Context Rule 8+9", NewFreeform(), []testCase{ |
| // Rule 8+9: Arabic Indic Digit |
| {"١٢٣٤٥۶", "", errContext}, |
| {"۱۲۳۴۵٦", "", errContext}, |
| {"١٢٣٤٥", "١٢٣٤٥", nil}, |
| {"۱۲۳۴۵", "۱۲۳۴۵", nil}, |
| }}, |
| |
| {"Nickname", Nickname, []testCase{ |
| {" Swan of Avon ", "Swan of Avon", nil}, |
| {"", "", errEmptyString}, |
| {" ", "", errEmptyString}, |
| {" ", "", errEmptyString}, |
| {"a\u00A0a\u1680a\u2000a\u2001a\u2002a\u2003a\u2004a\u2005a\u2006a\u2007a\u2008a\u2009a\u200Aa\u202Fa\u205Fa\u3000a", "a a a a a a a a a a a a a a a a a", nil}, |
| {"Foo", "Foo", nil}, |
| {"foo", "foo", nil}, |
| {"Foo Bar", "Foo Bar", nil}, |
| {"foo bar", "foo bar", nil}, |
| {"\u03A3", "\u03A3", nil}, |
| {"\u03C3", "\u03C3", nil}, |
| // Greek final sigma is left as is (do not fold!) |
| {"\u03C2", "\u03C2", nil}, |
| {"\u265A", "♚", nil}, |
| {"Richard \u2163", "Richard IV", nil}, |
| {"\u212B", "Å", nil}, |
| {"\uFB00", "ff", nil}, // because of NFKC |
| {"שa", "שa", nil}, // no bidi rule |
| {"동일조건변경허락", "동일조건변경허락", nil}, |
| }}, |
| {"OpaqueString", OpaqueString, []testCase{ |
| {" Swan of Avon ", " Swan of Avon ", nil}, |
| {"", "", errEmptyString}, |
| {" ", " ", nil}, |
| {" ", " ", nil}, |
| {"a\u00A0a\u1680a\u2000a\u2001a\u2002a\u2003a\u2004a\u2005a\u2006a\u2007a\u2008a\u2009a\u200Aa\u202Fa\u205Fa\u3000a", "a a a a a a a a a a a a a a a a a", nil}, |
| {"Foo", "Foo", nil}, |
| {"foo", "foo", nil}, |
| {"Foo Bar", "Foo Bar", nil}, |
| {"foo bar", "foo bar", nil}, |
| {"\u03C3", "\u03C3", nil}, |
| {"Richard \u2163", "Richard \u2163", nil}, |
| {"\u212B", "Å", nil}, |
| {"Jack of \u2666s", "Jack of \u2666s", nil}, |
| {"my cat is a \u0009by", "", errDisallowedRune}, |
| {"שa", "שa", nil}, // no bidi rule |
| }}, |
| {"UsernameCaseMapped", UsernameCaseMapped, []testCase{ |
| // TODO: Should this work? |
| // {UsernameCaseMapped, "", "", errDisallowedRune}, |
| {"juliet@example.com", "juliet@example.com", nil}, |
| {"fussball", "fussball", nil}, |
| {"fu\u00DFball", "fu\u00DFball", nil}, |
| {"\u03C0", "\u03C0", nil}, |
| {"\u03A3", "\u03C3", nil}, |
| {"\u03C3", "\u03C3", nil}, |
| // Greek final sigma is left as is (do not fold!) |
| {"\u03C2", "\u03C2", nil}, |
| {"\u0049", "\u0069", nil}, |
| {"\u0049", "\u0069", nil}, |
| {"\u03D2", "", errDisallowedRune}, |
| {"\u03B0", "\u03B0", nil}, |
| {"foo bar", "", errDisallowedRune}, |
| {"♚", "", bidirule.ErrInvalid}, |
| {"\u007E", "~", nil}, |
| {"a", "a", nil}, |
| {"!", "!", nil}, |
| {"²", "", bidirule.ErrInvalid}, |
| {"\t", "", errDisallowedRune}, |
| {"\n", "", errDisallowedRune}, |
| {"\u26D6", "", bidirule.ErrInvalid}, |
| {"\u26FF", "", bidirule.ErrInvalid}, |
| {"\uFB00", "", errDisallowedRune}, |
| {"\u1680", "", bidirule.ErrInvalid}, |
| {" ", "", errDisallowedRune}, |
| {" ", "", errDisallowedRune}, |
| {"\u01C5", "", errDisallowedRune}, |
| {"\u16EE", "", errDisallowedRune}, // Nl RUNIC ARLAUG SYMBOL |
| {"\u0488", "", bidirule.ErrInvalid}, // Me COMBINING CYRILLIC HUNDRED THOUSANDS SIGN |
| {"\u212B", "\u00e5", nil}, // Angstrom sign, NFC -> U+00E5 |
| {"A\u030A", "å", nil}, // A + ring |
| {"\u00C5", "å", nil}, // A with ring |
| {"\u00E7", "ç", nil}, // c cedille |
| {"\u0063\u0327", "ç", nil}, // c + cedille |
| {"\u0158", "ř", nil}, |
| {"\u0052\u030C", "ř", nil}, |
| |
| {"\u1E61", "\u1E61", nil}, // LATIN SMALL LETTER S WITH DOT ABOVE |
| |
| // Confusable characters ARE allowed and should NOT be mapped. |
| {"\u0410", "\u0430", nil}, // CYRILLIC CAPITAL LETTER A |
| |
| // Full width should be mapped to the canonical decomposition. |
| {"AB", "ab", nil}, |
| {"שc", "", bidirule.ErrInvalid}, // bidi rule |
| |
| }}, |
| {"UsernameCasePreserved", UsernameCasePreserved, []testCase{ |
| {"ABC", "ABC", nil}, |
| {"AB", "AB", nil}, |
| {"שc", "", bidirule.ErrInvalid}, // bidi rule |
| {"\uFB00", "", errDisallowedRune}, |
| {"\u212B", "\u00c5", nil}, // Angstrom sign, NFC -> U+00E5 |
| {"ẛ", "", errDisallowedRune}, // LATIN SMALL LETTER LONG S WITH DOT ABOVE |
| }}, |
| } |
| |
| func doTests(t *testing.T, fn func(t *testing.T, p *Profile, tc testCase)) { |
| for _, g := range enforceTestCases { |
| for i, tc := range g.cases { |
| name := fmt.Sprintf("%s:%d:%+q", g.name, i, tc.input) |
| testtext.Run(t, name, func(t *testing.T) { |
| fn(t, g.p, tc) |
| }) |
| } |
| } |
| } |
| |
| func TestString(t *testing.T) { |
| doTests(t, func(t *testing.T, p *Profile, tc testCase) { |
| if e, err := p.String(tc.input); tc.err != err || e != tc.output { |
| t.Errorf("got %+q (err: %v); want %+q (err: %v)", e, err, tc.output, tc.err) |
| } |
| }) |
| } |
| |
| func TestBytes(t *testing.T) { |
| doTests(t, func(t *testing.T, p *Profile, tc testCase) { |
| if e, err := p.Bytes([]byte(tc.input)); tc.err != err || string(e) != tc.output { |
| t.Errorf("got %+q (err: %v); want %+q (err: %v)", string(e), err, tc.output, tc.err) |
| } |
| }) |
| // Test that calling Bytes with something that doesn't transform returns a |
| // copy. |
| orig := []byte("hello") |
| b, _ := NewFreeform().Bytes(orig) |
| if reflect.ValueOf(b).Pointer() == reflect.ValueOf(orig).Pointer() { |
| t.Error("original and result are the same slice; should be a copy") |
| } |
| } |
| |
| func TestAppend(t *testing.T) { |
| doTests(t, func(t *testing.T, p *Profile, tc testCase) { |
| if e, err := p.Append(nil, []byte(tc.input)); tc.err != err || string(e) != tc.output { |
| t.Errorf("got %+q (err: %v); want %+q (err: %v)", string(e), err, tc.output, tc.err) |
| } |
| }) |
| } |
| |
| func TestStringMallocs(t *testing.T) { |
| if n := testtext.AllocsPerRun(100, func() { UsernameCaseMapped.String("helloworld") }); n > 0 { |
| // TODO: reduce this to 0. |
| t.Skipf("got %f allocs, want 0", n) |
| } |
| } |
| |
| func TestAppendMallocs(t *testing.T) { |
| str := []byte("helloworld") |
| out := make([]byte, 0, len(str)) |
| if n := testtext.AllocsPerRun(100, func() { UsernameCaseMapped.Append(out, str) }); n > 0 { |
| t.Errorf("got %f allocs, want 0", n) |
| } |
| } |
| |
| func TestTransformMallocs(t *testing.T) { |
| str := []byte("helloworld") |
| out := make([]byte, 0, len(str)) |
| tr := UsernameCaseMapped.NewTransformer() |
| if n := testtext.AllocsPerRun(100, func() { |
| tr.Reset() |
| tr.Transform(out, str, true) |
| }); n > 0 { |
| t.Errorf("got %f allocs, want 0", n) |
| } |
| } |
| |
| func min(a, b int) int { |
| if a < b { |
| return a |
| } |
| return b |
| } |
| |
| // TestTransformerShortBuffers tests that the precis.Transformer implements the |
| // spirit, not just the letter (the method signatures), of the |
| // transform.Transformer interface. |
| // |
| // In particular, it tests that, if one or both of the dst or src buffers are |
| // short, so that multiple Transform calls are required to complete the overall |
| // transformation, the end result is identical to one Transform call with |
| // sufficiently long buffers. |
| func TestTransformerShortBuffers(t *testing.T) { |
| srcUnit := []byte("a\u0300cce\u0301nts") // NFD normalization form. |
| wantUnit := []byte("àccénts") // NFC normalization form. |
| src := bytes.Repeat(srcUnit, 16) |
| want := bytes.Repeat(wantUnit, 16) |
| const long = 4096 |
| dst := make([]byte, long) |
| |
| // 5, 7, 9, 11, 13, 16 and 17 are all pair-wise co-prime, which means that |
| // slicing the dst and src buffers into 5, 7, 13 and 17 byte chunks will |
| // fall at different places inside the repeated srcUnit's and wantUnit's. |
| if len(srcUnit) != 11 || len(wantUnit) != 9 || len(src) > long || len(want) > long { |
| t.Fatal("inconsistent lengths") |
| } |
| |
| tr := NewFreeform().NewTransformer() |
| for _, deltaD := range []int{5, 7, 13, 17, long} { |
| loop: |
| for _, deltaS := range []int{5, 7, 13, 17, long} { |
| tr.Reset() |
| d0 := 0 |
| s0 := 0 |
| for { |
| d1 := min(len(dst), d0+deltaD) |
| s1 := min(len(src), s0+deltaS) |
| nDst, nSrc, err := tr.Transform(dst[d0:d1:d1], src[s0:s1:s1], s1 == len(src)) |
| d0 += nDst |
| s0 += nSrc |
| if err == nil { |
| break |
| } |
| if err == transform.ErrShortDst || err == transform.ErrShortSrc { |
| continue |
| } |
| t.Errorf("deltaD=%d, deltaS=%d: %v", deltaD, deltaS, err) |
| continue loop |
| } |
| if s0 != len(src) { |
| t.Errorf("deltaD=%d, deltaS=%d: s0: got %d, want %d", deltaD, deltaS, s0, len(src)) |
| continue |
| } |
| if d0 != len(want) { |
| t.Errorf("deltaD=%d, deltaS=%d: d0: got %d, want %d", deltaD, deltaS, d0, len(want)) |
| continue |
| } |
| got := dst[:d0] |
| if !bytes.Equal(got, want) { |
| t.Errorf("deltaD=%d, deltaS=%d:\ngot %q\nwant %q", deltaD, deltaS, got, want) |
| continue |
| } |
| } |
| } |
| } |