unicode/norm: implement Spanner interface Change-Id: I5e252a2d4dd82ccd9d2a3cfb62cf6916a488dc53 Reviewed-on: https://go-review.googlesource.com/28132 Run-TryBot: Marcel van Lohuizen <mpvl@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Nigel Tao <nigeltao@golang.org>
diff --git a/unicode/norm/normalize.go b/unicode/norm/normalize.go index eb5596f..bba8ce9 100644 --- a/unicode/norm/normalize.go +++ b/unicode/norm/normalize.go
@@ -8,7 +8,11 @@ // Package norm contains types and functions for normalizing Unicode strings. package norm // import "golang.org/x/text/unicode/norm" -import "unicode/utf8" +import ( + "unicode/utf8" + + "golang.org/x/text/transform" +) // A Form denotes a canonical representation of Unicode code points. // The Unicode-defined normalization and equivalence forms are: @@ -263,6 +267,34 @@ return n } +// Span implements transform.SpanningTransformer. It returns a boundary n such +// that b[0:n] == f(b[0:n]). It is not guaranteed to return the largest such n. +func (f Form) Span(b []byte, atEOF bool) (n int, err error) { + n, ok := formTable[f].quickSpan(inputBytes(b), 0, len(b), atEOF) + if n < len(b) { + if !ok { + err = transform.ErrEndOfSpan + } else { + err = transform.ErrShortSrc + } + } + return n, err +} + +// SpanString returns a boundary n such that s[0:n] == f(s[0:n]). +// It is not guaranteed to return the largest such n. +func (f Form) SpanString(s string, atEOF bool) (n int, err error) { + n, ok := formTable[f].quickSpan(inputString(s), 0, len(s), atEOF) + if n < len(s) { + if !ok { + err = transform.ErrEndOfSpan + } else { + err = transform.ErrShortSrc + } + } + return n, err +} + // quickSpan returns a boundary n such that src[0:n] == f(src[0:n]) and // whether any non-normalized parts were found. If atEOF is false, n will // not point past the last segment if this segment might be become @@ -321,7 +353,7 @@ return lastSegStart, false } -// QuickSpanString returns a boundary n such that b[0:n] == f(s[0:n]). +// QuickSpanString returns a boundary n such that s[0:n] == f(s[0:n]). // It is not guaranteed to return the largest such n. func (f Form) QuickSpanString(s string) int { n, _ := formTable[f].quickSpan(inputString(s), 0, len(s), true)
diff --git a/unicode/norm/normalize_test.go b/unicode/norm/normalize_test.go index 6e92abb..04810e7 100644 --- a/unicode/norm/normalize_test.go +++ b/unicode/norm/normalize_test.go
@@ -13,6 +13,9 @@ "strings" "testing" "unicode/utf8" + + "golang.org/x/text/internal/testtext" + "golang.org/x/text/transform" ) var ( @@ -368,94 +371,122 @@ runPosTests(t, "TestLastBoundary", NFC, lastBoundaryF, lastBoundaryTests) } -var quickSpanTests = []PositionTest{ - {"", 0, ""}, +type spanTest struct { + input string + atEOF bool + n int + err error +} + +var quickSpanTests = []spanTest{ + {"", true, 0, nil}, // starters - {"a", 1, ""}, - {"abc", 3, ""}, - {"\u043Eb", 3, ""}, + {"a", true, 1, nil}, + {"abc", true, 3, nil}, + {"\u043Eb", true, 3, nil}, // incomplete last rune. - {"\xCC", 1, ""}, - {"a\xCC", 2, ""}, + {"\xCC", true, 1, nil}, + {"\xCC", false, 0, transform.ErrShortSrc}, + {"a\xCC", true, 2, nil}, + {"a\xCC", false, 0, transform.ErrShortSrc}, // TODO: could be 1 for NFD // incorrectly ordered combining characters - {"\u0300\u0316", 0, ""}, - {"\u0300\u0316cd", 0, ""}, + {"\u0300\u0316", true, 0, transform.ErrEndOfSpan}, + {"\u0300\u0316", false, 0, transform.ErrEndOfSpan}, + {"\u0300\u0316cd", true, 0, transform.ErrEndOfSpan}, + {"\u0300\u0316cd", false, 0, transform.ErrEndOfSpan}, // have a maximum number of combining characters. - {rep(0x035D, 30) + "\u035B", 0, ""}, - {"a" + rep(0x035D, 30) + "\u035B", 0, ""}, - {"Ɵ" + rep(0x035D, 30) + "\u035B", 0, ""}, - {"aa" + rep(0x035D, 30) + "\u035B", 1, ""}, - {rep(0x035D, 30) + cgj + "\u035B", 64, ""}, - {"a" + rep(0x035D, 30) + cgj + "\u035B", 65, ""}, - {"Ɵ" + rep(0x035D, 30) + cgj + "\u035B", 66, ""}, - {"aa" + rep(0x035D, 30) + cgj + "\u035B", 66, ""}, + {rep(0x035D, 30) + "\u035B", true, 0, transform.ErrEndOfSpan}, + {"a" + rep(0x035D, 30) + "\u035B", true, 0, transform.ErrEndOfSpan}, + {"Ɵ" + rep(0x035D, 30) + "\u035B", true, 0, transform.ErrEndOfSpan}, + {"aa" + rep(0x035D, 30) + "\u035B", true, 1, transform.ErrEndOfSpan}, + {rep(0x035D, 30) + cgj + "\u035B", true, 64, nil}, + {"a" + rep(0x035D, 30) + cgj + "\u035B", true, 65, nil}, + {"Ɵ" + rep(0x035D, 30) + cgj + "\u035B", true, 66, nil}, + {"aa" + rep(0x035D, 30) + cgj + "\u035B", true, 66, nil}, + + {"a" + rep(0x035D, 30) + cgj + "\u035B", false, 61, transform.ErrShortSrc}, + {"Ɵ" + rep(0x035D, 30) + cgj + "\u035B", false, 62, transform.ErrShortSrc}, + {"aa" + rep(0x035D, 30) + cgj + "\u035B", false, 62, transform.ErrShortSrc}, } -var quickSpanNFDTests = []PositionTest{ +var quickSpanNFDTests = []spanTest{ // needs decomposing - {"\u00C0", 0, ""}, - {"abc\u00C0", 3, ""}, + {"\u00C0", true, 0, transform.ErrEndOfSpan}, + {"abc\u00C0", true, 3, transform.ErrEndOfSpan}, // correctly ordered combining characters - {"\u0300", 2, ""}, - {"ab\u0300", 4, ""}, - {"ab\u0300cd", 6, ""}, - {"\u0300cd", 4, ""}, - {"\u0316\u0300", 4, ""}, - {"ab\u0316\u0300", 6, ""}, - {"ab\u0316\u0300cd", 8, ""}, - {"ab\u0316\u0300\u00C0", 6, ""}, - {"\u0316\u0300cd", 6, ""}, - {"\u043E\u0308b", 5, ""}, + {"\u0300", true, 2, nil}, + {"ab\u0300", true, 4, nil}, + {"ab\u0300cd", true, 6, nil}, + {"\u0300cd", true, 4, nil}, + {"\u0316\u0300", true, 4, nil}, + {"ab\u0316\u0300", true, 6, nil}, + {"ab\u0316\u0300cd", true, 8, nil}, + {"ab\u0316\u0300\u00C0", true, 6, transform.ErrEndOfSpan}, + {"\u0316\u0300cd", true, 6, nil}, + {"\u043E\u0308b", true, 5, nil}, // incorrectly ordered combining characters - {"ab\u0300\u0316", 1, ""}, // TODO: we could skip 'b' as well. - {"ab\u0300\u0316cd", 1, ""}, + {"ab\u0300\u0316", true, 1, transform.ErrEndOfSpan}, // TODO: we could skip 'b' as well. + {"ab\u0300\u0316cd", true, 1, transform.ErrEndOfSpan}, // Hangul - {"같은", 0, ""}, + {"같은", true, 0, transform.ErrEndOfSpan}, } -var quickSpanNFCTests = []PositionTest{ +var quickSpanNFCTests = []spanTest{ // okay composed - {"\u00C0", 2, ""}, - {"abc\u00C0", 5, ""}, + {"\u00C0", true, 2, nil}, + {"abc\u00C0", true, 5, nil}, // correctly ordered combining characters - {"ab\u0300", 1, ""}, - {"ab\u0300cd", 1, ""}, - {"ab\u0316\u0300", 1, ""}, - {"ab\u0316\u0300cd", 1, ""}, - {"\u00C0\u035D", 4, ""}, + // TODO: b may combine with modifiers, which is why this fails. We could + // make a more precise test that that actually checks whether last + // characters combines. Probably not worth it. + {"ab\u0300", true, 1, transform.ErrEndOfSpan}, + {"ab\u0300cd", true, 1, transform.ErrEndOfSpan}, + {"ab\u0316\u0300", true, 1, transform.ErrEndOfSpan}, + {"ab\u0316\u0300cd", true, 1, transform.ErrEndOfSpan}, + {"\u00C0\u035D", true, 4, nil}, // we do not special case leading combining characters - {"\u0300cd", 0, ""}, - {"\u0300", 0, ""}, - {"\u0316\u0300", 0, ""}, - {"\u0316\u0300cd", 0, ""}, + {"\u0300cd", true, 0, transform.ErrEndOfSpan}, + {"\u0300", true, 0, transform.ErrEndOfSpan}, + {"\u0316\u0300", true, 0, transform.ErrEndOfSpan}, + {"\u0316\u0300cd", true, 0, transform.ErrEndOfSpan}, // incorrectly ordered combining characters - {"ab\u0300\u0316", 1, ""}, - {"ab\u0300\u0316cd", 1, ""}, + {"ab\u0300\u0316", true, 1, transform.ErrEndOfSpan}, + {"ab\u0300\u0316cd", true, 1, transform.ErrEndOfSpan}, // Hangul - {"같은", 6, ""}, + {"같은", true, 6, nil}, + {"같은", false, 3, transform.ErrShortSrc}, // We return the start of the violating segment in case of overflow. - {grave(30) + "\uff9e", 0, ""}, - {grave(30), 0, ""}, + {grave(30) + "\uff9e", true, 0, transform.ErrEndOfSpan}, + {grave(30), true, 0, transform.ErrEndOfSpan}, } -func doQuickSpan(rb *reorderBuffer, s string) (int, []byte) { - return rb.f.form.QuickSpan([]byte(s)), nil +func runSpanTests(t *testing.T, name string, f Form, testCases []spanTest) { + for i, tc := range testCases { + s := fmt.Sprintf("Bytes/%s/%d=%+q/atEOF=%v", name, i, pc(tc.input), tc.atEOF) + ok := testtext.Run(t, s, func(t *testing.T) { + n, err := f.Span([]byte(tc.input), tc.atEOF) + if n != tc.n || err != tc.err { + t.Errorf("\n got %d, %v;\nwant %d, %v", n, err, tc.n, tc.err) + } + }) + if !ok { + continue // Don't do the String variant if the Bytes variant failed. + } + s = fmt.Sprintf("String/%s/%d=%+q/atEOF=%v", name, i, pc(tc.input), tc.atEOF) + testtext.Run(t, s, func(t *testing.T) { + n, err := f.SpanString(tc.input, tc.atEOF) + if n != tc.n || err != tc.err { + t.Errorf("\n got %d, %v;\nwant %d, %v", n, err, tc.n, tc.err) + } + }) + } } -func doQuickSpanString(rb *reorderBuffer, s string) (int, []byte) { - return rb.f.form.QuickSpanString(s), nil -} - -func TestQuickSpan(t *testing.T) { - runPosTests(t, "TestQuickSpanNFD1", NFD, doQuickSpan, quickSpanTests) - runPosTests(t, "TestQuickSpanNFD2", NFD, doQuickSpan, quickSpanNFDTests) - runPosTests(t, "TestQuickSpanNFC1", NFC, doQuickSpan, quickSpanTests) - runPosTests(t, "TestQuickSpanNFC2", NFC, doQuickSpan, quickSpanNFCTests) - - runPosTests(t, "TestQuickSpanStringNFD1", NFD, doQuickSpanString, quickSpanTests) - runPosTests(t, "TestQuickSpanStringNFD2", NFD, doQuickSpanString, quickSpanNFDTests) - runPosTests(t, "TestQuickSpanStringNFC1", NFC, doQuickSpanString, quickSpanTests) - runPosTests(t, "TestQuickSpanStringNFC2", NFC, doQuickSpanString, quickSpanNFCTests) +func TestSpan(t *testing.T) { + runSpanTests(t, "NFD", NFD, quickSpanTests) + runSpanTests(t, "NFD", NFD, quickSpanNFDTests) + runSpanTests(t, "NFC", NFC, quickSpanTests) + runSpanTests(t, "NFC", NFC, quickSpanNFCTests) } var isNormalTests = []PositionTest{