unicode/norm: fix counting of modifiers for multi-segment decompositions In some rare cases the counting was mashed, leading to a crash or miscounting of modifiers. Code now relies more on streamSafe type to find segments than before. Also fixed some comments. Change-Id: If10cbae44402cf9d479e56e74acc2c9717fed984 Reviewed-on: https://go-review.googlesource.com/40454 Run-TryBot: Marcel van Lohuizen <mpvl@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Nigel Tao <nigeltao@golang.org>

commit: 290cc5ca29f7882d501c79148c59b26804e51916 [log] [tgz]
author: Marcel van Lohuizen <mpvl@golang.org> Wed Apr 12 22:15:11 2017 +0200
committer: Marcel van Lohuizen <mpvl@golang.org> Thu Apr 20 07:15:48 2017 +0000
tree: a28f0cbdf979673b52c410440c77197bf4b84578
parent: 17ea456e8dcdb7e743623cf6814865f98f01922c [diff]
diff --git a/unicode/norm/composition.go b/unicode/norm/composition.go
index d17b278..bab4c5d 100644
--- a/unicode/norm/composition.go
+++ b/unicode/norm/composition.go

@@ -33,17 +33,9 @@
 // streamSafe implements the policy of when a CGJ should be inserted.
 type streamSafe uint8
 
-// mkStreamSafe is a shorthand for declaring a streamSafe var and calling
-// first on it.
-func mkStreamSafe(p Properties) streamSafe {
-	return streamSafe(p.nTrailingNonStarters())
-}
-
-// first inserts the first rune of a segment.
+// first inserts the first rune of a segment. It is a faster version of next if
+// it is known p represents the first rune in a segment.
 func (ss *streamSafe) first(p Properties) {
-	if *ss != 0 {
-		panic("!= 0")
-	}
 	*ss = streamSafe(p.nTrailingNonStarters())
 }
 
@@ -66,7 +58,7 @@
 	// be a non-starter. Note that it always hold that if nLead > 0 then
 	// nLead == nTrail.
 	if n == 0 {
-		*ss = 0
+		*ss = streamSafe(p.nTrailingNonStarters())
 		return ssStarter
 	}
 	return ssSuccess
@@ -142,7 +134,6 @@
 func (rb *reorderBuffer) reset() {
 	rb.nrune = 0
 	rb.nbyte = 0
-	rb.ss = 0
 }
 
 func (rb *reorderBuffer) doFlush() bool {
@@ -257,6 +248,9 @@
 // It flushes the buffer on each new segment start.
 func (rb *reorderBuffer) insertDecomposed(dcomp []byte) insertErr {
 	rb.tmpBytes.setBytes(dcomp)
+	// As the streamSafe accounting already handles the counting for modifiers,
+	// we don't have to call next. However, we do need to keep the accounting
+	// intact when flushing the buffer.
 	for i := 0; i < len(dcomp); {
 		info := rb.f.info(rb.tmpBytes, i)
 		if info.BoundaryBefore() && rb.nrune > 0 && !rb.doFlush() {

diff --git a/unicode/norm/iter.go b/unicode/norm/iter.go
index 0a42a72..ce17f96 100644
--- a/unicode/norm/iter.go
+++ b/unicode/norm/iter.go

@@ -41,6 +41,7 @@
 	i.next = i.rb.f.nextMain
 	i.asciiF = nextASCIIBytes
 	i.info = i.rb.f.info(i.rb.src, i.p)
+	i.rb.ss.first(i.info)
 }
 
 // InitString initializes i to iterate over src after normalizing it to Form f.
@@ -56,11 +57,12 @@
 	i.next = i.rb.f.nextMain
 	i.asciiF = nextASCIIString
 	i.info = i.rb.f.info(i.rb.src, i.p)
+	i.rb.ss.first(i.info)
 }
 
 // Seek sets the segment to be returned by the next call to Next to start
 // at position p.  It is the responsibility of the caller to set p to the
-// start of a UTF8 rune.
+// start of a segment.
 func (i *Iter) Seek(offset int64, whence int) (int64, error) {
 	var abs int64
 	switch whence {
@@ -84,6 +86,7 @@
 	i.multiSeg = nil
 	i.next = i.rb.f.nextMain
 	i.info = i.rb.f.info(i.rb.src, i.p)
+	i.rb.ss.first(i.info)
 	return abs, nil
 }
 
@@ -161,6 +164,7 @@
 	if next >= i.rb.nsrc {
 		i.setDone()
 	} else if i.rb.src.hangul(next) == 0 {
+		i.rb.ss.next(i.info)
 		i.info = i.rb.f.info(i.rb.src, i.p)
 		i.next = i.rb.f.nextMain
 		return i.next(i)
@@ -204,12 +208,10 @@
 		if info.BoundaryBefore() {
 			i.rb.compose()
 			seg := i.buf[:i.rb.flushCopy(i.buf[:])]
-			i.rb.ss.first(info)
 			i.rb.insertUnsafe(input{bytes: d}, j, info)
 			i.multiSeg = d[j+int(info.size):]
 			return seg
 		}
-		i.rb.ss.next(info)
 		i.rb.insertUnsafe(input{bytes: d}, j, info)
 		j += int(info.size)
 	}
@@ -222,9 +224,9 @@
 func nextDecomposed(i *Iter) (next []byte) {
 	outp := 0
 	inCopyStart, outCopyStart := i.p, 0
-	ss := mkStreamSafe(i.info)
 	for {
 		if sz := int(i.info.size); sz <= 1 {
+			i.rb.ss = 0
 			p := i.p
 			i.p++ // ASCII or illegal byte.  Either way, advance by 1.
 			if i.p >= i.rb.nsrc {
@@ -243,6 +245,8 @@
 			p := outp + len(d)
 			if outp > 0 {
 				i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
+				// TODO: this condition should not be possible, but we leave it
+				// in for defensive purposes.
 				if p > len(i.buf) {
 					return i.buf[:outp]
 				}
@@ -266,7 +270,7 @@
 			} else {
 				i.info = i.rb.f.info(i.rb.src, i.p)
 			}
-			switch ss.next(i.info) {
+			switch i.rb.ss.next(i.info) {
 			case ssOverflow:
 				i.next = nextCGJDecompose
 				fallthrough
@@ -309,7 +313,7 @@
 		}
 		prevCC := i.info.tccc
 		i.info = i.rb.f.info(i.rb.src, i.p)
-		if v := ss.next(i.info); v == ssStarter {
+		if v := i.rb.ss.next(i.info); v == ssStarter {
 			break
 		} else if v == ssOverflow {
 			i.next = nextCGJDecompose
@@ -335,10 +339,6 @@
 
 func doNormDecomposed(i *Iter) []byte {
 	for {
-		if s := i.rb.ss.next(i.info); s == ssOverflow {
-			i.next = nextCGJDecompose
-			break
-		}
 		i.rb.insertUnsafe(i.rb.src, i.p, i.info)
 		if i.p += int(i.info.size); i.p >= i.rb.nsrc {
 			i.setDone()
@@ -348,6 +348,10 @@
 		if i.info.ccc == 0 {
 			break
 		}
+		if s := i.rb.ss.next(i.info); s == ssOverflow {
+			i.next = nextCGJDecompose
+			break
+		}
 	}
 	// new segment or too many combining characters: exit normalization
 	return i.buf[:i.rb.flushCopy(i.buf[:])]
@@ -357,6 +361,7 @@
 	i.rb.ss = 0
 	i.rb.insertCGJ()
 	i.next = nextDecomposed
+	i.rb.ss.first(i.info)
 	buf := doNormDecomposed(i)
 	return buf
 }
@@ -365,7 +370,6 @@
 func nextComposed(i *Iter) []byte {
 	outp, startp := 0, i.p
 	var prevCC uint8
-	ss := mkStreamSafe(i.info)
 	for {
 		if !i.info.isYesC() {
 			goto doNorm
@@ -385,11 +389,12 @@
 			i.setDone()
 			break
 		} else if i.rb.src._byte(i.p) < utf8.RuneSelf {
+			i.rb.ss = 0
 			i.next = i.asciiF
 			break
 		}
 		i.info = i.rb.f.info(i.rb.src, i.p)
-		if v := ss.next(i.info); v == ssStarter {
+		if v := i.rb.ss.next(i.info); v == ssStarter {
 			break
 		} else if v == ssOverflow {
 			i.next = nextCGJCompose
@@ -401,8 +406,10 @@
 	}
 	return i.returnSlice(startp, i.p)
 doNorm:
+	// reset to start position
 	i.p = startp
 	i.info = i.rb.f.info(i.rb.src, i.p)
+	i.rb.ss.first(i.info)
 	if i.info.multiSegment() {
 		d := i.info.Decomposition()
 		info := i.rb.f.info(input{bytes: d}, 0)

diff --git a/unicode/norm/iter_test.go b/unicode/norm/iter_test.go
index e2aa6f2..d95aa30 100644
--- a/unicode/norm/iter_test.go
+++ b/unicode/norm/iter_test.go

@@ -60,7 +60,7 @@
 	{"\ufdfa" + grave(30), []string{"\u0635", "\u0644", "\u0649", " ", "\u0627", "\u0644", "\u0644", "\u0647", " ", "\u0639", "\u0644", "\u064a", "\u0647", " ", "\u0648", "\u0633", "\u0644", "\u0645" + grave(30), ""}},
 	{"\uFDFA" + grave(64), []string{"\u0635", "\u0644", "\u0649", " ", "\u0627", "\u0644", "\u0644", "\u0647", " ", "\u0639", "\u0644", "\u064a", "\u0647", " ", "\u0648", "\u0633", "\u0644", "\u0645" + grave(30), cgj + grave(30), cgj + grave(4), ""}},
 
-	// Hangul and Jamo are grouped togeter.
+	// Hangul and Jamo are grouped together.
 	{"\uAC00", []string{"\u1100\u1161", ""}},
 	{"\uAC01", []string{"\u1100\u1161\u11A8", ""}},
 	{"\u1100\u1161", []string{"\u1100\u1161", ""}},

diff --git a/unicode/norm/normalize.go b/unicode/norm/normalize.go
index d3f2069..e28ac64 100644
--- a/unicode/norm/normalize.go
+++ b/unicode/norm/normalize.go

@@ -324,7 +324,6 @@
 		// have an overflow for runes that are starters (e.g. with U+FF9E).
 		switch ss.next(info) {
 		case ssStarter:
-			ss.first(info)
 			lastSegStart = i
 		case ssOverflow:
 			return lastSegStart, false
@@ -441,6 +440,8 @@
 			}
 			return -1
 		}
+		// TODO: Using streamSafe to determine the boundary isn't the same as
+		// using BoundaryBefore. Determine which should be used.
 		if s := ss.next(info); s != ssSuccess {
 			return i
 		}
@@ -505,15 +506,14 @@
 	if info.size == 0 {
 		return 0
 	}
-	if rb.nrune > 0 {
-		if s := rb.ss.next(info); s == ssStarter {
-			goto end
-		} else if s == ssOverflow {
-			rb.insertCGJ()
+	if s := rb.ss.next(info); s == ssStarter {
+		// TODO: this could be removed if we don't support merging.
+		if rb.nrune > 0 {
 			goto end
 		}
-	} else {
-		rb.ss.first(info)
+	} else if s == ssOverflow {
+		rb.insertCGJ()
+		goto end
 	}
 	if err := rb.insertFlush(rb.src, sp, info); err != iSuccess {
 		return int(err)

diff --git a/unicode/norm/normalize_test.go b/unicode/norm/normalize_test.go
index ffa1034..69a84bf 100644
--- a/unicode/norm/normalize_test.go
+++ b/unicode/norm/normalize_test.go

@@ -598,35 +598,42 @@
 
 func runAppendTests(t *testing.T, name string, f Form, fn appendFunc, tests []AppendTest) {
 	for i, test := range tests {
-		if *testn >= 0 && i != *testn {
-			continue
-		}
-		out := []byte(test.left)
-		have := string(fn(f, out, test.right))
-		if len(have) != len(test.out) {
-			t.Errorf("%s.%s:%d: length is %d; want %d (%+q vs %+q)", fstr[f], name, i, len(have), len(test.out), pc(have), pc(test.out))
-		}
-		if have != test.out {
-			k, pf := pidx(have, test.out)
-			t.Errorf("%s.%s:%d: \nwas  %s%+q; \nwant %s%+q", fstr[f], name, i, pf, pc(have[k:]), pf, pc(test.out[k:]))
-		}
+		t.Run(fmt.Sprintf("%s/%d", fstr[f], i), func(t *testing.T) {
+			id := pc(test.left + test.right)
+			if *testn >= 0 && i != *testn {
+				return
+			}
+			t.Run("fn", func(t *testing.T) {
+				out := []byte(test.left)
+				have := string(fn(f, out, test.right))
+				if len(have) != len(test.out) {
+					t.Errorf("%+q: length is %d; want %d (%+q vs %+q)", id, len(have), len(test.out), pc(have), pc(test.out))
+				}
+				if have != test.out {
+					k, pf := pidx(have, test.out)
+					t.Errorf("%+q:\nwas  %s%+q; \nwant %s%+q", id, pf, pc(have[k:]), pf, pc(test.out[k:]))
+				}
+			})
 
-		// Bootstrap by normalizing input. Ensures that the various variants
-		// behave the same.
-		for g := NFC; g <= NFKD; g++ {
-			if f == g {
-				continue
+			// Bootstrap by normalizing input. Ensures that the various variants
+			// behave the same.
+			for g := NFC; g <= NFKD; g++ {
+				if f == g {
+					continue
+				}
+				t.Run(fstr[g], func(t *testing.T) {
+					want := g.String(test.left + test.right)
+					have := string(fn(g, g.AppendString(nil, test.left), test.right))
+					if len(have) != len(want) {
+						t.Errorf("%+q: length is %d; want %d (%+q vs %+q)", id, len(have), len(want), pc(have), pc(want))
+					}
+					if have != want {
+						k, pf := pidx(have, want)
+						t.Errorf("%+q:\nwas  %s%+q; \nwant %s%+q", id, pf, pc(have[k:]), pf, pc(want[k:]))
+					}
+				})
 			}
-			want := g.String(test.left + test.right)
-			have := string(fn(g, g.AppendString(nil, test.left), test.right))
-			if len(have) != len(want) {
-				t.Errorf("%s(%s.%s):%d: length is %d; want %d (%+q vs %+q)", fstr[g], fstr[f], name, i, len(have), len(want), pc(have), pc(want))
-			}
-			if have != want {
-				k, pf := pidx(have, want)
-				t.Errorf("%s(%s.%s):%d: \nwas  %s%+q; \nwant %s%+q", fstr[g], fstr[f], name, i, pf, pc(have[k:]), pf, pc(want[k:]))
-			}
-		}
+		})
 	}
 }
 
@@ -768,6 +775,16 @@
 	// - Many non-starter decompositions in a row causing overflow.
 	{"", rep(0x340, 31), rep(0x300, 30) + cgj + "\u0300"},
 	{"", rep(0xFF9E, 31), rep(0x3099, 30) + cgj + "\u3099"},
+
+	{"", "\u0644\u0625" + rep(0x300, 31), "\u0644\u0625" + rep(0x300, 29) + cgj + "\u0300\u0300"},
+	{"", "\ufef9" + rep(0x300, 31), "\u0644\u0625" + rep(0x300, 29) + cgj + rep(0x0300, 2)},
+	{"", "\ufef9" + rep(0x300, 31), "\u0644\u0625" + rep(0x300, 29) + cgj + rep(0x0300, 2)},
+
+	// U+0F81 TIBETAN VOWEL SIGN REVERSED II splits into two modifiers.
+	{"", "\u0f7f" + rep(0xf71, 29) + "\u0f81", "\u0f7f" + rep(0xf71, 29) + cgj + "\u0f71\u0f80"},
+	{"", "\u0f7f" + rep(0xf71, 28) + "\u0f81", "\u0f7f" + rep(0xf71, 29) + "\u0f80"},
+	{"", "\u0f7f" + rep(0xf81, 16), "\u0f7f" + rep(0xf71, 15) + rep(0xf80, 15) + cgj + "\u0f71\u0f80"},
+
 	// weird UTF-8
 	{"\u00E0\xE1", "\x86", "\u00E0\xE1\x86"},
 	{"a\u0300\u11B7", "\u0300", "\u00E0\u11B7\u0300"},
@@ -780,6 +797,7 @@
 
 	{"", strings.Repeat("a\u0316\u0300", 6), strings.Repeat("\u00E0\u0316", 6)},
 	// large input.
+	{"", strings.Repeat("a\u0300\u0316", 31), strings.Repeat("\u00E0\u0316", 31)},
 	{"", strings.Repeat("a\u0300\u0316", 4000), strings.Repeat("\u00E0\u0316", 4000)},
 	{"", strings.Repeat("\x80\x80", 4000), strings.Repeat("\x80\x80", 4000)},
 	{"", "\u0041\u0307\u0304", "\u01E0"},
@@ -848,6 +866,12 @@
 		"\u0300\u0320a" + grave(34) + "\u0320",
 		"\u0320\u0300a" + grave(30) + cgj + "\u0320" + grave(4),
 	},
+	{
+		// U+0F81 TIBETAN VOWEL SIGN REVERSED II splits into two modifiers.
+		"",
+		"a\u0f7f" + rep(0xf71, 29) + "\u0f81",
+		"a\u0f7f" + rep(0xf71, 29) + cgj + "\u0f71\u0f80",
+	},
 }
 
 func TestAppend(t *testing.T) {
commit	290cc5ca29f7882d501c79148c59b26804e51916	[log] [tgz]
author	Marcel van Lohuizen <mpvl@golang.org>	Wed Apr 12 22:15:11 2017 +0200
committer	Marcel van Lohuizen <mpvl@golang.org>	Thu Apr 20 07:15:48 2017 +0000
tree	a28f0cbdf979673b52c410440c77197bf4b84578
parent	17ea456e8dcdb7e743623cf6814865f98f01922c [diff]