all: make Unicode surrogate halves illegal as UTF-8
Surrogate halves are part of UTF-16 and should never appear in UTF-8.
(The rune that two combined halves represent in UTF-16 should
be encoded directly.)
Encoding: encode as RuneError.
Decoding: convert to RuneError, consume one byte.
This requires changing:
package unicode/utf8
runtime for range over string
Also added utf8.ValidRune and fixed bug in utf.RuneLen.
Fixes #3927.
R=golang-dev, rsc, bsiegert
CC=golang-dev
https://golang.org/cl/6458099
diff --git a/src/pkg/unicode/utf8/utf8.go b/src/pkg/unicode/utf8/utf8.go
index cd9c80c..ad23577 100644
--- a/src/pkg/unicode/utf8/utf8.go
+++ b/src/pkg/unicode/utf8/utf8.go
@@ -18,6 +18,12 @@
UTFMax = 4 // maximum number of bytes of a UTF-8 encoded Unicode character.
)
+// Code points in the surrogate range are not valid for UTF-8.
+const (
+ surrogateMin = 0xD800
+ surrogateMax = 0xDFFF
+)
+
const (
t1 = 0x00 // 0000 0000
tx = 0x80 // 1000 0000
@@ -34,7 +40,6 @@
rune1Max = 1<<7 - 1
rune2Max = 1<<11 - 1
rune3Max = 1<<16 - 1
- rune4Max = 1<<21 - 1
)
func decodeRuneInternal(p []byte) (r rune, size int, short bool) {
@@ -87,6 +92,9 @@
if r <= rune2Max {
return RuneError, 1, false
}
+ if surrogateMin <= r && r <= surrogateMax {
+ return RuneError, 1, false
+ }
return r, 3, false
}
@@ -162,6 +170,9 @@
if r <= rune2Max {
return RuneError, 1, false
}
+ if surrogateMin <= r && r <= surrogateMax {
+ return RuneError, 1, false
+ }
return r, 3, false
}
@@ -295,15 +306,20 @@
}
// RuneLen returns the number of bytes required to encode the rune.
+// It returns -1 if the rune is not a valid value to encode in UTF-8.
func RuneLen(r rune) int {
switch {
+ case r < 0:
+ return -1
case r <= rune1Max:
return 1
case r <= rune2Max:
return 2
+ case surrogateMin <= r && r <= surrogateMax:
+ return -1
case r <= rune3Max:
return 3
- case r <= rune4Max:
+ case r <= MaxRune:
return 4
}
return -1
@@ -328,6 +344,10 @@
r = RuneError
}
+ if surrogateMin <= r && r <= surrogateMax {
+ r = RuneError
+ }
+
if uint32(r) <= rune3Max {
p[0] = t3 | byte(r>>12)
p[1] = tx | byte(r>>6)&maskx
@@ -407,3 +427,17 @@
}
return true
}
+
+// ValidRune reports whether r can be legally encoded as UTF-8.
+// Code points that are out of range or a surrogate half are illegal.
+func ValidRune(r rune) bool {
+ switch {
+ case r < 0:
+ return false
+ case surrogateMin <= r && r <= surrogateMax:
+ return false
+ case r > MaxRune:
+ return false
+ }
+ return true
+}