all: make Unicode surrogate halves illegal as UTF-8

Surrogate halves are part of UTF-16 and should never appear in UTF-8.
(The rune that two combined halves represent in UTF-16 should
be encoded directly.)

Encoding: encode as RuneError.
Decoding: convert to RuneError, consume one byte.

This requires changing:
        package unicode/utf8
        runtime for range over string
Also added utf8.ValidRune and fixed bug in utf.RuneLen.

Fixes #3927.

R=golang-dev, rsc, bsiegert
CC=golang-dev
https://golang.org/cl/6458099
diff --git a/src/pkg/unicode/utf8/utf8.go b/src/pkg/unicode/utf8/utf8.go
index cd9c80c..ad23577 100644
--- a/src/pkg/unicode/utf8/utf8.go
+++ b/src/pkg/unicode/utf8/utf8.go
@@ -18,6 +18,12 @@
 	UTFMax    = 4            // maximum number of bytes of a UTF-8 encoded Unicode character.
 )
 
+// Code points in the surrogate range are not valid for UTF-8.
+const (
+	surrogateMin = 0xD800
+	surrogateMax = 0xDFFF
+)
+
 const (
 	t1 = 0x00 // 0000 0000
 	tx = 0x80 // 1000 0000
@@ -34,7 +40,6 @@
 	rune1Max = 1<<7 - 1
 	rune2Max = 1<<11 - 1
 	rune3Max = 1<<16 - 1
-	rune4Max = 1<<21 - 1
 )
 
 func decodeRuneInternal(p []byte) (r rune, size int, short bool) {
@@ -87,6 +92,9 @@
 		if r <= rune2Max {
 			return RuneError, 1, false
 		}
+		if surrogateMin <= r && r <= surrogateMax {
+			return RuneError, 1, false
+		}
 		return r, 3, false
 	}
 
@@ -162,6 +170,9 @@
 		if r <= rune2Max {
 			return RuneError, 1, false
 		}
+		if surrogateMin <= r && r <= surrogateMax {
+			return RuneError, 1, false
+		}
 		return r, 3, false
 	}
 
@@ -295,15 +306,20 @@
 }
 
 // RuneLen returns the number of bytes required to encode the rune.
+// It returns -1 if the rune is not a valid value to encode in UTF-8.
 func RuneLen(r rune) int {
 	switch {
+	case r < 0:
+		return -1
 	case r <= rune1Max:
 		return 1
 	case r <= rune2Max:
 		return 2
+	case surrogateMin <= r && r <= surrogateMax:
+		return -1
 	case r <= rune3Max:
 		return 3
-	case r <= rune4Max:
+	case r <= MaxRune:
 		return 4
 	}
 	return -1
@@ -328,6 +344,10 @@
 		r = RuneError
 	}
 
+	if surrogateMin <= r && r <= surrogateMax {
+		r = RuneError
+	}
+
 	if uint32(r) <= rune3Max {
 		p[0] = t3 | byte(r>>12)
 		p[1] = tx | byte(r>>6)&maskx
@@ -407,3 +427,17 @@
 	}
 	return true
 }
+
+// ValidRune reports whether r can be legally encoded as UTF-8.
+// Code points that are out of range or a surrogate half are illegal.
+func ValidRune(r rune) bool {
+	switch {
+	case r < 0:
+		return false
+	case surrogateMin <= r && r <= surrogateMax:
+		return false
+	case r > MaxRune:
+		return false
+	}
+	return true
+}