net/url: accept non-ASCII bytes in URL per RFC 3986

Fixes #7991.
Fixes #12719.

Change-Id: I5650fa35ec5d49addeda6cc6e7fa93cfbe1cdfc0
Reviewed-on: https://go-review.googlesource.com/17385
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
Run-TryBot: Brad Fitzpatrick <bradfitz@golang.org>
diff --git a/src/net/url/url.go b/src/net/url/url.go
index e7c08b3..510ac77 100644
--- a/src/net/url/url.go
+++ b/src/net/url/url.go
@@ -71,6 +71,7 @@
 const (
 	encodePath encoding = 1 + iota
 	encodeHost
+	encodeZone
 	encodeUserPassword
 	encodeQueryComponent
 	encodeFragment
@@ -93,7 +94,7 @@
 		return false
 	}
 
-	if mode == encodeHost {
+	if mode == encodeHost || mode == encodeZone {
 		// §3.2.2 Host allows
 		//	sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
 		// as part of reg-name.
@@ -166,6 +167,27 @@
 				}
 				return "", EscapeError(s)
 			}
+			// Per https://tools.ietf.org/html/rfc3986#page-21
+			// in the host component %-encoding can only be used
+			// for non-ASCII bytes.
+			// But https://tools.ietf.org/html/rfc6874#section-2
+			// introduces %25 being allowed to escape a percent sign
+			// in IPv6 scoped-address literals. Yay.
+			if mode == encodeHost && unhex(s[i+1]) < 8 && s[i:i+3] != "%25" {
+				return "", EscapeError(s[i : i+3])
+			}
+			if mode == encodeZone {
+				// RFC 6874 says basically "anything goes" for zone identifiers
+				// and that even non-ASCII can be redundantly escaped,
+				// but it seems prudent to restrict %-escaped bytes here to those
+				// that are valid host name bytes in their unescaped form.
+				// That is, you can use escaping in the zone identifier but not
+				// to introduce bytes you couldn't just write directly.
+				v := unhex(s[i+1])<<4 | unhex(s[i+2])
+				if s[i:i+3] != "%25" && shouldEscape(v, encodeHost) {
+					return "", EscapeError(s[i : i+3])
+				}
+			}
 			i += 3
 		case '+':
 			hasPlus = mode == encodeQueryComponent
@@ -496,14 +518,9 @@
 // parseHost parses host as an authority without user
 // information. That is, as host[:port].
 func parseHost(host string) (string, error) {
-	litOrName := host
 	if strings.HasPrefix(host, "[") {
 		// Parse an IP-Literal in RFC 3986 and RFC 6874.
-		// E.g., "[fe80::1], "[fe80::1%25en0]"
-		//
-		// RFC 4007 defines "%" as a delimiter character in
-		// the textual representation of IPv6 addresses.
-		// Per RFC 6874, in URIs that "%" is encoded as "%25".
+		// E.g., "[fe80::1]", "[fe80::1%25en0]", "[fe80::1]:80".
 		i := strings.LastIndex(host, "]")
 		if i < 0 {
 			return "", errors.New("missing ']' in host")
@@ -512,29 +529,31 @@
 		if !validOptionalPort(colonPort) {
 			return "", fmt.Errorf("invalid port %q after host", colonPort)
 		}
-		// Parse a host subcomponent without a ZoneID in RFC
-		// 6874 because the ZoneID is allowed to use the
-		// percent encoded form.
-		j := strings.Index(host[:i], "%25")
-		if j < 0 {
-			litOrName = host[1:i]
-		} else {
-			litOrName = host[1:j]
+
+		// RFC 6874 defines that %25 (%-encoded percent) introduces
+		// the zone identifier, and the zone identifier can use basically
+		// any %-encoding it likes. That's different from the host, which
+		// can only %-encode non-ASCII bytes.
+		// We do impose some restrictions on the zone, to avoid stupidity
+		// like newlines.
+		zone := strings.Index(host[:i], "%25")
+		if zone >= 0 {
+			host1, err := unescape(host[:zone], encodeHost)
+			if err != nil {
+				return "", err
+			}
+			host2, err := unescape(host[zone:i], encodeZone)
+			if err != nil {
+				return "", err
+			}
+			host3, err := unescape(host[i:], encodeHost)
+			if err != nil {
+				return "", err
+			}
+			return host1 + host2 + host3, nil
 		}
 	}
 
-	// A URI containing an IP-Literal without a ZoneID or
-	// IPv4address in RFC 3986 and RFC 6847 must not be
-	// percent-encoded.
-	//
-	// A URI containing a DNS registered name in RFC 3986 is
-	// allowed to be percent-encoded, though we don't use it for
-	// now to avoid messing up with the gap between allowed
-	// characters in URI and allowed characters in DNS.
-	// See golang.org/issue/7991.
-	if strings.Contains(litOrName, "%") {
-		return "", errors.New("percent-encoded characters in host")
-	}
 	var err error
 	if host, err = unescape(host, encodeHost); err != nil {
 		return "", err
diff --git a/src/net/url/url_test.go b/src/net/url/url_test.go
index 037e854..da022f2 100644
--- a/src/net/url/url_test.go
+++ b/src/net/url/url_test.go
@@ -483,6 +483,34 @@
 		},
 		"",
 	},
+	// golang.org/issue/7991 and golang.org/issue/12719 (non-ascii %-encoded in host)
+	{
+		"http://hello.世界.com/foo",
+		&URL{
+			Scheme: "http",
+			Host:   "hello.世界.com",
+			Path:   "/foo",
+		},
+		"http://hello.%E4%B8%96%E7%95%8C.com/foo",
+	},
+	{
+		"http://hello.%e4%b8%96%e7%95%8c.com/foo",
+		&URL{
+			Scheme: "http",
+			Host:   "hello.世界.com",
+			Path:   "/foo",
+		},
+		"http://hello.%E4%B8%96%E7%95%8C.com/foo",
+	},
+	{
+		"http://hello.%E4%B8%96%E7%95%8C.com/foo",
+		&URL{
+			Scheme: "http",
+			Host:   "hello.世界.com",
+			Path:   "/foo",
+		},
+		"",
+	},
 }
 
 // more useful string for debugging than fmt's struct printer
@@ -1184,11 +1212,11 @@
 		{"http://[::1]%23", true},
 		{"http://[::1%25en0]", false},     // valid zone id
 		{"http://[::1]:", false},          // colon, but no port OK
-		{"http://[::1]:%38%30", true},     // no hex in port
-		{"http://[::1%25%10]", false},     // TODO: reject the %10 after the valid zone %25 separator?
+		{"http://[::1]:%38%30", true},     // not allowed: % encoding only for non-ASCII
+		{"http://[::1%25%41]", false},     // RFC 6874 allows over-escaping in zone
 		{"http://[%10::1]", true},         // no %xx escapes in IP address
 		{"http://[::1]/%48", false},       // %xx in path is fine
-		{"http://%41:8080/", true},        // TODO: arguably we should accept reg-name with %xx
+		{"http://%41:8080/", true},        // not allowed: % encoding only for non-ASCII
 		{"mysql://x@y(z:123)/foo", false}, // golang.org/issue/12023
 		{"mysql://x@y(1.2.3.4:123)/foo", false},
 		{"mysql://x@y([2001:db8::1]:123)/foo", false},