net/url: accept non-ASCII bytes in URL per RFC 3986
Fixes #7991.
Fixes #12719.
Change-Id: I5650fa35ec5d49addeda6cc6e7fa93cfbe1cdfc0
Reviewed-on: https://go-review.googlesource.com/17385
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
Run-TryBot: Brad Fitzpatrick <bradfitz@golang.org>
diff --git a/src/net/url/url.go b/src/net/url/url.go
index e7c08b3..510ac77 100644
--- a/src/net/url/url.go
+++ b/src/net/url/url.go
@@ -71,6 +71,7 @@
const (
encodePath encoding = 1 + iota
encodeHost
+ encodeZone
encodeUserPassword
encodeQueryComponent
encodeFragment
@@ -93,7 +94,7 @@
return false
}
- if mode == encodeHost {
+ if mode == encodeHost || mode == encodeZone {
// §3.2.2 Host allows
// sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
// as part of reg-name.
@@ -166,6 +167,27 @@
}
return "", EscapeError(s)
}
+ // Per https://tools.ietf.org/html/rfc3986#page-21
+ // in the host component %-encoding can only be used
+ // for non-ASCII bytes.
+ // But https://tools.ietf.org/html/rfc6874#section-2
+ // introduces %25 being allowed to escape a percent sign
+ // in IPv6 scoped-address literals. Yay.
+ if mode == encodeHost && unhex(s[i+1]) < 8 && s[i:i+3] != "%25" {
+ return "", EscapeError(s[i : i+3])
+ }
+ if mode == encodeZone {
+ // RFC 6874 says basically "anything goes" for zone identifiers
+ // and that even non-ASCII can be redundantly escaped,
+ // but it seems prudent to restrict %-escaped bytes here to those
+ // that are valid host name bytes in their unescaped form.
+ // That is, you can use escaping in the zone identifier but not
+ // to introduce bytes you couldn't just write directly.
+ v := unhex(s[i+1])<<4 | unhex(s[i+2])
+ if s[i:i+3] != "%25" && shouldEscape(v, encodeHost) {
+ return "", EscapeError(s[i : i+3])
+ }
+ }
i += 3
case '+':
hasPlus = mode == encodeQueryComponent
@@ -496,14 +518,9 @@
// parseHost parses host as an authority without user
// information. That is, as host[:port].
func parseHost(host string) (string, error) {
- litOrName := host
if strings.HasPrefix(host, "[") {
// Parse an IP-Literal in RFC 3986 and RFC 6874.
- // E.g., "[fe80::1], "[fe80::1%25en0]"
- //
- // RFC 4007 defines "%" as a delimiter character in
- // the textual representation of IPv6 addresses.
- // Per RFC 6874, in URIs that "%" is encoded as "%25".
+ // E.g., "[fe80::1]", "[fe80::1%25en0]", "[fe80::1]:80".
i := strings.LastIndex(host, "]")
if i < 0 {
return "", errors.New("missing ']' in host")
@@ -512,29 +529,31 @@
if !validOptionalPort(colonPort) {
return "", fmt.Errorf("invalid port %q after host", colonPort)
}
- // Parse a host subcomponent without a ZoneID in RFC
- // 6874 because the ZoneID is allowed to use the
- // percent encoded form.
- j := strings.Index(host[:i], "%25")
- if j < 0 {
- litOrName = host[1:i]
- } else {
- litOrName = host[1:j]
+
+ // RFC 6874 defines that %25 (%-encoded percent) introduces
+ // the zone identifier, and the zone identifier can use basically
+ // any %-encoding it likes. That's different from the host, which
+ // can only %-encode non-ASCII bytes.
+ // We do impose some restrictions on the zone, to avoid stupidity
+ // like newlines.
+ zone := strings.Index(host[:i], "%25")
+ if zone >= 0 {
+ host1, err := unescape(host[:zone], encodeHost)
+ if err != nil {
+ return "", err
+ }
+ host2, err := unescape(host[zone:i], encodeZone)
+ if err != nil {
+ return "", err
+ }
+ host3, err := unescape(host[i:], encodeHost)
+ if err != nil {
+ return "", err
+ }
+ return host1 + host2 + host3, nil
}
}
- // A URI containing an IP-Literal without a ZoneID or
- // IPv4address in RFC 3986 and RFC 6847 must not be
- // percent-encoded.
- //
- // A URI containing a DNS registered name in RFC 3986 is
- // allowed to be percent-encoded, though we don't use it for
- // now to avoid messing up with the gap between allowed
- // characters in URI and allowed characters in DNS.
- // See golang.org/issue/7991.
- if strings.Contains(litOrName, "%") {
- return "", errors.New("percent-encoded characters in host")
- }
var err error
if host, err = unescape(host, encodeHost); err != nil {
return "", err
diff --git a/src/net/url/url_test.go b/src/net/url/url_test.go
index 037e854..da022f2 100644
--- a/src/net/url/url_test.go
+++ b/src/net/url/url_test.go
@@ -483,6 +483,34 @@
},
"",
},
+ // golang.org/issue/7991 and golang.org/issue/12719 (non-ascii %-encoded in host)
+ {
+ "http://hello.世界.com/foo",
+ &URL{
+ Scheme: "http",
+ Host: "hello.世界.com",
+ Path: "/foo",
+ },
+ "http://hello.%E4%B8%96%E7%95%8C.com/foo",
+ },
+ {
+ "http://hello.%e4%b8%96%e7%95%8c.com/foo",
+ &URL{
+ Scheme: "http",
+ Host: "hello.世界.com",
+ Path: "/foo",
+ },
+ "http://hello.%E4%B8%96%E7%95%8C.com/foo",
+ },
+ {
+ "http://hello.%E4%B8%96%E7%95%8C.com/foo",
+ &URL{
+ Scheme: "http",
+ Host: "hello.世界.com",
+ Path: "/foo",
+ },
+ "",
+ },
}
// more useful string for debugging than fmt's struct printer
@@ -1184,11 +1212,11 @@
{"http://[::1]%23", true},
{"http://[::1%25en0]", false}, // valid zone id
{"http://[::1]:", false}, // colon, but no port OK
- {"http://[::1]:%38%30", true}, // no hex in port
- {"http://[::1%25%10]", false}, // TODO: reject the %10 after the valid zone %25 separator?
+ {"http://[::1]:%38%30", true}, // not allowed: % encoding only for non-ASCII
+ {"http://[::1%25%41]", false}, // RFC 6874 allows over-escaping in zone
{"http://[%10::1]", true}, // no %xx escapes in IP address
{"http://[::1]/%48", false}, // %xx in path is fine
- {"http://%41:8080/", true}, // TODO: arguably we should accept reg-name with %xx
+ {"http://%41:8080/", true}, // not allowed: % encoding only for non-ASCII
{"mysql://x@y(z:123)/foo", false}, // golang.org/issue/12023
{"mysql://x@y(1.2.3.4:123)/foo", false},
{"mysql://x@y([2001:db8::1]:123)/foo", false},