net/url: make Hostname and Port predictable for invalid Host values

When Host is not valid per RFC 3986, the behavior of Hostname and Port
was wildly unpredictable, to the point that Host could have a suffix
that didn't appear in neither Hostname nor Port.

This is a security issue when applications are applying checks to Host
and expecting them to be meaningful for the contents of Hostname.

To reduce disruption, this change only aims to guarantee the following
two security-relevant invariants.

* Host is either Hostname or [Hostname] with Port empty, or
  Hostname:Port or [Hostname]:Port.

* Port is only decimals.

The second invariant is the one that's most likely to cause disruption,
but I believe it's important, as it's conceivable an application might
do a suffix check on Host and expect it to be meaningful for the
contents of Hostname (if the suffix is not a valid port).

There are three ways to ensure it.

1) Reject invalid ports in Parse. Note that non-numeric ports are
   already rejected if and only if the host starts with "[".

2) Consider non-numeric ports as part of Hostname, not Port.

3) Allow non-numeric ports, and hope they only flow down to net/http,
   which will reject them (#14353).

This change adopts both 1 and 2. We could do only the latter, but then
these invalid hosts would flow past port checks, like in
http_test.TestTransportRejectsAlphaPort. Non-numeric ports weren't fully
supported anyway, because they were rejected after IPv6 literals, so
this restores consistency. We could do only the former, but at this
point 2) is free and might help with manually constructed Host values
(or if we get something wrong in Parse).

Note that net.SplitHostPort and net.Dial explicitly accept service names
in place of port numbers, but this is an URL package, and RFC 3986,
Section 3.2.3, clearly specifies ports as a number in decimal.

net/http uses a mix of net.SplitHostPort and url.Parse that would
deserve looking into, but in general it seems that it will still accept
service names in Addr fields as they are passed to net.Listen, while
rejecting them in URLs, which feels correct.

This leaves a number of invalid URLs to reject, which however are not
security relevant once the two invariants above hold, so can be done in
Go 1.14: IPv6 literals without brackets (#31024), invalid IPv6 literals,
hostnames with invalid characters, and more.

Tested with 200M executions of go-fuzz and the following Fuzz function.

	u, err := url.Parse(string(data))
	if err != nil {
		return 0
	}
	h := u.Hostname()
	p := u.Port()

	switch u.Host {
	case h + ":" + p:
		return 1
	case "[" + h + "]:" + p:
		return 1
	case h:
		fallthrough
	case "[" + h + "]":
		if p != "" {
			panic("unexpected Port()")
		}
		return 1
	}
	panic("Host is not a variant of [Hostname]:Port")

Fixes CVE-2019-14809
Updates #29098

Change-Id: I7ef40823dab28f29511329fa2d5a7fb10c3ec895
Reviewed-on: https://go-review.googlesource.com/c/go/+/189258
Reviewed-by: Ian Lance Taylor <iant@golang.org>
diff --git a/src/net/http/transport.go b/src/net/http/transport.go
index 5c1708c..f9d9f44 100644
--- a/src/net/http/transport.go
+++ b/src/net/http/transport.go
@@ -710,6 +710,8 @@
 }
 
 func (t *Transport) connectMethodForRequest(treq *transportRequest) (cm connectMethod, err error) {
+	// TODO: the validPort check is redundant after CL 189258, as url.URL.Port
+	// only returns valid ports now. golang.org/issue/33600
 	if port := treq.URL.Port(); !validPort(port) {
 		return cm, fmt.Errorf("invalid URL port %q", port)
 	}
diff --git a/src/net/http/transport_test.go b/src/net/http/transport_test.go
index ea01a20..1a6f631 100644
--- a/src/net/http/transport_test.go
+++ b/src/net/http/transport_test.go
@@ -4289,7 +4289,7 @@
 		t.Fatalf("got %#v; want *url.Error", err)
 	}
 	got := ue.Err.Error()
-	want := `invalid URL port "123foo"`
+	want := `invalid port ":123foo" after host`
 	if got != want {
 		t.Errorf("got error %q; want %q", got, want)
 	}
diff --git a/src/net/url/url.go b/src/net/url/url.go
index 982cfe6..12ea35f 100644
--- a/src/net/url/url.go
+++ b/src/net/url/url.go
@@ -648,6 +648,11 @@
 			}
 			return host1 + host2 + host3, nil
 		}
+	} else if i := strings.LastIndex(host, ":"); i != -1 {
+		colonPort := host[i:]
+		if !validOptionalPort(colonPort) {
+			return "", fmt.Errorf("invalid port %q after host", colonPort)
+		}
 	}
 
 	var err error
@@ -1046,44 +1051,39 @@
 	return result
 }
 
-// Hostname returns u.Host, without any port number.
+// Hostname returns u.Host, stripping any valid port number if present.
 //
-// If Host is an IPv6 literal with a port number, Hostname returns the
-// IPv6 literal without the square brackets. IPv6 literals may include
-// a zone identifier.
+// If the result is enclosed in square brackets, as literal IPv6 addresses are,
+// the square brackets are removed from the result.
 func (u *URL) Hostname() string {
-	return stripPort(u.Host)
+	host, _ := splitHostPort(u.Host)
+	return host
 }
 
 // Port returns the port part of u.Host, without the leading colon.
-// If u.Host doesn't contain a port, Port returns an empty string.
+//
+// If u.Host doesn't contain a valid numeric port, Port returns an empty string.
 func (u *URL) Port() string {
-	return portOnly(u.Host)
+	_, port := splitHostPort(u.Host)
+	return port
 }
 
-func stripPort(hostport string) string {
-	colon := strings.IndexByte(hostport, ':')
-	if colon == -1 {
-		return hostport
-	}
-	if i := strings.IndexByte(hostport, ']'); i != -1 {
-		return strings.TrimPrefix(hostport[:i], "[")
-	}
-	return hostport[:colon]
-}
+// splitHostPort separates host and port. If the port is not valid, it returns
+// the entire input as host, and it doesn't check the validity of the host.
+// Unlike net.SplitHostPort, but per RFC 3986, it requires ports to be numeric.
+func splitHostPort(hostport string) (host, port string) {
+	host = hostport
 
-func portOnly(hostport string) string {
-	colon := strings.IndexByte(hostport, ':')
-	if colon == -1 {
-		return ""
+	colon := strings.LastIndexByte(host, ':')
+	if colon != -1 && validOptionalPort(host[colon:]) {
+		host, port = host[:colon], host[colon+1:]
 	}
-	if i := strings.Index(hostport, "]:"); i != -1 {
-		return hostport[i+len("]:"):]
+
+	if strings.HasPrefix(host, "[") && strings.HasSuffix(host, "]") {
+		host = host[1 : len(host)-1]
 	}
-	if strings.Contains(hostport, "]") {
-		return ""
-	}
-	return hostport[colon+len(":"):]
+
+	return
 }
 
 // Marshaling interface implementations.
diff --git a/src/net/url/url_test.go b/src/net/url/url_test.go
index e6d6ef8..e83c86c 100644
--- a/src/net/url/url_test.go
+++ b/src/net/url/url_test.go
@@ -422,10 +422,10 @@
 	},
 	// worst case host, still round trips
 	{
-		"scheme://!$&'()*+,;=hello!:port/path",
+		"scheme://!$&'()*+,;=hello!:1/path",
 		&URL{
 			Scheme: "scheme",
-			Host:   "!$&'()*+,;=hello!:port",
+			Host:   "!$&'()*+,;=hello!:1",
 			Path:   "/path",
 		},
 		"",
@@ -1425,11 +1425,13 @@
 		{"http://[::1]", false},
 		{"http://[::1]:80", false},
 		{"http://[::1]:namedport", true}, // rfc3986 3.2.3
+		{"http://x:namedport", true},     // rfc3986 3.2.3
 		{"http://[::1]/", false},
 		{"http://[::1]a", true},
 		{"http://[::1]%23", true},
 		{"http://[::1%25en0]", false},     // valid zone id
 		{"http://[::1]:", false},          // colon, but no port OK
+		{"http://x:", false},              // colon, but no port OK
 		{"http://[::1]:%38%30", true},     // not allowed: % encoding only for non-ASCII
 		{"http://[::1%25%41]", false},     // RFC 6874 allows over-escaping in zone
 		{"http://[%10::1]", true},         // no %xx escapes in IP address
@@ -1621,52 +1623,46 @@
 	}
 }
 
-func TestURLHostname(t *testing.T) {
+func TestURLHostnameAndPort(t *testing.T) {
 	tests := []struct {
-		host string // URL.Host field
-		want string
+		in   string // URL.Host field
+		host string
+		port string
 	}{
-		{"foo.com:80", "foo.com"},
-		{"foo.com", "foo.com"},
-		{"FOO.COM", "FOO.COM"}, // no canonicalization (yet?)
-		{"1.2.3.4", "1.2.3.4"},
-		{"1.2.3.4:80", "1.2.3.4"},
-		{"[1:2:3:4]", "1:2:3:4"},
-		{"[1:2:3:4]:80", "1:2:3:4"},
-		{"[::1]:80", "::1"},
-		{"[::1]", "::1"},
-		{"localhost", "localhost"},
-		{"localhost:443", "localhost"},
-		{"some.super.long.domain.example.org:8080", "some.super.long.domain.example.org"},
-		{"[2001:0db8:85a3:0000:0000:8a2e:0370:7334]:17000", "2001:0db8:85a3:0000:0000:8a2e:0370:7334"},
-		{"[2001:0db8:85a3:0000:0000:8a2e:0370:7334]", "2001:0db8:85a3:0000:0000:8a2e:0370:7334"},
-	}
-	for _, tt := range tests {
-		u := &URL{Host: tt.host}
-		got := u.Hostname()
-		if got != tt.want {
-			t.Errorf("Hostname for Host %q = %q; want %q", tt.host, got, tt.want)
-		}
-	}
-}
+		{"foo.com:80", "foo.com", "80"},
+		{"foo.com", "foo.com", ""},
+		{"foo.com:", "foo.com", ""},
+		{"FOO.COM", "FOO.COM", ""}, // no canonicalization
+		{"1.2.3.4", "1.2.3.4", ""},
+		{"1.2.3.4:80", "1.2.3.4", "80"},
+		{"[1:2:3:4]", "1:2:3:4", ""},
+		{"[1:2:3:4]:80", "1:2:3:4", "80"},
+		{"[::1]:80", "::1", "80"},
+		{"[::1]", "::1", ""},
+		{"[::1]:", "::1", ""},
+		{"localhost", "localhost", ""},
+		{"localhost:443", "localhost", "443"},
+		{"some.super.long.domain.example.org:8080", "some.super.long.domain.example.org", "8080"},
+		{"[2001:0db8:85a3:0000:0000:8a2e:0370:7334]:17000", "2001:0db8:85a3:0000:0000:8a2e:0370:7334", "17000"},
+		{"[2001:0db8:85a3:0000:0000:8a2e:0370:7334]", "2001:0db8:85a3:0000:0000:8a2e:0370:7334", ""},
 
-func TestURLPort(t *testing.T) {
-	tests := []struct {
-		host string // URL.Host field
-		want string
-	}{
-		{"foo.com", ""},
-		{"foo.com:80", "80"},
-		{"1.2.3.4", ""},
-		{"1.2.3.4:80", "80"},
-		{"[1:2:3:4]", ""},
-		{"[1:2:3:4]:80", "80"},
+		// Ensure that even when not valid, Host is one of "Hostname",
+		// "Hostname:Port", "[Hostname]" or "[Hostname]:Port".
+		// See https://golang.org/issue/29098.
+		{"[google.com]:80", "google.com", "80"},
+		{"google.com]:80", "google.com]", "80"},
+		{"google.com:80_invalid_port", "google.com:80_invalid_port", ""},
+		{"[::1]extra]:80", "::1]extra", "80"},
+		{"google.com]extra:extra", "google.com]extra:extra", ""},
 	}
 	for _, tt := range tests {
-		u := &URL{Host: tt.host}
-		got := u.Port()
-		if got != tt.want {
-			t.Errorf("Port for Host %q = %q; want %q", tt.host, got, tt.want)
+		u := &URL{Host: tt.in}
+		host, port := u.Hostname(), u.Port()
+		if host != tt.host {
+			t.Errorf("Hostname for Host %q = %q; want %q", tt.in, host, tt.host)
+		}
+		if port != tt.port {
+			t.Errorf("Port for Host %q = %q; want %q", tt.in, port, tt.port)
 		}
 	}
 }