cmd/go/internal/module: define fs-safe module path encoding

Module paths, like import paths, are case-sensitive, for better or worse.
But not all file systems distinguish file paths with different cases.
If we are going to use module paths to construct file system paths,
we must apply an encoding that distinguishes case without relying
upon the file system to do it.

This CL defines that encoding, the "safe module path encoding".
Module paths today are ASCII-only with limited punctuation,
so the safe module path encoding is to convert the whole path
to lower case and insert an ! before every formerly upper-case letter:
github.com/Sirupsen/logrus is stored as github.com/!sirupsen/logrus.

Although this CL defines the encoding, it does not change the rest
of the go command to use the encoding. That will be done in
follow-up CLs.

Change-Id: I06e6188dcfcbbc1d88674f7c95e1cb45cb476238
Reviewed-on: https://go-review.googlesource.com/124378
Run-TryBot: Russ Cox <rsc@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Bryan C. Mills <bcmills@google.com>
diff --git a/src/cmd/go/internal/module/module.go b/src/cmd/go/internal/module/module.go
index 7b32b24..000699a 100644
--- a/src/cmd/go/internal/module/module.go
+++ b/src/cmd/go/internal/module/module.go
@@ -6,11 +6,18 @@
 // along with support code.
 package module
 
+// IMPORTANT NOTE
+//
+// This file essentially defines the set of valid import paths for the go command.
+// There are many subtle considerations, including Unicode ambiguity,
+// security, network, and file system representations.
+//
+// Changes to the semantics in this file require approval from rsc.
+
 import (
 	"fmt"
 	"sort"
 	"strings"
-	"unicode"
 	"unicode/utf8"
 
 	"cmd/go/internal/semver"
@@ -71,36 +78,33 @@
 
 // firstPathOK reports whether r can appear in the first element of a module path.
 // The first element of the path must be an LDH domain name, at least for now.
+// To avoid case ambiguity, the domain name must be entirely lower case.
 func firstPathOK(r rune) bool {
 	return r == '-' || r == '.' ||
 		'0' <= r && r <= '9' ||
-		'A' <= r && r <= 'Z' ||
 		'a' <= r && r <= 'z'
 }
 
 // pathOK reports whether r can appear in a module path.
-// Paths must avoid potentially problematic ASCII punctuation
-// and control characters but otherwise can be any Unicode printable character,
-// as defined by Go's IsPrint.
+// Paths can be ASCII letters, ASCII digits, and limited ASCII punctuation: + - . / _ and ~.
+// This matches what "go get" has historically recognized in import paths.
+// TODO(rsc): We would like to allow Unicode letters, but that requires additional
+// care in the safe encoding (see note below).
 func pathOK(r rune) bool {
 	if r < utf8.RuneSelf {
-		return r == '+' || r == ',' || r == '-' || r == '.' || r == '/' || r == '_' || r == '~' ||
+		return r == '+' || r == '-' || r == '.' || r == '/' || r == '_' || r == '~' ||
 			'0' <= r && r <= '9' ||
 			'A' <= r && r <= 'Z' ||
 			'a' <= r && r <= 'z'
 	}
-	return unicode.IsPrint(r)
+	return false
 }
 
 // CheckPath checks that a module path is valid.
 func CheckPath(path string) error {
-	if !utf8.ValidString(path) {
-		return fmt.Errorf("malformed module path %q: invalid UTF-8", path)
+	if err := checkImportPath(path); err != nil {
+		return fmt.Errorf("malformed module path %q: %v", path, err)
 	}
-	if path == "" {
-		return fmt.Errorf("malformed module path %q: empty string", path)
-	}
-
 	i := strings.Index(path, "/")
 	if i < 0 {
 		i = len(path)
@@ -111,40 +115,119 @@
 	if !strings.Contains(path[:i], ".") {
 		return fmt.Errorf("malformed module path %q: missing dot in first path element", path)
 	}
-	if path[i-1] == '.' {
-		return fmt.Errorf("malformed module path %q: trailing dot in first path element", path)
-	}
-	if path[0] == '.' {
-		return fmt.Errorf("malformed module path %q: leading dot in first path element", path)
-	}
 	if path[0] == '-' {
 		return fmt.Errorf("malformed module path %q: leading dash in first path element", path)
 	}
-	if strings.Contains(path, "..") {
-		return fmt.Errorf("malformed module path %q: double dot", path)
-	}
-	if strings.Contains(path, "//") {
-		return fmt.Errorf("malformed module path %q: double slash", path)
-	}
 	for _, r := range path[:i] {
 		if !firstPathOK(r) {
 			return fmt.Errorf("malformed module path %q: invalid char %q in first path element", path, r)
 		}
 	}
-	if path[len(path)-1] == '/' {
-		return fmt.Errorf("malformed module path %q: trailing slash", path)
-	}
-	for _, r := range path {
-		if !pathOK(r) {
-			return fmt.Errorf("malformed module path %q: invalid char %q", path, r)
-		}
-	}
 	if _, _, ok := SplitPathVersion(path); !ok {
 		return fmt.Errorf("malformed module path %q: invalid version %s", path, path[strings.LastIndex(path, "/")+1:])
 	}
 	return nil
 }
 
+// CheckImportPath checks that an import path is valid.
+func CheckImportPath(path string) error {
+	if err := checkImportPath(path); err != nil {
+		return fmt.Errorf("malformed import path %q: %v", path, err)
+	}
+	return nil
+}
+
+// checkImportPath checks that an import path is valid.
+// It returns an error describing why but not mentioning path.
+// Because these checks apply to both module paths and import paths,
+// the caller is expected to add the "malformed ___ path %q: " prefix.
+func checkImportPath(path string) error {
+	if !utf8.ValidString(path) {
+		return fmt.Errorf("invalid UTF-8")
+	}
+	if path == "" {
+		return fmt.Errorf("empty string")
+	}
+	if strings.Contains(path, "..") {
+		return fmt.Errorf("double dot")
+	}
+	if strings.Contains(path, "//") {
+		return fmt.Errorf("double slash")
+	}
+	if path[len(path)-1] == '/' {
+		return fmt.Errorf("trailing slash")
+	}
+	elemStart := 0
+	for i, r := range path {
+		if !pathOK(r) {
+			return fmt.Errorf("invalid char %q", r)
+		}
+		if r == '/' {
+			if err := checkElem(path[elemStart:i]); err != nil {
+				return err
+			}
+			elemStart = i + 1
+		}
+	}
+	if err := checkElem(path[elemStart:]); err != nil {
+		return err
+	}
+	return nil
+}
+
+// checkElem checks whether an individual path element is valid.
+func checkElem(elem string) error {
+	if elem == "" {
+		return fmt.Errorf("empty path element")
+	}
+	if elem[0] == '.' {
+		return fmt.Errorf("leading dot in path element")
+	}
+	if elem[len(elem)-1] == '.' {
+		return fmt.Errorf("trailing dot in path element")
+	}
+
+	// Windows disallows a bunch of path elements, sadly.
+	// See https://docs.microsoft.com/en-us/windows/desktop/fileio/naming-a-file
+	short := elem
+	if i := strings.Index(short, "."); i >= 0 {
+		short = short[:i]
+	}
+	for _, bad := range badWindowsNames {
+		if strings.EqualFold(bad, short) {
+			return fmt.Errorf("disallowed path element %q", elem)
+		}
+	}
+	return nil
+}
+
+// badWindowsNames are the reserved file path elements on Windows.
+// See https://docs.microsoft.com/en-us/windows/desktop/fileio/naming-a-file
+var badWindowsNames = []string{
+	"CON",
+	"PRN",
+	"AUX",
+	"NUL",
+	"COM1",
+	"COM2",
+	"COM3",
+	"COM4",
+	"COM5",
+	"COM6",
+	"COM7",
+	"COM8",
+	"COM9",
+	"LPT1",
+	"LPT2",
+	"LPT3",
+	"LPT4",
+	"LPT5",
+	"LPT6",
+	"LPT7",
+	"LPT8",
+	"LPT9",
+}
+
 // SplitPathVersion returns prefix and major version such that prefix+pathMajor == path
 // and version is either empty or "/vN" for N >= 2.
 // As a special case, gopkg.in paths are recognized directly;
@@ -228,3 +311,137 @@
 		return fi < fj
 	})
 }
+
+// Safe encodings
+//
+// Module paths appear as substrings of file system paths
+// (in the download cache) and of web server URLs in the proxy protocol.
+// In general we cannot rely on file systems to be case-sensitive,
+// nor can we rely on web servers, since they read from file systems.
+// That is, we cannot rely on the file system to keep rsc.io/QUOTE
+// and rsc.io/quote separate. Windows and macOS don't.
+// Instead, we must never require two different casings of a file path.
+// Because we want the download cache to match the proxy protocol,
+// and because we want the proxy protocol to be possible to serve
+// from a tree of static files (which might be stored on a case-insensitive
+// file system), the proxy protocol must never require two different casings
+// of a URL path either.
+//
+// One possibility would be to make the safe encoding be the lowercase
+// hexadecimal encoding of the actual path bytes. This would avoid ever
+// needing different casings of a file path, but it would be fairly illegible
+// to most programmers when those paths appeared in the file system
+// (including in file paths in compiler errors and stack traces)
+// in web server logs, and so on. Instead, we want a safe encoding that
+// leaves most paths unaltered.
+//
+// The safe encoding is this:
+// replace every uppercase letter with an exclamation mark
+// followed by the letter's lowercase equivalent.
+//
+// For example,
+// github.com/Azure/azure-sdk-for-go ->  github.com/!azure/azure-sdk-for-go.
+// github.com/GoogleCloudPlatform/cloudsql-proxy -> github.com/!google!cloud!platform/cloudsql-proxy
+// github.com/Sirupsen/logrus -> github.com/!sirupsen/logrus.
+//
+// Import paths that avoid upper-case letters are left unchanged.
+// Note that because import paths are ASCII-only and avoid various
+// problematic punctuation (like : < and >), the safe encoding is also ASCII-only
+// and avoids the same problematic punctuation.
+//
+// Import paths have never allowed exclamation marks, so there is no
+// need to define how to encode a literal !.
+//
+// Although paths are disallowed from using Unicode (see pathOK above),
+// the eventual plan is to allow Unicode letters as well, to assume that
+// file systems and URLs are Unicode-safe (storing UTF-8), and apply
+// the !-for-uppercase convention. Note however that not all runes that
+// are different but case-fold equivalent are an upper/lower pair.
+// For example, U+004B ('K'), U+006B ('k'), and U+212A ('K' for Kelvin)
+// are considered to case-fold to each other. When we do add Unicode
+// letters, we must not assume that upper/lower are the only case-equivalent pairs.
+// Perhaps the Kelvin symbol would be disallowed entirely, for example.
+// Or perhaps it would encode as "!!k", or perhaps as "(212A)".
+//
+// Also, it would be nice to allow Unicode marks as well as letters,
+// but marks include combining marks, and then we must deal not
+// only with case folding but also normalization: both U+00E9 ('é')
+// and U+0065 U+0301 ('e' followed by combining acute accent)
+// look the same on the page and are treated by some file systems
+// as the same path. If we do allow Unicode marks in paths, there
+// must be some kind of normalization to allow only one canonical
+// encoding of any character used in an import path.
+
+// EncodePath returns the safe encoding of the given module path.
+// It fails if the module path is invalid.
+func EncodePath(path string) (encoding string, err error) {
+	if err := CheckPath(path); err != nil {
+		return "", err
+	}
+
+	haveUpper := false
+	for _, r := range path {
+		if r == '!' || r >= utf8.RuneSelf {
+			// This should be disallowed by CheckPath, but diagnose anyway.
+			// The correctness of the encoding loop below depends on it.
+			return "", fmt.Errorf("internal error: inconsistency in EncodePath")
+		}
+		if 'A' <= r && r <= 'Z' {
+			haveUpper = true
+		}
+	}
+
+	if !haveUpper {
+		return path, nil
+	}
+
+	var buf []byte
+	for _, r := range path {
+		if 'A' <= r && r <= 'Z' {
+			buf = append(buf, '!', byte(r+'a'-'A'))
+		} else {
+			buf = append(buf, byte(r))
+		}
+	}
+	return string(buf), nil
+}
+
+// DecodePath returns the module path of the given safe encoding.
+// It fails if the encoding is invalid.
+func DecodePath(encoding string) (path string, err error) {
+	var buf []byte
+
+	bang := false
+	for _, r := range encoding {
+		if r >= utf8.RuneSelf {
+			goto BadEncoding
+		}
+		if bang {
+			bang = false
+			if r < 'a' || 'z' < r {
+				goto BadEncoding
+			}
+			buf = append(buf, byte(r+'A'-'a'))
+			continue
+		}
+		if r == '!' {
+			bang = true
+			continue
+		}
+		if 'A' <= r && r <= 'Z' {
+			goto BadEncoding
+		}
+		buf = append(buf, byte(r))
+	}
+	if bang {
+		goto BadEncoding
+	}
+	path = string(buf)
+	if err := CheckPath(path); err != nil {
+		return "", fmt.Errorf("invalid module path encoding %q: %v", encoding, err)
+	}
+	return path, nil
+
+BadEncoding:
+	return "", fmt.Errorf("invalid module path encoding %q", encoding)
+}
diff --git a/src/cmd/go/internal/module/module_test.go b/src/cmd/go/internal/module/module_test.go
index 6142a9e..972835f 100644
--- a/src/cmd/go/internal/module/module_test.go
+++ b/src/cmd/go/internal/module/module_test.go
@@ -55,92 +55,111 @@
 }
 
 var checkPathTests = []struct {
-	path string
-	ok   bool
+	path     string
+	ok       bool
+	importOK bool
 }{
-	{"x.y/z", true},
-	{"x.y", true},
+	{"x.y/z", true, true},
+	{"x.y", true, true},
 
-	{"", false},
-	{"x.y/\xFFz", false},
-	{"/x.y/z", false},
-	{"x./z", false},
-	{".x/z", false},
-	{"-x/z", false},
-	{"x..y/z", false},
-	{"x.y/z/../../w", false},
-	{"x.y//z", false},
-	{"x.y/z//w", false},
-	{"x.y/z/", false},
+	{"", false, false},
+	{"x.y/\xFFz", false, false},
+	{"/x.y/z", false, false},
+	{"x./z", false, false},
+	{".x/z", false, false},
+	{"-x/z", false, true},
+	{"x..y/z", false, false},
+	{"x.y/z/../../w", false, false},
+	{"x.y//z", false, false},
+	{"x.y/z//w", false, false},
+	{"x.y/z/", false, false},
 
-	{"x.y/z/v0", false},
-	{"x.y/z/v1", false},
-	{"x.y/z/v2", true},
-	{"x.y/z/v2.0", false},
+	{"x.y/z/v0", false, true},
+	{"x.y/z/v1", false, true},
+	{"x.y/z/v2", true, true},
+	{"x.y/z/v2.0", false, true},
+	{"X.y/z", false, true},
 
-	{"!x.y/z", false},
-	{"_x.y/z", false},
-	{"x.y!/z", false},
-	{"x.y\"/z", false},
-	{"x.y#/z", false},
-	{"x.y$/z", false},
-	{"x.y%/z", false},
-	{"x.y&/z", false},
-	{"x.y'/z", false},
-	{"x.y(/z", false},
-	{"x.y)/z", false},
-	{"x.y*/z", false},
-	{"x.y+/z", false},
-	{"x.y,/z", false},
-	{"x.y-/z", true},
-	{"x.y./zt", false},
-	{"x.y:/z", false},
-	{"x.y;/z", false},
-	{"x.y</z", false},
-	{"x.y=/z", false},
-	{"x.y>/z", false},
-	{"x.y?/z", false},
-	{"x.y@/z", false},
-	{"x.y[/z", false},
-	{"x.y\\/z", false},
-	{"x.y]/z", false},
-	{"x.y^/z", false},
-	{"x.y_/z", false},
-	{"x.y`/z", false},
-	{"x.y{/z", false},
-	{"x.y}/z", false},
-	{"x.y~/z", false},
-	{"x.y/z!", false},
-	{"x.y/z\"", false},
-	{"x.y/z#", false},
-	{"x.y/z$", false},
-	{"x.y/z%", false},
-	{"x.y/z&", false},
-	{"x.y/z'", false},
-	{"x.y/z(", false},
-	{"x.y/z)", false},
-	{"x.y/z*", false},
-	{"x.y/z+", true},
-	{"x.y/z,", true},
-	{"x.y/z-", true},
-	{"x.y/z.t", true},
-	{"x.y/z/t", true},
-	{"x.y/z:", false},
-	{"x.y/z;", false},
-	{"x.y/z<", false},
-	{"x.y/z=", false},
-	{"x.y/z>", false},
-	{"x.y/z?", false},
-	{"x.y/z@", false},
-	{"x.y/z[", false},
-	{"x.y/z\\", false},
-	{"x.y/z]", false},
-	{"x.y/z^", false},
-	{"x.y/z_", true},
-	{"x.y/z`", false},
-	{"x.y/z{", false},
-	{"x.y/z}", false},
-	{"x.y/z~", true},
+	{"!x.y/z", false, false},
+	{"_x.y/z", false, true},
+	{"x.y!/z", false, false},
+	{"x.y\"/z", false, false},
+	{"x.y#/z", false, false},
+	{"x.y$/z", false, false},
+	{"x.y%/z", false, false},
+	{"x.y&/z", false, false},
+	{"x.y'/z", false, false},
+	{"x.y(/z", false, false},
+	{"x.y)/z", false, false},
+	{"x.y*/z", false, false},
+	{"x.y+/z", false, true},
+	{"x.y,/z", false, false},
+	{"x.y-/z", true, true},
+	{"x.y./zt", false, false},
+	{"x.y:/z", false, false},
+	{"x.y;/z", false, false},
+	{"x.y</z", false, false},
+	{"x.y=/z", false, false},
+	{"x.y>/z", false, false},
+	{"x.y?/z", false, false},
+	{"x.y@/z", false, false},
+	{"x.y[/z", false, false},
+	{"x.y\\/z", false, false},
+	{"x.y]/z", false, false},
+	{"x.y^/z", false, false},
+	{"x.y_/z", false, true},
+	{"x.y`/z", false, false},
+	{"x.y{/z", false, false},
+	{"x.y}/z", false, false},
+	{"x.y~/z", false, true},
+	{"x.y/z!", false, false},
+	{"x.y/z\"", false, false},
+	{"x.y/z#", false, false},
+	{"x.y/z$", false, false},
+	{"x.y/z%", false, false},
+	{"x.y/z&", false, false},
+	{"x.y/z'", false, false},
+	{"x.y/z(", false, false},
+	{"x.y/z)", false, false},
+	{"x.y/z*", false, false},
+	{"x.y/z+", true, true},
+	{"x.y/z,", false, false},
+	{"x.y/z-", true, true},
+	{"x.y/z.t", true, true},
+	{"x.y/z/t", true, true},
+	{"x.y/z:", false, false},
+	{"x.y/z;", false, false},
+	{"x.y/z<", false, false},
+	{"x.y/z=", false, false},
+	{"x.y/z>", false, false},
+	{"x.y/z?", false, false},
+	{"x.y/z@", false, false},
+	{"x.y/z[", false, false},
+	{"x.y/z\\", false, false},
+	{"x.y/z]", false, false},
+	{"x.y/z^", false, false},
+	{"x.y/z_", true, true},
+	{"x.y/z`", false, false},
+	{"x.y/z{", false, false},
+	{"x.y/z}", false, false},
+	{"x.y/z~", true, true},
+	{"x.y/x.foo", true, true},
+	{"x.y/aux.foo", false, false},
+	{"x.y/prn", false, false},
+	{"x.y/prn2", true, true},
+	{"x.y/com", true, true},
+	{"x.y/com1", false, false},
+	{"x.y/com1.txt", false, false},
+	{"x.y/calm1", true, true},
+	{"github.com/!123/logrus", false, false},
+
+	// TODO: CL 41822 allowed Unicode letters in old "go get"
+	// without due consideration of the implications, and only on github.com (!).
+	// For now, we disallow non-ASCII characters in module mode,
+	// in both module paths and general import paths,
+	// until we can get the implications right.
+	// When we do, we'll enable them everywhere, not just for GitHub.
+	{"github.com/user/unicode/испытание", false, false},
 }
 
 func TestCheckPath(t *testing.T) {
@@ -151,6 +170,13 @@
 		} else if !tt.ok && err == nil {
 			t.Errorf("CheckPath(%q) succeeded, wanted error", tt.path)
 		}
+
+		err = CheckImportPath(tt.path)
+		if tt.importOK && err != nil {
+			t.Errorf("CheckImportPath(%q) = %v, wanted nil error", tt.path, err)
+		} else if !tt.importOK && err == nil {
+			t.Errorf("CheckImportPath(%q) succeeded, wanted error", tt.path)
+		}
 	}
 }
 
@@ -182,3 +208,84 @@
 		}
 	}
 }
+
+var encodeTests = []struct {
+	path string
+	enc  string // empty means same as path
+}{
+	{path: "ascii.com/abcdefghijklmnopqrstuvwxyz.-+/~_0123456789"},
+	{path: "github.com/GoogleCloudPlatform/omega", enc: "github.com/!google!cloud!platform/omega"},
+}
+
+func TestEncodePath(t *testing.T) {
+	// Check invalid paths.
+	for _, tt := range checkPathTests {
+		if !tt.ok {
+			_, err := EncodePath(tt.path)
+			if err == nil {
+				t.Errorf("EncodePath(%q): succeeded, want error (invalid path)", tt.path)
+			}
+		}
+	}
+
+	// Check encodings.
+	for _, tt := range encodeTests {
+		enc, err := EncodePath(tt.path)
+		if err != nil {
+			t.Errorf("EncodePath(%q): unexpected error: %v", tt.path, err)
+			continue
+		}
+		want := tt.enc
+		if want == "" {
+			want = tt.path
+		}
+		if enc != want {
+			t.Errorf("EncodePath(%q) = %q, want %q", tt.path, enc, want)
+		}
+	}
+}
+
+var badDecode = []string{
+	"github.com/GoogleCloudPlatform/omega",
+	"github.com/!google!cloud!platform!/omega",
+	"github.com/!0google!cloud!platform/omega",
+	"github.com/!_google!cloud!platform/omega",
+	"github.com/!!google!cloud!platform/omega",
+	"",
+}
+
+func TestDecodePath(t *testing.T) {
+	// Check invalid decodings.
+	for _, bad := range badDecode {
+		_, err := DecodePath(bad)
+		if err == nil {
+			t.Errorf("DecodePath(%q): succeeded, want error (invalid decoding)", bad)
+		}
+	}
+
+	// Check invalid paths (or maybe decodings).
+	for _, tt := range checkPathTests {
+		if !tt.ok {
+			path, err := DecodePath(tt.path)
+			if err == nil {
+				t.Errorf("DecodePath(%q) = %q, want error (invalid path)", tt.path, path)
+			}
+		}
+	}
+
+	// Check encodings.
+	for _, tt := range encodeTests {
+		enc := tt.enc
+		if enc == "" {
+			enc = tt.path
+		}
+		path, err := DecodePath(enc)
+		if err != nil {
+			t.Errorf("DecodePath(%q): unexpected error: %v", enc, err)
+			continue
+		}
+		if path != tt.path {
+			t.Errorf("DecodePath(%q) = %q, want %q", enc, path, tt.path)
+		}
+	}
+}