blob: d044a84f3a818425a173adc85cc4815db7881330 [file] [log] [blame]
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// FS-safe encoding of module paths and versions.
// Copied from cmd/go/internal/module and unexported.
package sumweb
import (
"fmt"
"unicode/utf8"
)
// Safe encodings
//
// Module paths appear as substrings of file system paths
// (in the download cache) and of web server URLs in the proxy protocol.
// In general we cannot rely on file systems to be case-sensitive,
// nor can we rely on web servers, since they read from file systems.
// That is, we cannot rely on the file system to keep rsc.io/QUOTE
// and rsc.io/quote separate. Windows and macOS don't.
// Instead, we must never require two different casings of a file path.
// Because we want the download cache to match the proxy protocol,
// and because we want the proxy protocol to be possible to serve
// from a tree of static files (which might be stored on a case-insensitive
// file system), the proxy protocol must never require two different casings
// of a URL path either.
//
// One possibility would be to make the safe encoding be the lowercase
// hexadecimal encoding of the actual path bytes. This would avoid ever
// needing different casings of a file path, but it would be fairly illegible
// to most programmers when those paths appeared in the file system
// (including in file paths in compiler errors and stack traces)
// in web server logs, and so on. Instead, we want a safe encoding that
// leaves most paths unaltered.
//
// The safe encoding is this:
// replace every uppercase letter with an exclamation mark
// followed by the letter's lowercase equivalent.
//
// For example,
// github.com/Azure/azure-sdk-for-go -> github.com/!azure/azure-sdk-for-go.
// github.com/GoogleCloudPlatform/cloudsql-proxy -> github.com/!google!cloud!platform/cloudsql-proxy
// github.com/Sirupsen/logrus -> github.com/!sirupsen/logrus.
//
// Import paths that avoid upper-case letters are left unchanged.
// Note that because import paths are ASCII-only and avoid various
// problematic punctuation (like : < and >), the safe encoding is also ASCII-only
// and avoids the same problematic punctuation.
//
// Import paths have never allowed exclamation marks, so there is no
// need to define how to encode a literal !.
//
// Although paths are disallowed from using Unicode (see pathOK above),
// the eventual plan is to allow Unicode letters as well, to assume that
// file systems and URLs are Unicode-safe (storing UTF-8), and apply
// the !-for-uppercase convention. Note however that not all runes that
// are different but case-fold equivalent are an upper/lower pair.
// For example, U+004B ('K'), U+006B ('k'), and U+212A ('K' for Kelvin)
// are considered to case-fold to each other. When we do add Unicode
// letters, we must not assume that upper/lower are the only case-equivalent pairs.
// Perhaps the Kelvin symbol would be disallowed entirely, for example.
// Or perhaps it would encode as "!!k", or perhaps as "(212A)".
//
// Also, it would be nice to allow Unicode marks as well as letters,
// but marks include combining marks, and then we must deal not
// only with case folding but also normalization: both U+00E9 ('é')
// and U+0065 U+0301 ('e' followed by combining acute accent)
// look the same on the page and are treated by some file systems
// as the same path. If we do allow Unicode marks in paths, there
// must be some kind of normalization to allow only one canonical
// encoding of any character used in an import path.
// encodePath returns the safe encoding of the given module path.
// It fails if the module path is invalid.
func encodePath(path string) (encoding string, err error) {
return encodeString(path)
}
// encodeVersion returns the safe encoding of the given module version.
// Versions are allowed to be in non-semver form but must be valid file names
// and not contain exclamation marks.
func encodeVersion(v string) (encoding string, err error) {
return encodeString(v)
}
func encodeString(s string) (encoding string, err error) {
haveUpper := false
for _, r := range s {
if r == '!' || r >= utf8.RuneSelf {
// This should be disallowed by CheckPath, but diagnose anyway.
// The correctness of the encoding loop below depends on it.
return "", fmt.Errorf("internal error: inconsistency in EncodePath")
}
if 'A' <= r && r <= 'Z' {
haveUpper = true
}
}
if !haveUpper {
return s, nil
}
var buf []byte
for _, r := range s {
if 'A' <= r && r <= 'Z' {
buf = append(buf, '!', byte(r+'a'-'A'))
} else {
buf = append(buf, byte(r))
}
}
return string(buf), nil
}
// decodePath returns the module path of the given safe encoding.
// It fails if the encoding is invalid or encodes an invalid path.
func decodePath(encoding string) (path string, err error) {
path, ok := decodeString(encoding)
if !ok {
return "", fmt.Errorf("invalid module path encoding %q", encoding)
}
return path, nil
}
// decodeVersion returns the version string for the given safe encoding.
// It fails if the encoding is invalid or encodes an invalid version.
// Versions are allowed to be in non-semver form but must be valid file names
// and not contain exclamation marks.
func decodeVersion(encoding string) (v string, err error) {
v, ok := decodeString(encoding)
if !ok {
return "", fmt.Errorf("invalid version encoding %q", encoding)
}
return v, nil
}
func decodeString(encoding string) (string, bool) {
var buf []byte
bang := false
for _, r := range encoding {
if r >= utf8.RuneSelf {
return "", false
}
if bang {
bang = false
if r < 'a' || 'z' < r {
return "", false
}
buf = append(buf, byte(r+'A'-'a'))
continue
}
if r == '!' {
bang = true
continue
}
if 'A' <= r && r <= 'Z' {
return "", false
}
buf = append(buf, byte(r))
}
if bang {
return "", false
}
return string(buf), true
}