blob: 5defe5046bb29225a0a394a2da93dd90e507001f [file] [log] [blame]
// Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build ignore
package main
import (
"bytes"
_ "embed"
"fmt"
"go/format"
"io"
"log"
"maps"
"os"
"slices"
"strconv"
"strings"
)
// We embed this source file in the resulting code-generation program in order
// to extract the definitions of the encoding type and constants from it and
// include them in the generated file.
//
//go:embed gen_encoding_table.go
var genSource string
const filename = "encoding_table.go"
func main() {
var out bytes.Buffer
fmt.Fprintln(&out, "// Code generated from gen_encoding_table.go using 'go generate'; DO NOT EDIT.")
fmt.Fprintln(&out)
fmt.Fprintln(&out, "// Copyright 2025 The Go Authors. All rights reserved.")
fmt.Fprintln(&out, "// Use of this source code is governed by a BSD-style")
fmt.Fprintln(&out, "// license that can be found in the LICENSE file.")
fmt.Fprintln(&out)
fmt.Fprintln(&out, "package url")
fmt.Fprintln(&out)
generateEnc(&out, genSource)
generateTable(&out)
formatted, err := format.Source(out.Bytes())
if err != nil {
log.Fatal("format:", err)
}
err = os.WriteFile(filename, formatted, 0644)
if err != nil {
log.Fatal("WriteFile:", err)
}
}
func generateEnc(w io.Writer, src string) {
var writeLine bool
for line := range strings.Lines(src) {
if strings.HasPrefix(line, "// START encoding") {
writeLine = true
continue
}
if strings.HasPrefix(line, "// END encoding") {
return
}
if writeLine {
fmt.Fprint(w, line)
}
}
}
func generateTable(w io.Writer) {
fmt.Fprintln(w, "var table = [256]encoding{")
// Sort the encodings (in decreasing order) to guarantee a stable output.
sortedEncs := slices.Sorted(maps.Keys(encNames))
slices.Reverse(sortedEncs)
for i := range 256 {
c := byte(i)
var lineBuf bytes.Buffer
// Write key to line buffer.
lineBuf.WriteString(strconv.QuoteRune(rune(c)))
lineBuf.WriteByte(':')
// Write value to line buffer.
blankVal := true
if ishex(c) {
// Set the hexChar bit if this char is hexadecimal.
lineBuf.WriteString("hexChar")
blankVal = false
}
for _, enc := range sortedEncs {
if !shouldEscape(c, enc) {
if !blankVal {
lineBuf.WriteByte('|')
}
// Set this encoding mode's bit if this char should NOT be
// escaped.
name := encNames[enc]
lineBuf.WriteString(name)
blankVal = false
}
}
if !blankVal {
lineBuf.WriteString(",\n")
w.Write(lineBuf.Bytes())
}
}
fmt.Fprintln(w, "}")
}
// START encoding (keep this marker comment in sync with genEnc)
type encoding uint8
const (
encodePath encoding = 1 << iota
encodePathSegment
encodeHost
encodeZone
encodeUserPassword
encodeQueryComponent
encodeFragment
// hexChar is actually NOT an encoding mode, but there are only seven
// encoding modes. We might as well abuse the otherwise unused most
// significant bit in uint8 to indicate whether a character is
// hexadecimal.
hexChar
)
// END encoding (keep this marker comment in sync with genEnc)
// Keep this in sync with the definitions of encoding mode constants.
var encNames = map[encoding]string{
encodePath: "encodePath",
encodePathSegment: "encodePathSegment",
encodeHost: "encodeHost",
encodeZone: "encodeZone",
encodeUserPassword: "encodeUserPassword",
encodeQueryComponent: "encodeQueryComponent",
encodeFragment: "encodeFragment",
}
// Return true if the specified character should be escaped when
// appearing in a URL string, according to RFC 3986.
//
// Please be informed that for now shouldEscape does not check all
// reserved characters correctly. See golang.org/issue/5684.
func shouldEscape(c byte, mode encoding) bool {
// §2.3 Unreserved characters (alphanum)
if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9' {
return false
}
if mode == encodeHost || mode == encodeZone {
// §3.2.2 Host allows
// sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
// as part of reg-name.
// We add : because we include :port as part of host.
// We add [ ] because we include [ipv6]:port as part of host.
// We add < > because they're the only characters left that
// we could possibly allow, and Parse will reject them if we
// escape them (because hosts can't use %-encoding for
// ASCII bytes).
switch c {
case '!', '$', '&', '\'', '(', ')', '*', '+', ',', ';', '=', ':', '[', ']', '<', '>', '"':
return false
}
}
switch c {
case '-', '_', '.', '~': // §2.3 Unreserved characters (mark)
return false
case '$', '&', '+', ',', '/', ':', ';', '=', '?', '@': // §2.2 Reserved characters (reserved)
// Different sections of the URL allow a few of
// the reserved characters to appear unescaped.
switch mode {
case encodePath: // §3.3
// The RFC allows : @ & = + $ but saves / ; , for assigning
// meaning to individual path segments. This package
// only manipulates the path as a whole, so we allow those
// last three as well. That leaves only ? to escape.
return c == '?'
case encodePathSegment: // §3.3
// The RFC allows : @ & = + $ but saves / ; , for assigning
// meaning to individual path segments.
return c == '/' || c == ';' || c == ',' || c == '?'
case encodeUserPassword: // §3.2.1
// The RFC allows ';', ':', '&', '=', '+', '$', and ',' in
// userinfo, so we must escape only '@', '/', and '?'.
// The parsing of userinfo treats ':' as special so we must escape
// that too.
return c == '@' || c == '/' || c == '?' || c == ':'
case encodeQueryComponent: // §3.4
// The RFC reserves (so we must escape) everything.
return true
case encodeFragment: // §4.1
// The RFC text is silent but the grammar allows
// everything, so escape nothing.
return false
}
}
if mode == encodeFragment {
// RFC 3986 §2.2 allows not escaping sub-delims. A subset of sub-delims are
// included in reserved from RFC 2396 §2.2. The remaining sub-delims do not
// need to be escaped. To minimize potential breakage, we apply two restrictions:
// (1) we always escape sub-delims outside of the fragment, and (2) we always
// escape single quote to avoid breaking callers that had previously assumed that
// single quotes would be escaped. See issue #19917.
switch c {
case '!', '(', ')', '*':
return false
}
}
// Everything else must be escaped.
return true
}
func ishex(c byte) bool {
return '0' <= c && c <= '9' ||
'a' <= c && c <= 'f' ||
'A' <= c && c <= 'F'
}