src/net/url/gen_encoding_table.go - go - Git at Google

 // Copyright 2025 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

 //go:build ignore

 package main

 import (
 	"bytes"
 	_ "embed"
 	"fmt"
 	"go/format"
 	"io"
 	"log"
 	"maps"
 	"os"
 	"slices"
 	"strconv"
 	"strings"
 )

 // We embed this source file in the resulting code-generation program in order
 // to extract the definitions of the encoding type and constants from it and
 // include them in the generated file.
 //
 //go:embed gen_encoding_table.go
 var genSource string

 const filename = "encoding_table.go"

 func main() {
 	var out bytes.Buffer
 	fmt.Fprintln(&out, "// Code generated from gen_encoding_table.go using 'go generate'; DO NOT EDIT.")
 	fmt.Fprintln(&out)
 	fmt.Fprintln(&out, "// Copyright 2025 The Go Authors. All rights reserved.")
 	fmt.Fprintln(&out, "// Use of this source code is governed by a BSD-style")
 	fmt.Fprintln(&out, "// license that can be found in the LICENSE file.")
 	fmt.Fprintln(&out)
 	fmt.Fprintln(&out, "package url")
 	fmt.Fprintln(&out)
 	generateEnc(&out, genSource)
 	generateTable(&out)

 	formatted, err := format.Source(out.Bytes())
 	if err != nil {
 		log.Fatal("format:", err)
 	}

 	err = os.WriteFile(filename, formatted, 0644)
 	if err != nil {
 		log.Fatal("WriteFile:", err)
 	}
 }

 func generateEnc(w io.Writer, src string) {
 	var writeLine bool
 	for line := range strings.Lines(src) {
 		if strings.HasPrefix(line, "// START encoding") {
 			writeLine = true
 			continue
 		}
 		if strings.HasPrefix(line, "// END encoding") {
 			return
 		}
 		if writeLine {
 			fmt.Fprint(w, line)
 		}
 	}
 }

 func generateTable(w io.Writer) {
 	fmt.Fprintln(w, "var table = [256]encoding{")

 	// Sort the encodings (in decreasing order) to guarantee a stable output.
 	sortedEncs := slices.Sorted(maps.Keys(encNames))
 	slices.Reverse(sortedEncs)

 	for i := range 256 {
 		c := byte(i)
 		var lineBuf bytes.Buffer

 		// Write key to line buffer.
 		lineBuf.WriteString(strconv.QuoteRune(rune(c)))

 		lineBuf.WriteByte(':')

 		// Write value to line buffer.
 		blankVal := true
 		if ishex(c) {
 			// Set the hexChar bit if this char is hexadecimal.
 			lineBuf.WriteString("hexChar")
 			blankVal = false
 		}
 		for _, enc := range sortedEncs {
 			if !shouldEscape(c, enc) {
 				if !blankVal {
 					lineBuf.WriteByte('|')
 				}
 				// Set this encoding mode's bit if this char should NOT be
 				// escaped.
 				name := encNames[enc]
 				lineBuf.WriteString(name)
 				blankVal = false
 			}
 		}

 		if !blankVal {
 			lineBuf.WriteString(",\n")
 			w.Write(lineBuf.Bytes())
 		}
 	}
 	fmt.Fprintln(w, "}")
 }

 // START encoding (keep this marker comment in sync with genEnc)
 type encoding uint8

 const (
 	encodePath encoding = 1 << iota
 	encodePathSegment
 	encodeHost
 	encodeZone
 	encodeUserPassword
 	encodeQueryComponent
 	encodeFragment

 	// hexChar is actually NOT an encoding mode, but there are only seven
 	// encoding modes. We might as well abuse the otherwise unused most
 	// significant bit in uint8 to indicate whether a character is
 	// hexadecimal.
 	hexChar
 )

 // END encoding (keep this marker comment in sync with genEnc)

 // Keep this in sync with the definitions of encoding mode constants.
 var encNames = map[encoding]string{
 	encodePath:           "encodePath",
 	encodePathSegment:    "encodePathSegment",
 	encodeHost:           "encodeHost",
 	encodeZone:           "encodeZone",
 	encodeUserPassword:   "encodeUserPassword",
 	encodeQueryComponent: "encodeQueryComponent",
 	encodeFragment:       "encodeFragment",
 }

 // Return true if the specified character should be escaped when
 // appearing in a URL string, according to RFC 3986.
 //
 // Please be informed that for now shouldEscape does not check all
 // reserved characters correctly. See golang.org/issue/5684.
 func shouldEscape(c byte, mode encoding) bool {
 	// §2.3 Unreserved characters (alphanum)
 	if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9' {
 		return false
 	}

 	if mode == encodeHost || mode == encodeZone {
 		// §3.2.2 Host allows
 		//	sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
 		// as part of reg-name.
 		// We add : because we include :port as part of host.
 		// We add [ ] because we include [ipv6]:port as part of host.
 		// We add < > because they're the only characters left that
 		// we could possibly allow, and Parse will reject them if we
 		// escape them (because hosts can't use %-encoding for
 		// ASCII bytes).
 		switch c {
 		case '!', '$', '&', '\'', '(', ')', '*', '+', ',', ';', '=', ':', '[', ']', '<', '>', '"':
 			return false
 		}
 	}

 	switch c {
 	case '-', '_', '.', '~': // §2.3 Unreserved characters (mark)
 		return false

 	case '$', '&', '+', ',', '/', ':', ';', '=', '?', '@': // §2.2 Reserved characters (reserved)
 		// Different sections of the URL allow a few of
 		// the reserved characters to appear unescaped.
 		switch mode {
 		case encodePath: // §3.3
 			// The RFC allows : @ & = + $ but saves / ; , for assigning
 			// meaning to individual path segments. This package
 			// only manipulates the path as a whole, so we allow those
 			// last three as well. That leaves only ? to escape.
 			return c == '?'

 		case encodePathSegment: // §3.3
 			// The RFC allows : @ & = + $ but saves / ; , for assigning
 			// meaning to individual path segments.
 			return c == '/' || c == ';' || c == ',' || c == '?'

 		case encodeUserPassword: // §3.2.1
 			// The RFC allows ';', ':', '&', '=', '+', '$', and ',' in
 			// userinfo, so we must escape only '@', '/', and '?'.
 			// The parsing of userinfo treats ':' as special so we must escape
 			// that too.
 			return c == '@' || c == '/' || c == '?' || c == ':'

 		case encodeQueryComponent: // §3.4
 			// The RFC reserves (so we must escape) everything.
 			return true

 		case encodeFragment: // §4.1
 			// The RFC text is silent but the grammar allows
 			// everything, so escape nothing.
 			return false
 		}
 	}

 	if mode == encodeFragment {
 		// RFC 3986 §2.2 allows not escaping sub-delims. A subset of sub-delims are
 		// included in reserved from RFC 2396 §2.2. The remaining sub-delims do not
 		// need to be escaped. To minimize potential breakage, we apply two restrictions:
 		// (1) we always escape sub-delims outside of the fragment, and (2) we always
 		// escape single quote to avoid breaking callers that had previously assumed that
 		// single quotes would be escaped. See issue #19917.
 		switch c {
 		case '!', '(', ')', '*':
 			return false
 		}
 	}

 	// Everything else must be escaped.
 	return true
 }

 func ishex(c byte) bool {
 	return '0' <= c && c <= '9' ||
 		'a' <= c && c <= 'f' ||
 		'A' <= c && c <= 'F'
 }
	// Copyright 2025 The Go Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style
	// license that can be found in the LICENSE file.

	//go:build ignore

	package main

	import (
	"bytes"
	_ "embed"
	"fmt"
	"go/format"
	"io"
	"log"
	"maps"
	"os"
	"slices"
	"strconv"
	"strings"
	)

	// We embed this source file in the resulting code-generation program in order
	// to extract the definitions of the encoding type and constants from it and
	// include them in the generated file.
	//
	//go:embed gen_encoding_table.go
	var genSource string

	const filename = "encoding_table.go"

	func main() {
	var out bytes.Buffer
	fmt.Fprintln(&out, "// Code generated from gen_encoding_table.go using 'go generate'; DO NOT EDIT.")
	fmt.Fprintln(&out)
	fmt.Fprintln(&out, "// Copyright 2025 The Go Authors. All rights reserved.")
	fmt.Fprintln(&out, "// Use of this source code is governed by a BSD-style")
	fmt.Fprintln(&out, "// license that can be found in the LICENSE file.")
	fmt.Fprintln(&out)
	fmt.Fprintln(&out, "package url")
	fmt.Fprintln(&out)
	generateEnc(&out, genSource)
	generateTable(&out)

	formatted, err := format.Source(out.Bytes())
	if err != nil {
	log.Fatal("format:", err)
	}

	err = os.WriteFile(filename, formatted, 0644)
	if err != nil {
	log.Fatal("WriteFile:", err)
	}
	}

	func generateEnc(w io.Writer, src string) {
	var writeLine bool
	for line := range strings.Lines(src) {
	if strings.HasPrefix(line, "// START encoding") {
	writeLine = true
	continue
	}
	if strings.HasPrefix(line, "// END encoding") {
	return
	}
	if writeLine {
	fmt.Fprint(w, line)
	}
	}
	}

	func generateTable(w io.Writer) {
	fmt.Fprintln(w, "var table = [256]encoding{")

	// Sort the encodings (in decreasing order) to guarantee a stable output.
	sortedEncs := slices.Sorted(maps.Keys(encNames))
	slices.Reverse(sortedEncs)

	for i := range 256 {
	c := byte(i)
	var lineBuf bytes.Buffer

	// Write key to line buffer.
	lineBuf.WriteString(strconv.QuoteRune(rune(c)))

	lineBuf.WriteByte(':')

	// Write value to line buffer.
	blankVal := true
	if ishex(c) {
	// Set the hexChar bit if this char is hexadecimal.
	lineBuf.WriteString("hexChar")
	blankVal = false
	}
	for _, enc := range sortedEncs {
	if !shouldEscape(c, enc) {
	if !blankVal {
	lineBuf.WriteByte('\|')
	}
	// Set this encoding mode's bit if this char should NOT be
	// escaped.
	name := encNames[enc]
	lineBuf.WriteString(name)
	blankVal = false
	}
	}

	if !blankVal {
	lineBuf.WriteString(",\n")
	w.Write(lineBuf.Bytes())
	}
	}
	fmt.Fprintln(w, "}")
	}

	// START encoding (keep this marker comment in sync with genEnc)
	type encoding uint8

	const (
	encodePath encoding = 1 << iota
	encodePathSegment
	encodeHost
	encodeZone
	encodeUserPassword
	encodeQueryComponent
	encodeFragment

	// hexChar is actually NOT an encoding mode, but there are only seven
	// encoding modes. We might as well abuse the otherwise unused most
	// significant bit in uint8 to indicate whether a character is
	// hexadecimal.
	hexChar
	)

	// END encoding (keep this marker comment in sync with genEnc)

	// Keep this in sync with the definitions of encoding mode constants.
	var encNames = map[encoding]string{
	encodePath: "encodePath",
	encodePathSegment: "encodePathSegment",
	encodeHost: "encodeHost",
	encodeZone: "encodeZone",
	encodeUserPassword: "encodeUserPassword",
	encodeQueryComponent: "encodeQueryComponent",
	encodeFragment: "encodeFragment",
	}

	// Return true if the specified character should be escaped when
	// appearing in a URL string, according to RFC 3986.
	//
	// Please be informed that for now shouldEscape does not check all
	// reserved characters correctly. See golang.org/issue/5684.
	func shouldEscape(c byte, mode encoding) bool {
	// §2.3 Unreserved characters (alphanum)
	if 'a' <= c && c <= 'z' \|\| 'A' <= c && c <= 'Z' \|\| '0' <= c && c <= '9' {
	return false
	}

	if mode == encodeHost \|\| mode == encodeZone {
	// §3.2.2 Host allows
	// sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
	// as part of reg-name.
	// We add : because we include :port as part of host.
	// We add [ ] because we include [ipv6]:port as part of host.
	// We add < > because they're the only characters left that
	// we could possibly allow, and Parse will reject them if we
	// escape them (because hosts can't use %-encoding for
	// ASCII bytes).
	switch c {
	case '!', '$', '&', '\'', '(', ')', '*', '+', ',', ';', '=', ':', '[', ']', '<', '>', '"':
	return false
	}
	}

	switch c {
	case '-', '_', '.', '~': // §2.3 Unreserved characters (mark)
	return false

	case '$', '&', '+', ',', '/', ':', ';', '=', '?', '@': // §2.2 Reserved characters (reserved)
	// Different sections of the URL allow a few of
	// the reserved characters to appear unescaped.
	switch mode {
	case encodePath: // §3.3
	// The RFC allows : @ & = + $ but saves / ; , for assigning
	// meaning to individual path segments. This package
	// only manipulates the path as a whole, so we allow those
	// last three as well. That leaves only ? to escape.
	return c == '?'

	case encodePathSegment: // §3.3
	// The RFC allows : @ & = + $ but saves / ; , for assigning
	// meaning to individual path segments.
	return c == '/' \|\| c == ';' \|\| c == ',' \|\| c == '?'

	case encodeUserPassword: // §3.2.1
	// The RFC allows ';', ':', '&', '=', '+', '$', and ',' in
	// userinfo, so we must escape only '@', '/', and '?'.
	// The parsing of userinfo treats ':' as special so we must escape
	// that too.
	return c == '@' \|\| c == '/' \|\| c == '?' \|\| c == ':'

	case encodeQueryComponent: // §3.4
	// The RFC reserves (so we must escape) everything.
	return true

	case encodeFragment: // §4.1
	// The RFC text is silent but the grammar allows
	// everything, so escape nothing.
	return false
	}
	}

	if mode == encodeFragment {
	// RFC 3986 §2.2 allows not escaping sub-delims. A subset of sub-delims are
	// included in reserved from RFC 2396 §2.2. The remaining sub-delims do not
	// need to be escaped. To minimize potential breakage, we apply two restrictions:
	// (1) we always escape sub-delims outside of the fragment, and (2) we always
	// escape single quote to avoid breaking callers that had previously assumed that
	// single quotes would be escaped. See issue #19917.
	switch c {
	case '!', '(', ')', '*':
	return false
	}
	}

	// Everything else must be escaped.
	return true
	}

	func ishex(c byte) bool {
	return '0' <= c && c <= '9' \|\|
	'a' <= c && c <= 'f' \|\|
	'A' <= c && c <= 'F'
	}