src/encoding/json/internal/jsonwire/wire.go - go - Git at Google

 // Copyright 2023 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

 //go:build goexperiment.jsonv2

 // Package jsonwire implements stateless functionality for handling JSON text.
 package jsonwire

 import (
 	"cmp"
 	"errors"
 	"strconv"
 	"strings"
 	"unicode"
 	"unicode/utf16"
 	"unicode/utf8"
 )

 // TrimSuffixWhitespace trims JSON from the end of b.
 func TrimSuffixWhitespace(b []byte) []byte {
 	// NOTE: The arguments and logic are kept simple to keep this inlinable.
 	n := len(b) - 1
 	for n >= 0 && (b[n] == ' ' || b[n] == '\t' || b[n] == '\r' || b[n] == '\n') {
 		n--
 	}
 	return b[:n+1]
 }

 // TrimSuffixString trims a valid JSON string at the end of b.
 // The behavior is undefined if there is not a valid JSON string present.
 func TrimSuffixString(b []byte) []byte {
 	// NOTE: The arguments and logic are kept simple to keep this inlinable.
 	if len(b) > 0 && b[len(b)-1] == '"' {
 		b = b[:len(b)-1]
 	}
 	for len(b) >= 2 && !(b[len(b)-1] == '"' && b[len(b)-2] != '\\') {
 		b = b[:len(b)-1] // trim all characters except an unescaped quote
 	}
 	if len(b) > 0 && b[len(b)-1] == '"' {
 		b = b[:len(b)-1]
 	}
 	return b
 }

 // HasSuffixByte reports whether b ends with c.
 func HasSuffixByte(b []byte, c byte) bool {
 	// NOTE: The arguments and logic are kept simple to keep this inlinable.
 	return len(b) > 0 && b[len(b)-1] == c
 }

 // TrimSuffixByte removes c from the end of b if it is present.
 func TrimSuffixByte(b []byte, c byte) []byte {
 	// NOTE: The arguments and logic are kept simple to keep this inlinable.
 	if len(b) > 0 && b[len(b)-1] == c {
 		return b[:len(b)-1]
 	}
 	return b
 }

 // QuoteRune quotes the first rune in the input.
 func QuoteRune[Bytes ~[]byte | ~string](b Bytes) string {
 	r, n := utf8.DecodeRuneInString(string(truncateMaxUTF8(b)))
 	if r == utf8.RuneError && n == 1 {
 		return `'\x` + strconv.FormatUint(uint64(b[0]), 16) + `'`
 	}
 	return strconv.QuoteRune(r)
 }

 // CompareUTF16 lexicographically compares x to y according
 // to the UTF-16 codepoints of the UTF-8 encoded input strings.
 // This implements the ordering specified in RFC 8785, section 3.2.3.
 func CompareUTF16[Bytes ~[]byte | ~string](x, y Bytes) int {
 	// NOTE: This is an optimized, mostly allocation-free implementation
 	// of CompareUTF16Simple in wire_test.go. FuzzCompareUTF16 verifies that the
 	// two implementations agree on the result of comparing any two strings.
 	isUTF16Self := func(r rune) bool {
 		return ('\u0000' <= r && r <= '\uD7FF') || ('\uE000' <= r && r <= '\uFFFF')
 	}

 	for {
 		if len(x) == 0 || len(y) == 0 {
 			return cmp.Compare(len(x), len(y))
 		}

 		// ASCII fast-path.
 		if x[0] < utf8.RuneSelf || y[0] < utf8.RuneSelf {
 			if x[0] != y[0] {
 				return cmp.Compare(x[0], y[0])
 			}
 			x, y = x[1:], y[1:]
 			continue
 		}

 		// Decode next pair of runes as UTF-8.
 		rx, nx := utf8.DecodeRuneInString(string(truncateMaxUTF8(x)))
 		ry, ny := utf8.DecodeRuneInString(string(truncateMaxUTF8(y)))

 		selfx := isUTF16Self(rx)
 		selfy := isUTF16Self(ry)
 		switch {
 		// The x rune is a single UTF-16 codepoint, while
 		// the y rune is a surrogate pair of UTF-16 codepoints.
 		case selfx && !selfy:
 			ry, _ = utf16.EncodeRune(ry)
 		// The y rune is a single UTF-16 codepoint, while
 		// the x rune is a surrogate pair of UTF-16 codepoints.
 		case selfy && !selfx:
 			rx, _ = utf16.EncodeRune(rx)
 		}
 		if rx != ry {
 			return cmp.Compare(rx, ry)
 		}

 		// Check for invalid UTF-8, in which case,
 		// we just perform a byte-for-byte comparison.
 		if isInvalidUTF8(rx, nx) || isInvalidUTF8(ry, ny) {
 			if x[0] != y[0] {
 				return cmp.Compare(x[0], y[0])
 			}
 		}
 		x, y = x[nx:], y[ny:]
 	}
 }

 // truncateMaxUTF8 truncates b such it contains at least one rune.
 //
 // The utf8 package currently lacks generic variants, which complicates
 // generic functions that operates on either []byte or string.
 // As a hack, we always call the utf8 function operating on strings,
 // but always truncate the input such that the result is identical.
 //
 // Example usage:
 //
 //	utf8.DecodeRuneInString(string(truncateMaxUTF8(b)))
 //
 // Converting a []byte to a string is stack allocated since
 // truncateMaxUTF8 guarantees that the []byte is short.
 func truncateMaxUTF8[Bytes ~[]byte | ~string](b Bytes) Bytes {
 	// TODO(https://go.dev/issue/56948): Remove this function and
 	// instead directly call generic utf8 functions wherever used.
 	if len(b) > utf8.UTFMax {
 		return b[:utf8.UTFMax]
 	}
 	return b
 }

 // TODO(https://go.dev/issue/70547): Use utf8.ErrInvalid instead.
 var ErrInvalidUTF8 = errors.New("invalid UTF-8")

 func NewInvalidCharacterError[Bytes ~[]byte | ~string](prefix Bytes, where string) error {
 	what := QuoteRune(prefix)
 	return errors.New("invalid character " + what + " " + where)
 }

 func NewInvalidEscapeSequenceError[Bytes ~[]byte | ~string](what Bytes) error {
 	label := "escape sequence"
 	if len(what) > 6 {
 		label = "surrogate pair"
 	}
 	needEscape := strings.IndexFunc(string(what), func(r rune) bool {
 		return r == '`' || r == utf8.RuneError || unicode.IsSpace(r) || !unicode.IsPrint(r)
 	}) >= 0
 	if needEscape {
 		return errors.New("invalid " + label + " " + strconv.Quote(string(what)) + " in string")
 	} else {
 		return errors.New("invalid " + label + " `" + string(what) + "` in string")
 	}
 }

 // TruncatePointer optionally truncates the JSON pointer,
 // enforcing that the length roughly does not exceed n.
 func TruncatePointer(s string, n int) string {
 	if len(s) <= n {
 		return s
 	}
 	i := n / 2
 	j := len(s) - n/2

 	// Avoid truncating a name if there are multiple names present.
 	if k := strings.LastIndexByte(s[:i], '/'); k > 0 {
 		i = k
 	}
 	if k := strings.IndexByte(s[j:], '/'); k >= 0 {
 		j += k + len("/")
 	}

 	// Avoid truncation in the middle of a UTF-8 rune.
 	for i > 0 && isInvalidUTF8(utf8.DecodeLastRuneInString(s[:i])) {
 		i--
 	}
 	for j < len(s) && isInvalidUTF8(utf8.DecodeRuneInString(s[j:])) {
 		j++
 	}

 	// Determine the right middle fragment to use.
 	var middle string
 	switch strings.Count(s[i:j], "/") {
 	case 0:
 		middle = "…"
 	case 1:
 		middle = "…/…"
 	default:
 		middle = "…/…/…"
 	}
 	if strings.HasPrefix(s[i:j], "/") && middle != "…" {
 		middle = strings.TrimPrefix(middle, "…")
 	}
 	if strings.HasSuffix(s[i:j], "/") && middle != "…" {
 		middle = strings.TrimSuffix(middle, "…")
 	}
 	return s[:i] + middle + s[j:]
 }

 func isInvalidUTF8(r rune, rn int) bool {
 	return r == utf8.RuneError && rn == 1
 }
	// Copyright 2023 The Go Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style
	// license that can be found in the LICENSE file.

	//go:build goexperiment.jsonv2

	// Package jsonwire implements stateless functionality for handling JSON text.
	package jsonwire

	import (
	"cmp"
	"errors"
	"strconv"
	"strings"
	"unicode"
	"unicode/utf16"
	"unicode/utf8"
	)

	// TrimSuffixWhitespace trims JSON from the end of b.
	func TrimSuffixWhitespace(b []byte) []byte {
	// NOTE: The arguments and logic are kept simple to keep this inlinable.
	n := len(b) - 1
	for n >= 0 && (b[n] == ' ' \|\| b[n] == '\t' \|\| b[n] == '\r' \|\| b[n] == '\n') {
	n--
	}
	return b[:n+1]
	}

	// TrimSuffixString trims a valid JSON string at the end of b.
	// The behavior is undefined if there is not a valid JSON string present.
	func TrimSuffixString(b []byte) []byte {
	// NOTE: The arguments and logic are kept simple to keep this inlinable.
	if len(b) > 0 && b[len(b)-1] == '"' {
	b = b[:len(b)-1]
	}
	for len(b) >= 2 && !(b[len(b)-1] == '"' && b[len(b)-2] != '\\') {
	b = b[:len(b)-1] // trim all characters except an unescaped quote
	}
	if len(b) > 0 && b[len(b)-1] == '"' {
	b = b[:len(b)-1]
	}
	return b
	}

	// HasSuffixByte reports whether b ends with c.
	func HasSuffixByte(b []byte, c byte) bool {
	// NOTE: The arguments and logic are kept simple to keep this inlinable.
	return len(b) > 0 && b[len(b)-1] == c
	}

	// TrimSuffixByte removes c from the end of b if it is present.
	func TrimSuffixByte(b []byte, c byte) []byte {
	// NOTE: The arguments and logic are kept simple to keep this inlinable.
	if len(b) > 0 && b[len(b)-1] == c {
	return b[:len(b)-1]
	}
	return b
	}

	// QuoteRune quotes the first rune in the input.
	func QuoteRune[Bytes ~[]byte \| ~string](b Bytes) string {
	r, n := utf8.DecodeRuneInString(string(truncateMaxUTF8(b)))
	if r == utf8.RuneError && n == 1 {
	return `'\x` + strconv.FormatUint(uint64(b[0]), 16) + `'`
	}
	return strconv.QuoteRune(r)
	}

	// CompareUTF16 lexicographically compares x to y according
	// to the UTF-16 codepoints of the UTF-8 encoded input strings.
	// This implements the ordering specified in RFC 8785, section 3.2.3.
	func CompareUTF16[Bytes ~[]byte \| ~string](x, y Bytes) int {
	// NOTE: This is an optimized, mostly allocation-free implementation
	// of CompareUTF16Simple in wire_test.go. FuzzCompareUTF16 verifies that the
	// two implementations agree on the result of comparing any two strings.
	isUTF16Self := func(r rune) bool {
	return ('\u0000' <= r && r <= '\uD7FF') \|\| ('\uE000' <= r && r <= '\uFFFF')
	}

	for {
	if len(x) == 0 \|\| len(y) == 0 {
	return cmp.Compare(len(x), len(y))
	}

	// ASCII fast-path.
	if x[0] < utf8.RuneSelf \|\| y[0] < utf8.RuneSelf {
	if x[0] != y[0] {
	return cmp.Compare(x[0], y[0])
	}
	x, y = x[1:], y[1:]
	continue
	}

	// Decode next pair of runes as UTF-8.
	rx, nx := utf8.DecodeRuneInString(string(truncateMaxUTF8(x)))
	ry, ny := utf8.DecodeRuneInString(string(truncateMaxUTF8(y)))

	selfx := isUTF16Self(rx)
	selfy := isUTF16Self(ry)
	switch {
	// The x rune is a single UTF-16 codepoint, while
	// the y rune is a surrogate pair of UTF-16 codepoints.
	case selfx && !selfy:
	ry, _ = utf16.EncodeRune(ry)
	// The y rune is a single UTF-16 codepoint, while
	// the x rune is a surrogate pair of UTF-16 codepoints.
	case selfy && !selfx:
	rx, _ = utf16.EncodeRune(rx)
	}
	if rx != ry {
	return cmp.Compare(rx, ry)
	}

	// Check for invalid UTF-8, in which case,
	// we just perform a byte-for-byte comparison.
	if isInvalidUTF8(rx, nx) \|\| isInvalidUTF8(ry, ny) {
	if x[0] != y[0] {
	return cmp.Compare(x[0], y[0])
	}
	}
	x, y = x[nx:], y[ny:]
	}
	}

	// truncateMaxUTF8 truncates b such it contains at least one rune.
	//
	// The utf8 package currently lacks generic variants, which complicates
	// generic functions that operates on either []byte or string.
	// As a hack, we always call the utf8 function operating on strings,
	// but always truncate the input such that the result is identical.
	//
	// Example usage:
	//
	// utf8.DecodeRuneInString(string(truncateMaxUTF8(b)))
	//
	// Converting a []byte to a string is stack allocated since
	// truncateMaxUTF8 guarantees that the []byte is short.
	func truncateMaxUTF8[Bytes ~[]byte \| ~string](b Bytes) Bytes {
	// TODO(https://go.dev/issue/56948): Remove this function and
	// instead directly call generic utf8 functions wherever used.
	if len(b) > utf8.UTFMax {
	return b[:utf8.UTFMax]
	}
	return b
	}

	// TODO(https://go.dev/issue/70547): Use utf8.ErrInvalid instead.
	var ErrInvalidUTF8 = errors.New("invalid UTF-8")

	func NewInvalidCharacterError[Bytes ~[]byte \| ~string](prefix Bytes, where string) error {
	what := QuoteRune(prefix)
	return errors.New("invalid character " + what + " " + where)
	}

	func NewInvalidEscapeSequenceError[Bytes ~[]byte \| ~string](what Bytes) error {
	label := "escape sequence"
	if len(what) > 6 {
	label = "surrogate pair"
	}
	needEscape := strings.IndexFunc(string(what), func(r rune) bool {
	return r == '`' \|\| r == utf8.RuneError \|\| unicode.IsSpace(r) \|\| !unicode.IsPrint(r)
	}) >= 0
	if needEscape {
	return errors.New("invalid " + label + " " + strconv.Quote(string(what)) + " in string")
	} else {
	return errors.New("invalid " + label + " `" + string(what) + "` in string")
	}
	}

	// TruncatePointer optionally truncates the JSON pointer,
	// enforcing that the length roughly does not exceed n.
	func TruncatePointer(s string, n int) string {
	if len(s) <= n {
	return s
	}
	i := n / 2
	j := len(s) - n/2

	// Avoid truncating a name if there are multiple names present.
	if k := strings.LastIndexByte(s[:i], '/'); k > 0 {
	i = k
	}
	if k := strings.IndexByte(s[j:], '/'); k >= 0 {
	j += k + len("/")
	}

	// Avoid truncation in the middle of a UTF-8 rune.
	for i > 0 && isInvalidUTF8(utf8.DecodeLastRuneInString(s[:i])) {
	i--
	}
	for j < len(s) && isInvalidUTF8(utf8.DecodeRuneInString(s[j:])) {
	j++
	}

	// Determine the right middle fragment to use.
	var middle string
	switch strings.Count(s[i:j], "/") {
	case 0:
	middle = "…"
	case 1:
	middle = "…/…"
	default:
	middle = "…/…/…"
	}
	if strings.HasPrefix(s[i:j], "/") && middle != "…" {
	middle = strings.TrimPrefix(middle, "…")
	}
	if strings.HasSuffix(s[i:j], "/") && middle != "…" {
	middle = strings.TrimSuffix(middle, "…")
	}
	return s[:i] + middle + s[j:]
	}

	func isInvalidUTF8(r rune, rn int) bool {
	return r == utf8.RuneError && rn == 1
	}