| // Copyright 2023 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| //go:build goexperiment.jsonv2 |
| |
| package jsonwire |
| |
| import ( |
| "io" |
| "math" |
| "slices" |
| "strconv" |
| "unicode/utf16" |
| "unicode/utf8" |
| ) |
| |
| type ValueFlags uint |
| |
| const ( |
| _ ValueFlags = (1 << iota) / 2 // powers of two starting with zero |
| |
| stringNonVerbatim // string cannot be naively treated as valid UTF-8 |
| stringNonCanonical // string not formatted according to RFC 8785, section 3.2.2.2. |
| // TODO: Track whether a number is a non-integer? |
| ) |
| |
| func (f *ValueFlags) Join(f2 ValueFlags) { *f |= f2 } |
| func (f ValueFlags) IsVerbatim() bool { return f&stringNonVerbatim == 0 } |
| func (f ValueFlags) IsCanonical() bool { return f&stringNonCanonical == 0 } |
| |
| // ConsumeWhitespace consumes leading JSON whitespace per RFC 7159, section 2. |
| func ConsumeWhitespace(b []byte) (n int) { |
| // NOTE: The arguments and logic are kept simple to keep this inlinable. |
| for len(b) > n && (b[n] == ' ' || b[n] == '\t' || b[n] == '\r' || b[n] == '\n') { |
| n++ |
| } |
| return n |
| } |
| |
| // ConsumeNull consumes the next JSON null literal per RFC 7159, section 3. |
| // It returns 0 if it is invalid, in which case consumeLiteral should be used. |
| func ConsumeNull(b []byte) int { |
| // NOTE: The arguments and logic are kept simple to keep this inlinable. |
| const literal = "null" |
| if len(b) >= len(literal) && string(b[:len(literal)]) == literal { |
| return len(literal) |
| } |
| return 0 |
| } |
| |
| // ConsumeFalse consumes the next JSON false literal per RFC 7159, section 3. |
| // It returns 0 if it is invalid, in which case consumeLiteral should be used. |
| func ConsumeFalse(b []byte) int { |
| // NOTE: The arguments and logic are kept simple to keep this inlinable. |
| const literal = "false" |
| if len(b) >= len(literal) && string(b[:len(literal)]) == literal { |
| return len(literal) |
| } |
| return 0 |
| } |
| |
| // ConsumeTrue consumes the next JSON true literal per RFC 7159, section 3. |
| // It returns 0 if it is invalid, in which case consumeLiteral should be used. |
| func ConsumeTrue(b []byte) int { |
| // NOTE: The arguments and logic are kept simple to keep this inlinable. |
| const literal = "true" |
| if len(b) >= len(literal) && string(b[:len(literal)]) == literal { |
| return len(literal) |
| } |
| return 0 |
| } |
| |
| // ConsumeLiteral consumes the next JSON literal per RFC 7159, section 3. |
| // If the input appears truncated, it returns io.ErrUnexpectedEOF. |
| func ConsumeLiteral(b []byte, lit string) (n int, err error) { |
| for i := 0; i < len(b) && i < len(lit); i++ { |
| if b[i] != lit[i] { |
| return i, NewInvalidCharacterError(b[i:], "in literal "+lit+" (expecting "+strconv.QuoteRune(rune(lit[i]))+")") |
| } |
| } |
| if len(b) < len(lit) { |
| return len(b), io.ErrUnexpectedEOF |
| } |
| return len(lit), nil |
| } |
| |
| // ConsumeSimpleString consumes the next JSON string per RFC 7159, section 7 |
| // but is limited to the grammar for an ASCII string without escape sequences. |
| // It returns 0 if it is invalid or more complicated than a simple string, |
| // in which case consumeString should be called. |
| // |
| // It rejects '<', '>', and '&' for compatibility reasons since these were |
| // always escaped in the v1 implementation. Thus, if this function reports |
| // non-zero then we know that the string would be encoded the same way |
| // under both v1 or v2 escape semantics. |
| func ConsumeSimpleString(b []byte) (n int) { |
| // NOTE: The arguments and logic are kept simple to keep this inlinable. |
| if len(b) > 0 && b[0] == '"' { |
| n++ |
| for len(b) > n && b[n] < utf8.RuneSelf && escapeASCII[b[n]] == 0 { |
| n++ |
| } |
| if uint(len(b)) > uint(n) && b[n] == '"' { |
| n++ |
| return n |
| } |
| } |
| return 0 |
| } |
| |
| // ConsumeString consumes the next JSON string per RFC 7159, section 7. |
| // If validateUTF8 is false, then this allows the presence of invalid UTF-8 |
| // characters within the string itself. |
| // It reports the number of bytes consumed and whether an error was encountered. |
| // If the input appears truncated, it returns io.ErrUnexpectedEOF. |
| func ConsumeString(flags *ValueFlags, b []byte, validateUTF8 bool) (n int, err error) { |
| return ConsumeStringResumable(flags, b, 0, validateUTF8) |
| } |
| |
| // ConsumeStringResumable is identical to consumeString but supports resuming |
| // from a previous call that returned io.ErrUnexpectedEOF. |
| func ConsumeStringResumable(flags *ValueFlags, b []byte, resumeOffset int, validateUTF8 bool) (n int, err error) { |
| // Consume the leading double quote. |
| switch { |
| case resumeOffset > 0: |
| n = resumeOffset // already handled the leading quote |
| case uint(len(b)) == 0: |
| return n, io.ErrUnexpectedEOF |
| case b[0] == '"': |
| n++ |
| default: |
| return n, NewInvalidCharacterError(b[n:], `at start of string (expecting '"')`) |
| } |
| |
| // Consume every character in the string. |
| for uint(len(b)) > uint(n) { |
| // Optimize for long sequences of unescaped characters. |
| noEscape := func(c byte) bool { |
| return c < utf8.RuneSelf && ' ' <= c && c != '\\' && c != '"' |
| } |
| for uint(len(b)) > uint(n) && noEscape(b[n]) { |
| n++ |
| } |
| if uint(len(b)) <= uint(n) { |
| return n, io.ErrUnexpectedEOF |
| } |
| |
| // Check for terminating double quote. |
| if b[n] == '"' { |
| n++ |
| return n, nil |
| } |
| |
| switch r, rn := utf8.DecodeRune(b[n:]); { |
| // Handle UTF-8 encoded byte sequence. |
| // Due to specialized handling of ASCII above, we know that |
| // all normal sequences at this point must be 2 bytes or larger. |
| case rn > 1: |
| n += rn |
| // Handle escape sequence. |
| case r == '\\': |
| flags.Join(stringNonVerbatim) |
| resumeOffset = n |
| if uint(len(b)) < uint(n+2) { |
| return resumeOffset, io.ErrUnexpectedEOF |
| } |
| switch r := b[n+1]; r { |
| case '/': |
| // Forward slash is the only character with 3 representations. |
| // Per RFC 8785, section 3.2.2.2., this must not be escaped. |
| flags.Join(stringNonCanonical) |
| n += 2 |
| case '"', '\\', 'b', 'f', 'n', 'r', 't': |
| n += 2 |
| case 'u': |
| if uint(len(b)) < uint(n+6) { |
| if hasEscapedUTF16Prefix(b[n:], false) { |
| return resumeOffset, io.ErrUnexpectedEOF |
| } |
| flags.Join(stringNonCanonical) |
| return n, NewInvalidEscapeSequenceError(b[n:]) |
| } |
| v1, ok := parseHexUint16(b[n+2 : n+6]) |
| if !ok { |
| flags.Join(stringNonCanonical) |
| return n, NewInvalidEscapeSequenceError(b[n : n+6]) |
| } |
| // Only certain control characters can use the \uFFFF notation |
| // for canonical formatting (per RFC 8785, section 3.2.2.2.). |
| switch v1 { |
| // \uFFFF notation not permitted for these characters. |
| case '\b', '\f', '\n', '\r', '\t': |
| flags.Join(stringNonCanonical) |
| default: |
| // \uFFFF notation only permitted for control characters. |
| if v1 >= ' ' { |
| flags.Join(stringNonCanonical) |
| } else { |
| // \uFFFF notation must be lower case. |
| for _, c := range b[n+2 : n+6] { |
| if 'A' <= c && c <= 'F' { |
| flags.Join(stringNonCanonical) |
| } |
| } |
| } |
| } |
| n += 6 |
| |
| r := rune(v1) |
| if validateUTF8 && utf16.IsSurrogate(r) { |
| if uint(len(b)) < uint(n+6) { |
| if hasEscapedUTF16Prefix(b[n:], true) { |
| return resumeOffset, io.ErrUnexpectedEOF |
| } |
| flags.Join(stringNonCanonical) |
| return n - 6, NewInvalidEscapeSequenceError(b[n-6:]) |
| } else if v2, ok := parseHexUint16(b[n+2 : n+6]); b[n] != '\\' || b[n+1] != 'u' || !ok { |
| flags.Join(stringNonCanonical) |
| return n - 6, NewInvalidEscapeSequenceError(b[n-6 : n+6]) |
| } else if r = utf16.DecodeRune(rune(v1), rune(v2)); r == utf8.RuneError { |
| flags.Join(stringNonCanonical) |
| return n - 6, NewInvalidEscapeSequenceError(b[n-6 : n+6]) |
| } else { |
| n += 6 |
| } |
| } |
| default: |
| flags.Join(stringNonCanonical) |
| return n, NewInvalidEscapeSequenceError(b[n : n+2]) |
| } |
| // Handle invalid UTF-8. |
| case r == utf8.RuneError: |
| if !utf8.FullRune(b[n:]) { |
| return n, io.ErrUnexpectedEOF |
| } |
| flags.Join(stringNonVerbatim | stringNonCanonical) |
| if validateUTF8 { |
| return n, ErrInvalidUTF8 |
| } |
| n++ |
| // Handle invalid control characters. |
| case r < ' ': |
| flags.Join(stringNonVerbatim | stringNonCanonical) |
| return n, NewInvalidCharacterError(b[n:], "in string (expecting non-control character)") |
| default: |
| panic("BUG: unhandled character " + QuoteRune(b[n:])) |
| } |
| } |
| return n, io.ErrUnexpectedEOF |
| } |
| |
| // AppendUnquote appends the unescaped form of a JSON string in src to dst. |
| // Any invalid UTF-8 within the string will be replaced with utf8.RuneError, |
| // but the error will be specified as having encountered such an error. |
| // The input must be an entire JSON string with no surrounding whitespace. |
| func AppendUnquote[Bytes ~[]byte | ~string](dst []byte, src Bytes) (v []byte, err error) { |
| dst = slices.Grow(dst, len(src)) |
| |
| // Consume the leading double quote. |
| var i, n int |
| switch { |
| case uint(len(src)) == 0: |
| return dst, io.ErrUnexpectedEOF |
| case src[0] == '"': |
| i, n = 1, 1 |
| default: |
| return dst, NewInvalidCharacterError(src, `at start of string (expecting '"')`) |
| } |
| |
| // Consume every character in the string. |
| for uint(len(src)) > uint(n) { |
| // Optimize for long sequences of unescaped characters. |
| noEscape := func(c byte) bool { |
| return c < utf8.RuneSelf && ' ' <= c && c != '\\' && c != '"' |
| } |
| for uint(len(src)) > uint(n) && noEscape(src[n]) { |
| n++ |
| } |
| if uint(len(src)) <= uint(n) { |
| dst = append(dst, src[i:n]...) |
| return dst, io.ErrUnexpectedEOF |
| } |
| |
| // Check for terminating double quote. |
| if src[n] == '"' { |
| dst = append(dst, src[i:n]...) |
| n++ |
| if n < len(src) { |
| err = NewInvalidCharacterError(src[n:], "after string value") |
| } |
| return dst, err |
| } |
| |
| switch r, rn := utf8.DecodeRuneInString(string(truncateMaxUTF8(src[n:]))); { |
| // Handle UTF-8 encoded byte sequence. |
| // Due to specialized handling of ASCII above, we know that |
| // all normal sequences at this point must be 2 bytes or larger. |
| case rn > 1: |
| n += rn |
| // Handle escape sequence. |
| case r == '\\': |
| dst = append(dst, src[i:n]...) |
| |
| // Handle escape sequence. |
| if uint(len(src)) < uint(n+2) { |
| return dst, io.ErrUnexpectedEOF |
| } |
| switch r := src[n+1]; r { |
| case '"', '\\', '/': |
| dst = append(dst, r) |
| n += 2 |
| case 'b': |
| dst = append(dst, '\b') |
| n += 2 |
| case 'f': |
| dst = append(dst, '\f') |
| n += 2 |
| case 'n': |
| dst = append(dst, '\n') |
| n += 2 |
| case 'r': |
| dst = append(dst, '\r') |
| n += 2 |
| case 't': |
| dst = append(dst, '\t') |
| n += 2 |
| case 'u': |
| if uint(len(src)) < uint(n+6) { |
| if hasEscapedUTF16Prefix(src[n:], false) { |
| return dst, io.ErrUnexpectedEOF |
| } |
| return dst, NewInvalidEscapeSequenceError(src[n:]) |
| } |
| v1, ok := parseHexUint16(src[n+2 : n+6]) |
| if !ok { |
| return dst, NewInvalidEscapeSequenceError(src[n : n+6]) |
| } |
| n += 6 |
| |
| // Check whether this is a surrogate half. |
| r := rune(v1) |
| if utf16.IsSurrogate(r) { |
| r = utf8.RuneError // assume failure unless the following succeeds |
| if uint(len(src)) < uint(n+6) { |
| if hasEscapedUTF16Prefix(src[n:], true) { |
| return utf8.AppendRune(dst, r), io.ErrUnexpectedEOF |
| } |
| err = NewInvalidEscapeSequenceError(src[n-6:]) |
| } else if v2, ok := parseHexUint16(src[n+2 : n+6]); src[n] != '\\' || src[n+1] != 'u' || !ok { |
| err = NewInvalidEscapeSequenceError(src[n-6 : n+6]) |
| } else if r = utf16.DecodeRune(rune(v1), rune(v2)); r == utf8.RuneError { |
| err = NewInvalidEscapeSequenceError(src[n-6 : n+6]) |
| } else { |
| n += 6 |
| } |
| } |
| |
| dst = utf8.AppendRune(dst, r) |
| default: |
| return dst, NewInvalidEscapeSequenceError(src[n : n+2]) |
| } |
| i = n |
| // Handle invalid UTF-8. |
| case r == utf8.RuneError: |
| dst = append(dst, src[i:n]...) |
| if !utf8.FullRuneInString(string(truncateMaxUTF8(src[n:]))) { |
| return dst, io.ErrUnexpectedEOF |
| } |
| // NOTE: An unescaped string may be longer than the escaped string |
| // because invalid UTF-8 bytes are being replaced. |
| dst = append(dst, "\uFFFD"...) |
| n += rn |
| i = n |
| err = ErrInvalidUTF8 |
| // Handle invalid control characters. |
| case r < ' ': |
| dst = append(dst, src[i:n]...) |
| return dst, NewInvalidCharacterError(src[n:], "in string (expecting non-control character)") |
| default: |
| panic("BUG: unhandled character " + QuoteRune(src[n:])) |
| } |
| } |
| dst = append(dst, src[i:n]...) |
| return dst, io.ErrUnexpectedEOF |
| } |
| |
| // hasEscapedUTF16Prefix reports whether b is possibly |
| // the truncated prefix of a \uFFFF escape sequence. |
| func hasEscapedUTF16Prefix[Bytes ~[]byte | ~string](b Bytes, lowerSurrogateHalf bool) bool { |
| for i := range len(b) { |
| switch c := b[i]; { |
| case i == 0 && c != '\\': |
| return false |
| case i == 1 && c != 'u': |
| return false |
| case i == 2 && lowerSurrogateHalf && c != 'd' && c != 'D': |
| return false // not within ['\uDC00':'\uDFFF'] |
| case i == 3 && lowerSurrogateHalf && !('c' <= c && c <= 'f') && !('C' <= c && c <= 'F'): |
| return false // not within ['\uDC00':'\uDFFF'] |
| case i >= 2 && i < 6 && !('0' <= c && c <= '9') && !('a' <= c && c <= 'f') && !('A' <= c && c <= 'F'): |
| return false |
| } |
| } |
| return true |
| } |
| |
| // UnquoteMayCopy returns the unescaped form of b. |
| // If there are no escaped characters, the output is simply a subslice of |
| // the input with the surrounding quotes removed. |
| // Otherwise, a new buffer is allocated for the output. |
| // It assumes the input is valid. |
| func UnquoteMayCopy(b []byte, isVerbatim bool) []byte { |
| // NOTE: The arguments and logic are kept simple to keep this inlinable. |
| if isVerbatim { |
| return b[len(`"`) : len(b)-len(`"`)] |
| } |
| b, _ = AppendUnquote(nil, b) |
| return b |
| } |
| |
| // ConsumeSimpleNumber consumes the next JSON number per RFC 7159, section 6 |
| // but is limited to the grammar for a positive integer. |
| // It returns 0 if it is invalid or more complicated than a simple integer, |
| // in which case consumeNumber should be called. |
| func ConsumeSimpleNumber(b []byte) (n int) { |
| // NOTE: The arguments and logic are kept simple to keep this inlinable. |
| if len(b) > 0 { |
| if b[0] == '0' { |
| n++ |
| } else if '1' <= b[0] && b[0] <= '9' { |
| n++ |
| for len(b) > n && ('0' <= b[n] && b[n] <= '9') { |
| n++ |
| } |
| } else { |
| return 0 |
| } |
| if uint(len(b)) <= uint(n) || (b[n] != '.' && b[n] != 'e' && b[n] != 'E') { |
| return n |
| } |
| } |
| return 0 |
| } |
| |
| type ConsumeNumberState uint |
| |
| const ( |
| consumeNumberInit ConsumeNumberState = iota |
| beforeIntegerDigits |
| withinIntegerDigits |
| beforeFractionalDigits |
| withinFractionalDigits |
| beforeExponentDigits |
| withinExponentDigits |
| ) |
| |
| // ConsumeNumber consumes the next JSON number per RFC 7159, section 6. |
| // It reports the number of bytes consumed and whether an error was encountered. |
| // If the input appears truncated, it returns io.ErrUnexpectedEOF. |
| // |
| // Note that JSON numbers are not self-terminating. |
| // If the entire input is consumed, then the caller needs to consider whether |
| // there may be subsequent unread data that may still be part of this number. |
| func ConsumeNumber(b []byte) (n int, err error) { |
| n, _, err = ConsumeNumberResumable(b, 0, consumeNumberInit) |
| return n, err |
| } |
| |
| // ConsumeNumberResumable is identical to consumeNumber but supports resuming |
| // from a previous call that returned io.ErrUnexpectedEOF. |
| func ConsumeNumberResumable(b []byte, resumeOffset int, state ConsumeNumberState) (n int, _ ConsumeNumberState, err error) { |
| // Jump to the right state when resuming from a partial consumption. |
| n = resumeOffset |
| if state > consumeNumberInit { |
| switch state { |
| case withinIntegerDigits, withinFractionalDigits, withinExponentDigits: |
| // Consume leading digits. |
| for uint(len(b)) > uint(n) && ('0' <= b[n] && b[n] <= '9') { |
| n++ |
| } |
| if uint(len(b)) <= uint(n) { |
| return n, state, nil // still within the same state |
| } |
| state++ // switches "withinX" to "beforeY" where Y is the state after X |
| } |
| switch state { |
| case beforeIntegerDigits: |
| goto beforeInteger |
| case beforeFractionalDigits: |
| goto beforeFractional |
| case beforeExponentDigits: |
| goto beforeExponent |
| default: |
| return n, state, nil |
| } |
| } |
| |
| // Consume required integer component (with optional minus sign). |
| beforeInteger: |
| resumeOffset = n |
| if uint(len(b)) > 0 && b[0] == '-' { |
| n++ |
| } |
| switch { |
| case uint(len(b)) <= uint(n): |
| return resumeOffset, beforeIntegerDigits, io.ErrUnexpectedEOF |
| case b[n] == '0': |
| n++ |
| state = beforeFractionalDigits |
| case '1' <= b[n] && b[n] <= '9': |
| n++ |
| for uint(len(b)) > uint(n) && ('0' <= b[n] && b[n] <= '9') { |
| n++ |
| } |
| state = withinIntegerDigits |
| default: |
| return n, state, NewInvalidCharacterError(b[n:], "in number (expecting digit)") |
| } |
| |
| // Consume optional fractional component. |
| beforeFractional: |
| if uint(len(b)) > uint(n) && b[n] == '.' { |
| resumeOffset = n |
| n++ |
| switch { |
| case uint(len(b)) <= uint(n): |
| return resumeOffset, beforeFractionalDigits, io.ErrUnexpectedEOF |
| case '0' <= b[n] && b[n] <= '9': |
| n++ |
| default: |
| return n, state, NewInvalidCharacterError(b[n:], "in number (expecting digit)") |
| } |
| for uint(len(b)) > uint(n) && ('0' <= b[n] && b[n] <= '9') { |
| n++ |
| } |
| state = withinFractionalDigits |
| } |
| |
| // Consume optional exponent component. |
| beforeExponent: |
| if uint(len(b)) > uint(n) && (b[n] == 'e' || b[n] == 'E') { |
| resumeOffset = n |
| n++ |
| if uint(len(b)) > uint(n) && (b[n] == '-' || b[n] == '+') { |
| n++ |
| } |
| switch { |
| case uint(len(b)) <= uint(n): |
| return resumeOffset, beforeExponentDigits, io.ErrUnexpectedEOF |
| case '0' <= b[n] && b[n] <= '9': |
| n++ |
| default: |
| return n, state, NewInvalidCharacterError(b[n:], "in number (expecting digit)") |
| } |
| for uint(len(b)) > uint(n) && ('0' <= b[n] && b[n] <= '9') { |
| n++ |
| } |
| state = withinExponentDigits |
| } |
| |
| return n, state, nil |
| } |
| |
| // parseHexUint16 is similar to strconv.ParseUint, |
| // but operates directly on []byte and is optimized for base-16. |
| // See https://go.dev/issue/42429. |
| func parseHexUint16[Bytes ~[]byte | ~string](b Bytes) (v uint16, ok bool) { |
| if len(b) != 4 { |
| return 0, false |
| } |
| for i := range 4 { |
| c := b[i] |
| switch { |
| case '0' <= c && c <= '9': |
| c = c - '0' |
| case 'a' <= c && c <= 'f': |
| c = 10 + c - 'a' |
| case 'A' <= c && c <= 'F': |
| c = 10 + c - 'A' |
| default: |
| return 0, false |
| } |
| v = v*16 + uint16(c) |
| } |
| return v, true |
| } |
| |
| // ParseUint parses b as a decimal unsigned integer according to |
| // a strict subset of the JSON number grammar, returning the value if valid. |
| // It returns (0, false) if there is a syntax error and |
| // returns (math.MaxUint64, false) if there is an overflow. |
| func ParseUint(b []byte) (v uint64, ok bool) { |
| const unsafeWidth = 20 // len(fmt.Sprint(uint64(math.MaxUint64))) |
| var n int |
| for ; len(b) > n && ('0' <= b[n] && b[n] <= '9'); n++ { |
| v = 10*v + uint64(b[n]-'0') |
| } |
| switch { |
| case n == 0 || len(b) != n || (b[0] == '0' && string(b) != "0"): |
| return 0, false |
| case n >= unsafeWidth && (b[0] != '1' || v < 1e19 || n > unsafeWidth): |
| return math.MaxUint64, false |
| } |
| return v, true |
| } |
| |
| // ParseFloat parses a floating point number according to the Go float grammar. |
| // Note that the JSON number grammar is a strict subset. |
| // |
| // If the number overflows the finite representation of a float, |
| // then we return MaxFloat since any finite value will always be infinitely |
| // more accurate at representing another finite value than an infinite value. |
| func ParseFloat(b []byte, bits int) (v float64, ok bool) { |
| fv, err := strconv.ParseFloat(string(b), bits) |
| if math.IsInf(fv, 0) { |
| switch { |
| case bits == 32 && math.IsInf(fv, +1): |
| fv = +math.MaxFloat32 |
| case bits == 64 && math.IsInf(fv, +1): |
| fv = +math.MaxFloat64 |
| case bits == 32 && math.IsInf(fv, -1): |
| fv = -math.MaxFloat32 |
| case bits == 64 && math.IsInf(fv, -1): |
| fv = -math.MaxFloat64 |
| } |
| } |
| return fv, err == nil |
| } |