blob: c0513a8b241339b67dfa7c6d0cf7712a65bbc813 [file] [log] [blame]
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package text
import (
"bytes"
"io"
"regexp"
"strconv"
"unicode/utf8"
"google.golang.org/protobuf/internal/errors"
"google.golang.org/protobuf/reflect/protoreflect"
)
type syntaxError struct{ error }
func newSyntaxError(f string, x ...interface{}) error {
return syntaxError{errors.New(f, x...)}
}
// Unmarshal parses b as the proto text format.
// It returns a Value, which is always of the Message type.
func Unmarshal(b []byte) (Value, error) {
p := decoder{in: b}
p.consume(0) // trim leading spaces or comments
v, err := p.unmarshalMessage(false)
if err != nil {
if e, ok := err.(syntaxError); ok {
b = b[:len(b)-len(p.in)] // consumed input
line := bytes.Count(b, []byte("\n")) + 1
if i := bytes.LastIndexByte(b, '\n'); i >= 0 {
b = b[i+1:]
}
column := utf8.RuneCount(b) + 1 // ignore multi-rune characters
err = errors.New("syntax error (line %d:%d): %v", line, column, e.error)
}
return Value{}, err
}
if len(p.in) > 0 {
return Value{}, errors.New("%d bytes of unconsumed input", len(p.in))
}
return v, nil
}
type decoder struct {
in []byte
}
func (p *decoder) unmarshalList() (Value, error) {
b := p.in
var elems []Value
if err := p.consumeChar('[', "at start of list"); err != nil {
return Value{}, err
}
if len(p.in) > 0 && p.in[0] != ']' {
for len(p.in) > 0 {
v, err := p.unmarshalValue()
if err != nil {
return Value{}, err
}
elems = append(elems, v)
if !p.tryConsumeChar(',') {
break
}
}
}
if err := p.consumeChar(']', "at end of list"); err != nil {
return Value{}, err
}
b = b[:len(b)-len(p.in)]
return rawValueOf(elems, b[:len(b):len(b)]), nil
}
func (p *decoder) unmarshalMessage(checkDelims bool) (Value, error) {
b := p.in
var items [][2]Value
delims := [2]byte{'{', '}'}
if len(p.in) > 0 && p.in[0] == '<' {
delims = [2]byte{'<', '>'}
}
if checkDelims {
if err := p.consumeChar(delims[0], "at start of message"); err != nil {
return Value{}, err
}
}
for len(p.in) > 0 {
if p.in[0] == '}' || p.in[0] == '>' {
break
}
k, err := p.unmarshalKey()
if err != nil {
return Value{}, err
}
if !p.tryConsumeChar(':') && len(p.in) > 0 && p.in[0] != '{' && p.in[0] != '<' {
return Value{}, newSyntaxError("expected ':' after message key")
}
v, err := p.unmarshalValue()
if err != nil {
return Value{}, err
}
if p.tryConsumeChar(';') || p.tryConsumeChar(',') {
// always optional
}
items = append(items, [2]Value{k, v})
}
if checkDelims {
if err := p.consumeChar(delims[1], "at end of message"); err != nil {
return Value{}, err
}
}
b = b[:len(b)-len(p.in)]
return rawValueOf(items, b[:len(b):len(b)]), nil
}
// unmarshalKey parses the key, which may be a Name, String, or Uint.
func (p *decoder) unmarshalKey() (v Value, err error) {
if p.tryConsumeChar('[') {
if len(p.in) == 0 {
return Value{}, io.ErrUnexpectedEOF
}
if p.in[0] == '\'' || p.in[0] == '"' {
// Historically, Go's parser allowed a string for the Any type URL.
// This is specific to Go and contrary to the C++ implementation,
// which does not support strings for the Any type URL.
v, err = p.unmarshalString()
if err != nil {
return Value{}, err
}
} else {
v, err = p.unmarshalURL()
if err != nil {
return Value{}, err
}
}
if err := p.consumeChar(']', "at end of extension name"); err != nil {
return Value{}, err
}
return v, nil
}
v, err = p.unmarshalName()
if err == nil {
return v, nil
}
v, err = p.unmarshalNumberKey()
if err == nil {
return v, nil
}
return Value{}, err
}
// unmarshalURL parses an Any type URL string. The C++ parser does not handle
// many legal URL strings. This implementation is more liberal and allows for
// the pattern ^[-_a-zA-Z0-9]+([./][-_a-zA-Z0-9]+)*`).
func (p *decoder) unmarshalURL() (Value, error) {
s := p.in
var size int
for len(s) > 0 && (s[0] == '-' || s[0] == '_' ||
('0' <= s[0] && s[0] <= '9') ||
('a' <= s[0] && s[0] <= 'z') ||
('A' <= s[0] && s[0] <= 'Z')) {
s = s[1:]
size++
if len(s) > 0 && (s[0] == '/' || s[0] == '.') {
s = s[1:]
size++
}
}
// Last character cannot be '.' or '/'.
// Next byte should either be a delimiter or it is at the end.
if size == 0 || p.in[size-1] == '.' || p.in[size-1] == '/' ||
(len(s) > 0 && !isDelim(s[0])) {
return Value{}, newSyntaxError("invalid %q as identifier", errRegexp.Find(p.in))
}
v := rawValueOf(string(p.in[:size]), p.in[:size:size])
p.consume(size)
return v, nil
}
// unmarshalNumberKey parses field number as key. Field numbers are non-negative
// integers.
func (p *decoder) unmarshalNumberKey() (Value, error) {
num, ok := parseNumber(p.in)
if !ok || num.neg || num.typ == numFloat {
return Value{}, newSyntaxError("invalid %q as identifier", errRegexp.Find(p.in))
}
v, err := strconv.ParseUint(string(num.value), 0, 64)
if err != nil {
return Value{}, newSyntaxError("invalid %q as identifier", errRegexp.Find(p.in))
}
p.consume(num.size)
return rawValueOf(v, num.value), nil
}
func (p *decoder) unmarshalValue() (Value, error) {
if len(p.in) == 0 {
return Value{}, io.ErrUnexpectedEOF
}
switch p.in[0] {
case '"', '\'':
return p.unmarshalStrings()
case '[':
return p.unmarshalList()
case '{', '<':
return p.unmarshalMessage(true)
default:
n, ok := consumeName(p.in)
if ok && literals[string(p.in[:n])] == nil {
v := rawValueOf(protoreflect.Name(p.in[:n]), p.in[:n:n])
p.consume(n)
return v, nil
}
return p.unmarshalNumber()
}
}
// unmarshalName unmarshals an unquoted proto identifier.
// Regular expression that matches an identifier: `^[_a-zA-Z][_a-zA-Z0-9]*`
//
// E.g., `field_name` => ValueOf(protoreflect.Name("field_name"))
func (p *decoder) unmarshalName() (Value, error) {
n, ok := consumeName(p.in)
if !ok {
return Value{}, newSyntaxError("invalid %q as identifier", errRegexp.Find(p.in))
}
v := rawValueOf(protoreflect.Name(p.in[:n]), p.in[:n:n])
p.consume(n)
return v, nil
}
func consumeName(input []byte) (int, bool) {
var n int
s := input
if len(s) == 0 {
return 0, false
}
switch {
case s[0] == '_',
'a' <= s[0] && s[0] <= 'z',
'A' <= s[0] && s[0] <= 'Z':
s = s[1:]
n++
default:
return 0, false
}
for len(s) > 0 && (s[0] == '_' ||
'a' <= s[0] && s[0] <= 'z' ||
'A' <= s[0] && s[0] <= 'Z' ||
'0' <= s[0] && s[0] <= '9') {
s = s[1:]
n++
}
if len(s) > 0 && !isDelim(s[0]) {
return 0, false
}
return n, true
}
func (p *decoder) consumeChar(c byte, msg string) error {
if p.tryConsumeChar(c) {
return nil
}
if len(p.in) == 0 {
return io.ErrUnexpectedEOF
}
return newSyntaxError("invalid character %q, expected %q %s", p.in[0], c, msg)
}
func (p *decoder) tryConsumeChar(c byte) bool {
if len(p.in) > 0 && p.in[0] == c {
p.consume(1)
return true
}
return false
}
// consume consumes n bytes of input and any subsequent whitespace or comments.
func (p *decoder) consume(n int) {
p.in = p.in[n:]
for len(p.in) > 0 {
switch p.in[0] {
case ' ', '\n', '\r', '\t':
p.in = p.in[1:]
case '#':
if i := bytes.IndexByte(p.in, '\n'); i >= 0 {
p.in = p.in[i+len("\n"):]
} else {
p.in = nil
}
default:
return
}
}
}
// Any sequence that looks like a non-delimiter (for error reporting).
var errRegexp = regexp.MustCompile(`^([-+._a-zA-Z0-9\/]+|.)`)
// isDelim returns true if given byte is a delimiter character.
func isDelim(c byte) bool {
return !(c == '-' || c == '+' || c == '.' || c == '_' ||
('a' <= c && c <= 'z') ||
('A' <= c && c <= 'Z') ||
('0' <= c && c <= '9'))
}