internal/filter/lex.go - oscar - Git at Google

 // Copyright 2024 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

 package filter

 import (
 	"fmt"
 	"strings"
 	"unicode/utf8"
 )

 // tokenKind is a lexical token.
 type tokenKind int

 const (
 	tokenInvalidInput      tokenKind = iota
 	tokenEOF                         // end of tokens
 	tokenWSC                         // whitespace or comment
 	tokenDot                         // .
 	tokenHas                         // :
 	tokenOr                          // OR
 	tokenAnd                         // AND
 	tokenNot                         // NOT
 	tokenLparen                      // (
 	tokenRparen                      // )
 	tokenComma                       // ,
 	tokenLessThan                    // <
 	tokenGreaterThan                 // >
 	tokenGreaterThanEquals           // >=
 	tokenLessThanEquals              // <=
 	tokenNotEquals                   // !=
 	tokenMatchesRegexp               // =~
 	tokenNotMatchesRegexp            // !~
 	tokenEquals                      // =
 	tokenMinus                       // -
 	tokenPlus                        // +
 	tokenTilde                       // ~
 	tokenBackslash                   // \
 	tokenString
 	tokenText
 )

 var tokenKindStrings = [...]string{
 	tokenInvalidInput:      "invalid",
 	tokenEOF:               "EOF",
 	tokenWSC:               "whitespace",
 	tokenDot:               ".",
 	tokenHas:               ":",
 	tokenOr:                "OR",
 	tokenAnd:               "AND",
 	tokenNot:               "NOT",
 	tokenLparen:            "(",
 	tokenRparen:            ")",
 	tokenComma:             ",",
 	tokenLessThan:          "<",
 	tokenGreaterThan:       ">",
 	tokenGreaterThanEquals: ">=",
 	tokenLessThanEquals:    "<=",
 	tokenNotEquals:         "!=",
 	tokenMatchesRegexp:     "=~",
 	tokenNotMatchesRegexp:  "!~",
 	tokenEquals:            "=",
 	tokenMinus:             "-",
 	tokenPlus:              "+",
 	tokenTilde:             "~",
 	tokenBackslash:         `\`,
 	tokenString:            "string",
 	tokenText:              "text",
 }

 // String returns the string representation of a tokenKind.
 func (tk tokenKind) String() string {
 	return tokenKindStrings[tk]
 }

 // token is a lexical token read from the filter string.
 type token struct {
 	kind tokenKind
 	pos  position
 	val  string // only set for tokenString and tokenText
 }

 // position describes a position in the filter string.
 type position struct {
 	line, col int
 }

 // String prints a position for an error message.
 func (pos position) String() string {
 	return fmt.Sprintf("%d:%d", pos.line, pos.col)
 }

 // lexer is used to convert a string into a sequence of tokens.
 // The exact separation into names and other tokens,
 // and the handling of escape sequences,
 // is pretty loosely defined.
 type lexer struct {
 	input string

 	pushed      bool
 	pushedToken token // valid if pushed

 	prevDot  bool // previous token was tokenDot
 	prevText bool // previous token was tokenText

 	nextPos position // position of next token
 }

 // next returns the next token from the string.
 // At the end of the input this returns tokenEOF.
 func (lex *lexer) nextToken() token {
 	if lex.pushed {
 		lex.pushed = false
 		return lex.pushedToken
 	}

 	pos := lex.nextPos

 	prevDot := lex.prevDot
 	prevText := lex.prevText
 	lex.prevDot = false
 	lex.prevText = false

 	if len(lex.input) == 0 {
 		return token{kind: tokenEOF, pos: pos}
 	}

 	r, size := utf8.DecodeRuneInString(lex.input)
 	lex.advance(r, size)
 	if r == utf8.RuneError {
 		return token{kind: tokenInvalidInput, pos: pos}
 	}

 	switch {
 	case isWhite(r):
 		lex.skipWhite()
 		return token{kind: tokenWSC, pos: pos}
 	case r == '-':
 		if len(lex.input) == 0 {
 			return token{kind: tokenMinus, pos: pos}
 		}
 		rn, size := utf8.DecodeRuneInString(lex.input)
 		if rn == '-' {
 			// Comment.
 			lex.advance(rn, size)
 			lex.skipComment()
 			lex.skipWhite()
 			return token{kind: tokenWSC, pos: pos}
 		}
 		if isDigit(rn) {
 			return lex.text(r, prevDot)
 		}
 		if !prevText && rn == '.' && len(lex.input) > size && isDigit(rune(lex.input[size])) {
 			return lex.text(r, prevDot)
 		}
 		return token{kind: tokenMinus, pos: pos}
 	case r == '.':
 		if len(lex.input) == 0 {
 			lex.prevDot = true
 			return token{kind: tokenDot, pos: pos}
 		}
 		if !prevText {
 			rn, _ := utf8.DecodeRuneInString(lex.input)
 			if isDigit(rn) {
 				return lex.text(r, prevDot)
 			}
 		}
 		lex.prevDot = true
 		return token{kind: tokenDot, pos: pos}
 	case r == ':':
 		return token{kind: tokenHas, pos: pos}
 	case r == 'O':
 		if len(lex.input) > 0 && lex.input[0] == 'R' && (len(lex.input) < 2 || isWhite(rune(lex.input[1]))) {
 			lex.advance('R', 1)
 			return token{kind: tokenOr, pos: pos}
 		}
 		return lex.text(r, prevDot)
 	case r == 'A':
 		if len(lex.input) > 1 && lex.input[:2] == "ND" && (len(lex.input) < 3 || isWhite(rune(lex.input[2]))) {
 			lex.advance('N', 1)
 			lex.advance('D', 1)
 			return token{kind: tokenAnd, pos: pos}
 		}
 		return lex.text(r, prevDot)
 	case r == 'N':
 		if len(lex.input) > 1 && lex.input[:2] == "OT" && (len(lex.input) < 3 || isWhite(rune(lex.input[2]))) {
 			lex.advance('O', 1)
 			lex.advance('T', 1)
 			return token{kind: tokenNot, pos: pos}
 		}
 		return lex.text(r, prevDot)
 	case r == '(':
 		return token{kind: tokenLparen, pos: pos}
 	case r == ')':
 		return token{kind: tokenRparen, pos: pos}
 	case r == ',':
 		return token{kind: tokenComma, pos: pos}
 	case r == '<':
 		if len(lex.input) > 0 && lex.input[0] == '=' {
 			lex.advance('=', 1)
 			return token{kind: tokenLessThanEquals, pos: pos}
 		}
 		return token{kind: tokenLessThan, pos: pos}
 	case r == '>':
 		if len(lex.input) > 0 && lex.input[0] == '=' {
 			lex.advance('=', 1)
 			return token{kind: tokenGreaterThanEquals, pos: pos}
 		}
 		return token{kind: tokenGreaterThan, pos: pos}
 	case r == '!':
 		if len(lex.input) > 0 {
 			if lex.input[0] == '=' {
 				lex.advance('=', 1)
 				return token{kind: tokenNotEquals, pos: pos}
 			}
 			if lex.input[0] == '~' {
 				lex.advance('~', 1)
 				return token{kind: tokenNotMatchesRegexp, pos: pos}
 			}
 		}
 		// A bare ! is part of a name.
 		return lex.text(r, prevDot)
 	case r == '=':
 		if len(lex.input) > 0 && lex.input[0] == '~' {
 			lex.advance('~', 1)
 			return token{kind: tokenMatchesRegexp, pos: pos}
 		}
 		return token{kind: tokenEquals, pos: pos}
 	case r == '+':
 		return token{kind: tokenPlus, pos: pos}
 	case r == '~':
 		return token{kind: tokenTilde, pos: pos}
 	case r == '"':
 		return lex.collectString()
 	case isTextStart(r):
 		return lex.text(r, prevDot)
 	case isDigit(r):
 		return lex.text(r, prevDot)

 	default:
 		return token{kind: tokenInvalidInput, pos: pos}
 	}
 }

 // skipWhite skips whitespace.
 func (lex *lexer) skipWhite() {
 	for len(lex.input) > 0 {
 		r, size := utf8.DecodeRuneInString(lex.input)
 		switch {
 		case isWhite(r):
 			lex.advance(r, size)
 		case r == '-':
 			if len(lex.input) <= size {
 				return
 			}
 			rn, sizen := utf8.DecodeRuneInString(lex.input[size:])
 			if rn != '-' {
 				return
 			}
 			lex.advance(r, size)
 			lex.advance(rn, sizen)
 			lex.skipComment()
 		default:
 			return
 		}
 	}
 }

 // skipComment skips a comment.
 // We have already skipped the initial two '-' characters.
 func (lex *lexer) skipComment() {
 	for len(lex.input) > 0 {
 		r, size := utf8.DecodeRuneInString(lex.input)
 		switch {
 		case r == '\t',
 			r >= ' ' && r <= '~',
 			isa1OrHigher(r):

 			lex.advance(r, size)

 		default:
 			return
 		}
 	}
 }

 // advance advances past r of size size.
 func (lex *lexer) advance(r rune, size int) {
 	lex.input = lex.input[size:]
 	if r == '\n' {
 		lex.nextPos.line++
 		lex.nextPos.col = 0
 	} else {
 		lex.nextPos.col++
 	}
 }

 // text collects a name.
 // r is the first rune in the name; we have already advanced past it.
 // prevDot records whether the previous token was tokenDot;
 // in that case we don't include a dot after a number prefix.
 func (lex *lexer) text(r rune, prevDot bool) token {
 	pos := lex.nextPos

 	var sb strings.Builder

 	if r != '\\' {
 		sb.WriteRune(r)
 	} else {
 		if !lex.textEsc(&sb) {
 			return token{kind: tokenInvalidInput, pos: pos}
 		}
 	}

 	// Add a number prefix.
 	if r == '-' || r == '.' || isDigit(r) {
 		for len(lex.input) > 0 {
 			rn, size := utf8.DecodeRuneInString(lex.input)
 			if isDigit(rn) {
 				sb.WriteRune(rn)
 				lex.advance(rn, size)
 				if r == '.' {
 					break
 				}
 			} else if !prevDot && r != '.' && rn == '.' {
 				sb.WriteRune(rn)
 				lex.advance(rn, size)
 				break
 			} else {
 				break
 			}
 			if r == '-' {
 				r = rn
 			}
 		}
 	}

 	for len(lex.input) > 0 {
 		rn, size := utf8.DecodeRuneInString(lex.input)
 		if rn == '\\' {
 			lex.advance(rn, size)
 			if !lex.textEsc(&sb) {
 				return token{kind: tokenInvalidInput, pos: pos}
 			}
 		} else if isTextStart(rn) || isDigit(rn) || rn == '+' || rn == '-' {
 			sb.WriteRune(rn)
 			lex.advance(rn, size)
 		} else if rn == '!' {
 			// A bare '!' is part of a name for some reason,
 			// but not != or !~.
 			if len(lex.input) > 1 && (lex.input[1] == '=' || lex.input[1] == '~') {
 				break
 			}
 			sb.WriteRune(rn)
 			lex.advance(rn, size)
 		} else {
 			break
 		}
 	}

 	lex.prevText = true

 	return token{kind: tokenText, pos: pos, val: sb.String()}
 }

 // quotedSpecials is a map from a character that can follow a
 // backslash to the byte value that it represents.
 var quotedSpecials = map[rune]byte{
 	'a': '\a',
 	'b': '\b',
 	'f': '\f',
 	'n': '\n',
 	'r': '\r',
 	't': '\t',
 	'v': '\v',
 }

 // collectString collects the characters from a string.
 func (lex *lexer) collectString() token {
 	pos := lex.nextPos

 	check := false
 	var sb strings.Builder
 	for len(lex.input) > 0 {
 		r, size := utf8.DecodeRuneInString(lex.input)
 		lex.advance(r, size)
 		switch {
 		case isWhite(r),
 			r == '!',
 			r >= '#' && r <= '[', // does not include '\\'
 			r >= ']' && r <= '~',
 			isa1OrHigher(r):

 			sb.WriteRune(r)

 		case r == '"':
 			s := sb.String()
 			if check && !utf8.ValidString(s) {
 				return token{kind: tokenInvalidInput, pos: pos}
 			}
 			return token{kind: tokenString, pos: pos, val: s}

 		case r == '\\':
 			check = true
 			if len(lex.input) == 0 {
 				return token{kind: tokenInvalidInput, pos: pos}
 			}
 			r, size = utf8.DecodeRuneInString(lex.input)
 			if !lex.handleTextEsc(r, &sb) {
 				if s, ok := quotedSpecials[r]; ok {
 					lex.advance(r, size)
 					sb.WriteByte(s)
 				} else {
 					// Don't advance past r,
 					// just treat it as the next rune.
 					sb.WriteByte('\\')
 				}
 			}
 		}
 	}
 	return token{kind: tokenInvalidInput, pos: pos}
 }

 // textEsc is called with the input pointing after a backslash.
 // If we see an escape sequence that can appear in a string or a name,
 // this adds a rune to sb and advances input.
 // It reports whether it added a rune.
 func (lex *lexer) textEsc(sb *strings.Builder) bool {
 	if len(lex.input) == 0 {
 		return false
 	}
 	r, _ := utf8.DecodeRuneInString(lex.input)
 	return lex.handleTextEsc(r, sb)
 }

 // handleTextEsc is called with a character following a backslash.
 // The character is still in lex.input.
 // If this is an escape sequence that can appear in a string or a name,
 // this adds a rune to sb and advances input.
 // It reports whether it added a rune.
 func (lex *lexer) handleTextEsc(r rune, sb *strings.Builder) bool {
 	switch r {
 	case ',', ':', '=', '<', '>', '+', '~', '"', '\\', '.', '*':
 		lex.advance(r, 1)
 		sb.WriteRune(r)
 		return true

 	case 'u':
 		return lex.handleHex(r, sb)

 	case '0', '1', '2', '3', '4', '5', '6', '7':
 		// A backslash sequence of 1-3 octal digits is an octal escape.
 		if len(lex.input) >= 3 && r <= '3' && isOctalDigit(rune(lex.input[1])) && isOctalDigit(rune(lex.input[2])) {
 			n1 := rune(lex.input[1])
 			n2 := rune(lex.input[2])
 			r = (r - '0') << 6
 			r += (n1 - '0') << 3
 			r += n2 - '0'
 			lex.advance(r, 1)
 			lex.advance(n1, 1)
 			lex.advance(n2, 1)
 			sb.WriteRune(r)
 			return true
 		}
 		if len(lex.input) >= 2 && isOctalDigit(rune(lex.input[1])) {
 			lex.advance(r, 1)
 			r -= '0'
 			r <<= 3
 			r += rune(lex.input[1] - '0')
 			lex.advance(rune(lex.input[1]), 1)
 			sb.WriteRune(r)
 			return true
 		}
 		lex.advance(r, 1)
 		r -= '0'
 		sb.WriteRune(r)
 		return true

 	case 'x':
 		return lex.handleHex(r, sb)

 	default:
 		return false
 	}
 }

 // handleHex is a subroutine of handleTextEsc.
 // r is the escape sequence starter, either 'u' for a \uDDDD escape
 // or 'r' for a \xHH escape.
 func (lex *lexer) handleHex(r rune, sb *strings.Builder) bool {
 	var ln int
 	if r == 'u' {
 		ln = 4
 	} else {
 		ln = 2
 	}

 	if len(lex.input) < ln+1 {
 		return false
 	}

 	for i := range ln {
 		if !isHexDigit(rune(lex.input[i+1])) {
 			return false
 		}
 	}

 	// Skip the 'u' or 'h' starter.
 	lex.advance(r, 1)

 	r = 0
 	for range ln {
 		r1 := rune(lex.input[0])
 		lex.advance(r1, 1)
 		r <<= 4
 		if r1 >= 'a' && r1 <= 'f' {
 			r += 10 + r1 - 'a'
 		} else if r1 >= 'A' && r1 <= 'F' {
 			r += 10 + r1 - 'A'
 		} else {
 			r += r1 - '0'
 		}
 	}
 	if r == 'u' {
 		// \uDDDD is the rune DDDD.
 		sb.WriteRune(r)
 	} else {
 		// /xHH is the byte HH.
 		sb.WriteByte(byte(r))
 	}
 	return true
 }

 // pushToken pushes a token so that it is the next one returned.
 func (lex *lexer) pushToken(tok token) {
 	if lex.pushed {
 		panic("double pushToken")
 	}
 	lex.pushed = true
 	lex.pushedToken = tok
 }

 // isWhite reports whether r is a whitespace character.
 // These are the whitespace characters recognized by the C++ lexer.
 func isWhite(r rune) bool {
 	switch r {
 	case ' ', '\t', '\f', '\u00a0', '\r', '\n':
 		return true
 	default:
 		return false
 	}
 }

 // isDigit reports whether r is a digit.
 func isDigit(r rune) bool {
 	return r >= '0' && r <= '9'
 }

 // isOctalDigit reports whether r is an octal digit.
 func isOctalDigit(r rune) bool {
 	return r >= '0' && r <= '7'
 }

 // isHexDigit reports whether r is a hex digit.
 func isHexDigit(r rune) bool {
 	return isDigit(r) || (r >= 'a' && r <= 'f') || (r >= 'A' && r <= 'F')
 }

 // isTextStart reports whether r can be start of a name.
 func isTextStart(r rune) bool {
 	switch {
 	case r >= '#' && r <= '\'':
 	case r == '*':
 	case r == '/':
 	case r == ';':
 	case r == '?':
 	case r == '@':
 	case r >= 'A' && r <= 'Z':
 	case r == '[':
 	case r == ']':
 	case r >= '^' && r <= '}': // includes lower-case ASCII letters
 	case isa1OrHigher(r):
 	case r == '\\':

 	default:
 		return false
 	}

 	return true
 }

 // isa1OrHigher reports whether r is a rune larger than \u00a1.
 func isa1OrHigher(r rune) bool {
 	return r >= '\u00a1' && r <= '\U000effff'
 }
	// Copyright 2024 The Go Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style
	// license that can be found in the LICENSE file.

	package filter

	import (
	"fmt"
	"strings"
	"unicode/utf8"
	)

	// tokenKind is a lexical token.
	type tokenKind int

	const (
	tokenInvalidInput tokenKind = iota
	tokenEOF // end of tokens
	tokenWSC // whitespace or comment
	tokenDot // .
	tokenHas // :
	tokenOr // OR
	tokenAnd // AND
	tokenNot // NOT
	tokenLparen // (
	tokenRparen // )
	tokenComma // ,
	tokenLessThan // <
	tokenGreaterThan // >
	tokenGreaterThanEquals // >=
	tokenLessThanEquals // <=
	tokenNotEquals // !=
	tokenMatchesRegexp // =~
	tokenNotMatchesRegexp // !~
	tokenEquals // =
	tokenMinus // -
	tokenPlus // +
	tokenTilde // ~
	tokenBackslash // \
	tokenString
	tokenText
	)

	var tokenKindStrings = [...]string{
	tokenInvalidInput: "invalid",
	tokenEOF: "EOF",
	tokenWSC: "whitespace",
	tokenDot: ".",
	tokenHas: ":",
	tokenOr: "OR",
	tokenAnd: "AND",
	tokenNot: "NOT",
	tokenLparen: "(",
	tokenRparen: ")",
	tokenComma: ",",
	tokenLessThan: "<",
	tokenGreaterThan: ">",
	tokenGreaterThanEquals: ">=",
	tokenLessThanEquals: "<=",
	tokenNotEquals: "!=",
	tokenMatchesRegexp: "=~",
	tokenNotMatchesRegexp: "!~",
	tokenEquals: "=",
	tokenMinus: "-",
	tokenPlus: "+",
	tokenTilde: "~",
	tokenBackslash: `\`,
	tokenString: "string",
	tokenText: "text",
	}

	// String returns the string representation of a tokenKind.
	func (tk tokenKind) String() string {
	return tokenKindStrings[tk]
	}

	// token is a lexical token read from the filter string.
	type token struct {
	kind tokenKind
	pos position
	val string // only set for tokenString and tokenText
	}

	// position describes a position in the filter string.
	type position struct {
	line, col int
	}

	// String prints a position for an error message.
	func (pos position) String() string {
	return fmt.Sprintf("%d:%d", pos.line, pos.col)
	}

	// lexer is used to convert a string into a sequence of tokens.
	// The exact separation into names and other tokens,
	// and the handling of escape sequences,
	// is pretty loosely defined.
	type lexer struct {
	input string

	pushed bool
	pushedToken token // valid if pushed

	prevDot bool // previous token was tokenDot
	prevText bool // previous token was tokenText

	nextPos position // position of next token
	}

	// next returns the next token from the string.
	// At the end of the input this returns tokenEOF.
	func (lex *lexer) nextToken() token {
	if lex.pushed {
	lex.pushed = false
	return lex.pushedToken
	}

	pos := lex.nextPos

	prevDot := lex.prevDot
	prevText := lex.prevText
	lex.prevDot = false
	lex.prevText = false

	if len(lex.input) == 0 {
	return token{kind: tokenEOF, pos: pos}
	}

	r, size := utf8.DecodeRuneInString(lex.input)
	lex.advance(r, size)
	if r == utf8.RuneError {
	return token{kind: tokenInvalidInput, pos: pos}
	}

	switch {
	case isWhite(r):
	lex.skipWhite()
	return token{kind: tokenWSC, pos: pos}
	case r == '-':
	if len(lex.input) == 0 {
	return token{kind: tokenMinus, pos: pos}
	}
	rn, size := utf8.DecodeRuneInString(lex.input)
	if rn == '-' {
	// Comment.
	lex.advance(rn, size)
	lex.skipComment()
	lex.skipWhite()
	return token{kind: tokenWSC, pos: pos}
	}
	if isDigit(rn) {
	return lex.text(r, prevDot)
	}
	if !prevText && rn == '.' && len(lex.input) > size && isDigit(rune(lex.input[size])) {
	return lex.text(r, prevDot)
	}
	return token{kind: tokenMinus, pos: pos}
	case r == '.':
	if len(lex.input) == 0 {
	lex.prevDot = true
	return token{kind: tokenDot, pos: pos}
	}
	if !prevText {
	rn, _ := utf8.DecodeRuneInString(lex.input)
	if isDigit(rn) {
	return lex.text(r, prevDot)
	}
	}
	lex.prevDot = true
	return token{kind: tokenDot, pos: pos}
	case r == ':':
	return token{kind: tokenHas, pos: pos}
	case r == 'O':
	if len(lex.input) > 0 && lex.input[0] == 'R' && (len(lex.input) < 2 \|\| isWhite(rune(lex.input[1]))) {
	lex.advance('R', 1)
	return token{kind: tokenOr, pos: pos}
	}
	return lex.text(r, prevDot)
	case r == 'A':
	if len(lex.input) > 1 && lex.input[:2] == "ND" && (len(lex.input) < 3 \|\| isWhite(rune(lex.input[2]))) {
	lex.advance('N', 1)
	lex.advance('D', 1)
	return token{kind: tokenAnd, pos: pos}
	}
	return lex.text(r, prevDot)
	case r == 'N':
	if len(lex.input) > 1 && lex.input[:2] == "OT" && (len(lex.input) < 3 \|\| isWhite(rune(lex.input[2]))) {
	lex.advance('O', 1)
	lex.advance('T', 1)
	return token{kind: tokenNot, pos: pos}
	}
	return lex.text(r, prevDot)
	case r == '(':
	return token{kind: tokenLparen, pos: pos}
	case r == ')':
	return token{kind: tokenRparen, pos: pos}
	case r == ',':
	return token{kind: tokenComma, pos: pos}
	case r == '<':
	if len(lex.input) > 0 && lex.input[0] == '=' {
	lex.advance('=', 1)
	return token{kind: tokenLessThanEquals, pos: pos}
	}
	return token{kind: tokenLessThan, pos: pos}
	case r == '>':
	if len(lex.input) > 0 && lex.input[0] == '=' {
	lex.advance('=', 1)
	return token{kind: tokenGreaterThanEquals, pos: pos}
	}
	return token{kind: tokenGreaterThan, pos: pos}
	case r == '!':
	if len(lex.input) > 0 {
	if lex.input[0] == '=' {
	lex.advance('=', 1)
	return token{kind: tokenNotEquals, pos: pos}
	}
	if lex.input[0] == '~' {
	lex.advance('~', 1)
	return token{kind: tokenNotMatchesRegexp, pos: pos}
	}
	}
	// A bare ! is part of a name.
	return lex.text(r, prevDot)
	case r == '=':
	if len(lex.input) > 0 && lex.input[0] == '~' {
	lex.advance('~', 1)
	return token{kind: tokenMatchesRegexp, pos: pos}
	}
	return token{kind: tokenEquals, pos: pos}
	case r == '+':
	return token{kind: tokenPlus, pos: pos}
	case r == '~':
	return token{kind: tokenTilde, pos: pos}
	case r == '"':
	return lex.collectString()
	case isTextStart(r):
	return lex.text(r, prevDot)
	case isDigit(r):
	return lex.text(r, prevDot)

	default:
	return token{kind: tokenInvalidInput, pos: pos}
	}
	}

	// skipWhite skips whitespace.
	func (lex *lexer) skipWhite() {
	for len(lex.input) > 0 {
	r, size := utf8.DecodeRuneInString(lex.input)
	switch {
	case isWhite(r):
	lex.advance(r, size)
	case r == '-':
	if len(lex.input) <= size {
	return
	}
	rn, sizen := utf8.DecodeRuneInString(lex.input[size:])
	if rn != '-' {
	return
	}
	lex.advance(r, size)
	lex.advance(rn, sizen)
	lex.skipComment()
	default:
	return
	}
	}
	}

	// skipComment skips a comment.
	// We have already skipped the initial two '-' characters.
	func (lex *lexer) skipComment() {
	for len(lex.input) > 0 {
	r, size := utf8.DecodeRuneInString(lex.input)
	switch {
	case r == '\t',
	r >= ' ' && r <= '~',
	isa1OrHigher(r):

	lex.advance(r, size)

	default:
	return
	}
	}
	}

	// advance advances past r of size size.
	func (lex *lexer) advance(r rune, size int) {
	lex.input = lex.input[size:]
	if r == '\n' {
	lex.nextPos.line++
	lex.nextPos.col = 0
	} else {
	lex.nextPos.col++
	}
	}

	// text collects a name.
	// r is the first rune in the name; we have already advanced past it.
	// prevDot records whether the previous token was tokenDot;
	// in that case we don't include a dot after a number prefix.
	func (lex *lexer) text(r rune, prevDot bool) token {
	pos := lex.nextPos

	var sb strings.Builder

	if r != '\\' {
	sb.WriteRune(r)
	} else {
	if !lex.textEsc(&sb) {
	return token{kind: tokenInvalidInput, pos: pos}
	}
	}

	// Add a number prefix.
	if r == '-' \|\| r == '.' \|\| isDigit(r) {
	for len(lex.input) > 0 {
	rn, size := utf8.DecodeRuneInString(lex.input)
	if isDigit(rn) {
	sb.WriteRune(rn)
	lex.advance(rn, size)
	if r == '.' {
	break
	}
	} else if !prevDot && r != '.' && rn == '.' {
	sb.WriteRune(rn)
	lex.advance(rn, size)
	break
	} else {
	break
	}
	if r == '-' {
	r = rn
	}
	}
	}

	for len(lex.input) > 0 {
	rn, size := utf8.DecodeRuneInString(lex.input)
	if rn == '\\' {
	lex.advance(rn, size)
	if !lex.textEsc(&sb) {
	return token{kind: tokenInvalidInput, pos: pos}
	}
	} else if isTextStart(rn) \|\| isDigit(rn) \|\| rn == '+' \|\| rn == '-' {
	sb.WriteRune(rn)
	lex.advance(rn, size)
	} else if rn == '!' {
	// A bare '!' is part of a name for some reason,
	// but not != or !~.
	if len(lex.input) > 1 && (lex.input[1] == '=' \|\| lex.input[1] == '~') {
	break
	}
	sb.WriteRune(rn)
	lex.advance(rn, size)
	} else {
	break
	}
	}

	lex.prevText = true

	return token{kind: tokenText, pos: pos, val: sb.String()}
	}

	// quotedSpecials is a map from a character that can follow a
	// backslash to the byte value that it represents.
	var quotedSpecials = map[rune]byte{
	'a': '\a',
	'b': '\b',
	'f': '\f',
	'n': '\n',
	'r': '\r',
	't': '\t',
	'v': '\v',
	}

	// collectString collects the characters from a string.
	func (lex *lexer) collectString() token {
	pos := lex.nextPos

	check := false
	var sb strings.Builder
	for len(lex.input) > 0 {
	r, size := utf8.DecodeRuneInString(lex.input)
	lex.advance(r, size)
	switch {
	case isWhite(r),
	r == '!',
	r >= '#' && r <= '[', // does not include '\\'
	r >= ']' && r <= '~',
	isa1OrHigher(r):

	sb.WriteRune(r)

	case r == '"':
	s := sb.String()
	if check && !utf8.ValidString(s) {
	return token{kind: tokenInvalidInput, pos: pos}
	}
	return token{kind: tokenString, pos: pos, val: s}

	case r == '\\':
	check = true
	if len(lex.input) == 0 {
	return token{kind: tokenInvalidInput, pos: pos}
	}
	r, size = utf8.DecodeRuneInString(lex.input)
	if !lex.handleTextEsc(r, &sb) {
	if s, ok := quotedSpecials[r]; ok {
	lex.advance(r, size)
	sb.WriteByte(s)
	} else {
	// Don't advance past r,
	// just treat it as the next rune.
	sb.WriteByte('\\')
	}
	}
	}
	}
	return token{kind: tokenInvalidInput, pos: pos}
	}

	// textEsc is called with the input pointing after a backslash.
	// If we see an escape sequence that can appear in a string or a name,
	// this adds a rune to sb and advances input.
	// It reports whether it added a rune.
	func (lex lexer) textEsc(sb strings.Builder) bool {
	if len(lex.input) == 0 {
	return false
	}
	r, _ := utf8.DecodeRuneInString(lex.input)
	return lex.handleTextEsc(r, sb)
	}

	// handleTextEsc is called with a character following a backslash.
	// The character is still in lex.input.
	// If this is an escape sequence that can appear in a string or a name,
	// this adds a rune to sb and advances input.
	// It reports whether it added a rune.
	func (lex lexer) handleTextEsc(r rune, sb strings.Builder) bool {
	switch r {
	case ',', ':', '=', '<', '>', '+', '~', '"', '\\', '.', '*':
	lex.advance(r, 1)
	sb.WriteRune(r)
	return true

	case 'u':
	return lex.handleHex(r, sb)

	case '0', '1', '2', '3', '4', '5', '6', '7':
	// A backslash sequence of 1-3 octal digits is an octal escape.
	if len(lex.input) >= 3 && r <= '3' && isOctalDigit(rune(lex.input[1])) && isOctalDigit(rune(lex.input[2])) {
	n1 := rune(lex.input[1])
	n2 := rune(lex.input[2])
	r = (r - '0') << 6
	r += (n1 - '0') << 3
	r += n2 - '0'
	lex.advance(r, 1)
	lex.advance(n1, 1)
	lex.advance(n2, 1)
	sb.WriteRune(r)
	return true
	}
	if len(lex.input) >= 2 && isOctalDigit(rune(lex.input[1])) {
	lex.advance(r, 1)
	r -= '0'
	r <<= 3
	r += rune(lex.input[1] - '0')
	lex.advance(rune(lex.input[1]), 1)
	sb.WriteRune(r)
	return true
	}
	lex.advance(r, 1)
	r -= '0'
	sb.WriteRune(r)
	return true

	case 'x':
	return lex.handleHex(r, sb)

	default:
	return false
	}
	}

	// handleHex is a subroutine of handleTextEsc.
	// r is the escape sequence starter, either 'u' for a \uDDDD escape
	// or 'r' for a \xHH escape.
	func (lex lexer) handleHex(r rune, sb strings.Builder) bool {
	var ln int
	if r == 'u' {
	ln = 4
	} else {
	ln = 2
	}

	if len(lex.input) < ln+1 {
	return false
	}

	for i := range ln {
	if !isHexDigit(rune(lex.input[i+1])) {
	return false
	}
	}

	// Skip the 'u' or 'h' starter.
	lex.advance(r, 1)

	r = 0
	for range ln {
	r1 := rune(lex.input[0])
	lex.advance(r1, 1)
	r <<= 4
	if r1 >= 'a' && r1 <= 'f' {
	r += 10 + r1 - 'a'
	} else if r1 >= 'A' && r1 <= 'F' {
	r += 10 + r1 - 'A'
	} else {
	r += r1 - '0'
	}
	}
	if r == 'u' {
	// \uDDDD is the rune DDDD.
	sb.WriteRune(r)
	} else {
	// /xHH is the byte HH.
	sb.WriteByte(byte(r))
	}
	return true
	}

	// pushToken pushes a token so that it is the next one returned.
	func (lex *lexer) pushToken(tok token) {
	if lex.pushed {
	panic("double pushToken")
	}
	lex.pushed = true
	lex.pushedToken = tok
	}

	// isWhite reports whether r is a whitespace character.
	// These are the whitespace characters recognized by the C++ lexer.
	func isWhite(r rune) bool {
	switch r {
	case ' ', '\t', '\f', '\u00a0', '\r', '\n':
	return true
	default:
	return false
	}
	}

	// isDigit reports whether r is a digit.
	func isDigit(r rune) bool {
	return r >= '0' && r <= '9'
	}

	// isOctalDigit reports whether r is an octal digit.
	func isOctalDigit(r rune) bool {
	return r >= '0' && r <= '7'
	}

	// isHexDigit reports whether r is a hex digit.
	func isHexDigit(r rune) bool {
	return isDigit(r) \|\| (r >= 'a' && r <= 'f') \|\| (r >= 'A' && r <= 'F')
	}

	// isTextStart reports whether r can be start of a name.
	func isTextStart(r rune) bool {
	switch {
	case r >= '#' && r <= '\'':
	case r == '*':
	case r == '/':
	case r == ';':
	case r == '?':
	case r == '@':
	case r >= 'A' && r <= 'Z':
	case r == '[':
	case r == ']':
	case r >= '^' && r <= '}': // includes lower-case ASCII letters
	case isa1OrHigher(r):
	case r == '\\':

	default:
	return false
	}

	return true
	}

	// isa1OrHigher reports whether r is a rune larger than \u00a1.
	func isa1OrHigher(r rune) bool {
	return r >= '\u00a1' && r <= '\U000effff'
	}