blob: a540ebefac5b1026ee31b42987e82f4bd3d3f477 [file] [log] [blame]
// Copyright 2023 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package glob implements an LSP-compliant glob pattern matcher for testing.
package glob
import (
"errors"
"fmt"
"strings"
"unicode/utf8"
)
// A Glob is an LSP-compliant glob pattern, as defined by the spec:
// https://microsoft.github.io/language-server-protocol/specifications/lsp/3.17/specification/#documentFilter
//
// NOTE: this implementation is currently only intended for testing. In order
// to make it production ready, we'd need to:
// - verify it against the VS Code implementation
// - add more tests
// - microbenchmark, likely avoiding the element interface
// - resolve the question of what is meant by "character". If it's a UTF-16
// code (as we suspect) it'll be a bit more work.
//
// Quoting from the spec:
// Glob patterns can have the following syntax:
// - `*` to match one or more characters in a path segment
// - `?` to match on one character in a path segment
// - `**` to match any number of path segments, including none
// - `{}` to group sub patterns into an OR expression. (e.g. `**/*.{ts,js}`
// matches all TypeScript and JavaScript files)
// - `[]` to declare a range of characters to match in a path segment
// (e.g., `example.[0-9]` to match on `example.0`, `example.1`, …)
// - `[!...]` to negate a range of characters to match in a path segment
// (e.g., `example.[!0-9]` to match on `example.a`, `example.b`, but
// not `example.0`)
//
// Expanding on this:
// - '/' matches one or more literal slashes.
// - any other character matches itself literally.
type Glob struct {
elems []element // pattern elements
}
// Parse builds a Glob for the given pattern, returning an error if the pattern
// is invalid.
func Parse(pattern string) (*Glob, error) {
g, _, err := parse(pattern, false)
return g, err
}
func parse(pattern string, nested bool) (*Glob, string, error) {
g := new(Glob)
for len(pattern) > 0 {
switch pattern[0] {
case '/':
pattern = pattern[1:]
g.elems = append(g.elems, slash{})
case '*':
if len(pattern) > 1 && pattern[1] == '*' {
if (len(g.elems) > 0 && g.elems[len(g.elems)-1] != slash{}) || (len(pattern) > 2 && pattern[2] != '/') {
return nil, "", errors.New("** may only be adjacent to '/'")
}
pattern = pattern[2:]
g.elems = append(g.elems, starStar{})
break
}
pattern = pattern[1:]
g.elems = append(g.elems, star{})
case '?':
pattern = pattern[1:]
g.elems = append(g.elems, anyChar{})
case '{':
var gs group
for pattern[0] != '}' {
pattern = pattern[1:]
g, pat, err := parse(pattern, true)
if err != nil {
return nil, "", err
}
if len(pat) == 0 {
return nil, "", errors.New("unmatched '{'")
}
pattern = pat
gs = append(gs, g)
}
pattern = pattern[1:]
g.elems = append(g.elems, gs)
case '}', ',':
if nested {
return g, pattern, nil
}
pattern = g.parseLiteral(pattern, false)
case '[':
pattern = pattern[1:]
if len(pattern) == 0 {
return nil, "", errBadRange
}
negate := false
if pattern[0] == '!' {
pattern = pattern[1:]
negate = true
}
low, sz, err := readRangeRune(pattern)
if err != nil {
return nil, "", err
}
pattern = pattern[sz:]
if len(pattern) == 0 || pattern[0] != '-' {
return nil, "", errBadRange
}
pattern = pattern[1:]
high, sz, err := readRangeRune(pattern)
if err != nil {
return nil, "", err
}
pattern = pattern[sz:]
if len(pattern) == 0 || pattern[0] != ']' {
return nil, "", errBadRange
}
pattern = pattern[1:]
g.elems = append(g.elems, charRange{negate, low, high})
default:
pattern = g.parseLiteral(pattern, nested)
}
}
return g, "", nil
}
// helper for decoding a rune in range elements, e.g. [a-z]
func readRangeRune(input string) (rune, int, error) {
r, sz := utf8.DecodeRuneInString(input)
var err error
if r == utf8.RuneError {
// See the documentation for DecodeRuneInString.
switch sz {
case 0:
err = errBadRange
case 1:
err = errInvalidUTF8
}
}
return r, sz, err
}
var (
errBadRange = errors.New("'[' patterns must be of the form [x-y]")
errInvalidUTF8 = errors.New("invalid UTF-8 encoding")
)
func (g *Glob) parseLiteral(pattern string, nested bool) string {
var specialChars string
if nested {
specialChars = "*?{[/},"
} else {
specialChars = "*?{[/"
}
end := strings.IndexAny(pattern, specialChars)
if end == -1 {
end = len(pattern)
}
g.elems = append(g.elems, literal(pattern[:end]))
return pattern[end:]
}
func (g *Glob) String() string {
var b strings.Builder
for _, e := range g.elems {
fmt.Fprint(&b, e)
}
return b.String()
}
// element holds a glob pattern element, as defined below.
type element fmt.Stringer
// element types.
type (
slash struct{} // One or more '/' separators
literal string // string literal, not containing /, *, ?, {}, or []
star struct{} // *
anyChar struct{} // ?
starStar struct{} // **
group []*Glob // {foo, bar, ...} grouping
charRange struct { // [a-z] character range
negate bool
low, high rune
}
)
func (s slash) String() string { return "/" }
func (l literal) String() string { return string(l) }
func (s star) String() string { return "*" }
func (a anyChar) String() string { return "?" }
func (s starStar) String() string { return "**" }
func (g group) String() string {
var parts []string
for _, g := range g {
parts = append(parts, g.String())
}
return "{" + strings.Join(parts, ",") + "}"
}
func (r charRange) String() string {
return "[" + string(r.low) + "-" + string(r.high) + "]"
}
// Match reports whether the input string matches the glob pattern.
func (g *Glob) Match(input string) bool {
return match(g.elems, input)
}
func match(elems []element, input string) (ok bool) {
var elem interface{}
for len(elems) > 0 {
elem, elems = elems[0], elems[1:]
switch elem := elem.(type) {
case slash:
if len(input) == 0 || input[0] != '/' {
return false
}
for input[0] == '/' {
input = input[1:]
}
case starStar:
// Special cases:
// - **/a matches "a"
// - **/ matches everything
//
// Note that if ** is followed by anything, it must be '/' (this is
// enforced by Parse).
if len(elems) > 0 {
elems = elems[1:]
}
// A trailing ** matches anything.
if len(elems) == 0 {
return true
}
// Backtracking: advance pattern segments until the remaining pattern
// elements match.
for len(input) != 0 {
if match(elems, input) {
return true
}
_, input = split(input)
}
return false
case literal:
if !strings.HasPrefix(input, string(elem)) {
return false
}
input = input[len(elem):]
case star:
var segInput string
segInput, input = split(input)
elemEnd := len(elems)
for i, e := range elems {
if e == (slash{}) {
elemEnd = i
break
}
}
segElems := elems[:elemEnd]
elems = elems[elemEnd:]
// A trailing * matches the entire segment.
if len(segElems) == 0 {
break
}
// Backtracking: advance characters until remaining subpattern elements
// match.
matched := false
for i := range segInput {
if match(segElems, segInput[i:]) {
matched = true
break
}
}
if !matched {
return false
}
case anyChar:
if len(input) == 0 || input[0] == '/' {
return false
}
input = input[1:]
case group:
// Append remaining pattern elements to each group member looking for a
// match.
var branch []element
for _, m := range elem {
branch = branch[:0]
branch = append(branch, m.elems...)
branch = append(branch, elems...)
if match(branch, input) {
return true
}
}
return false
case charRange:
if len(input) == 0 || input[0] == '/' {
return false
}
c, sz := utf8.DecodeRuneInString(input)
if c < elem.low || c > elem.high {
return false
}
input = input[sz:]
default:
panic(fmt.Sprintf("segment type %T not implemented", elem))
}
}
return len(input) == 0
}
// split returns the portion before and after the first slash
// (or sequence of consecutive slashes). If there is no slash
// it returns (input, nil).
func split(input string) (first, rest string) {
i := strings.IndexByte(input, '/')
if i < 0 {
return input, ""
}
first = input[:i]
for j := i; j < len(input); j++ {
if input[j] != '/' {
return first, input[j:]
}
}
return first, ""
}