internal/testing/htmlcheck: replace cascadia for htmlcheck.NotIn

This change reimplements a subset of cascadia's query functionality so
we can replace it for htmlcheck.NotIn. It adds code that parses a
subset of the css selector syntax (currently it's more than we need
for NotIn but not enough for In) and adds a query function to query
the selector in the given html node. Unlike cascadia, our query
function can match the node itself.

Future cls will add support for more selector syntax so we can support
everything htmlcheck.In is used for but we only intend so support just
what we need. I also limit the selector syntax to ascii for
simplicity.

For #61399

Change-Id: Ia03cc8a9ab42ae11d445a650e1ca0a07ee8a391f
Reviewed-on: https://go-review.googlesource.com/c/pkgsite/+/541437
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
kokoro-CI: kokoro <noreply+kokoro@google.com>
Reviewed-by: Jonathan Amsterdam <jba@google.com>
diff --git a/internal/testing/htmlcheck/htmlcheck.go b/internal/testing/htmlcheck/htmlcheck.go
index 764ec2e..46cdf87 100644
--- a/internal/testing/htmlcheck/htmlcheck.go
+++ b/internal/testing/htmlcheck/htmlcheck.go
@@ -38,7 +38,7 @@
 //
 // A nil Checker is valid and always succeeds.
 func In(selector string, checkers ...Checker) Checker {
-	sel := mustParseSelector(selector)
+	sel := mustParseCascadiaSelector(selector)
 	return func(n *html.Node) error {
 		var m *html.Node
 		// cascadia.Query does not test against its argument node.
@@ -64,7 +64,7 @@
 func NotIn(selector string) Checker {
 	sel := mustParseSelector(selector)
 	return func(n *html.Node) error {
-		if sel.Match(n) || cascadia.Query(n, sel) != nil {
+		if query(n, sel) != nil {
 			return fmt.Errorf("%q matched one or more elements", selector)
 		}
 		return nil
@@ -84,9 +84,9 @@
 	return nil
 }
 
-// mustParseSelector parses the given CSS selector. An empty string
+// mustParseCascadiaSelector parses the given CSS selector. An empty string
 // is treated as "*" (match everything).
-func mustParseSelector(s string) cascadia.Sel {
+func mustParseCascadiaSelector(s string) cascadia.Sel {
 	if s == "" {
 		s = "*"
 	}
@@ -97,6 +97,16 @@
 	return sel
 }
 
+// mustParseSelector parses the given CSS selector. An empty string
+// is treated as matching everything.
+func mustParseSelector(s string) *selector {
+	sel, err := parse(s)
+	if err != nil {
+		panic(fmt.Sprintf("parsing %q: %v", s, err))
+	}
+	return sel
+}
+
 // HasText returns a Checker that checks whether the given regexp matches the node's text.
 // The text of a node n is the concatenated contents of all text nodes in n's subtree.
 // HasText panics if the argument doesn't compile.
diff --git a/internal/testing/htmlcheck/query.go b/internal/testing/htmlcheck/query.go
new file mode 100644
index 0000000..1e5a671
--- /dev/null
+++ b/internal/testing/htmlcheck/query.go
@@ -0,0 +1,270 @@
+// Copyright 2023 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package htmlcheck
+
+import (
+	"errors"
+	"fmt"
+	"strings"
+
+	"golang.org/x/net/html"
+)
+
+// A selector represents a parsed css selector that can be used in a query.
+// The atoms all match against a given element and next matches against
+// children of that element. So, for example "div#id a" parses into a selector that
+// has atoms for matching the div and the id and a next that points to another
+// selector that has an atom for "a".
+type selector struct {
+	atoms []selectorAtom
+	next  *selector
+}
+
+// String returns a string used for debugging test failures.
+func (s *selector) String() string {
+	if s == nil {
+		return "nil"
+	}
+	str := "["
+	for i, atom := range s.atoms {
+		str += fmt.Sprintf("%#v", atom)
+		if i != len(s.atoms)-1 {
+			str += ","
+		}
+	}
+	str += "]->" + s.next.String()
+	return str
+}
+
+// selectorAtom represents a part of a selector that individually
+// matches a single element name, id, class, or attribute value.
+type selectorAtom interface {
+	match(n *html.Node) bool
+}
+
+// query returns the first node in n that matches the given selector,
+// or nil if there are no nodes matching the selector.
+func query(n *html.Node, selector *selector) *html.Node {
+	allMatch := true
+	for _, atom := range selector.atoms {
+		if !atom.match(n) {
+			allMatch = false
+			break
+		}
+	}
+	if allMatch {
+		if selector.next != nil {
+			if result := queryChildren(n, selector.next); result != nil {
+				return result
+			}
+		} else {
+			return n
+		}
+	}
+	return queryChildren(n, selector)
+}
+
+func queryChildren(n *html.Node, selector *selector) *html.Node {
+	child := n.FirstChild
+	for child != nil {
+		if result := query(child, selector); result != nil {
+			return result
+		}
+		child = child.NextSibling
+	}
+	return nil
+}
+
+// parse parses the string into a selector. It matches the following
+// atoms: element, #id, .class, [attribute="value"]. It allows the atoms
+// to be combined where they all need to match (for example, a#id) and
+// for nested selectors to be combined with a space.
+// For simplicity, the selector must not have any non-ASCII bytes.
+func parse(s string) (*selector, error) {
+	sel := &selector{}
+	if !isAscii(s) {
+		return nil, errors.New("non ascii byte found in selector string")
+	}
+	for len(s) > 0 {
+		switch {
+		case isLetter(s[0]):
+			ident, rest := consumeIdentifier(s)
+			sel.atoms = append(sel.atoms, &elementSelector{ident})
+			s = rest
+		case s[0] == '.':
+			ident, rest := consumeIdentifier(s[1:])
+			if len(ident) == 0 {
+				return nil, errors.New("no class name after '.'")
+			}
+			sel.atoms = append(sel.atoms, &classSelector{ident})
+			s = rest
+		case s[0] == '#':
+			ident, rest := consumeIdentifier(s[1:])
+			if len(ident) == 0 {
+				return nil, errors.New("no id name after '#'")
+			}
+			sel.atoms = append(sel.atoms, &idSelector{ident})
+			s = rest
+		case s[0] == '[':
+			attributeSelector, rest, err := parseAttributeSelector(s)
+			if err != nil {
+				return nil, err
+			}
+			sel.atoms = append(sel.atoms, attributeSelector)
+			s = rest
+		case s[0] == ' ':
+			s = strings.TrimLeft(s, " ")
+			next, err := parse(s)
+			if err != nil {
+				return nil, err
+			}
+			sel.next = next
+			return sel, nil
+		default:
+			return nil, fmt.Errorf("unexpected character %q in input", s[0])
+		}
+	}
+	return sel, nil
+}
+
+// parseAttributeSelector parses an attribute selector of the form [attribute-name="attribute=value"]
+func parseAttributeSelector(s string) (*attributeSelector, string, error) {
+	if s[0] != '[' {
+		return nil, "", errors.New("expected '[' at beginning of attribute selector")
+	}
+	ident, rest := consumeIdentifier(s[1:])
+	if len(ident) == 0 {
+		return nil, "", errors.New("expected attribute name after '[' in attribute selector")
+	}
+	attributeName := ident
+	s = rest
+	if len(s) == 0 || s[0] != '=' {
+		return nil, "", errors.New("expected '=' after attribute name in attribute selector")
+	}
+	s = s[1:]
+	if len(s) == 0 || s[0] != '"' {
+		return nil, "", errors.New("expected '\"' after = in attribute selector")
+	}
+	s = s[1:]
+	i := 0
+	for ; i < len(s) && s[i] != '"'; i++ {
+	}
+	attributeValue, s := s[:i], s[i:]
+	if len(s) == 0 || s[0] != '"' {
+		return nil, "", errors.New("expected '\"' after attribute value")
+	}
+	s = s[1:]
+	if len(s) == 0 || s[0] != ']' {
+		return nil, "", errors.New("expected ']' at end of attribute selector")
+	}
+	s = s[1:]
+	return &attributeSelector{attribute: attributeName, value: attributeValue}, s, nil
+}
+
+func isLetter(b byte) bool {
+	return ('a' <= b && b <= 'z') || ('A' <= b && b <= 'Z')
+}
+
+func isNumber(b byte) bool {
+	return ('0' <= b && b <= '9')
+}
+
+// consumeIdentifier consumes and returns a identifier at the beginning
+// of the given string, and the rest of the string.
+func consumeIdentifier(s string) (letters, rest string) {
+	i := 0
+	for ; i < len(s); i++ {
+		// must start with letter or hyphen or underscore
+		if i == 0 {
+			if !(isLetter(s[i]) || s[i] == '-' || s[i] == '_') {
+				break
+			}
+		} else {
+			if !(isLetter(s[i]) || isNumber(s[i]) || s[i] == '-' || s[i] == '_') {
+				break
+			}
+		}
+		// CSS doesn't allow identifiers to start with two hyphens or a hyphen
+		// followed by a digit, but we'll allow it.
+	}
+	return s[:i], s[i:]
+}
+
+func isAscii(s string) bool {
+	for i := 0; i < len(s); i++ {
+		if s[i] > 127 {
+			return false
+		}
+	}
+	return true
+}
+
+// elementSelector matches a node that has the given element name.
+type elementSelector struct {
+	name string
+}
+
+func (s *elementSelector) match(n *html.Node) bool {
+	if n.Type != html.ElementNode {
+		return false
+	}
+	return n.Data == s.name
+}
+
+type attributeSelector struct {
+	attribute, value string
+}
+
+// attributeSelector matches a node with an attribute that has a given value.
+func (s *attributeSelector) match(n *html.Node) bool {
+	if n.Type != html.ElementNode {
+		return false
+	}
+	for _, attr := range n.Attr {
+		if attr.Key == s.attribute {
+			return attr.Val == s.value
+		}
+	}
+	return false
+}
+
+// idSelector matches an element that has the given id.
+type idSelector struct {
+	id string
+}
+
+func (s *idSelector) match(n *html.Node) bool {
+	if n.Type != html.ElementNode {
+		return false
+	}
+	for _, attr := range n.Attr {
+		if attr.Key == "id" {
+			return attr.Val == s.id
+		}
+	}
+	return false
+}
+
+// classSelector matches an element that has the given class set on it.
+type classSelector struct {
+	class string
+}
+
+func (s *classSelector) match(n *html.Node) bool {
+	if n.Type != html.ElementNode {
+		return false
+	}
+	for _, attr := range n.Attr {
+		if attr.Key == "class" {
+			for _, f := range strings.Fields(attr.Val) {
+				if f == s.class {
+					return true
+				}
+			}
+			break
+		}
+	}
+	return false
+}
diff --git a/internal/testing/htmlcheck/query_test.go b/internal/testing/htmlcheck/query_test.go
new file mode 100644
index 0000000..f128289
--- /dev/null
+++ b/internal/testing/htmlcheck/query_test.go
@@ -0,0 +1,185 @@
+// Copyright 2023 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package htmlcheck
+
+import (
+	"bytes"
+	"errors"
+	"reflect"
+	"strings"
+	"testing"
+
+	"golang.org/x/net/html"
+)
+
+func TestParse(t *testing.T) {
+	var testCases = []struct {
+		text         string
+		wantSelector *selector
+		wantErr      error
+	}{
+		{
+			"a#id",
+			&selector{
+				atoms: []selectorAtom{&elementSelector{"a"}, &idSelector{"id"}},
+			},
+			nil,
+		},
+		{
+			"a",
+			&selector{atoms: []selectorAtom{&elementSelector{"a"}}},
+			nil,
+		},
+		{
+			"#id",
+			&selector{atoms: []selectorAtom{&idSelector{"id"}}},
+			nil,
+		},
+		{
+			".class a",
+			&selector{
+				atoms: []selectorAtom{&classSelector{"class"}},
+				next:  &selector{atoms: []selectorAtom{&elementSelector{"a"}}},
+			},
+			nil,
+		},
+		{
+			`[attribute-name="value"] a`,
+			&selector{
+				atoms: []selectorAtom{&attributeSelector{attribute: "attribute-name", value: "value"}},
+				next:  &selector{atoms: []selectorAtom{&elementSelector{"a"}}},
+			},
+			nil,
+		},
+		{
+			`a[attribute-name="value"] a`,
+			&selector{
+				atoms: []selectorAtom{&elementSelector{"a"}, &attributeSelector{attribute: "attribute-name", value: "value"}},
+				next:  &selector{atoms: []selectorAtom{&elementSelector{"a"}}},
+			},
+			nil,
+		},
+		{
+			`.class1.class2`,
+			&selector{
+				atoms: []selectorAtom{&classSelector{"class1"}, &classSelector{"class2"}},
+			},
+			nil,
+		},
+		{
+			`a.class1.class2`,
+			&selector{
+				atoms: []selectorAtom{&elementSelector{"a"}, &classSelector{"class1"}, &classSelector{"class2"}},
+			},
+			nil,
+		},
+		{
+			".",
+			nil,
+			errors.New("no class name after '.'"),
+		},
+		{
+			"#.",
+			nil,
+			errors.New("no id name after '#'"),
+		},
+		{
+			"[]",
+			nil,
+			errors.New("expected attribute name after '[' in attribute selector"),
+		},
+		{
+			"[attribute-name]",
+			nil,
+			errors.New("expected '=' after attribute name in attribute selector"),
+		},
+		{
+			"[attribute-name=]",
+			nil,
+			errors.New("expected '\"' after = in attribute selector"),
+		},
+		{
+			`[attribute-name=""]`,
+			&selector{atoms: []selectorAtom{&attributeSelector{attribute: "attribute-name", value: ""}}},
+			nil,
+		},
+		{
+			`[attribute-name="]`,
+			nil,
+			errors.New("expected '\"' after attribute value"),
+		},
+		{
+			`[attribute-name="value`,
+			nil,
+			errors.New("expected '\"' after attribute value"),
+		},
+		{
+			`[attribute-name="value"`,
+			nil,
+			errors.New("expected ']' at end of attribute selector"),
+		},
+	}
+	for _, tc := range testCases {
+		sel, err := parse(tc.text)
+		if tc.wantErr != nil {
+			if err == nil {
+				t.Fatalf("parse(%q): got nil err, want err %q", tc.text, tc.wantErr)
+			}
+			if tc.wantErr.Error() != err.Error() {
+				t.Fatalf("parse(%q): got err %q, want err %q", tc.text, err, tc.wantErr)
+			}
+		} else if err != nil {
+			t.Fatalf("parse(%q): got err %q, want nil error", tc.text, err)
+		}
+		if !reflect.DeepEqual(sel, tc.wantSelector) {
+			t.Fatalf("parse(%q): got %v; want %v", tc.text, sel, tc.wantSelector)
+		}
+	}
+}
+
+func TestQuery(t *testing.T) {
+	var testCases = []struct {
+		queriedText string
+		selector    string
+		want        string
+	}{
+		{"<a></a>", "a", "<a></a>"},
+		{`<a></a><a id="id">text</a>`, "a#id", `<a id="id">text</a>`},
+		{`<a></a><a class="class1"></a><a class="class2"></a><a class="class1 class2">text</a>`, ".class1.class2", `<a class="class1 class2">text</a>`},
+		{`<a></a><a class="class1">first</a><a class="class2"></a><a class="class1 class2">text</a>`, ".class1", `<a class="class1">first</a>`},
+		{`<div><div></div><div my-attr="my-val">text</div></div>`, `[my-attr="my-val"]`, `<div my-attr="my-val">text</div>`},
+		{`<div><div></div><div my-attr="my-val">text</div></div>`, `[myattr="my-val"]`, ""},
+		{`<html><head></head><body><div><div></div><div my-attr="my-val">text</div></div></body></html>`, ``, `<html><head></head><body><div><div></div><div my-attr="my-val">text</div></div></body></html>`},
+		{`<div></div><div><div>match me</div></div>`, "div div", `<div>match me</div>`},
+		{`<div></div><div><div>wrong</div></div><div id="wrong-id"><div class="my-class">also wrong</div></div><div id="my-id"><div class="wrong-class">still wrong</div></div><div id="my-id"><div class="my-class">match</div></div>`, "div#my-id div.my-class", `<div class="my-class">match</div>`},
+		{`<a></a><div class="UnitMeta-repo"><a href="foo" title="">link body</a></div>`, ".UnitMeta-repo a", `<a href="foo" title="">link body</a>`},
+		{`<ul class="UnitFiles-fileList"><li><a href="foo">a.go</a></li></ul>`, ".UnitFiles-fileList a", `<a href="foo">a.go</a>`},
+	}
+	for _, tc := range testCases {
+		n, err := html.Parse(strings.NewReader(tc.queriedText))
+		if err != nil {
+			t.Fatalf("parsing queried text %q: %v", tc.queriedText, err)
+		}
+		sel, err := parse(tc.selector)
+		if err != nil {
+			t.Fatalf("parsing selector %q: %v", tc.selector, err)
+		}
+		got := query(n, sel)
+		if got == nil {
+			if tc.want == "" {
+				continue
+			}
+			t.Fatalf("query(%q, %q): got nil; want %q", tc.queriedText, tc.selector, tc.want)
+		}
+		var buf bytes.Buffer
+		err = html.Render(&buf, got)
+		if err != nil {
+			t.Fatalf("rendering result of query: %v", err)
+		}
+		if buf.String() != tc.want {
+			t.Fatalf("query(%q, %q): got %q; want %q", tc.queriedText, tc.selector, buf.String(), tc.want)
+		}
+	}
+}