html: add Node.{Ancestors,ChildNodes,Descendants}()
Adds iterators for the parents, immediate children, and all children of a Node respectively.
Fixes golang/go#62113
Change-Id: Iab015872cc3a20fe5e7cae3bc90b89cba68cc3f8
GitHub-Last-Rev: d99de580ab8c30bb04c9fee401d6bfc5dd55d1ec
GitHub-Pull-Request: golang/net#215
Reviewed-on: https://go-review.googlesource.com/c/net/+/594195
Reviewed-by: Ian Lance Taylor <iant@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Auto-Submit: Ian Lance Taylor <iant@google.com>
Reviewed-by: Damien Neil <dneil@google.com>
diff --git a/html/doc.go b/html/doc.go
index 3a7e5ab..885c4c5 100644
--- a/html/doc.go
+++ b/html/doc.go
@@ -78,16 +78,11 @@
if err != nil {
// ...
}
- var f func(*html.Node)
- f = func(n *html.Node) {
+ for n := range doc.Descendants() {
if n.Type == html.ElementNode && n.Data == "a" {
// Do something with n...
}
- for c := n.FirstChild; c != nil; c = c.NextSibling {
- f(c)
- }
}
- f(doc)
The relevant specifications include:
https://html.spec.whatwg.org/multipage/syntax.html and
diff --git a/html/example_test.go b/html/example_test.go
index 0b06ed7..830f0b2 100644
--- a/html/example_test.go
+++ b/html/example_test.go
@@ -2,6 +2,8 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
+//go:build go1.23
+
// This example demonstrates parsing HTML data and walking the resulting tree.
package html_test
@@ -11,6 +13,7 @@
"strings"
"golang.org/x/net/html"
+ "golang.org/x/net/html/atom"
)
func ExampleParse() {
@@ -19,9 +22,8 @@
if err != nil {
log.Fatal(err)
}
- var f func(*html.Node)
- f = func(n *html.Node) {
- if n.Type == html.ElementNode && n.Data == "a" {
+ for n := range doc.Descendants() {
+ if n.Type == html.ElementNode && n.DataAtom == atom.A {
for _, a := range n.Attr {
if a.Key == "href" {
fmt.Println(a.Val)
@@ -29,11 +31,8 @@
}
}
}
- for c := n.FirstChild; c != nil; c = c.NextSibling {
- f(c)
- }
}
- f(doc)
+
// Output:
// foo
// /bar/baz
diff --git a/html/iter.go b/html/iter.go
new file mode 100644
index 0000000..54be8fd
--- /dev/null
+++ b/html/iter.go
@@ -0,0 +1,56 @@
+// Copyright 2024 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build go1.23
+
+package html
+
+import "iter"
+
+// Ancestors returns an iterator over the ancestors of n, starting with n.Parent.
+//
+// Mutating a Node or its parents while iterating may have unexpected results.
+func (n *Node) Ancestors() iter.Seq[*Node] {
+ _ = n.Parent // eager nil check
+
+ return func(yield func(*Node) bool) {
+ for p := n.Parent; p != nil && yield(p); p = p.Parent {
+ }
+ }
+}
+
+// ChildNodes returns an iterator over the immediate children of n,
+// starting with n.FirstChild.
+//
+// Mutating a Node or its children while iterating may have unexpected results.
+func (n *Node) ChildNodes() iter.Seq[*Node] {
+ _ = n.FirstChild // eager nil check
+
+ return func(yield func(*Node) bool) {
+ for c := n.FirstChild; c != nil && yield(c); c = c.NextSibling {
+ }
+ }
+
+}
+
+// Descendants returns an iterator over all nodes recursively beneath
+// n, excluding n itself. Nodes are visited in depth-first preorder.
+//
+// Mutating a Node or its descendants while iterating may have unexpected results.
+func (n *Node) Descendants() iter.Seq[*Node] {
+ _ = n.FirstChild // eager nil check
+
+ return func(yield func(*Node) bool) {
+ n.descendants(yield)
+ }
+}
+
+func (n *Node) descendants(yield func(*Node) bool) bool {
+ for c := range n.ChildNodes() {
+ if !yield(c) || !c.descendants(yield) {
+ return false
+ }
+ }
+ return true
+}
diff --git a/html/iter_test.go b/html/iter_test.go
new file mode 100644
index 0000000..cca7f82
--- /dev/null
+++ b/html/iter_test.go
@@ -0,0 +1,96 @@
+// Copyright 2024 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build go1.23
+
+package html
+
+import (
+ "strings"
+ "testing"
+)
+
+func TestNode_ChildNodes(t *testing.T) {
+ tests := []struct {
+ in string
+ want string
+ }{
+ {"", ""},
+ {"<a></a>", "a"},
+ {"a", "a"},
+ {"<a></a><!--b-->", "a b"},
+ {"a<b></b>c", "a b c"},
+ {"a<b><!--c--></b>d", "a b d"},
+ {"<a><b>c<!--d-->e</b></a>f<!--g--><h>i</h>", "a f g h"},
+ }
+ for _, test := range tests {
+ doc, err := Parse(strings.NewReader(test.in))
+ if err != nil {
+ t.Fatal(err)
+ }
+ // Drill to <html><head></head><body>
+ n := doc.FirstChild.FirstChild.NextSibling
+ var results []string
+ for c := range n.ChildNodes() {
+ results = append(results, c.Data)
+ }
+ if got := strings.Join(results, " "); got != test.want {
+ t.Errorf("ChildNodes = %q, want %q", got, test.want)
+ }
+ }
+}
+
+func TestNode_Descendants(t *testing.T) {
+ tests := []struct {
+ in string
+ want string
+ }{
+ {"", ""},
+ {"<a></a>", "a"},
+ {"<a><b></b></a>", "a b"},
+ {"<a>b</a>", "a b"},
+ {"<a><!--b--></a>", "a b"},
+ {"<a>b<c></c>d</a>", "a b c d"},
+ {"<a>b<c><!--d--></c>e</a>", "a b c d e"},
+ {"<a><b><c>d<!--e-->f</c></b>g<!--h--><i>j</i></a>", "a b c d e f g h i j"},
+ }
+ for _, test := range tests {
+ doc, err := Parse(strings.NewReader(test.in))
+ if err != nil {
+ t.Fatal(err)
+ }
+ // Drill to <html><head></head><body>
+ n := doc.FirstChild.FirstChild.NextSibling
+ var results []string
+ for c := range n.Descendants() {
+ results = append(results, c.Data)
+ }
+ if got := strings.Join(results, " "); got != test.want {
+ t.Errorf("Descendants = %q; want: %q", got, test.want)
+ }
+ }
+}
+
+func TestNode_Ancestors(t *testing.T) {
+ for _, size := range []int{0, 1, 2, 10, 100, 10_000} {
+ n := buildChain(size)
+ nParents := 0
+ for _ = range n.Ancestors() {
+ nParents++
+ }
+ if nParents != size {
+ t.Errorf("number of Ancestors = %d; want: %d", nParents, size)
+ }
+ }
+}
+
+func buildChain(size int) *Node {
+ child := new(Node)
+ for range size {
+ parent := child
+ child = new(Node)
+ parent.AppendChild(child)
+ }
+ return child
+}
diff --git a/html/node.go b/html/node.go
index 1350eef..77741a1 100644
--- a/html/node.go
+++ b/html/node.go
@@ -38,6 +38,10 @@
// that it looks like "a<b" rather than "a<b". For element nodes, DataAtom
// is the atom for Data, or zero if Data is not a known tag name.
//
+// Node trees may be navigated using the link fields (Parent,
+// FirstChild, and so on) or a range loop over iterators such as
+// [Node.Descendants].
+//
// An empty Namespace implies a "http://www.w3.org/1999/xhtml" namespace.
// Similarly, "math" is short for "http://www.w3.org/1998/Math/MathML", and
// "svg" is short for "http://www.w3.org/2000/svg".