internal/crawl/crawl_test.go - oscar - Git at Google

 // Copyright 2024 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

 package crawl

 import (
 	"context"
 	"errors"
 	"net/http"
 	"net/url"
 	"slices"
 	"strings"
 	"testing"

 	"golang.org/x/net/html"
 	"golang.org/x/oscar/internal/storage"
 	"golang.org/x/oscar/internal/testutil"
 )

 func TestCrawl(t *testing.T) {
 	lg := testutil.Slogger(t)
 	db := storage.MemDB()

 	newCrawl := func(tc *http.Client) *Crawler {
 		c := New(lg, db, tc)
 		c.Allow(allow...)
 		c.Deny(deny...)
 		c.Clean(clean)
 		c.Add("https://go.dev/")
 		c.Add("https://go.dev/")
 		return c
 	}

 	tc := readTestClient(t, "testdata/godev.txt")
 	c := newCrawl(tc)

 	testutil.StopPanic(func() {
 		c.Add("https://go.dev/#foo")
 		t.Errorf("Add with URL fragment did not panic")
 	})

 	c.Run(context.Background())

 	for _, u := range needVisited {
 		if p, ok := c.Get(u); !ok {
 			t.Errorf("Crawl %s: should have visited, did not", u)
 		} else if len(p.HTML) > 0 {
 			t.Errorf("Crawl %s: should have visited but not recorded HTML; found HTML", u)
 		}
 	}
 	for _, u := range needHTML {
 		if p, ok := c.Get(u); !ok {
 			t.Errorf("Crawl %s: should have recorded HTML, did not visit", u)
 		} else if len(p.HTML) == 0 {
 			t.Errorf("Crawl %s: should have recorded HTML, visited but no HTML", u)
 		}
 	}
 	for _, u := range needSkipped {
 		if _, ok := c.Get(u); ok {
 			t.Errorf("Crawl %s: should have skipped, found queued page", u)
 		}
 	}

 	// Check for various errors.
 	for p := range c.PageWatcher("test1").Recent() {
 		if strings.Contains(p.URL, "/err/") && p.Error == "" {
 			t.Errorf("crawl %s: no error", p.URL)
 		}
 	}

 	// Check that default recrawl does not recrawl.
 	didRoot2 := false
 	c = newCrawl(&http.Client{
 		Transport: transportFunc(func(req *http.Request) (*http.Response, error) {
 			if req.URL.Path == "/root2" {
 				didRoot2 = true
 				return tc.Transport.RoundTrip(req)
 			}
 			t.Fatalf("crawler recrawled too soon")
 			panic("unreachable")
 		}),
 	})
 	c.Add("https://go.dev/root2") // not seen yet
 	c.Run(context.Background())
 	if !didRoot2 {
 		t.Errorf("did not crawl /root2")
 	}

 	// Check that Recrawl(0) does recrawl, but also terminates.
 	n := 0
 	c = newCrawl(&http.Client{
 		Transport: transportFunc(func(req *http.Request) (*http.Response, error) {
 			n++
 			return tc.Transport.RoundTrip(req)
 		}),
 	})
 	c.SetRecrawl(0)
 	c.Run(context.Background())
 	if n < 3 {
 		t.Fatalf("Run after Recrawl(0) crawled %d pages, want ≥ 3", n)
 	}
 }

 var allow = []string{
 	"https://go.dev/",
 }

 var deny = []string{
 	"https://go.dev/api/",
 	"https://go.dev/change/",
 	"https://go.dev/cl/",
 	"https://go.dev/design/",
 	"https://go.dev/dl/",
 	"https://go.dev/issue/",
 	"https://go.dev/lib/",
 	"https://go.dev/misc/",
 	"https://go.dev/play",
 	"https://go.dev/s/",
 	"https://go.dev/src/",
 	"https://go.dev/test/",
 }

 var needVisited = []string{
 	"https://go.dev/doc/faq/",
 	"https://go.dev/err/bad-status",
 	"https://go.dev/err/bad-content-type",
 	"https://go.dev/err/redirect-no-location",
 	"https://go.dev/err/redirect-bad-url",
 	"https://go.dev/err/body-too-large",
 	"https://go.dev/err/body-read-error",
 }

 var needHTML = []string{
 	"https://go.dev/",
 	"https://go.dev/doc/faq",
 	"https://go.dev/pkg/math/?m=old",
 	"https://go.dev/pkg/strings/?m=old",
 	"https://go.dev/player/okay",
 }

 var needSkipped = []string{
 	"https://go.dev/play/p/asdf",
 	"https://go.dev/s/short",
 	"https://www.google.com/",
 	"https://go.dev/root2",
 	"https://go.dev/err/clean-error",
 	"https://go.dev/err/disallow-after-clean",
 }

 func clean(u *url.URL) error {
 	if u.Host == "go.dev" {
 		u.RawQuery = ""
 		u.ForceQuery = false
 		if strings.HasPrefix(u.Path, "/pkg") || strings.HasPrefix(u.Path, "/cmd") {
 			u.RawQuery = "m=old"
 		}
 	}

 	if strings.HasPrefix(u.Path, "/err/disallow-after-clean") {
 		u.Path = "/s/disallow"
 	}

 	if strings.HasPrefix(u.Path, "/err/clean-error") {
 		return errors.New("clean error!")
 	}
 	return nil
 }

 func TestLinks(t *testing.T) {
 	check := testutil.Checker(t)

 	u, err := url.Parse("https://go.dev/")
 	check(err)
 	doc, err := html.Parse(strings.NewReader(`
 		<a href="a1"></a><a href="https://www.google.com/a2"></a><a href="a3"></a><a href="a4"></a>
 	`))
 	check(err)
 	want := []string{
 		"https://go.dev/a1",
 		"https://www.google.com/a2",
 		"https://go.dev/a3",
 		"https://go.dev/a4",
 	}
 	var have []string
 	for u := range links(testutil.Slogger(t), u, doc) {
 		have = append(have, u.String())
 	}
 	if !slices.Equal(have, want) {
 		t.Errorf("links:\nhave %q\nwant %q", have, want)
 	}

 	want = want[:2]
 	have = have[:0]
 	for u := range links(testutil.Slogger(t), u, doc) {
 		have = append(have, u.String())
 		if len(have) == 2 {
 			break
 		}
 	}
 	if !slices.Equal(have, want) {
 		t.Errorf("links with early break:\nhave %q\nwant %q", have, want)
 	}
 }
	// Copyright 2024 The Go Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style
	// license that can be found in the LICENSE file.

	package crawl

	import (
	"context"
	"errors"
	"net/http"
	"net/url"
	"slices"
	"strings"
	"testing"

	"golang.org/x/net/html"
	"golang.org/x/oscar/internal/storage"
	"golang.org/x/oscar/internal/testutil"
	)

	func TestCrawl(t *testing.T) {
	lg := testutil.Slogger(t)
	db := storage.MemDB()

	newCrawl := func(tc http.Client) Crawler {
	c := New(lg, db, tc)
	c.Allow(allow...)
	c.Deny(deny...)
	c.Clean(clean)
	c.Add("https://go.dev/")
	c.Add("https://go.dev/")
	return c
	}

	tc := readTestClient(t, "testdata/godev.txt")
	c := newCrawl(tc)

	testutil.StopPanic(func() {
	c.Add("https://go.dev/#foo")
	t.Errorf("Add with URL fragment did not panic")
	})

	c.Run(context.Background())

	for _, u := range needVisited {
	if p, ok := c.Get(u); !ok {
	t.Errorf("Crawl %s: should have visited, did not", u)
	} else if len(p.HTML) > 0 {
	t.Errorf("Crawl %s: should have visited but not recorded HTML; found HTML", u)
	}
	}
	for _, u := range needHTML {
	if p, ok := c.Get(u); !ok {
	t.Errorf("Crawl %s: should have recorded HTML, did not visit", u)
	} else if len(p.HTML) == 0 {
	t.Errorf("Crawl %s: should have recorded HTML, visited but no HTML", u)
	}
	}
	for _, u := range needSkipped {
	if _, ok := c.Get(u); ok {
	t.Errorf("Crawl %s: should have skipped, found queued page", u)
	}
	}

	// Check for various errors.
	for p := range c.PageWatcher("test1").Recent() {
	if strings.Contains(p.URL, "/err/") && p.Error == "" {
	t.Errorf("crawl %s: no error", p.URL)
	}
	}

	// Check that default recrawl does not recrawl.
	didRoot2 := false
	c = newCrawl(&http.Client{
	Transport: transportFunc(func(req http.Request) (http.Response, error) {
	if req.URL.Path == "/root2" {
	didRoot2 = true
	return tc.Transport.RoundTrip(req)
	}
	t.Fatalf("crawler recrawled too soon")
	panic("unreachable")
	}),
	})
	c.Add("https://go.dev/root2") // not seen yet
	c.Run(context.Background())
	if !didRoot2 {
	t.Errorf("did not crawl /root2")
	}

	// Check that Recrawl(0) does recrawl, but also terminates.
	n := 0
	c = newCrawl(&http.Client{
	Transport: transportFunc(func(req http.Request) (http.Response, error) {
	n++
	return tc.Transport.RoundTrip(req)
	}),
	})
	c.SetRecrawl(0)
	c.Run(context.Background())
	if n < 3 {
	t.Fatalf("Run after Recrawl(0) crawled %d pages, want ≥ 3", n)
	}
	}

	var allow = []string{
	"https://go.dev/",
	}

	var deny = []string{
	"https://go.dev/api/",
	"https://go.dev/change/",
	"https://go.dev/cl/",
	"https://go.dev/design/",
	"https://go.dev/dl/",
	"https://go.dev/issue/",
	"https://go.dev/lib/",
	"https://go.dev/misc/",
	"https://go.dev/play",
	"https://go.dev/s/",
	"https://go.dev/src/",
	"https://go.dev/test/",
	}

	var needVisited = []string{
	"https://go.dev/doc/faq/",
	"https://go.dev/err/bad-status",
	"https://go.dev/err/bad-content-type",
	"https://go.dev/err/redirect-no-location",
	"https://go.dev/err/redirect-bad-url",
	"https://go.dev/err/body-too-large",
	"https://go.dev/err/body-read-error",
	}

	var needHTML = []string{
	"https://go.dev/",
	"https://go.dev/doc/faq",
	"https://go.dev/pkg/math/?m=old",
	"https://go.dev/pkg/strings/?m=old",
	"https://go.dev/player/okay",
	}

	var needSkipped = []string{
	"https://go.dev/play/p/asdf",
	"https://go.dev/s/short",
	"https://www.google.com/",
	"https://go.dev/root2",
	"https://go.dev/err/clean-error",
	"https://go.dev/err/disallow-after-clean",
	}

	func clean(u *url.URL) error {
	if u.Host == "go.dev" {
	u.RawQuery = ""
	u.ForceQuery = false
	if strings.HasPrefix(u.Path, "/pkg") \|\| strings.HasPrefix(u.Path, "/cmd") {
	u.RawQuery = "m=old"
	}
	}

	if strings.HasPrefix(u.Path, "/err/disallow-after-clean") {
	u.Path = "/s/disallow"
	}

	if strings.HasPrefix(u.Path, "/err/clean-error") {
	return errors.New("clean error!")
	}
	return nil
	}

	func TestLinks(t *testing.T) {
	check := testutil.Checker(t)

	u, err := url.Parse("https://go.dev/")
	check(err)
	doc, err := html.Parse(strings.NewReader(`
	<a href="a1"></a><a href="https://www.google.com/a2"></a><a href="a3"></a><a href="a4"></a>
	`))
	check(err)
	want := []string{
	"https://go.dev/a1",
	"https://www.google.com/a2",
	"https://go.dev/a3",
	"https://go.dev/a4",
	}
	var have []string
	for u := range links(testutil.Slogger(t), u, doc) {
	have = append(have, u.String())
	}
	if !slices.Equal(have, want) {
	t.Errorf("links:\nhave %q\nwant %q", have, want)
	}

	want = want[:2]
	have = have[:0]
	for u := range links(testutil.Slogger(t), u, doc) {
	have = append(have, u.String())
	if len(have) == 2 {
	break
	}
	}
	if !slices.Equal(have, want) {
	t.Errorf("links with early break:\nhave %q\nwant %q", have, want)
	}
	}